#include <ARCH/SETUP.H>
#include <SPAD/IRQ.H>
#include <SPAD/ALLOC.H>
#include <SPAD/SYSLOG.H>
#include <STRING.H>
#include <KERNEL/VMDEF.H>
#include <KERNEL/SMP/SHARE.H>
#include <KERNEL/SMP/IPI.H>
#include <KERNEL/ASM.H>
#include <ARCH/MOV.H>

#include <KERNEL/SMP/MATRIX.H>

#if __KERNEL_SUPPORT_SMP

typedef struct {
	void *last;
	AST *first;
	AST delay;
} QUEUE;	/* this must be 4 pointers, hardcoded in the assembler */

typedef struct {
	QUEUE q[SPL_MATRIX_TOP];
	QUEUE pad[32 - SPL_MATRIX_TOP];
} ELEMENT;	/* this must be 32 QUEUEs, hardcoded in the assembler */

#define ELEMENTS_SIZE	((sizeof(ELEMENT) * VAL_CPU_ID_LIMIT + PAGE_CLUSTER_SIZE - 1) & ~(size_t)(PAGE_CLUSTER_SIZE - 1))

typedef struct {
	ELEMENT *row[1];
} MATRIX;

#define MATRIX_SIZE	(sizeof(MATRIX) - sizeof(ELEMENT *) + sizeof(ELEMENT *) * VAL_CPU_ID_LIMIT)

static KERNEL_SHARED matrix_share = { "MATRIX", NULL };
static MATRIX *matrix = NULL;
static ELEMENT *my_row = NULL;

static void DISPATCH_ASTS(int n);

static AST MATRIX_IPI_ARRAY[SPL_MATRIX_TOP];

static AST_STUB MATRIX_IPI_AST;

DECL_RT_IRQ_HANDLER(MATRIX_IPI_RT_IRQ)
{
	AST *a;
#if __DEBUG >= 2
	if (__unlikely(!matrix))
		KERNEL$SUICIDE("MATRIX_IPI_RT_IRQ: MATRIX NOT INITIALIZED");
#endif
	a = &MATRIX_IPI_ARRAY[(unsigned long)DATA];
	if (__likely(!a->fn)) {
		a->fn = MATRIX_IPI_AST;
		IRQ_POST_AST(a);
	}
	IRQ_RETURN;
}

static __DECL_IRQ_VAST(MATRIX_IPI_AST, MATRIX_IPI_ARRAY)
{
	int n = RQ - MATRIX_IPI_ARRAY;
#if __DEBUG >= 2
	if (__unlikely(KERNEL$SPL != __SPL_X_VAR(n)))
		KERNEL$SUICIDE("MATRIX_IPI_AST AT SPL %08X, SHOULD BE %08X", KERNEL$SPL, __SPL_X_VAR(n));
#endif
	RQ->fn = NULL;
	__barrier();
	DISPATCH_ASTS(n);
	RETURN;
}

static void DISPATCH_ASTS(int n)
{
	__cpu_id_t c = 0;
	do {
		QUEUE *q = &matrix->row[c][VAL_CPU_ID].q[n];
		void *l = q->last;
		AST *a = q->first;
		if (__likely(!((unsigned long)l & 1)))
			continue;
		if (__unlikely(__LOCK_CMPXCHGP(&q->last, l, &q->first) != 0))
			continue;
		do {
			AST *next = (AST *)a->tmp1;
			__PREFETCHT0(next);
			CALL_AST(a);
			a = next;
		} while (a);
	} while (++c < VAL_CPU_ID_LIMIT);
}

static AST_STUB XCPU_BATCH_AST;

__DECL_XCPU_AST()
{
	QUEUE *q;
	void *l;

#if __DEBUG >= 2
	if (__unlikely(KERNEL$SPL != SPL_X(SPL_TOP)))
		KERNEL$SUICIDE("XCPU_AST AT SPL %08X", KERNEL$SPL);
	if (__unlikely(!matrix))
		KERNEL$SUICIDE("XCPU_AST: MATRIX NOT INITIALIZED");
#endif
	q = &my_row[TARGET_CPU].q[SPL - 1];
	RQ->tmp1 = 0;
	again:
	l = q->last;
	if (__unlikely((unsigned long)l & 1)) {
		if (__unlikely(__LOCK_CMPXCHGP(&q->last, l, (char *)l - 1) != 0))
			goto again;
		l = q->last;
	}
	*(AST **)l = RQ;
	q->last = &RQ->tmp1;

	if (!q->delay.fn) {
		q->delay.fn = XCPU_BATCH_AST;
		RETURN_AST(&q->delay);
	}

	RETURN;
}

static __DECL_VAST_P(XCPU_BATCH_AST, my_row)
{
	int n;
	__cpu_id_t cpu;
	QUEUE *q;

	q = GET_STRUCT(RQ, QUEUE, delay);
	n = (q - (QUEUE *)my_row) & 31;
#if __DEBUG >= 2
	if (__unlikely(KERNEL$SPL != __SPL_X_VAR(n)))
		KERNEL$SUICIDE("XCPU_BATCH_AST AT SPL %08X, SHOULD BE %08X", KERNEL$SPL, __SPL_X_VAR(n));
#endif
	RAISE_SPL(SPL_TOP);
	q->delay.fn = NULL;
#if __DEBUG >= 2
	if (__unlikely(((unsigned long)q->last & 1) != 0))
		KERNEL$SUICIDE("XCPU_BATCH_AST: QUEUE IS ALREADY SUBMITTED");
	if (__unlikely(__unlikely(q->last == &q->first)))
		KERNEL$SUICIDE("XCPU_BATCH_AST: QUEUE IS EMPTY");
#endif
	q->last = (char *)q->last + 1;
	cpu = (q - (QUEUE *)my_row) >> 5;
	SEND_IPI(cpu, IPI_AST + n, __SPL_X_VAR(n));
	RETURN;
}

__COLD_ATTR__ static void MATRIX_INIT_COMMON(void)
{
	memset(MATRIX_IPI_ARRAY, 0, sizeof MATRIX_IPI_ARRAY);
	my_row = matrix->row[VAL_CPU_ID];
}

__COLD_ATTR__ int MATRIX_INIT(void)
{
	size_t elements_size = ELEMENTS_SIZE;
	__node_id_t n;
	__cpu_id_t c, d;
	int s;
	int er;

	if (VAL_CPU_ID != 0)
		KERNEL$SUICIDE("MATRIX_INIT ON NON-BOOT CPU %u", (unsigned)VAL_CPU_ID);

	matrix = KERNEL$ALLOC_CONTIG_AREA(MATRIX_SIZE, AREA_DATA);
	if (__IS_ERR(matrix)) {
		er = __PTR_ERR(matrix);
		matrix = NULL;
		goto err;
	}
	memset(matrix, 0, MATRIX_SIZE);

	for (n = 0; n < VAL_NODE_ID_LIMIT; n++) {
		ELEMENT *e = NULL;
		size_t size_left = 0;
		for (c = 0; c < VAL_CPU_ID_LIMIT; c++) if (CPU_NODE(c) == n) {
			if (size_left < sizeof(ELEMENT) * VAL_CPU_ID_LIMIT) {
				size_left = elements_size;
				e = KERNEL$ALLOC_CONTIG_AREA(elements_size, AREA_DATA | AREA_ALIGN | AREA_NODE, PAGE_CLUSTER_SIZE, n);
				if (__IS_ERR(e)) {
					er = __PTR_ERR(e);
					goto err;
				}
			}
			matrix->row[c] = e;
			e += VAL_CPU_ID_LIMIT;
			size_left -= sizeof(ELEMENT) * VAL_CPU_ID_LIMIT;
		}
	}
	for (c = 0; c < VAL_CPU_ID_LIMIT; c++) {
		if (!matrix->row[c])
			KERNEL$SUICIDE("CPU %u HAS INVALID NODE %u", (unsigned)c, (unsigned)CPU_NODE(c));
		for (d = 0; d < VAL_CPU_ID_LIMIT; d++) {
			for (s = 0; s < SPL_MATRIX_TOP; s++) {
				QUEUE *q = &matrix->row[c][d].q[s];
				memset(q, 0, sizeof(QUEUE));
				q->last = &q->first;
			}
		}
	}

	matrix_share.ptr = matrix;
	REGISTER_SHARED_POINTER(&matrix_share);
	MATRIX_INIT_COMMON();
	return 0;

	err:
	if (matrix) {
		for (c = 0; c < VAL_CPU_ID_LIMIT; c++)
			if (matrix->row[c] && !((unsigned long)matrix->row[c] & (PAGE_CLUSTER_SIZE - 1)))
				KERNEL$FREE_CONTIG_AREA(matrix->row[c], elements_size);
		KERNEL$FREE_CONTIG_AREA(matrix, MATRIX_SIZE);
		matrix = NULL;
	}
	KERNEL$SYSLOG(__SYSLOG_SW_ERROR, "KERNEL", "UNABLE TO ALLOCATE CPU MATRIX");
	return er;
}

__COLD_ATTR__ void MATRIX_INIT_AP(void)
{
	if (!VAL_CPU_ID)
		KERNEL$SUICIDE("MATRIX_ATTACH_CPU ON BOOT CPU");
	matrix = FIND_SHARED_POINTER("MATRIX");
	MATRIX_INIT_COMMON();
}

#else

__DECL_XCPU_AST()
{
	KERNEL$SUICIDE("XCPU_AST REQUESTED FOR CPU %u", (unsigned)cpu);
}

#endif
