#include <SPAD/AC.H>
#include <SPAD/WQ.H>
#include <SPAD/LIBC.H>
#include <MALLOC.H>
#include <KERNEL/VM_ARCH.H>
#include <SPAD/SYNC.H>
#include <SPAD/TIMER.H>
#include <ARCH/MOV.H>
#include <SPAD/DL.H>
#include <DLFCN.H>
#include <KERNEL/PARAMS.H>
#include <KERNEL/SLAB.H>
#include <KERNEL/SMP/SHARE.H>
#include <SPAD/SPINLOCK.H>

#include <KERNEL/VM.H>
#include <SPAD/ALLOC.H>

#define CACHE_RESERVE_PCT		8
#define TARGET_INCREASE_LATENCY		(JIFFIES_PER_SECOND)
#define TARGET_DECREASE_LATENCY		(TARGET_INCREASE_LATENCY + 120 * JIFFIES_PER_SECOND)
#define USER_TARGET_DECREASE_LATENCY	(JIFFIES_PER_SECOND * 5)

#define FREE_MEMORY_PCT		16

#define VM_JIFFIES		(JIFFIES_PER_SECOND / 40)
#define FREEMEM_JIFFIES		(JIFFIES_PER_SECOND * 5)
#define REBALANCE_TIME		(JIFFIES_PER_SECOND * 30)
#define WATERMARK_REFRESH_N	12

#if __DEBUG_ALLOC_KERNEL_PAGE_SHORTAGE > 0 || __DEBUG_ALLOC_USER_PAGE_SHORTAGE > 0
#undef FREEMEM_JIFFIES
#define FREEMEM_JIFFIES		1
#endif

#if __DEBUG >= 2
#define TEST_MEMORY
#define TEST_OFFSET_1		0
#define TEST_OFFSET_2		(__offsetof(struct __slpage, __n_alloc) / 4)
#define TEST_OFFSET_3		((PAGE_CLUSTER_SIZE - 4) / 4)
#define CALLER_OFFSET		((PAGE_CLUSTER_SIZE - 16) / 4)
#define MAGIC_1			0x055D4929U
#define MAGIC_2			0xEF8E8D2CU
#define MAGIC_3			0xE6D581BBU
#endif

struct pg_zone {
	LIST_HEAD freelist[2];
	unsigned long n_free;
};

static IO_STUB NO_PAGE_RELEASE;
static IO_STUB KERNEL_PAGE_RELEASE;
static IO_STUB UNALLOCATED_PAGE_RELEASE;
static IO_STUB RELEASE_FREE_PAGE;

static unsigned long VM_STATS[N_VM_TYPES + 1];

static struct pg_zone KERNEL_ZONE;
static unsigned long kernel_free_target;
static unsigned long max_kernel_free_target;
static u_jiffies_lo_t last_kernel_target_incr = 0;
static struct pg_zone IO_ZONE;
static unsigned long io_free_target;
static unsigned long max_io_free_target;
static u_jiffies_lo_t last_io_target_incr = 0;
#if KERNEL_HIGH_ZONE_COMPILE
static struct pg_zone HIGH_ZONE;
#endif
static unsigned long user_free_target;
static u_jiffies_lo_t last_user_target_decr = 0;
unsigned long VM_FREEMEM_WATERMARK;
static int VM_FREEMEM_WATERMARK_REFRESH;

WQ_DECL(KERNEL$FREEMEM_WAIT, "KERNEL$FREEMEM_WAIT");
WQ_DECL(WRITEBACK_WAIT, "KERNEL$WRITEBACK_WAIT");
WQ_DECL(FREEMEM_ABOVE, "KERNEL$FREEMEM_ABOVE");
WQ_DECL(FREE_ENTITIES, "KERNEL$FREE_ENTITIES");

static MTX_DECL(BALANCE_EVENT, "KERNEL$BALANCE_EVENT");
static MTX_DECL(CACHE_FREE_EVENT, "KERNEL$CACHE_FREE_EVENT");

static DECL_TIMER(freemem_timer);

static char BALANCE_CANT_EXTEND = 0;
static __s8 BALANCE_NO_PROGRESS = -1;

#define can_increase(x)		((u_jiffies_lo_t)(KERNEL$GET_JIFFIES_LO() - (x)) >= TARGET_INCREASE_LATENCY)
#define can_decrease(j, x)	((u_jiffies_lo_t)((j) - (x)) >= TARGET_DECREASE_LATENCY)

#define VM_OVERLIMIT()		(__unlikely(VM_ENTITIES > VM_ENTITIES_SOFT_LIMIT))
#if !KERNEL_HIGH_ZONE_COMPILE
#define FREE_UNDER_MIN()	(KERNEL_ZONE.n_free + IO_ZONE.n_free + IO_IN_PROGRESS <= io_free_target + user_free_target || VM_OVERLIMIT())
#else
#define FREE_UNDER_MIN()	(KERNEL_ZONE.n_free + IO_ZONE.n_free + HIGH_ZONE.n_free + IO_IN_PROGRESS <= io_free_target + user_free_target || VM_OVERLIMIT())
#endif
#define BALANCE_UNDER_MIN()	(KERNEL_ZONE.n_free + IO_IN_PROGRESS / 32 <= kernel_free_target)

static IO_STUB RELEASED_PAGE_RELEASE;
static int RELEASE_PAGE(PAGE *p);
static PAGE *PAGE_TO_RELEASE = NULL;

#if __KERNEL_SUPPORT_SMP

struct vmnode {
	LIST_HEAD buckets;
	SPINLOCK lock;
};	/* !!! FIXME: align it to cacheline */

static struct vmnode *vmnodes;

static KERNEL_SHARED vmnodes_share = { "VM$NODE", &vmnodes };

#endif

static struct pg_zone *VM_GET_PAGE_ZONE(PAGE *p)
{
	unsigned long bank = (long)(p - KERNEL_PAGE_MAP) >> (PG_BANK_BITS - PG_CLUSTER_BITS);
	if (__unlikely(VM_ARCH->PBANKS[bank] & AREA_KERNEL_ZONE)) return &KERNEL_ZONE;
#if KERNEL_HIGH_ZONE_COMPILE
	if (__unlikely(VM_ARCH_IS_PBANK_HIGH(bank))) return &HIGH_ZONE;
#endif
	return &IO_ZONE;
}

static int OOM(void)
{
	return KERNEL$SLAB_REAP();
}

void MEMORY_BALANCE(void)
{
#if __DEBUG >= 1
	if (__unlikely(KERNEL$SPL != SPL_X(SPL_TOP))) KERNEL$SUICIDE("MEMORY_BALANCE AT SPL %08X", KERNEL$SPL);
#endif
	if (__unlikely(BALANCE_UNDER_MIN())) {
		MTX_SET(&BALANCE_EVENT, 0);
	}
	if (__likely(FREE_UNDER_MIN())) {
		MTX_SET(&CACHE_FREE_EVENT, 0);
	}
}

#ifdef TEST_MEMORY
__COLD_ATTR__ static void DISPLAY_ROW(__u8 *addr, unsigned offset)
{
	unsigned i;
	__critical_printf("%06X: ", offset);
	for (i = 0; i < 16; i++) {
		unsigned char c = addr[offset + i];
		__critical_printf("%02X ", c);
	}
	__critical_printf(" ");
	for (i = 0; i < 16; i++) {
		unsigned char c = addr[offset + i];
		if (c < 32 || c >= 127) c = '.';
		__critical_printf("%c", c);
	}
	__critical_printf("\n");
}

__COLD_ATTR__ static __NORET_ATTR__ void TEST_PAGE_FAILED(PAGE *p, __u32 *v)
{
	void *caller;
	const char *str_c;
	unsigned long off_c;
	void *prev_rele;
	const char *str_p;
	unsigned long off_p;
	int i;
	for (i = 0; i < 64; i += 16) DISPLAY_ROW((__u8 *)v, i);
	for (i = 32; i > 0; i -= 16) DISPLAY_ROW((__u8 *)v, PAGE_CLUSTER_SIZE - i);
	caller = *(void **)&v[CALLER_OFFSET];
	str_c = KERNEL$DL_GET_SYMBOL_NAME(caller, &off_c, 0);
	if (PAGE_2_PHYS(p) != 0) prev_rele = p[-1].release;
	else prev_rele = NULL;
	str_p = KERNEL$DL_GET_SYMBOL_NAME(prev_rele, &off_p, 0);
	KERNEL$SUICIDE("TEST_PAGE_FAILED: PAGE %"__64_format"X ON FREELIST CORRUPTED (MAGIC_1 %08X, MAGIC_2 %08X, MAGIC_3 %08X), FREED AT %s+%lX. PREVIOUS PAGE RELEASE %s+%lX", (__u64)PAGE_2_PHYS(p), v[TEST_OFFSET_1], v[TEST_OFFSET_2], v[TEST_OFFSET_3], str_c ? str_c : "?", off_c, str_p ? str_p : "?", off_p);
}
#endif

static PAGE *ZONE_ALLOC(struct pg_zone *z, int cache)
{
	PAGE *p;
#if __DEBUG >= 1
	if (__unlikely(!z->n_free != (LIST_EMPTY(&z->freelist[0])
		&& LIST_EMPTY(&z->freelist[1])
		)))
		KERNEL$SUICIDE("ZONE_ALLOC: FREELIST SKEW, N_FREE == %ld", z->n_free);
#endif
	if (__unlikely(!z->n_free)) return NULL;
	z->n_free--;
	p = LIST_STRUCT(z->freelist[cache].next, PAGE, hash_entry);
	if (__unlikely(p == LIST_STRUCT(&z->freelist[cache], PAGE, hash_entry)))
		p = LIST_STRUCT(z->freelist[cache ^ 1].next, PAGE, hash_entry);
	DEL_FROM_LIST(&p->hash_entry);
#ifdef TEST_MEMORY
	{
		__u32 *v = KERNEL$MAP_PHYSICAL_PAGE(p);
		if (__unlikely(v[TEST_OFFSET_1] != MAGIC_1) ||
		    __unlikely(v[TEST_OFFSET_2] != MAGIC_2) ||
		    __unlikely(v[TEST_OFFSET_3] != MAGIC_3)) {
			TEST_PAGE_FAILED(p, v);
		}
		KERNEL$UNMAP_PHYSICAL_BANK(v);
	}
#endif
	return p;
}

PAGE *KERNEL$ALLOC_IO_PAGE(int type)
{
	PAGE *p;
	int spl = KERNEL$SPL;
#if __DEBUG_ALLOC_USER_PAGE_SHORTAGE > 0
	if (__unlikely((random() & 0xff) < __DEBUG_ALLOC_USER_PAGE_SHORTAGE) && __likely(PROC_INITIALIZED))
		return NULL;
#endif
	RAISE_SPL(SPL_TOP);
	if (__likely((p = ZONE_ALLOC(&IO_ZONE, 1)) != NULL)) {
#if !KERNEL_HIGH_ZONE_COMPILE
		if (__unlikely(IO_ZONE.n_free + KERNEL_ZONE.n_free <= io_free_target + user_free_target))
#else
		if (__unlikely(IO_ZONE.n_free + KERNEL_ZONE.n_free + HIGH_ZONE.n_free <= io_free_target + user_free_target))
#endif
			mb: MEMORY_BALANCE();
		is:
		VM_STATS[type]++;
		p->release = NO_PAGE_RELEASE;
		LOWER_SPLX(spl);
		return p;
	}
	TEST_SPLX(spl, SPL_X(SPL_TOP));
	if (__likely((p = ZONE_ALLOC(&KERNEL_ZONE, 1)) != NULL)) {
		if (__unlikely(KERNEL_ZONE.n_free <= kernel_free_target)) goto mb;
		goto is;
	}
	TEST_SPLX(spl, SPL_X(SPL_TOP));
#if KERNEL_HIGH_ZONE_COMPILE
	if (__likely((p = ZONE_ALLOC(&HIGH_ZONE, 1)) != NULL)) {
		if (__unlikely(IO_ZONE.n_free + KERNEL_ZONE.n_free + HIGH_ZONE.n_free <= io_free_target + user_free_target)) goto mb;
		goto is;
	}
	TEST_SPLX(spl, SPL_X(SPL_TOP));
#endif
	MEMORY_BALANCE();
	if (__likely(io_free_target < max_io_free_target)) {
		if (can_increase(last_io_target_incr)) {
			io_free_target++;
			last_io_target_incr = KERNEL$GET_JIFFIES_LO();
		}
	}
	LOWER_SPLX(spl);
	return NULL;
}

PAGE *KERNEL$ALLOC_USER_PAGE(int type)
{
#if KERNEL_HIGH_ZONE_COMPILE
	PAGE *p;
	int spl = KERNEL$SPL;
#if __DEBUG_ALLOC_USER_PAGE_SHORTAGE > 0
	if (__unlikely((random() & 0xff) < __DEBUG_ALLOC_USER_PAGE_SHORTAGE) && __likely(PROC_INITIALIZED))
		return NULL;
#endif
	RAISE_SPL(SPL_TOP);
	if (__likely((p = ZONE_ALLOC(&HIGH_ZONE, 0)) != NULL)) {
		if (__unlikely(HIGH_ZONE.n_free + IO_ZONE.n_free + KERNEL_ZONE.n_free <= io_free_target + user_free_target))
			mb: MEMORY_BALANCE();
		is:
		VM_STATS[type]++;
		p->release = NO_PAGE_RELEASE;
		LOWER_SPLX(spl);
		return p;
	}
	TEST_SPLX(spl, SPL_X(SPL_TOP));
	if (__likely((p = ZONE_ALLOC(&IO_ZONE, 0)) != NULL)) {
		if (__unlikely(IO_ZONE.n_free + KERNEL_ZONE.n_free <= io_free_target)) goto mb;
		goto is;
	}
	TEST_SPLX(spl, SPL_X(SPL_TOP));
	if (__likely((p = ZONE_ALLOC(&KERNEL_ZONE, 0)) != NULL)) {
		if (__unlikely(KERNEL_ZONE.n_free <= kernel_free_target)) goto mb;
		goto is;
	}
	TEST_SPLX(spl, SPL_X(SPL_TOP));
	MEMORY_BALANCE();
	if (__likely(io_free_target < max_io_free_target)) {
		if (can_increase(last_io_target_incr)) {
			io_free_target++;
			last_io_target_incr = KERNEL$GET_JIFFIES_LO();
		}
	}
	LOWER_SPLX(spl);
	return NULL;
#else
	return KERNEL$ALLOC_IO_PAGE(type);
#endif
}

void *KERNEL$ALLOC_KERNEL_PAGE(int type)
{
	PAGE *p;
	int spl = KERNEL$SPL;
#if __DEBUG_ALLOC_KERNEL_PAGE_SHORTAGE > 0
	if (__unlikely((random() & 0xff) < __DEBUG_ALLOC_KERNEL_PAGE_SHORTAGE) && __likely(PROC_INITIALIZED))
		return NULL;
#endif
	RAISE_SPL(SPL_TOP);
	if (__likely((p = ZONE_ALLOC(&KERNEL_ZONE, 0)) != NULL)) {
#if !KERNEL_HIGH_ZONE_COMPILE
		if (__unlikely(KERNEL_ZONE.n_free <= kernel_free_target) ||
		    __unlikely(IO_ZONE.n_free + KERNEL_ZONE.n_free <= io_free_target + user_free_target))
#else
		if (__unlikely(KERNEL_ZONE.n_free <= kernel_free_target) ||
		    __unlikely(IO_ZONE.n_free + KERNEL_ZONE.n_free + HIGH_ZONE.n_free <= io_free_target + user_free_target))
#endif
			MEMORY_BALANCE();
		VM_STATS[type]++;
		p->release = KERNEL_PAGE_RELEASE;
		LOWER_SPLX(spl);
		return PAGE_2_VIRT(p);
	}
	TEST_SPLX(spl, SPL_X(SPL_TOP));
	MEMORY_BALANCE();
	if (__likely(kernel_free_target < max_kernel_free_target)) {
		if (can_increase(last_kernel_target_incr)) {
			kernel_free_target++;
			last_kernel_target_incr = KERNEL$GET_JIFFIES_LO();
			if (kernel_free_target > io_free_target) io_free_target = kernel_free_target, last_io_target_incr = last_kernel_target_incr;
		}
	}
	LOWER_SPLX(spl);
	return NULL;
}

void KERNEL$FREE_USER_PAGE(PAGE *p, int type)
{
	int spl;
	struct pg_zone *z;
	LIST_HEAD *l;
#ifdef TEST_MEMORY
	{
		__u32 *v = KERNEL$MAP_PHYSICAL_PAGE(p);
		__MOVNTI32(&v[TEST_OFFSET_1], MAGIC_1);
		__MOVNTI32(&v[TEST_OFFSET_2], MAGIC_2);
		__MOVNTI32(&v[TEST_OFFSET_3], MAGIC_3);
#ifdef __GNUC__
		__MOVNTIPTR((void **)&v[CALLER_OFFSET], __builtin_return_address(0));
#endif
		__MOVNTI_FLUSH();
		KERNEL$UNMAP_PHYSICAL_BANK(v);
	}
#endif
	spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
#if __DEBUG >= 1
	if (__unlikely(p->release == RELEASE_FREE_PAGE))
		KERNEL$SUICIDE("KERNEL$FREE_USER_PAGE: FREEING FREE PAGE %"__64_format"X", (__u64)PAGE_2_PHYS(p));
	p->lockdown = NULL;
	if (__unlikely(!VM_STATS[type]) && __likely(type != N_VM_TYPES))
		KERNEL$SUICIDE("KERNEL$FREE_USER_PAGE: VM_STATS[%d] UNDERFLOW", type);
#endif
	VM_STATS[type]--;
	if (__unlikely(p == PAGE_TO_RELEASE)) {
		p->release = &RELEASED_PAGE_RELEASE;
		goto lspl_ret;
	}
	p->release = RELEASE_FREE_PAGE;
	z = VM_GET_PAGE_ZONE(p);
	p->fnode = z;
	l = &z->freelist[p->node != VAL_NODE_ID];
	ADD_TO_LIST(l, &p->hash_entry);
	z->n_free++;
	lspl_ret:
	LOWER_SPLX(spl);
	WQ_WAKE_ALL(&KERNEL$FREEMEM_WAIT);
#if !KERNEL_HIGH_ZONE_COMPILE
	if (__likely(KERNEL_ZONE.n_free + IO_ZONE.n_free >= io_free_target + user_free_target))
#else
	if (__likely(KERNEL_ZONE.n_free + IO_ZONE.n_free + HIGH_ZONE.n_free >= io_free_target + user_free_target))
#endif
		WQ_WAKE_ALL(&FREEMEM_ABOVE);
}

void KERNEL$FREE_KERNEL_PAGE(void *ptr, int type)
{
#if __DEBUG >= 1
	if ((unsigned long)ptr & (PAGE_CLUSTER_SIZE - 1))
		KERNEL$SUICIDE("KERNEL$FREE_KERNEL_PAGE: UNALIGNED POINTER %p", ptr);
#endif
	KERNEL$FREE_USER_PAGE(VIRT_2_PAGE_ALIGNED(ptr), type);
}

void KERNEL_TRANSFER_VM_STATE(int type_from, int type_to)
{
#if __DEBUG >= 1
	if (__unlikely(KERNEL$SPL != SPL_X(SPL_TOP)))
		KERNEL$SUICIDE("KERNEL_TRANSFER_VM_STATE AT SPL %08X", KERNEL$SPL);
	if (__unlikely(!VM_STATS[type_from]))
		KERNEL$SUICIDE("KERNEL_TRANSFER_VM_STATE: VM_STATS[%d] UNDERFLOW", type_from);
#endif
	VM_STATS[type_from]--;
	VM_STATS[type_to]++;
}

static long BALANCE_THREAD(void *p_)
{
	int r;
	unsigned long pgidx = 0;
	unsigned long last_cycle_pgidx = 0;

	while (1) {
		KERNEL$THREAD_MAY_BLOCK();

		MTX_SET(&BALANCE_EVENT, 1);
		if (!BALANCE_UNDER_MIN()) {
			if (__likely(!(KERNEL_ZONE.n_free <= kernel_free_target))) {
				BALANCE_NO_PROGRESS = -1;
				last_cycle_pgidx = pgidx;
			}
			MTX_WAIT_SYNC(&BALANCE_EVENT);
			OOM();
		}

		if (VM_GET_PAGE_ZONE(&KERNEL_PAGE_MAP[pgidx]) != &KERNEL_ZONE) {
			pgidx = (pgidx + PG_BANK / PG_CLUSTER) & ~(unsigned long)(PG_BANK / PG_CLUSTER - 1);
			goto cont;
		}
		if (KERNEL_PAGE_MAP[pgidx].release == UNALLOCATED_PAGE_RELEASE ||
		    KERNEL_PAGE_MAP[pgidx].release == NO_PAGE_RELEASE ||
		    KERNEL_PAGE_MAP[pgidx].release == KERNEL_PAGE_RELEASE ||
		    KERNEL_PAGE_MAP[pgidx].release == RELEASE_FREE_PAGE) {
			pgidx++;
			goto cont;
		}
		MTX_LOCK_SYNC(&VM_MUTEX);
		r = RELEASE_PAGE(&KERNEL_PAGE_MAP[pgidx]);
		MTX_UNLOCK(&VM_MUTEX);
		if (__unlikely(r != 0)) {
			pgidx++;
			goto cont;
		}
		BALANCE_NO_PROGRESS = -1;
		KERNEL$FREE_USER_PAGE(&KERNEL_PAGE_MAP[pgidx], N_VM_TYPES);
		pgidx++;

		cont:

		if (__unlikely((pgidx >> (PG_BANK_BITS - PG_CLUSTER_BITS)) >= VM_ARCH->N_DIRECT_PBANKS)) {
			pgidx = 0;
		}

		if (__unlikely(pgidx == last_cycle_pgidx)) {
			if (__likely(BALANCE_NO_PROGRESS <= 0)) {
				BALANCE_NO_PROGRESS++;
				if (__unlikely(BALANCE_NO_PROGRESS > 0)) {
					WQ_WAKE_ALL(&KERNEL$FREEMEM_WAIT);
				}
			}
			if (__likely(!BALANCE_CANT_EXTEND)) {
				int r;
				MTX_LOCK_SYNC(&VM_MUTEX);
				r = VM_ARCH_NEW_PBANK(AREA_DATA | AREA_KERNEL_ZONE, 0, 0);
				if (__unlikely(r)) {
					BALANCE_CANT_EXTEND = 1;
				} else {
					BALANCE_NO_PROGRESS = -1;
				}
				MTX_UNLOCK(&VM_MUTEX);
			}
		}

		if (__unlikely(BALANCE_CANT_EXTEND)) {
			unsigned long u;
			RAISE_SPL(SPL_DEV);
			u = CACHE_UPCALL(kernel_free_target + 1);
			LOWER_SPL(SPL_ZERO);
			OOM();
			if (__unlikely(u != 0) && BALANCE_NO_PROGRESS > 0)
				KERNEL$SLEEP(0);	/* If we can't make any progress, don't hog the CPU */
		}
	}
	return -EIO;	/* zap warning */
}

void VM_ADDED_PAGES(void)
{
	BALANCE_CANT_EXTEND = 0;
}

static long CACHE_FREE_THREAD(void *p_)
{
	/* This must not take VM_MUTEX */
	while (1) {
		long v, to_free, u;
		KERNEL$THREAD_MAY_BLOCK();

		MTX_SET(&CACHE_FREE_EVENT, 1);
		if (!FREE_UNDER_MIN()) {
			MTX_WAIT_SYNC(&CACHE_FREE_EVENT);
		}
		RAISE_SPL(SPL_DEV);
		to_free = 1;
		v = io_free_target + user_free_target - (KERNEL_ZONE.n_free + IO_ZONE.n_free + HIGH_ZONE.n_free + IO_IN_PROGRESS);
		if (__likely(v > to_free)) to_free = v;
		v = VM_ENTITIES - VM_ENTITIES_SOFT_LIMIT;
		if (__unlikely(v > to_free)) to_free = v;
		u = CACHE_UPCALL(to_free);
		LOWER_SPL(SPL_ZERO);
		OOM();
		if (__unlikely(u != 0))
			KERNEL$SLEEP(0);
	}
	return -EIO;	/* zap warning */
}

static DECL_IOCALL(RELEASE_FREE_PAGE, SPL_TOP, PAGE_RELEASE_REQUEST)
{
	PAGE *pg = RQ->pg;
	if (pg->release != &RELEASE_FREE_PAGE) RETURN_IORQ_LSTAT(RQ, KERNEL$WAKE_PAGE_RELEASE);
	if (!((struct pg_zone *)pg->fnode)->n_free--)
		KERNEL$SUICIDE("RELEASE_FREE_PAGE: N_FREE UNDERFLOW");
	DEL_FROM_LIST(&pg->hash_entry);
	pg->release = &RELEASED_PAGE_RELEASE;
	RQ->status = 0;
	RETURN_AST(RQ);
}

static DECL_IOCALL(UNALLOCATED_PAGE_RELEASE, SPL_TOP, PAGE_RELEASE_REQUEST)
{
	RQ->status = -ENOENT;
	RETURN_AST(RQ);
}

static DECL_IOCALL(NO_PAGE_RELEASE, SPL_TOP, PAGE_RELEASE_REQUEST)
{
	RQ->status = -EIO;
	RETURN_AST(RQ);
}

static DECL_IOCALL(KERNEL_PAGE_RELEASE, SPL_TOP, PAGE_RELEASE_REQUEST)
{
	RQ->status = -EEXIST;
	RETURN_AST(RQ);
}

static DECL_IOCALL(RELEASED_PAGE_RELEASE, SPL_TOP, PAGE_RELEASE_REQUEST)
{
	RQ->status = 0;
	RETURN_AST(RQ);
}

DECL_IOCALL(KERNEL$WAKE_PAGE_RELEASE, SPL_BOTTOM, PAGE_RELEASE_REQUEST)
{
	if (__unlikely(KERNEL$LOCKUP_LEVEL >= LOCKUP_LEVEL_ONE_PASS) /* && __likely((RQ->status & RQS_PRIORITY_MASK) < RQS_ADD_PRIORITY * KERNEL_NO_RESTART_PRIORITY)*/) {
		WQ_WAIT(&KERNEL$LOCKUP_EVENTS, RQ, KERNEL$WAKE_PAGE_RELEASE);
		RETURN;
	}
	RETURN_IORQ_LSTAT(RQ, RQ->pg->release);
}

static int RELEASE_PAGE(PAGE *p)
{
	PAGE_RELEASE_REQUEST prr;
#if __DEBUG >= 1
	if (__unlikely(PAGE_TO_RELEASE != NULL))
		KERNEL$SUICIDE("RELEASE_PAGE: ANOTHER RELEASE ALREADY PENDING");
#endif
	PAGE_TO_RELEASE = p;
	prr.pg = p;
	SYNC_IO_CANCELABLE(&prr, KERNEL$WAKE_PAGE_RELEASE);
	if (p->release == RELEASED_PAGE_RELEASE) prr.status = 0;
	if (!prr.status) {
#if __DEBUG >= 1
		if (p->release != RELEASED_PAGE_RELEASE)
			KERNEL$SUICIDE("RELEASE_PAGE: PAGE NOT RELEASED");
#endif
		p->release = NO_PAGE_RELEASE;
	}
	PAGE_TO_RELEASE = NULL;
	return prr.status;
}

MTX_DECL(VM_MUTEX, "KERNEL$VM_MUTEX");

__COLD_ATTR__ void *KERNEL$ALLOC_CONTIG_AREA(unsigned long size, int flags, ...)
{
	void *p;
	const int FLAGS = AREA_ISADMA | AREA_PCIDMA | AREA_PCIDMA64 | AREA_CODE | AREA_DATA;
	unsigned long npages, pgidx, i, j;
	unsigned long align = 1;
	va_list va;
	int spl = KERNEL$SPL;
	__node_id_t node = VAL_NODE_ID;
	__node_id_t node_mask = ~(__node_id_t)0;

	LOWER_SPL(SPL_ZERO);

	va_start(va, flags);

	if (flags & AREA_ALIGN) align = va_arg(va, unsigned long);
	if (__unlikely(!align) || __unlikely((align & (align - 1)) != 0))
		KERNEL$SUICIDE("KERNEL$ALLOC_CONTIG_AREA: INVALID ALIGN %lX", align);
	align /= PAGE_CLUSTER_SIZE;
	if (align) align--;

	if (flags & AREA_NODE) {
		node = va_arg(va, __node_id_t);
		node_mask = ~(__node_id_t)0;
	}

	va_end(va);

	MTX_LOCK_SYNC(&VM_MUTEX);
	npages = (size + PAGE_CLUSTER_SIZE - 1) / PAGE_CLUSTER_SIZE;

	scan_again:
	pgidx = 0;

	next_page:
	KERNEL$THREAD_MAY_BLOCK();

	pgidx = (pgidx + align) & ~(unsigned long)align;

	if (__unlikely(((pgidx + npages - 1) >> (PG_BANK_BITS - PG_CLUSTER_BITS)) >= VM_ARCH->N_DIRECT_PBANKS)) {
		if (!VM_ARCH_NEW_PBANK(flags & FLAGS, node, node_mask))
			goto scan_again;
		if (node_mask) {
			node_mask = 0;
			goto scan_again;
		}
		p = __ERR_PTR(-ENOMEM);
		goto ret;
	}

	for (i = 0; i < npages; i++) {
		int pbank_flags;
		KERNEL$THREAD_MAY_BLOCK();
		pbank_flags = VM_ARCH->PBANKS[(pgidx + i) >> (PG_BANK_BITS - PG_CLUSTER_BITS)];
		if ((pbank_flags & (flags & FLAGS)) != (flags & FLAGS)) {
			pgidx = (pgidx + PG_BANK / PG_CLUSTER) & ~(unsigned long)(PG_BANK / PG_CLUSTER - 1);
			goto next_page;
		}

		if ((KERNEL_PAGE_MAP[pgidx + i].node & node_mask) != (node & node_mask) ||
		    KERNEL_PAGE_MAP[pgidx + i].release == UNALLOCATED_PAGE_RELEASE ||
		    KERNEL_PAGE_MAP[pgidx + i].release == NO_PAGE_RELEASE ||
		    KERNEL_PAGE_MAP[pgidx + i].release == KERNEL_PAGE_RELEASE) {
			pgidx += i + 1;
			goto next_page;
		}
	}

	for (i = 0; i < npages; i++) {
		KERNEL$THREAD_MAY_BLOCK();
		if (__unlikely(RELEASE_PAGE(&KERNEL_PAGE_MAP[pgidx + i]) != 0)) {
			for (j = 0; j < i; j++)
				KERNEL$FREE_USER_PAGE(&KERNEL_PAGE_MAP[pgidx + j], N_VM_TYPES);
			pgidx += i + 1;
			goto next_page;
		}
	}

	RAISE_SPL(SPL_TOP);
	VM_STATS[VM_TYPE_WIRED_MAPPED] += npages;
	LOWER_SPL(SPL_ZERO);
	MTX_UNLOCK(&VM_MUTEX);

	p = KMAP_PAGE_2_VIRT(KERNEL_PAGE_MAP + pgidx);

	ret:
	RAISE_SPLX(spl);
	return p;
}

__COLD_ATTR__ void KERNEL$FREE_CONTIG_AREA(void *ptr, unsigned long size)
{
	unsigned long npages;
	if (__unlikely((unsigned long)ptr & (PAGE_CLUSTER_SIZE - 1)))
		KERNEL$SUICIDE("KERNEL$FREE_CONTIG_AREA: UNALIGNED POINTER PASSED: %p, %lX", ptr, size);
	npages = (size + PAGE_CLUSTER_SIZE - 1) / PAGE_CLUSTER_SIZE;
	while (npages--) KERNEL$FREE_KERNEL_PAGE(ptr, VM_TYPE_WIRED_MAPPED), ptr = (char *)ptr + PAGE_CLUSTER_SIZE;
}

static WQ *PAGE_LOCKDOWN(PAGE *p, int lock)
{
#if __DEBUG >= 1
	if (__unlikely(KERNEL$SPL != SPL_X(SPL_VSPACE)))
		KERNEL$SUICIDE("PAGE_LOCKDOWN AT SPL %08X", KERNEL$SPL);
#endif
	if (!lock) {
		if (__unlikely((p->flags & PAGE_DMALOCKCOUNT) == PAGE_DMALOCKCOUNT)) return &p->wait;
		p->flags += PAGE_DMALOCKCOUNT_1;
		return NULL;
	} else {
#if __DEBUG >= 1
		if (__unlikely(!(p->flags & PAGE_DMALOCKCOUNT)))
			KERNEL$SUICIDE("PAGE_LOCKDOWN: UNLOCKING UNLOCKED PAGE");
#endif
		if (__unlikely((p->flags & PAGE_DMALOCKCOUNT) == PAGE_DMALOCKCOUNT)) WQ_WAKE_ALL(&p->wait);
		if (__likely(!((p->flags -= PAGE_DMALOCKCOUNT_1) & PAGE_DMALOCKCOUNT))) WQ_WAKE_ALL(&p->wait);
		return NULL;
	}
}

void VM_ADD_PAGE(PAGE *p, int res, __node_id_t node)
{
	int spl;
	p->node = node;
	if (__unlikely(p->release != NULL)) {
		if (!res && p->release == UNALLOCATED_PAGE_RELEASE && __likely(!VM_ARCH_CHECK_HW_MEM(p))) {
			KERNEL$FREE_USER_PAGE(p, VM_TYPE_RESERVED);
		}
		return;
	}
	if (__likely(!res)) {
		if (__unlikely(VM_ARCH_CHECK_HW_MEM(p))) goto reserved;
#if __DEBUG >= 2
		KERNEL$UNMAP_PHYSICAL_BANK(memset(KERNEL$MAP_PHYSICAL_PAGE(p), 0xAA, PAGE_CLUSTER_SIZE));
#endif
		KERNEL$FREE_USER_PAGE(p, N_VM_TYPES);
	} else if (res == 1) {
		reserved:
		p->flags = PAGE_WRITEABLE;
		p->release = UNALLOCATED_PAGE_RELEASE;
		p->lockdown = PAGE_LOCKDOWN;
		WQ_INIT(&p->wait, "KERNEL$RESERVED_PAGE_WAIT");
		INIT_XLIST(&p->mapping);
		spl = KERNEL$SPL;
		RAISE_SPL(SPL_TOP);
		VM_STATS[VM_TYPE_RESERVED]++;
		LOWER_SPLX(spl);
	} else if (res == 2) {
		p->release = KERNEL_PAGE_RELEASE;
		spl = KERNEL$SPL;
		RAISE_SPL(SPL_TOP);
		VM_STATS[VM_TYPE_WIRED_MAPPED]++;
		LOWER_SPLX(spl);
	} else KERNEL$SUICIDE("VM_ADD_PAGE: RES %d, PAGE %p", res, p);
}

__COLD_ATTR__ int VM_RELEASE_PAGES(unsigned long from, unsigned long to_reserve, unsigned long to)
{
	PAGE *p;
	int r;
	unsigned long i;
	int spl = KERNEL$SPL;
	if (__unlikely(from > to_reserve) || __unlikely(to_reserve > to)) KERNEL$SUICIDE("VM_RELEASE_PAGES: BAD PARAMS %lX, %lX, %lX", from, to_reserve, to);
	for (i = from; i < to; i++) {
		KERNEL$THREAD_MAY_BLOCK();
		p = &KERNEL_PAGE_MAP[i];
		if (p->release == UNALLOCATED_PAGE_RELEASE) {
			RAISE_SPL(SPL_TOP);
			VM_STATS[VM_TYPE_RESERVED]--;
			LOWER_SPLX(spl);
			continue;
		}
		if (__unlikely((r = RELEASE_PAGE(p)) != 0)) {
			for (i--; i != from - 1; i--) {
				if (KERNEL_PAGE_MAP[i].release != UNALLOCATED_PAGE_RELEASE) {
					KERNEL$FREE_USER_PAGE(&KERNEL_PAGE_MAP[i], N_VM_TYPES);
				} else {
					RAISE_SPL(SPL_TOP);
					VM_STATS[VM_TYPE_RESERVED]++;
					LOWER_SPLX(spl);
				}
			}
			return r;
		}
	}
	for (i = from; i < to_reserve; i++) {
		RAISE_SPL(SPL_TOP);
		KERNEL_PAGE_MAP[i].release = NULL;
		VM_ADD_PAGE(&KERNEL_PAGE_MAP[i], 1, KERNEL_PAGE_MAP[i].node);
		LOWER_SPLX(spl);
	}
	memset(&KERNEL_PAGE_MAP[to_reserve], 0, sizeof(PAGE) * (to - to_reserve));
	return 0;
}

__COLD_ATTR__ void VM_RESET_ZONES(int bank)
{
	unsigned long i;
	int spl = KERNEL$SPL;
	for (i = (unsigned long)bank * (PG_BANK / PG_CLUSTER); i < (unsigned long)(bank + 1) * (PG_BANK / PG_CLUSTER); i++) {
		PAGE *p = &KERNEL_PAGE_MAP[i];
		RAISE_SPL(SPL_TOP);
		if (p->release == RELEASE_FREE_PAGE) {
			struct pg_zone *z = VM_GET_PAGE_ZONE(p);
			if (z != p->fnode) {
				if (!((struct pg_zone *)p->fnode)->n_free--)
					KERNEL$SUICIDE("VM_RESET_ZONES: N_FREE UNDERFLOW");
				DEL_FROM_LIST(&p->hash_entry);
				p->release = NO_PAGE_RELEASE;
				KERNEL$FREE_USER_PAGE(p, N_VM_TYPES);
			}
		}
		LOWER_SPLX(spl);
	}
	WQ_WAKE_ALL(&KERNEL$FREEMEM_WAIT);
}

int KERNEL$OOM(int type)
{
	unsigned long some_free;
	int spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
	some_free = VM_STATS[VM_TYPE_CACHED_MAPPED] | KERNEL_ZONE.n_free;
	if (type == VM_TYPE_WIRED_MAPPED || type == VM_TYPE_CACHED_MAPPED || type == VM_TYPE_USER_MAPPED) {
		if (!BALANCE_CANT_EXTEND || BALANCE_NO_PROGRESS <= 0) {
			goto test_unmapped;
		}
	} else if (type == VM_TYPE_WIRED_UNMAPPED || type == VM_TYPE_CACHED_UNMAPPED || type == VM_TYPE_USER_UNMAPPED) {
		test_unmapped:
		some_free |= VM_STATS[VM_TYPE_CACHED_UNMAPPED] | VM_STATS[VM_TYPE_USER_UNMAPPED] | IO_ZONE.n_free;
#if KERNEL_HIGH_ZONE_COMPILE
		some_free |= HIGH_ZONE.n_free;
#endif
	} else KERNEL$SUICIDE("KERNEL$OOM(%d)", type);
	LOWER_SPLX(spl);
	return !some_free;
}

unsigned long KERNEL$VM_OOMKILL(void)
{
	char *e;
	unsigned long freemem;
	__u64 res;
	unsigned long lres;
	int spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
	e = getenv("@KERNEL$CACHE_RESERVED");
	if (__unlikely(e != NULL) && __likely(!__get_64_number(e, strchr(e, 0), 0, (__s64 *)&res))) res /= PAGE_CLUSTER_SIZE;
	else res = KERNEL$GET_MEMORY_SIZE(VM_TYPE_USER_UNMAPPED) / (PAGE_CLUSTER_SIZE * CACHE_RESERVE_PCT);
	res += (max_io_free_target << 1) + 1;
	if (__unlikely(res > MAXULONG)) lres = MAXULONG;
	else lres = res;
	freemem = VM_STATS[VM_TYPE_CACHED_MAPPED] + VM_STATS[VM_TYPE_CACHED_UNMAPPED] + KERNEL_ZONE.n_free + IO_ZONE.n_free
#if KERNEL_HIGH_ZONE_COMPILE
		+ HIGH_ZONE.n_free
#endif
		;
	LOWER_SPLX(spl);
	if (freemem < lres) return lres - freemem;
	else return 0;
}

int KERNEL$OOM_ERR(size_t sz)
{
	if (__unlikely(sz > PAGE_CLUSTER_SIZE)) return -ERANGE;
	if (__unlikely(KERNEL$OOM(VM_TYPE_WIRED_MAPPED))) return -ENOMEM;
	return 0;
}

void KERNEL$MEMWAIT(IORQ *rq, IO_STUB *fn, size_t sz)
{
	int r;
	if (__unlikely(r = KERNEL$OOM_ERR(sz))) {
		rq->status = r;
		CALL_AST(rq);
		return;
	}
	WQ_WAIT(&KERNEL$FREEMEM_WAIT, rq, fn);
}

int KERNEL$MEMWAIT_SYNC(size_t sz)
{
	int r;
	KERNEL$THREAD_MAY_BLOCK();
	if (__unlikely(r = KERNEL$OOM_ERR(sz))) goto ret_err;
	if (KERNEL_ZONE.n_free) return 0;
#if __DEBUG_ALLOC_KERNEL_PAGE_SHORTAGE > 0 || __DEBUG_ALLOC_USER_PAGE_SHORTAGE > 0
	r = 0;
#else
	r = WQ_WAIT_SYNC_CANCELABLE(&KERNEL$FREEMEM_WAIT);
#endif
	if (__unlikely(r)) ret_err: errno = -r;
	return r;
}


DECL_IOCALL(KERNEL$UNIVERSAL_MALLOC, SPL_MALLOC, MALLOC_REQUEST)
{
	if (__likely((RQ->ptr = malloc(RQ->size)) != NULL)) {
		RQ->status = 0;
		RETURN_AST(RQ);
	}
	KERNEL$MEMWAIT((IORQ *)RQ, KERNEL$UNIVERSAL_MALLOC, RQ->size);
	RETURN;
}

/* shadowed in LIB/KERNEL/UVM.C */

WQ *KERNEL$VM_UNMAP_PAGE(PAGE *p)
{
	WQ *wq;
	if (__unlikely((wq = VM_ARCH_UNMAP_MAPPING(&p->mapping)) != NULL)) return wq;
	/* raises SPL to SPL_VSPACE */
	if (__unlikely(p->flags & (PAGE_BUSY | PAGE_DMALOCKCOUNT | PAGE_WRITECOUNT)))
		return &p->wait;
	return NULL;
}

WQ *KERNEL$VM_UNSET_WRITEABLE(PAGE *p)
{
	WQ *wq;
	if (__unlikely((void *)(wq = VM_ARCH_CHECK_MAPPING(&p->mapping, 1)) > (void *)1)) return wq;
	/* raises SPL to SPL_VSPACE */
	if (__unlikely(p->flags & (PAGE_BUSY | PAGE_DMALOCKCOUNT)))
		return &p->wait;
	return NULL;
}

int KERNEL$VM_SCAN_PAGE(PAGE *p)
{
	WQ *r;
	int spl = KERNEL$SPL;
	r = VM_ARCH_CHECK_MAPPING(&p->mapping, 0);
	/* raises SPL to SPL_VSPACE */
	if (__unlikely((void *)r > (void *)1)) r = (void *)1;
	LOWER_SPLX(spl);
	return (int)(unsigned long)r;
}

WQ *KERNEL$VM_UNMAP_SPAGE(SPAGE *sp)
{
	WQ *wq;
	if (__unlikely((wq = VM_ARCH_UNMAP_MAPPING(&sp->mapping)) != NULL)) return wq;
	/* raises SPL to SPL_VSPACE */
	if (__unlikely(sp->flags & (PAGE_BUSY | PAGE_DMALOCKCOUNT | PAGE_WRITECOUNT)))
		return &sp->wait;
	return NULL;
}

int KERNEL$VM_SCAN_SPAGE(SPAGE *sp)
{
	WQ *r;
	int spl = KERNEL$SPL;
	r = VM_ARCH_CHECK_MAPPING(&sp->mapping, 0);
	/* raises SPL to SPL_VSPACE */
	if (__unlikely((void *)r > (void *)1)) r = (void *)1;
	LOWER_SPLX(spl);
	return (int)(unsigned long)r;
}

void KERNEL$VM_PREPARE_PAGE_FOR_MMAP(PAGE *p)
{
	p->flags = PAGE_WRITEABLE;
	p->lockdown = PAGE_LOCKDOWN;
	WQ_INIT(&p->wait, "KERNEL$PAGE_WAIT");
	INIT_XLIST(&p->mapping);
}

__COLD_ATTR__ static void ZONE_INIT(struct pg_zone *z)
{
	INIT_LIST(&z->freelist[0]);
	INIT_LIST(&z->freelist[1]);
	z->n_free = 0;
}

void VM_COUNT_ZONES(unsigned long *kernel, unsigned long *io, unsigned long *high)
{
	unsigned long i;
	unsigned long kz = 0, iz = 0, hz = 0;
#define step (PG_BANK / PG_CLUSTER)
	for (i = 0; i < VM_ARCH->N_PBANKS * (PG_BANK / PG_CLUSTER); i += step) {
		struct pg_zone *z = VM_GET_PAGE_ZONE(&KERNEL_PAGE_MAP[i]);
		if (z == &KERNEL_ZONE) kz += step;
		else if (z == &IO_ZONE) iz += step;
#if KERNEL_HIGH_ZONE_COMPILE
		else if (z == &HIGH_ZONE) hz += step;
#endif
		else KERNEL$SUICIDE("VM_SET_TARGETS: PAGE %lu HAS UNKNOWN ZONE %p", i, z);
	}
#undef step
	*kernel = kz;
	*io = iz;
	*high = hz;
	return;
}

void VM_REFRESH_WATERMARKS(void)
{
	unsigned wm;
	VM_FREEMEM_WATERMARK_REFRESH = WATERMARK_REFRESH_N;
	if (__likely(!(wm = VM_FREEMEM_WATERMARK))) {
		unsigned long kz, iz, hz;
		unsigned long t;
		VM_COUNT_ZONES(&kz, &iz, &hz);

		t = __likely(kz + iz + hz >= VM_STATS[VM_TYPE_WIRED_MAPPED] + VM_STATS[VM_TYPE_WIRED_UNMAPPED]) ? kz + iz + hz - VM_STATS[VM_TYPE_WIRED_MAPPED] - VM_STATS[VM_TYPE_WIRED_UNMAPPED] : 0;
		t = t / FREE_MEMORY_PCT + 1;
		max_io_free_target = t;

		t = __likely(kz >= VM_STATS[VM_TYPE_WIRED_MAPPED]) ? kz - VM_STATS[VM_TYPE_WIRED_MAPPED] : 0;
		t = t / FREE_MEMORY_PCT + 1;
		if (__unlikely(t > max_io_free_target)) t = max_io_free_target;
		max_kernel_free_target = t;

		/*__debug_printf("kz: %d, iz %d, hz %d\n", kz, iz, hz);
		__debug_printf("wm: %d, wunm: %d\n", VM_STATS[VM_TYPE_WIRED_MAPPED], VM_STATS[VM_TYPE_WIRED_UNMAPPED]);
		__debug_printf("watermarks: %d, %d\n", max_kernel_free_target, max_io_free_target);*/
	} else {
		max_kernel_free_target = wm;
		max_io_free_target = wm;
	}
}

static void freemem_timer_fn(TIMER *t)
{
	/* SPL_TOP active here */
	u_jiffies_lo_t j = KERNEL$GET_JIFFIES_LO();
	if (__likely(kernel_free_target != 0) && __unlikely(can_decrease(j, last_kernel_target_incr))) {
		kernel_free_target--;
		last_kernel_target_incr = j - TARGET_INCREASE_LATENCY;
	}
	if (__likely(io_free_target != 0) && __unlikely(can_decrease(j, last_io_target_incr))) {
		io_free_target--;
		last_io_target_incr = j - TARGET_INCREASE_LATENCY;
	}
	if (j - last_user_target_decr > USER_TARGET_DECREASE_LATENCY) {
		user_free_target -= ((user_free_target + 3) >> 2);
		last_user_target_decr = j;
	}
	LOWER_SPL(SPL_TIMER);
	WQ_WAKE_ALL(&KERNEL$FREEMEM_WAIT);
	WQ_WAKE_ALL(&WRITEBACK_WAIT);
	WQ_WAKE_ALL(&FREEMEM_ABOVE);
	WQ_WAKE_ALL(&FREE_ENTITIES);
	if (__unlikely(!--VM_FREEMEM_WATERMARK_REFRESH)) {
		VM_REFRESH_WATERMARKS();
	}
	KERNEL$SET_TIMER(FREEMEM_JIFFIES, &freemem_timer);
}

WQ *KERNEL$MAY_ALLOC(PROC *p, unsigned size)
{
	int spl;
	unsigned long freepg;
	if (__unlikely(p == &KERNEL$PROC_KERNEL)) return NULL;
	spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
	UNIFY_ALLOC_RATE(p);
	LOWER_SPLX(spl);
	/*if (__unlikely(!p->alloc_rate)) goto ok_to_alloc;*/
	freepg = p->alloc_rate >> (PG_SIZE_BITS + PG_CLUSTER_BITS);
	if (__unlikely((freepg << 2) > max_io_free_target)) {
		freepg = (max_io_free_target >> 2) + ((freepg - (max_io_free_target >> 2)) >> 2);
		if (__unlikely((freepg << 1) > max_io_free_target))
			freepg = max_io_free_target >> 1;
	}
	if (__unlikely((freepg << 1) > user_free_target)) {
		user_free_target = freepg << 1;
		RAISE_SPL(SPL_TOP);
		MEMORY_BALANCE();
		LOWER_SPLX(spl);
	}
	if (__unlikely(freepg + VM_ENTITIES > VM_ENTITIES_HARD_LIMIT) && __likely(freepg != 0)) {
		spl = KERNEL$SPL;
		RAISE_SPL(SPL_TOP);
		MEMORY_BALANCE();
		LOWER_SPLX(spl);
		return &FREE_ENTITIES;
	}
#if !KERNEL_HIGH_ZONE_COMPILE
	if (__unlikely(KERNEL_ZONE.n_free + IO_ZONE.n_free < freepg + io_free_target))
#else
	if (__unlikely(KERNEL_ZONE.n_free + IO_ZONE.n_free + HIGH_ZONE.n_free < freepg + io_free_target))
#endif
	{
		spl = KERNEL$SPL;
		RAISE_SPL(SPL_TOP);
		MEMORY_BALANCE();
		LOWER_SPLX(spl);
		/*if (__unlikely(!KERNEL_ZONE.n_free)) return NULL;*/
		return &FREEMEM_ABOVE;
	}

	/*ok_to_alloc:*/
	spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
	if (__unlikely((p->alloc_rate += size) > MAXLONG)) p->alloc_rate = MAXLONG;
	LOWER_SPLX(spl);
	return NULL;
}


void KERNEL$NOTIFY_FREE(PROC *p, unsigned size)
{
	unsigned long pa;
	int spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
	pa = p->alloc_rate - size;
	pa &= (pa / __LONG_SGN_BIT) - 1;
	p->alloc_rate = pa;
	LOWER_SPLX(spl);
}

void KERNEL$NOTIFY_ALLOC(PROC *p, unsigned size)
{
	int spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
	p->alloc_rate += size;
	if (__unlikely(p->alloc_rate > MAXLONG)) p->alloc_rate = MAXLONG;
	LOWER_SPLX(spl);
}

static int dirty_wait_until_idle = 0;

WQ *KERNEL$MAY_DIRTY(PROC *p, unsigned mode)
{
	if (__likely(!mode)) {
		if (__likely(p->writeback <= 1 + VM_WIRED_STREAM_QUEUE + VM_WIRED_STREAM_QUEUE / 4)) return NULL;
	} else {
		if (!p->writeback) return NULL;
		dirty_wait_until_idle = 1;
	}
	if (__unlikely(p == &KERNEL$PROC_KERNEL)) return NULL;
	return &WRITEBACK_WAIT;
}

/* !!! TODO: acquire tag also on all dirty pages of to-be-written fnode */
void KERNEL$ACQUIRE_WRITEBACK_TAG(PROC *proc)
{
	__ADDI(&proc->writeback, 1);
}

void KERNEL$RELEASE_WRITEBACK_TAG(PROC *proc)
{
	__SUBI(&proc->writeback, 1);
	if (__unlikely(dirty_wait_until_idle)) {
		if (!proc->writeback) {
			WQ_WAKE_ALL_PL(&WRITEBACK_WAIT);
			dirty_wait_until_idle = 0;
		}
	}
	if (proc->writeback == (VM_WIRED_STREAM_QUEUE >> 1) + 1) WQ_WAKE_ALL(&WRITEBACK_WAIT);
#if __DEBUG >= 1
	if (__unlikely(proc->writeback < 0))
		KERNEL$SUICIDE("KERNEL$RELEASE_WRITEBACK_TAG: WRITEBACK COUNTER UNDERFLOW: %d", proc->writeback);
#endif
	if (__unlikely(proc->flags & PR_RUNDOWN) && !proc->writeback)
/* SMP warning, process may acquire another tag immediatelly while >= SPL_DEV */
		DELAYED_SHUTDOWN();
}

__COLD_ATTR__ void KERNEL_VM_INIT(void)
{
	if (sizeof(PAGE) != PG_SIZEOF_STRUCT_PAGE || PG_SIZEOF_STRUCT_PAGE != (1 << PG_SIZEOF_STRUCT_PAGE_BITS) || PG_SIZEOF_STRUCT_PAGE & (PG_SIZEOF_STRUCT_PAGE - 1))
		KERNEL$SUICIDE("KERNEL_VM_INIT: BAD PAGE SIZE: %d, %d, %d", (int)sizeof(PAGE), PG_SIZEOF_STRUCT_PAGE, PG_SIZEOF_STRUCT_PAGE_BITS);
	memset(&VM_STATS, 0, sizeof VM_STATS);
#if KERNEL_HIGH_ZONE_COMPILE
	ZONE_INIT(&HIGH_ZONE);
#endif
	ZONE_INIT(&IO_ZONE);
	io_free_target = 0;
	ZONE_INIT(&KERNEL_ZONE);
	kernel_free_target = 0;
	user_free_target = 0;
	VM_FREEMEM_WATERMARK_REFRESH = 1;
}

__COLD_ATTR__ static DECL_AST(THREAD_RETURNED, SPL_TOP, THREAD_RQ)
{
	__critical_printf("COULD NOT SPAWN VM THREAD: %s", strerror(-RQ->status));
	HALT_KERNEL();
}

__p_addr ZERO_PAD_PHYS;

__COLD_ATTR__ void KERNEL_VM_INIT_2(void)
{
	static THREAD_RQ cache_free_thread;
	static THREAD_RQ balance_thread;

	void *ptr;

#if __KERNEL_SUPPORT_SMP
	if (!VAL_CPU_ID) {
		__node_id_t n;
		vmnodes = calloc(VAL_NODE_ID_LIMIT, sizeof(struct vmnode));
		if (!vmnodes) {
			__critical_printf("COULD NOT ALLOCATE %lu VM NODES\n", (unsigned long)VAL_NODE_ID_LIMIT);
			HALT_KERNEL();
		}
		for (n = 0; n < VAL_NODE_ID_LIMIT; n++) {
			INIT_LIST(&vmnodes[n].buckets);
			KERNEL$SPINLOCK_INIT(&vmnodes[n].lock);
		}
		REGISTER_SHARED_POINTER(&vmnodes_share);
	}
#endif

	freemem_timer.fn = freemem_timer_fn;
	KERNEL$SET_TIMER(FREEMEM_JIFFIES, &freemem_timer);

	cache_free_thread.fn = THREAD_RETURNED;
	cache_free_thread.thread_main = CACHE_FREE_THREAD;
	cache_free_thread.p = NULL;
	cache_free_thread.error = NULL;
	cache_free_thread.cwd = NULL;
	cache_free_thread.std_in = -1;
	cache_free_thread.std_out = -1;
	cache_free_thread.std_err = -1;
	cache_free_thread.dlrq = NULL;
	cache_free_thread.thread = NULL;
	cache_free_thread.spawned = 0;
	CALL_IORQ(&cache_free_thread, KERNEL$THREAD);

	balance_thread.fn = THREAD_RETURNED;
	balance_thread.thread_main = BALANCE_THREAD;
	balance_thread.p = NULL;
	balance_thread.error = NULL;
	balance_thread.cwd = NULL;
	balance_thread.std_in = -1;
	balance_thread.std_out = -1;
	balance_thread.std_err = -1;
	balance_thread.dlrq = NULL;
	balance_thread.thread = NULL;
	balance_thread.spawned = 0;
	CALL_IORQ(&balance_thread, KERNEL$THREAD);

	ptr = KERNEL$ALLOC_CONTIG_AREA(PAGE_CLUSTER_SIZE, AREA_DATA | AREA_PHYSCONTIG | AREA_PCIDMA | AREA_ALIGN, (unsigned long)PAGE_CLUSTER_SIZE);
	if (__IS_ERR(ptr)) {
		__critical_printf("COULD NOT ALLOCATE ZERO PAGE\n");
		HALT_KERNEL();
	}
	memset(ptr, 0, PAGE_CLUSTER_SIZE);
	ZERO_PAD_PHYS = KERNEL$VIRT_2_PHYS(ptr);
}

int KERNEL$QUERY_MEMORY(unsigned long *mem)
{
	int spl;
	memset(mem, 0, KQM_N_ENTRIES * sizeof(unsigned long));
	spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
	mem[KQM_KERNEL_FREE] = KERNEL_ZONE.n_free;
	mem[KQM_IO_FREE] = IO_ZONE.n_free;
#if KERNEL_HIGH_ZONE_COMPILE
	mem[KQM_HIGH_FREE] = HIGH_ZONE.n_free;
#endif
	memcpy(mem, VM_STATS, N_VM_TYPES * sizeof(unsigned long));
	LOWER_SPLX(spl);
	return 0;
}

__COLD_ATTR__ void KERNEL$MEMSTAT_DUMP(void)
{
	unsigned long active;
	unsigned long mem[KQM_N_ENTRIES];
	long *net_limit, *net_avail;
	unsigned long net_mem;
	if (__unlikely(KERNEL$QUERY_MEMORY(mem))) return;
	net_limit = dlsym(RTLD_DEFAULT, "NET$MEMORY_LIMIT");
	net_avail = dlsym(RTLD_DEFAULT, "NET$MEMORY_AVAIL");
	if (__likely(net_limit != NULL) && __likely(net_avail != NULL)) net_mem = *net_limit - *net_avail;
	else net_mem = 0;
	active = CACHE_ACTIVE_ENTITIES();
	__critical_printf(
"RESERVED: %lu, NET MAPPED: %lu, WIRED MAPPED: %lu, WIRED UNMAPPED: %lu\n"
"CACHED MAPPED: %lu, CACHED UNMAPPED: %lu, USER MAPPED: %lu, USER UNMAPPED: %lu\n"
"KERNEL FREE: %lu, IO FREE: %lu, HIGH FREE: %lu\n"
"KERNEL TARGET: %lu, IO TARGET: %lu, USER TARGET: %lu\n"
"VM ENTITIES: %lu, ACTIVE %lu, INACTIVE %lu, %s\n"
"SWAPIO: %d, FREE UNDER %d, ACTIVE %d\n"
"BALANCE UNDER %d, ACTIVE %d, CAN'T EXTEND %d, NO PROGRESS %d\n",
mem[VM_TYPE_RESERVED], net_mem, mem[VM_TYPE_WIRED_MAPPED] - net_mem, mem[VM_TYPE_WIRED_UNMAPPED],
mem[VM_TYPE_CACHED_MAPPED], mem[VM_TYPE_CACHED_UNMAPPED], mem[VM_TYPE_USER_MAPPED], mem[VM_TYPE_USER_UNMAPPED],
mem[KQM_KERNEL_FREE], mem[KQM_IO_FREE], mem[KQM_HIGH_FREE],
kernel_free_target, io_free_target, user_free_target,
VM_ENTITIES, active, VM_ENTITIES - active, CACHE_DUMPQ(),
IO_IN_PROGRESS, FREE_UNDER_MIN(), WQ_EMPTY(&CACHE_FREE_EVENT.wq),
BALANCE_UNDER_MIN(), WQ_EMPTY(&BALANCE_EVENT.wq), BALANCE_CANT_EXTEND, BALANCE_NO_PROGRESS);
}

void VM_CHECK_MAGICS(void)
{
#ifdef TEST_MEMORY
	int i;
	int spl = KERNEL$SPL;
	RAISE_SPL(SPL_TOP);
#if !KERNEL_HIGH_ZONE_COMPILE
	for (i = 0; i < 2; i++) {
		struct pg_zone *z = !i ? &KERNEL_ZONE : &IO_ZONE;
#else
	for (i = 0; i < 3; i++) {
		struct pg_zone *z = !i ? &KERNEL_ZONE : i == 1 ? &IO_ZONE : &HIGH_ZONE;
#endif
		int j;
		PAGE *p;
		for (j = 0; j < 2; j++)
			LIST_FOR_EACH(p, &z->freelist[j], PAGE, hash_entry) {
				__u32 *v = KERNEL$MAP_PHYSICAL_PAGE(p);
				if (__unlikely(v[TEST_OFFSET_1] != MAGIC_1) ||
				    __unlikely(v[TEST_OFFSET_2] != MAGIC_2) ||
				    __unlikely(v[TEST_OFFSET_3] != MAGIC_3)) {
					TEST_PAGE_FAILED(p, v);
				}
				KERNEL$UNMAP_PHYSICAL_BANK(v);
			}
	}
	LOWER_SPLX(spl);
#endif
}
