#include "SWAPPER.H"

static __u64 error_kill_seq = 0;

static SWAPRQ *swaprq_current = NULL;
static unsigned swaprq_current_pos = 0;

static AST delayed_timer = { NULL };

static int UNSHARE_PAGES(void);
static unsigned long N_SHARED_PAGES(void);

static unsigned SWAP_ALLOC(int beginning)
{
	unsigned i;
	unsigned sp;
	swp_map_t m;
	if (__unlikely(beginning)) goto no_wide_space;
	if (swaprq_current_pos != swppages && __likely(!TEST_SWPALLOC_BIT(swaprq_current_pos))) {
		if (__unlikely(!(swaprq_current_pos & SWP_RETRY_MASK))) goto skip_current;
		SET_SWPALLOC_BIT(swaprq_current_pos);
		return swaprq_current_pos;
	}
	skip_current:
	sp = (swppages + SWP_MAP_BITS_MASK) >> SWP_MAP_BITS_L2;
	i = swpalloc_min;
	if (SWP_MAP_BITS == 8) {
		/* strnlen searches for 0x00 byte */
		i += strnlen((char *)swpalloc + i, sp - i);
		if (__unlikely(i == sp)) {
			swpalloc_min = sp - 1;
			goto no_wide_space;
		}
	} else while (1) {
		if (__likely(!swpalloc[i])) break;
		i++;
		if (__unlikely(i >= sp)) {
			swpalloc_min = sp - 1;
			goto no_wide_space;
		}
	}
	swpalloc_min = i;
	if (sizeof(swp_map_t) <= sizeof(unsigned long)) {
		unsigned long m = swpalloc[i - 1];
		if (__unlikely(!m)) KERNEL$SUICIDE("SWAP_ALLOC: WORD FREE BEFORE SWPALLOC_MIN (AT %u)", i);
		i <<= SWP_MAP_BITS_L2;
		i -= SWP_MAP_BITS_MASK - __BSR(m);
	} else {
		i <<= SWP_MAP_BITS_L2;
		while (!TEST_SWPALLOC_BIT(i - 1)) i--;
	}
	SET_SWPALLOC_BIT(i);
	return i;
	
	no_wide_space:
	sp = (swppages + SWP_MAP_BITS_MASK) >> SWP_MAP_BITS_L2;
	i = swpalloc_min_bit;
	while (__likely(!(m = (swpalloc[i] ^ (swp_map_t)-1)))) {
		i++;
		if (__unlikely(i >= sp)) {
			swpalloc_min_bit = sp - 1;
			return 0;
		}
	}
	swpalloc_min_bit = i;
	if (sizeof(swp_map_t) <= sizeof(unsigned long)) {
		i = (i << SWP_MAP_BITS_L2) + __BSF(m);
	} else {
		i <<= SWP_MAP_BITS_L2;
		while (TEST_SWPALLOC_BIT(i)) if (__unlikely(!(++i & SWP_MAP_BITS_MASK))) KERNEL$SUICIDE("SWAP_ALLOC: CAN'T ALLOC BIT");
	}
	SET_SWPALLOC_BIT(i);
	return i | __INT_SGN_BIT;
}

void SWAPRQ_CTOR(struct __slhead *g, void *o)
{
	SWAPRQ *swaprq = o;
	swaprq->rq.h = swphndl;
	swaprq->desc.v.vspace = &KERNEL$PHYSICAL;
}

static AST_STUB IOPAGE_DONE;
static AST_STUB DELAYED_TIMER;

void SWAPIN(PAGENODE *pn, IORQ *rq, PROC *owner, int wr)
{
	PAGE *p;
	SWAPRQ *swaprq;
	WQ *wq;
	/* KERNEL$MAY_ALLOC already in SWAP_CREATE_PAGE */
	if (__unlikely(!(swaprq = __slalloc(&swaprqs)))) {
		if (rq) WQ_WAIT_F(&swaprqs_free, rq);
		return;
	}
	RAISE_SPL(SPL_VSPACE);
	if (__unlikely((wq = KERNEL$ACQUIRE_IO_TAG(&swaprq->tag, owner)) != NULL)) {
		if (rq) WQ_WAIT_F(wq, rq);
		LOWER_SPL(SPL_FS);
		__slow_slfree(swaprq);
		WQ_WAKE_ALL(&swaprqs_free);
		return;
	}
	LOWER_SPL(SPL_FS);
	p = PAGEZONE_ALLOC(&pagezone, KERNEL$ALLOC_IO_PAGE, EMERGENCY_RESERVE_PAGES);
	if (__unlikely(!p)) {
		if (__unlikely(wr & PF_SWAPPER)) {
			p = PAGEZONE_ALLOC(&pagezone, KERNEL$ALLOC_USER_PAGE, 0);
			if (p) goto got_page;
		}
		RAISE_SPL(SPL_VSPACE);
		KERNEL$RELEASE_IO_TAG(&swaprq->tag);
		LOWER_SPL(SPL_FS);
		__slow_slfree(swaprq);
		WQ_WAKE_ALL(&swaprqs_free);
		SWAPOUT(rq);
		return;
	}
	got_page:
	SWAP_LOCK(SW_LOCK);
	setup_node_page(p, pn, swaprq->tag.proc);
	RAISE_SPL(SPL_VSPACE);
	p->flags |= PAGE_BUSY_1;
	LOWER_SPL(SPL_FS);
	SWAP_LOCK(SW_UNLOCK);
	swaprq->page = p;
	swaprq->next = NULL;
	swaprq->last = swaprq;
	swaprq->desc.v.ptr = KERNEL$PAGE_2_PHYS(p);
	swaprq->desc.v.len = PAGE_CLUSTER_SIZE;
	swaprq->desc.next = NULL;
	swaprq->rq.fn = IOPAGE_DONE;
	swaprq->rq.sec = (__sec_t)pn->swap_pos * SECTORS_PER_PAGE;
	swaprq->rq.nsec = SECTORS_PER_PAGE;
	swaprq->rq.flags = BIO_READ;
	swaprq->rq.desc = &swaprq->desc;
	swaprq->rq.proc = swaprq->tag.proc;
	swaprq->rq.fault_sec = -1;
	swaprq->iorq = rq;
	if (__unlikely(!rq)) CALL_IORQ(&swaprq->rq, KERNEL$BIO);
	else CALL_IORQ_CANCELABLE(&swaprq->rq, KERNEL$BIO, rq);
}

static __finline__ void FLUSH_SWAPRQ_CURRENT(void)
{
	if (swaprq_current) {
		CALL_IORQ(&swaprq_current->rq, KERNEL$BIO);
		swaprq_current = NULL;
	}
}

/* need_free:
	0 --- try to swap out
	1 --- must swap out (add to wantfree)
	2 --- must swap out & allocate from the beginning
*/
int SWAPOUT_PAGE(PAGE *p, IORQ *rq, int need_free, PROC *owner)
{
	WQ *wq;
	PAGENODE *pn;
	unsigned al;
	SWAPRQ *swaprq;
	retry:
	if (__unlikely(!swppages)) goto no_swap;
	pn = p->fnode;
	if (__unlikely((unsigned)(unsigned long)pn & PAGE_FNODE_FREED)) {
		if (__likely(wantfree_active)) {
			if (rq) WQ_WAIT_F(&wantfree_done, rq);
			return 1;
		}
		w_p_wait:
		RAISE_SPL(SPL_VSPACE);
		wq = &p->wait;
		goto wwq;
	}
	if (__unlikely(pn->swap_pos)) {
		if (__unlikely((wq = KERNEL$VM_UNMAP_PAGE(p)) != NULL)) {
			wwq:
			SWAPPAGE_ADD_TO_WANTFREE(p, wq);
			if (rq) WQ_WAIT_F(wq, rq);
			LOWER_SPL(SPL_FS);
			return 1;
		}
		pn->page = NULL;
		LOWER_SPL(SPL_FS);
		free_page(p, &KERNEL$PROC_KERNEL);
		return 0;
	}
	if (__unlikely((wq = KERNEL$VM_UNSET_WRITEABLE(p)) != NULL)) goto wwq;
	if (__unlikely(!(swaprq = __slalloc(&swaprqs)))) {
		wq = &swaprqs_free;
		goto wwq;
	}
	p->flags |= PAGE_BUSY_1;
	LOWER_SPL(SPL_FS);
	swaprq->page = p;
	swaprq->next = NULL;
	swaprq->last = swaprq;
	swaprq->desc.v.ptr = KERNEL$PAGE_2_PHYS(p);
	swaprq->desc.v.len = PAGE_CLUSTER_SIZE;
	swaprq->desc.next = NULL;
	swaprq->iorq = (void *)1;
	al = SWAP_ALLOC(need_free == 2);
	if (__unlikely(!al)) {
		unsigned long oom;
		RAISE_SPL(SPL_VSPACE);
		p->flags &= ~PAGE_BUSY_1;
		LOWER_SPL(SPL_FS);
		__slow_slfree(swaprq);
		WQ_WAKE_ALL(&swaprqs_free);

		no_swap:
		if (__unlikely(wantfree_active)) {
			if (rq) WQ_WAIT_F(&wantfree_done, rq);
			return 1;
		}
		if (__unlikely((oom = KERNEL$VM_OOMKILL()) != 0)) {
			if (TEST_PRUNE_LDCACHE()) {
/* we don't know if we will free the page or not, so rather reinsert it */
				SWAPPAGE_OUT_OF_WANTFREE(p);
				PRUNE_LDCACHE();
				goto return_retry;
			}
			if (swppages && oom < N_SHARED_PAGES() && __likely(UNSHARE_PAGES())) {
				goto retry;
			}
#if __DEBUG_USER_ERRORS
			__critical_printf("OOM: KILLING.\n");
#endif
/* we don't know if we will free the page or not, so rather reinsert it */
			SWAPPAGE_OUT_OF_WANTFREE(p);
			OUT_OF_PAGED(&root->pageq);
			return_retry:
/* This is tricky. Return code has two meanings:
	1. if rq != NULL, then return != 0 means that rq was posted
	2. if rq == NULL (cache upcall or SWP_PROCESS_WANTFREE), return != 0
	   means that the page remains on list
   This trick should be resolved better way when this will be rewritten
*/
			if (rq) {
				CALL_IORQ_LSTAT_EXPR(rq, (IO_STUB *)rq->tmp1);
				return 1;
			} else {
				return 0;
			}
		}
		if (rq) WQ_WAIT_F(&KERNEL$FREEMEM_WAIT, rq);
		return 2;
	}
	swaprq->allocated_from_beginning = al / __INT_SGN_BIT;
	swaprq->error_kill_seq = error_kill_seq;
	al &= ~__INT_SGN_BIT;
	RAISE_SPL(SPL_VSPACE);
	pn->swap_pos = al;
	LOWER_SPL(SPL_FS);
	if (__likely(al == swaprq_current_pos) && __likely(swaprq_current != NULL) && __likely(swaprq_current->rq.nsec + SECTORS_PER_PAGE <= optimal_sectors)) {
		swaprq_current->rq.nsec += SECTORS_PER_PAGE;
		swaprq_current_pos++;
		swaprq_current->last->next = swaprq;
		swaprq_current->last->desc.next = &swaprq->desc;
		swaprq_current->last = swaprq;
		goto ret;
	}
	FLUSH_SWAPRQ_CURRENT();
	KERNEL$ACQUIRE_WRITEBACK_TAG(owner);
	swaprq_current_pos = al + 1;
	swaprq->rq.fn = IOPAGE_DONE;
	swaprq->rq.sec = (__sec_t)al * SECTORS_PER_PAGE;
	swaprq->rq.nsec = SECTORS_PER_PAGE;
	swaprq->rq.flags = BIO_WRITE;
	swaprq->rq.desc = &swaprq->desc;
	swaprq->tag.proc = owner;
	swaprq->rq.proc = owner;
	swaprq->rq.fault_sec = -1;
	swaprq_current = swaprq;
	if (__likely(!delayed_timer.fn)) {
		delayed_timer.fn = DELAYED_TIMER;
		CALL_AST(&delayed_timer);
	}
	ret:
	if (!need_free) {
		if (rq) WQ_WAIT_F(&p->wait, rq);
		return 1;
	}
	goto w_p_wait;
}

static DECL_AST(DELAYED_TIMER, SPL_SHELL, AST)
{
	RAISE_SPL(SPL_FS);
	delayed_timer.fn = NULL;
	FLUSH_SWAPRQ_CURRENT();
	RETURN;
}

static void IO_ERROR(SWAPRQ *swaprq);
static void READPAGE_ERROR(SWAPRQ *swaprq);
static void WRITEPAGE_ERROR(SWAPRQ *swaprq);

static DECL_AST(IOPAGE_DONE, SPL_FS, BIORQ)
{
	PAGE *p;
	SWAPRQ *swaprq = GET_STRUCT(RQ, SWAPRQ, rq), *s2;
	if ((unsigned long)swaprq->iorq != 1) {
		if ((unsigned long)swaprq->iorq > 1) {
			IO_DISABLE_CHAIN_CANCEL(SPL_X(SPL_FS), swaprq->iorq);
		}
		RAISE_SPL(SPL_VSPACE);
		KERNEL$RELEASE_IO_TAG(&swaprq->tag);
		LOWER_SPL(SPL_FS);
	} else {
		KERNEL$RELEASE_WRITEBACK_TAG(swaprq->tag.proc);
	}
	if (__unlikely(swaprq->rq.status < 0)) {
		IO_ERROR(swaprq);
		RETURN;
	}
	do {
		p = swaprq->page;
		RAISE_SPL(SPL_VSPACE);
#if __DEBUG >= 1
		if (__unlikely(!(p->flags & PAGE_BUSY_1)))
			KERNEL$SUICIDE("IOPAGE_DONE: IO ON NON-BUSY PAGE, FLAGS %X", p->flags);
#endif
		p->flags &= ~PAGE_BUSY_1;
		WQ_WAKE_ALL(&p->wait);
		LOWER_SPL(SPL_FS);
		if ((unsigned long)swaprq->iorq > 1) {
			CALL_IORQ_LSTAT_EXPR(swaprq->iorq, (IO_STUB *)swaprq->iorq->tmp1);
#if __DEBUG >= 1
			if (__unlikely(swaprq->next != NULL))
				KERNEL$SUICIDE("IOPAGE_DONE: MULTI-PAGE IO WITH RQ");
#endif
		}
		s2 = swaprq->next;
		__slfree(swaprq);
		swaprq = s2;
	} while (swaprq);
	WQ_WAKE_ALL(&swaprqs_free);
	RETURN;
}

static void IO_ERROR(SWAPRQ *swaprq)
{
	if ((unsigned long)swaprq->iorq != 1) {
		READPAGE_ERROR(swaprq);
	} else {
		WRITEPAGE_ERROR(swaprq);
	}
}

static void READPAGE_ERROR(SWAPRQ *swaprq)
{
	PAGENODE *pn;
	PAGE *p;
	if (__unlikely(swaprq->next != NULL))
		KERNEL$SUICIDE("READPAGE_ERROR: MULTI-PAGE IO WITH RQ");
	if (__likely(swaprq->iorq != NULL)) {
		swaprq->iorq->status = swaprq->rq.status;
		CALL_AST(swaprq->iorq);
	}
	p = swaprq->page;
	pn = p->fnode;
	RAISE_SPL(SPL_VSPACE);
	p->flags &= ~PAGE_BUSY_1;
	if (__likely(!((unsigned)(unsigned long)pn & PAGE_FNODE_FREED))) {
		pn->page = NULL;
		LOWER_SPL(SPL_FS);
		free_page(p, &KERNEL$PROC_KERNEL);
	} else {
		LOWER_SPL(SPL_FS);
		WQ_WAKE_ALL(&p->wait);
	}
	__slow_slfree(swaprq);
	WQ_WAKE_ALL(&swaprqs_free);
}

static void WRITEPAGE_ERROR(SWAPRQ *swaprq)
{
	SWAPRQ *s2;
	PAGE *p;
	PAGENODE *pn;
	int retry = !swaprq->allocated_from_beginning || swaprq->error_kill_seq != error_kill_seq;
	if (__unlikely(!retry)) {
		unsigned long oom;
		if (__unlikely((oom = KERNEL$VM_OOMKILL()) != 0)) {
			if ((swaprq->rq.status == -ENOSPC || swaprq->rq.status == -ERANGE) && oom < N_SHARED_PAGES() && __likely(UNSHARE_PAGES())) {
				retry = 1;
				goto no_oom;
			}
#if __DEBUG_USER_ERRORS
			__critical_printf("premature OOM due to error %s.\n", strerror(-swaprq->rq.status));
#endif
			OUT_OF_PAGED(&root->pageq);
			error_kill_seq++;
		}
		no_oom:;
	}
	do {
		p = swaprq->page;
		pn = p->fnode;
		RAISE_SPL(SPL_VSPACE);
		if (__unlikely(!(p->flags & PAGE_BUSY_1)))
			KERNEL$SUICIDE("WRITEPAGE_ERROR: IO ON NON-BUSY PAGE, FLAGS %X", p->flags);
		p->flags &= ~PAGE_BUSY_1;
		WQ_WAKE_ALL(&p->wait);
		if (__likely(!((unsigned)(unsigned long)pn & PAGE_FNODE_FREED))) {
			CLEAR_SWPALLOC_BIT(pn->swap_pos);
			pn->swap_pos = 0;
			LOWER_SPL(SPL_FS);
			if (retry) {
				SWAPOUT_PAGE(p, NULL, 2, &KERNEL$PROC_KERNEL);
			} else {
				SWAPPAGE_OUT_OF_WANTFREE(p);
			}
		}
		LOWER_SPL(SPL_FS);
		s2 = swaprq->next;
		__slow_slfree(swaprq);
		swaprq = s2;
	} while (swaprq);
	WQ_WAKE_ALL(&swaprqs_free);
}

#define SWAPOUT_CLUSTER	1

void SWAPOUT(IORQ *rq)
{
	PAGE *p;
#if SWAPOUT_CLUSTER != 1
	int i;
#endif
	if (__unlikely(LIST_EMPTY(&all_pages)))
		KERNEL$SUICIDE("SWAPOUT: NO PAGES");
#if SWAPOUT_CLUSTER != 1
	for (i = 0; i < SWAPOUT_CLUSTER; i++) {
		if (__unlikely(LIST_EMPTY(&all_pages))) break;
#endif
		p = LIST_STRUCT(all_pages.prev, PAGE, node_entry);
		DEL_FROM_LIST(&p->node_entry);
		ADD_TO_LIST(&all_pages, &p->node_entry);
		if (!SWAPOUT_PAGE(p, rq, 0, &KERNEL$PROC_KERNEL)) if (rq) CALL_IORQ_LSTAT_EXPR(rq, (IO_STUB *)rq->tmp1);
#if SWAPOUT_CLUSTER != 1
		rq = NULL;
	}
#endif
}

static int UNSHARE_PAGES(void)
{
	int c = 0;
	PAGE *p;
	PAGENODE *pn;
	LIST_FOR_EACH(p, &all_pages, PAGE, node_entry) {
		if (__unlikely(p->flags & PAGE_BUSY)) continue;
		pn = p->fnode;
		if (__unlikely((unsigned)(unsigned long)pn & PAGE_FNODE_FREED)) continue;
		if (__unlikely(pn->swap_pos)) {
			CLEAR_SWPALLOC_BIT(pn->swap_pos);
			pn->swap_pos = 0;
			c = 1;
		}
	}
	return c;
}

static unsigned long N_SHARED_PAGES(void)
{
	unsigned long c = 0;
	PAGE *p;
	PAGENODE *pn;
	LIST_FOR_EACH(p, &all_pages, PAGE, node_entry) {
		if (__unlikely(p->flags & PAGE_BUSY)) continue;
		pn = p->fnode;
		if (__unlikely((unsigned)(unsigned long)pn & PAGE_FNODE_FREED)) continue;
		if (__unlikely(pn->swap_pos)) {
			c++;
		}
	}
	return c;
}
