#include <SPAD/AC.H>
#include <SPAD/SYNC.H>
#include <SPAD/LIST.H>
#include <SPAD/ALLOC.H>
#include <SPAD/VM.H>
#include <SPAD/DEV_KRNL.H>
#include <SPAD/SLAB.H>
#include <SPAD/IOCTL.H>
#include <SPAD/TIMER.H>
#include <ARCH/SETUP.H>
#include <STDLIB.H>
#include <SYS/TIME.H>
#include <SPAD/LIBPROC.H>
#include <SPAD/QUOTA.H>

#include <SPAD/PKT.H>
#include <SPAD/SOCKET.H>
#include <SYS/SOCKET.H>
#include "NET.H"

#define CHILDHASH_SIZE		64	/* must be <= 256 */
#define HASH_MEMORY_SHIFT	10

struct __socket_node {
	QUOTA q;
	SOCKET_NODE *parent;
	__u64 jobname;
	LIST_HEAD sockets;
	XLIST_HEAD handles;
	SOCKET_NODE *fork_leader;
	int depth;
	SOCKET_SPACE *ss;
	char acc[4];
	NETACL *netacl;
	LIST_ENTRY childhash_entry;
	XLIST_HEAD childhash[CHILDHASH_SIZE];
};

struct __socket_space {
	__const__ SOCKET_SPACE_OPERATIONS *op;
	SOCKET_NODE root;
	void *lnte;
	void *dlrq;
	struct __slhead socket;
	LIST_ENTRY list;
	char name[9];
};

static DECL_XLIST(socket_spaces);

static XLIST_HEAD *socket_hash = NULL;
unsigned socket_hash_mask = 0;

#define HASH_SOCKET(node, n)	(((unsigned long)(node) + ((unsigned long)(node) << 10) + (n)) & socket_hash_mask)

#define q_isroot(x)	(!LIST_STRUCT((x), SOCKET_NODE, q)->parent)
#define q_parent(x)	(&LIST_STRUCT((x), SOCKET_NODE, q)->parent->q)
#define q_for_all_subnodes(v, p)	{ unsigned i; for (i = 0; i < CHILDHASH_SIZE; i++) for (v = &LIST_STRUCT(LIST_STRUCT(p, SOCKET_NODE, q)->childhash[i].next, SOCKET_NODE, childhash_entry)->q; v != &LIST_STRUCT(&KERNEL$LIST_END, SOCKET_NODE, childhash_entry)->q; v = &LIST_STRUCT(LIST_STRUCT(v, SOCKET_NODE, q)->childhash_entry.next, SOCKET_NODE, childhash_entry)->q)
#define q_for_all_subnodes_tail(v, p)	}

static void socket_node_ctor(void *g, void *sn_)
{
	int i;
	SOCKET_NODE *sn = sn_;
	INIT_LIST(&sn->sockets);
	INIT_XLIST(&sn->handles);
	for (i = 0; i < CHILDHASH_SIZE; i++) INIT_XLIST(&sn->childhash[i]);
	sn->acc[0] = 127;
	sn->acc[1] = 127;
	sn->acc[2] = 127;
	sn->acc[3] = 127;
	sn->netacl = NULL;
}

static void socket_ctor(void *ss_, void *s_)
{
	SOCKET_SPACE *ss = ss_;
	SOCKET *s = s_;
	s->space = ss;
	INIT_XLIST(&s->handles);
	ss->op->ctor_SOCKET(s);
}

static int NET_ZAP_FROM_NODE(QUOTA *zap, SOCKET_NODE *current_node, SOCKET *current_socket);

static void *SOCKET_NODE_CLONE(HANDLE *hp, HANDLE *h, int open_flags);
static void *SOCKET_NODE_LOOKUP(HANDLE *h, char *str, int open_flags);
static void *SOCKET_NODE_CREATE(HANDLE *h, char *str, int open_flags);
static void *SOCKET_NODE_DELETE(HANDLE *h, IORQ *rq, int open_flags, HANDLE *hp);
static void *SOCKET_NODE_INSTANTIATE(HANDLE *h, IORQ *rq, int open_flags);
static void SOCKET_NODE_DETACH(HANDLE *h);
static void SOCKET_NODE_LEAVE(HANDLE *h);
extern IO_STUB SOCKET_NODE_IOCTL;

static __const__ HANDLE_OPERATIONS SOCKET_NODE_OPERATIONS = {
	SPL_X(SPL_NET),
	KERNEL$NO_VSPACE_GET,
	KERNEL$NO_VSPACE_PUT,
	KERNEL$NO_VSPACE_MAP,
	KERNEL$NO_VSPACE_DMALOCK,
	KERNEL$NO_VSPACE_DMA64LOCK,
	KERNEL$NO_VSPACE_PHYSLOCK,
	KERNEL$NO_VSPACE_GET_PAGEIN_RQ,
	KERNEL$NO_VSPACE_GET_PAGE,
	KERNEL$NO_VSPACE_SWAP_OP,
	SOCKET_NODE_CLONE,	/* clone */
	SOCKET_NODE_LOOKUP,	/* lookup */
	SOCKET_NODE_CREATE,	/* create */
	SOCKET_NODE_DELETE,	/* delete */
	NULL,			/* rename */
	NULL,			/* lookup_io */
	SOCKET_NODE_INSTANTIATE,/* instantiate */
	SOCKET_NODE_LEAVE,	/* leave */
	SOCKET_NODE_DETACH,	/* detach */
	NULL,			/* open */
	NULL,			/* close */
	KERNEL$NO_OPERATION,	/* read */
	KERNEL$NO_OPERATION,	/* write */
	KERNEL$NO_OPERATION,	/* aread */
	KERNEL$NO_OPERATION,	/* awrite */
	SOCKET_NODE_IOCTL,	/* ioctl */
	KERNEL$NO_OPERATION,	/* bio */
	KERNEL$NO_OPERATION,	/* pktio */
};

static struct __slhead socket_node;

static __finline__ void FREE_NETACL(SOCKET_NODE *sn, NETACL *n)
{
	if (__unlikely(n != NULL)) sn->ss->op->free_netacl(n);
}

static __finline__ void FREE_NETACL_ZERO(SOCKET_NODE *sn, NETACL **n)
{
	if (__unlikely(*n != NULL)) {
		sn->ss->op->free_netacl(*n);
		*n = NULL;
	}
}

#define H_NETACL(h)	(*(NETACL **)(void *)&(h)->flags2)

#define j(x)	(((char *)jobname)[x])

static __finline__ int str2jobname(char *str, __u64 *jobname)
{
	char a;
	unsigned char h;
	*jobname = 0;
	h = 0;
	a = str[0];
	if (__unlikely(!a)) return -1;
	h += a;
	j(0) = a;
	a = str[1];
	if (!a) goto ret;
	h += a;
	j(1) = a;
	a = str[2];
	if (!a) goto ret;
	h += a;
	j(2) = a;
	a = str[3];
	if (!a) goto ret;
	h += a;
	j(3) = a;
	a = str[4];
	if (!a) goto ret;
	h += a;
	j(4) = a;
	a = str[5];
	if (!a) goto ret;
	h += a;
	j(5) = a;
	a = str[6];
	if (!a) goto ret;
	h += a;
	j(6) = a;
	a = str[7];
	if (!a) goto ret;
	h += a;
	j(7) = a;
	a = str[8];
	if (__unlikely(a)) return -1;
	ret:
	h &= CHILDHASH_SIZE - 1;
	return h;
}

#undef j

static __finline__ int jobnamehash(__u64 *jobname)
{
	__u32 h;
	if (sizeof(unsigned long) >= sizeof(__u64))
		h = *jobname + (*jobname >> 32);
	else
		h = ((__u32 *)jobname)[0] + ((__u32 *)jobname)[1];
	h += h >> 16;
	h += h >> 8;
	h &= CHILDHASH_SIZE - 1;
	return h;
}

static void *SOCKET_SUBNODE_CREATE(HANDLE *h, char *str, int open_flags);

static void *SOCKET_SUBNODE_LOOKUP(HANDLE *h, char *str, int open_flags)
{
	__u64 j;
	SOCKET_NODE *n, *sn = h->fnode;
	int x = str2jobname(str, &j);
	if (__unlikely(x < 0)) return __ERR_PTR(-ENAMETOOLONG);
	XLIST_FOR_EACH(n, &sn->childhash[x], SOCKET_NODE, childhash_entry) {
		if (__likely(n->jobname == j)) {
			h->fnode = n;
			return NULL;
		}
	}
	if (__likely(open_flags & (_O_DELETE | O_TRUNC))) return open_flags & _O_DELETE ? __ERR_PTR(-ESRCH) : __ERR_PTR(-ENOENT);
	/*return __ERR_PTR(-ENOENT);*/
	return SOCKET_SUBNODE_CREATE(h, str, O_CREAT);
}

static void *SOCKET_NODE_CLONE(HANDLE *hp, HANDLE *h, int open_flags)
{
	char *str;
	unsigned depth = 0;
	SOCKET_SPACE *s = ((SOCKET_NODE *)hp->fnode)->ss;
	SOCKET_NODE *aclnode = NULL;
	h->flags = hp->flags;
	H_NETACL(h) = NULL;
	h->op = &SOCKET_NODE_OPERATIONS;
	h->fnode = &s->root;
	while ((str = KERNEL$PROC_PATH(h->name_addrspace, &depth))) {
		SOCKET_NODE *n;
		void *ret = SOCKET_SUBNODE_LOOKUP(h, str, open_flags);
		if (__unlikely(ret != NULL)) return ret;
		n = h->fnode;
		if (depth & PROC_PATH_FORKED && __likely(LIST_EMPTY(&n->sockets)/* shouldn't happen unless userspace misbehaves */)) n->fork_leader = n->parent->fork_leader;
		if (n->parent == hp->fnode) aclnode = h->fnode;
	}
	if (h->fnode != hp->fnode && aclnode != NULL && aclnode->netacl != H_NETACL(hp)) {
		void *r;
		r = aclnode->ss->op->copy_netacl(aclnode, &aclnode->netacl, H_NETACL(hp));
		if (__unlikely(r != NULL)) return r;
	}
	return NULL;
}

static void *SOCKET_NODE_SPECIAL(HANDLE *h, char *str)
{
	SOCKET_NODE *sn = h->fnode;
	return sn->ss->op->parse_netacl(sn, str, &H_NETACL(h));
}

static void *SOCKET_NODE_LOOKUP(HANDLE *h, char *str, int open_flags)
{
	unsigned l, hash;
	unsigned c;
	SOCKET_NODE *sn = h->fnode;
	SOCKET *s;
	if (__unlikely(str[0] == '^')) return SOCKET_NODE_SPECIAL(h, str);
	if (__unlikely(str[0] != '@')) return SOCKET_SUBNODE_LOOKUP(h, str, open_flags);
	if (__unlikely(H_NETACL(h) != NULL)) return __ERR_PTR(-EINVAL);	/* can't set NETACL on individual sockets */
	sn = sn->fork_leader;
	if (__unlikely(!(l = (unsigned char)str[1]))) return __ERR_PTR(-EINVAL);
	if (__unlikely(!(c = (unsigned char)str[2]))) return __ERR_PTR(-EINVAL);
	l |= c << 8;
	if (__unlikely(!(c = (unsigned char)str[3]))) return __ERR_PTR(-EINVAL);
	l |= c << 16;
	if (__unlikely(!(c = (unsigned char)str[4]))) return __ERR_PTR(-EINVAL);
	l |= c << 24;
	hash = HASH_SOCKET(sn, l);
	XLIST_FOR_EACH(s, &socket_hash[hash], SOCKET, hash_entry) {
		if (__likely(s->n == l) && __likely(s->node == sn)) {
			FREE_NETACL(h->fnode, H_NETACL(h));
			h->fnode = s;
			h->op = s->op;
			return NULL;
		}
	}
	return __likely(open_flags & O_CREAT) ? __ERR_PTR(-ENOENT) : __ERR_PTR(-ECONNABORTED);
}

NETACL *NET$GET_NETACL(SOCKET_NODE **sn)
{
	SOCKET_NODE *snl = (*sn)->fork_leader;
	NETACL *n = snl->netacl;
	*sn = snl->parent;
	return n;
}

static void *SOCKET_SUBNODE_CREATE(HANDLE *h, char *str, int open_flags)
{
	QUOTA *zap;
	__u64 j;
	WQ *wq;
	SOCKET_NODE *n, *sn = h->fnode;
	int x = str2jobname(str, &j);
	if (__unlikely(x < 0)) return __ERR_PTR(-ENAMETOOLONG);
	if (__unlikely(sn->depth == MAX_PROC_DEPTH - 1)) {
		return __ERR_PTR(-EDQUOT);
	}
	if (__unlikely((wq = KERNEL$MAY_ALLOC(h->name_addrspace, sizeof(SOCKET_NODE))) != NULL)) {
		return wq;
	}
	retry_alloc:
	if (__unlikely(!(n = __slalloc(&socket_node)))) {
		wq = NET$OOM_NODE_SOCKET(sn, NULL);
		if (__unlikely(wq != NULL)) return wq;
	}
	QALLOC(&sn->q, sizeof(SOCKET_NODE) / QUOTA_DIV, q_isroot, q_parent, Q_NULL_CALL, zap, {
		__slow_slfree(n);
		if (NET_ZAP_FROM_NODE(zap, sn, NULL)) return __ERR_PTR(-EDQUOT);
		if (KERNEL$LOCKUP_LEVEL >= LOCKUP_LEVEL_ONE_PASS) return (void *)2;
		goto retry_alloc;
	});
	QINIT(&n->q);
	QINIT2(&n->q);
	n->jobname = j;
	n->parent = sn;
	n->fork_leader = n;
	n->depth = sn->depth + 1;
	n->ss = sn->ss;
	ADD_TO_XLIST(&sn->childhash[x], &n->childhash_entry);
	h->fnode = n;
	/*__debug_printf("create node %Lx\n", n->jobname);*/
	return NULL;
}

static void *SOCKET_NODE_CREATE(HANDLE *h, char *str, int open_flags)
{
	QUOTA *zap;
	int r;
	unsigned l, hash;
	unsigned c;
	long size;
	WQ *wq;
	SOCKET_SPACE *ss;
	SOCKET_NODE *sn = h->fnode;
	SOCKET *s;
	if (__unlikely(str[0] != '@')) return SOCKET_SUBNODE_CREATE(h, str, open_flags);
	sn = sn->fork_leader;
	if (__unlikely(!(l = (unsigned char)str[1]))) return __ERR_PTR(-EINVAL);
	if (__unlikely(!(c = (unsigned char)str[2]))) return __ERR_PTR(-EINVAL);
	l |= c << 8;
	if (__unlikely(!(c = (unsigned char)str[3]))) return __ERR_PTR(-EINVAL);
	l |= c << 16;
	if (__unlikely(!(c = (unsigned char)str[4]))) return __ERR_PTR(-EINVAL);
	l |= c << 24;
	ss = sn->ss;
	if (__unlikely((wq = KERNEL$MAY_ALLOC(h->name_addrspace, ss->op->sizeof_SOCKET)) != NULL)) {
		return wq;
	}
	retry_alloc:
	if (__unlikely(!(s = __slalloc(&ss->socket)))) {
		wq = NET$OOM_NODE_SOCKET(sn, NULL);
		if (__unlikely(wq != NULL)) return wq;
	}
	size = __slheadsize(&ss->socket) / QUOTA_DIV;
	QALLOC(&sn->q, size, q_isroot, q_parent, Q_NULL_CALL, zap, {
		__slow_slfree(s);
		if (NET_ZAP_FROM_NODE(zap, sn, NULL)) return __ERR_PTR(-EDQUOT);
		if (KERNEL$LOCKUP_LEVEL >= LOCKUP_LEVEL_ONE_PASS) return (void *)2;
		goto retry_alloc;
	});
	s->open_count = 1;
	s->n = l;
	s->node = sn;
	s->sock_flags = 0;
	s->sock_sndbuf = 0;
	s->sock_rcvbuf = 0;
	s->sock_sndlowat = 0;
	s->sock_rcvlowat = 0;
	s->sock_sndtimeo = 0;
	s->sock_rcvtimeo = 0;
	s->sock_lingertime = 0;
	s->sock_type = 0;
	s->sock_error = 0;
	r = ss->op->init_socket(s, str + 5);
	if (__unlikely(r)) {
		__slow_slfree(s);
		size = __slheadsize(&ss->socket) / QUOTA_DIV;
		QFREE(&sn->q, size, q_isroot, q_parent, Q_NULL_CALL);
		return __ERR_PTR(r);
	}
	hash = HASH_SOCKET(sn, l);
	ADD_TO_XLIST(&socket_hash[hash], &s->hash_entry);
	ADD_TO_LIST(&sn->sockets, &s->node_entry);
	FREE_NETACL(h->fnode, H_NETACL(h));
	h->fnode = s;
	h->op = s->op;
	return NULL;
}

SOCKET *NET$SOCKET_CREATE_SIBLING(SOCKET *os)
{
	QUOTA *zap;
	SOCKET *s;
	SOCKET_NODE *sn = os->node;
	unsigned l;
	unsigned hash;
	long size;
	again:
	l = __32LE2CPU(*(__u32 *)&sn->acc);
	inc_csock(sn->acc);
	hash = HASH_SOCKET(sn, l);
	XLIST_FOR_EACH_UNLIKELY(s, &socket_hash[hash], SOCKET, hash_entry) {
		if (__unlikely(s->n == l) && __unlikely(s->node == sn)) {
			unsigned r = rand();
			sn->acc[0] = fixup_sock_char(sn->acc[0] ^ r);
			sn->acc[1] = fixup_sock_char(sn->acc[1] ^ (r >> 8));
			sn->acc[2] = fixup_sock_char(sn->acc[2] ^ (r >> 16));
			sn->acc[3] = fixup_sock_char(sn->acc[3] ^ (r >> 24));
			goto again;
		}
	}
	/* don't call KERNEL$MAY_ALLOC? ... we don't know process and this is
	   caused by external event (receive of SYN) rather than by process
	   action */
	retry_alloc:
	if (__unlikely(!(s = __slalloc(&sn->ss->socket)))) {
		if (NET$OOM_NODE_SOCKET(NULL, os)) return NULL;
		goto retry_alloc;
	}
	size = __slheadsize(&sn->ss->socket) / QUOTA_DIV;
	QALLOC(&sn->q, size, q_isroot, q_parent, Q_NULL_CALL, zap, {
		__slow_slfree(s);
		if (NET_ZAP_FROM_NODE(zap, NULL, os)) return NULL;
		if (KERNEL$LOCKUP_LEVEL >= LOCKUP_LEVEL_ONE_PASS) return NULL;
		goto retry_alloc;
	});
	s->open_count = 1;
	s->n = l;
	s->node = sn;
	s->sock_flags = os->sock_flags;
	s->sock_sndbuf = os->sock_sndbuf;
	s->sock_rcvbuf = os->sock_rcvbuf;
	s->sock_sndlowat = os->sock_sndlowat;
	s->sock_rcvlowat = os->sock_rcvlowat;
	s->sock_sndtimeo = os->sock_sndtimeo;
	s->sock_rcvtimeo = os->sock_rcvtimeo;
	s->sock_lingertime = os->sock_lingertime;
	s->sock_type = os->sock_type;
	s->sock_error = 0;
	sn->ss->op->dup_socket(s, os);
	ADD_TO_XLIST(&socket_hash[hash], &s->hash_entry);
	ADD_TO_LIST(&sn->sockets, &s->node_entry);
	return s;
}

static void DELETE_NODE(SOCKET_NODE *sn)
{
	/*__debug_printf("delete node %Lx\n", sn->jobname);*/
	while (__unlikely(!XLIST_EMPTY(&sn->handles))) {
		HANDLE *h = LIST_STRUCT(sn->handles.next, HANDLE, fnode_entry);
		KERNEL$DETACH_HANDLE(h);
	}
	if (__likely(sn->parent != NULL)) {
		QDONE(&sn->q, "SOCKET NODE");
		FREE_NETACL_ZERO(sn, &sn->netacl);
		DEL_FROM_LIST(&sn->childhash_entry);
		QFREE(&sn->parent->q, sizeof(SOCKET_NODE) / QUOTA_DIV, q_isroot, q_parent, Q_NULL_CALL);
		__slfree(sn);
	}
}

static void DELETE_NODE_RECURSIVE(SOCKET_NODE *sn)
{
	unsigned i;
	SOCKET_NODE *snn = sn;
	sub:
	for (i = 0; i < CHILDHASH_SIZE; i++) {
		if (__unlikely(!XLIST_EMPTY(&snn->childhash[i]))) {
			snn = LIST_STRUCT(snn->childhash[i].next, SOCKET_NODE, childhash_entry);
			goto sub;
		}
	}
	while (__unlikely(!LIST_EMPTY(&snn->sockets))) {
		SOCKET *s;
#if __DEBUG >= 1
		if (__unlikely(snn->fork_leader != snn))
			KERNEL$SUICIDE("DELETE_NODE_RECURSIVE: SOCKETS ON NON-LEADER NODE %016"__64_format"X, LEADER %016"__64_format"X", snn->jobname, snn->fork_leader->jobname);
#endif
		s = LIST_STRUCT(snn->sockets.next, SOCKET, node_entry);
		snn->ss->op->destroy_socket(s);
	}
	if (__unlikely(snn != sn)) {
		SOCKET_NODE *snnn = snn->parent;
		DELETE_NODE(snn);
		snn = snnn;
		goto sub;
	}
	DELETE_NODE(snn);
	return;
}

static void *SOCKET_NODE_DELETE(HANDLE *h, IORQ *rq, int open_flags, HANDLE *hp)
{
	SOCKET_NODE *sn = h->fnode;
	if (__unlikely(!sn->parent)) return __ERR_PTR(-EBUSY);
	DELETE_NODE_RECURSIVE(sn);
	return NULL;
}

static void *SOCKET_NODE_INSTANTIATE(HANDLE *h, IORQ *rq, int open_flags)
{
	SOCKET_NODE *ss = h->fnode;
	if (__likely(ss->parent != NULL)) ADD_TO_XLIST(&ss->handles, &h->fnode_entry);
	else VOID_LIST_ENTRY(&h->fnode_entry);
	return NULL;
}

static void SOCKET_NODE_DETACH(HANDLE *h)
{
	DEL_FROM_LIST(&h->fnode_entry);
	SOCKET_NODE_LEAVE(h);
}

static void SOCKET_NODE_LEAVE(HANDLE *h)
{
	FREE_NETACL(h->fnode, H_NETACL(h));
#if __DEBUG >= 1
	h->fnode = NULL;	/* not needed but catches bugz */
#endif
}

int NET$QALLOC(SOCKET *s, unsigned len)
{
	QUOTA *zap;
	len /= QUOTA_DIV;
	retry:
	QALLOC(&s->node->q, len, q_isroot, q_parent, Q_NULL_CALL, zap, {
		if (NET_ZAP_FROM_NODE(zap, NULL, s)) return -EDQUOT;
		if (KERNEL$LOCKUP_LEVEL >= LOCKUP_LEVEL_ONE_PASS) return 1;
		goto retry;
	});
	return 0;
}

void NET$QFREE(SOCKET *s, unsigned len)
{
	len /= QUOTA_DIV;
	QFREE(&s->node->q, len, q_isroot, q_parent, Q_NULL_CALL);
}

void NET$DESTROY_SOCKET(SOCKET *s)
{
	long size;
	while (__unlikely(!XLIST_EMPTY(&s->handles))) {
		HANDLE *h = LIST_STRUCT(s->handles.next, HANDLE, fnode_entry);
		KERNEL$DETACH_HANDLE(h);
	}
	DEL_FROM_LIST(&s->hash_entry);
	DEL_FROM_LIST(&s->node_entry);	/* may not be linked on node, but on temporary delete-list */
	size = __slheadsize(&s->space->socket) / QUOTA_DIV;
	QFREE(&s->node->q, size, q_isroot, q_parent, Q_NULL_CALL);
#if __DEBUG >= 1
	s->node = NULL;
#endif
	__slfree(s);
}

void *NET$SOCKET_LOOKUP(HANDLE *h, char *str, int open_flags)
{
	if (__likely(!_strcasecmp(str, "^NONBLOCK"))) {
		h->flags |= SOCKET_NONBLOCK;
		return NULL;
	}
	return __ERR_PTR(-EBADMOD);
}

void *NET$SOCKET_INSTANTIATE(HANDLE *h, IORQ *rq, int open_flags)
{
	SOCKET *s = h->fnode;
	ADD_TO_XLIST(&s->handles, &h->fnode_entry);
	return NULL;
}

void NET$SOCKET_DETACH(HANDLE *h)
{
	DEL_FROM_LIST(&h->fnode_entry);
#if __DEBUG >= 1
	h->fnode = NULL;	/* not needed but catches bugz */
#endif
}

int NET$SOCKET_IOCTL(IOCTLRQ *rq)
{
	SOCKET *s = rq->handle->fnode;
	if (__likely(rq->ioctl == IOCTL_SETSOCKOPT)) {
		int r;
		struct timeval tv;
		unsigned o;
		u_jiffies_t t;
		int v;
		struct linger lin;
		switch (rq->param) {
			case __SO_MAKEPARAM(SOL_SOCKET, SO_SNDTIMEO):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_RCVTIMEO):
				if (__unlikely(r = KERNEL$GET_IOCTL_STRUCT(rq, &tv, sizeof tv))) {
					if (r == 1) {
						DO_PAGEIN_NORET(rq, &rq->v, PF_READ);
						return 1;
					}
					rq->status = r;
					return 0;
				}
				TV_2_JIFFIES(&tv, t);
				if (__unlikely(t & ~(__u64)(u_jiffies_lo_t)-1)) t = (u_jiffies_lo_t)-1;
				if (rq->param == __SO_MAKEPARAM(SOL_SOCKET, SO_SNDTIMEO)) s->sock_sndtimeo = t;
				else s->sock_rcvtimeo = t;
				rq->status = 0;
				return 0;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_LINGER):
				if (__unlikely(r = KERNEL$GET_IOCTL_STRUCT(rq, &lin, sizeof lin))) {
					if (r == 1) {
						DO_PAGEIN_NORET(rq, &rq->v, PF_READ);
						return 1;
					}
					rq->status = r;
					return 0;
				}
				if (lin.l_onoff) {
					if (__unlikely(lin.l_linger > MAXUINT / JIFFIES_PER_SECOND)) s->sock_lingertime = MAXUINT;
					else s->sock_lingertime = lin.l_linger * JIFFIES_PER_SECOND;
				} else {
					s->sock_lingertime = 0;
				}
				rq->status = 0;
				return 0;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_REUSEADDR):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_KEEPALIVE):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_DONTROUTE):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_BROADCAST):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_OOBINLINE):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_SNDBUF):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_RCVBUF):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_SNDLOWAT):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_RCVLOWAT):
				if (__unlikely(r = KERNEL$GET_IOCTL_STRUCT(rq, &v, sizeof v))) {
					if (r == 1) {
						DO_PAGEIN_NORET(rq, &rq->v, PF_READ);
						return 1;
					}
					rq->status = r;
					return 0;
				}
				switch (rq->param) {
					case __SO_MAKEPARAM(SOL_SOCKET, SO_REUSEADDR):
					case __SO_MAKEPARAM(SOL_SOCKET, SO_KEEPALIVE):
					case __SO_MAKEPARAM(SOL_SOCKET, SO_DONTROUTE):
					case __SO_MAKEPARAM(SOL_SOCKET, SO_BROADCAST):
					case __SO_MAKEPARAM(SOL_SOCKET, SO_OOBINLINE):
						o = __SO_OPT(rq->param);
						s->sock_flags = (s->sock_flags & ~(1 << o)) | (v ? 1 << o : 0);
						break;
					case __SO_MAKEPARAM(SOL_SOCKET, SO_SNDBUF):
						s->sock_sndbuf = v;
						break;
					case __SO_MAKEPARAM(SOL_SOCKET, SO_RCVBUF):
						s->sock_rcvbuf = v;
						break;
					case __SO_MAKEPARAM(SOL_SOCKET, SO_SNDLOWAT):
						s->sock_sndlowat = v;
						break;
					case __SO_MAKEPARAM(SOL_SOCKET, SO_RCVLOWAT):
						s->sock_rcvlowat = v;
						break;
				}
				rq->status = 0;
				return 0;
			default:
				rq->status = -ENOPROTOOPT;
				return 0;
		}
	} else if (__likely(rq->ioctl == IOCTL_GETSOCKOPT)) {
		int v;
		void *ret = &v;
		int retl = sizeof v;
		unsigned long iol;
		int r;
		struct timeval tv;
		struct linger lin;
		switch (rq->param) {
			case __SO_MAKEPARAM(SOL_SOCKET, SO_DEBUG):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_ACCEPTCONN):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_REUSEADDR):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_KEEPALIVE):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_DONTROUTE):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_BROADCAST):
			case __SO_MAKEPARAM(SOL_SOCKET, SO_OOBINLINE):
				v = (s->sock_flags >> __SO_OPT(rq->param)) & 1;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_SNDBUF):
				v = s->sock_sndbuf;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_RCVBUF):
				v = s->sock_rcvbuf;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_SNDLOWAT):
				v = s->sock_sndlowat;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_RCVLOWAT):
				v = s->sock_rcvlowat;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_SNDTIMEO):
				memset(&tv, 0, sizeof tv);
				JIFFIES_2_TV(s->sock_sndtimeo, &tv);
				ret = &tv;
				retl = sizeof tv;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_RCVTIMEO):
				memset(&tv, 0, sizeof tv);
				JIFFIES_2_TV(s->sock_rcvtimeo, &tv);
				ret = &tv;
				retl = sizeof tv;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_LINGER):
				memset(&lin, 0, sizeof lin);
				if (s->sock_lingertime) {
					lin.l_onoff = 1;
					lin.l_linger = s->sock_lingertime / JIFFIES_PER_SECOND;
				} else {
					lin.l_onoff = 0;
					lin.l_linger = 0;
				}
				ret = &lin;
				retl = sizeof lin;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_TYPE):
				v = s->sock_type;
				break;
			case __SO_MAKEPARAM(SOL_SOCKET, SO_ERROR):
				v = -s->sock_error;
				break;
			default:
				rq->status = -ENOPROTOOPT;
				return 0;
		}
		iol = rq->v.len;
		if (__unlikely(rq->v.len > retl)) rq->v.len = retl;
		r = KERNEL$PUT_IOCTL_STRUCT(rq, ret, retl);
		rq->v.len = iol;
		if (__unlikely(r == 1)) {
			DO_PAGEIN_NORET(rq, &rq->v, PF_WRITE);
			return 1;
		}
		if (__unlikely(s->sock_flags & SOCK_RESETERROR) && __unlikely(rq->param == __SO_MAKEPARAM(SOL_SOCKET, SO_ERROR)) && __likely(!r)) s->sock_error = 0;
		rq->status = __unlikely(r) ? r : retl;
		return 0;
	} else if (__likely(rq->ioctl == IOCTL_DUP)) {
		s->open_count++;
		rq->status = 0;
		return 0;
	}
	rq->status = -ENOOP;
	return 0;
}

int NET$SOCKET_CLOSE(HANDLE *h, IORQ *rq)
{
	SOCKET *s = h->fnode;
	if (__likely(--s->open_count <= 0)) {
		int r;
		s->open_count = 0;
		r = s->space->op->close_socket(s, rq);
		if (__unlikely(r > 0)) s->open_count = 1;
		return r;
	}
	return 0;
}

DECL_IOCALL(SOCKET_NODE_IOCTL, SPL_NET, IOCTLRQ)
{
	HANDLE *h = RQ->handle;
	SOCKET_NODE *sn;
	if (__unlikely(h->op != &SOCKET_NODE_OPERATIONS)) RETURN_IORQ_LSTAT(RQ, KERNEL$WAKE_IOCTL);
	RQ->tmp1 = (unsigned long)KERNEL$WAKE_IOCTL;
	TEST_LOCKUP_ENTRY(RQ, RETURN);
	SWITCH_PROC_ACCOUNT(h->name_addrspace, SPL_X(SPL_NET));
	sn = h->fnode;
	switch (RQ->ioctl) {
		case IOCTL_SOCKET_LINGER: {
			int (*linger_socket)(SOCKET *s, IORQ *rq);
			SOCKET *s, *s1;
			if (__likely(LIST_EMPTY(&sn->sockets))) {
				RQ->status = 0;
				RETURN_AST(RQ);
			}
			linger_socket = sn->ss->op->linger_socket;
			s1 = LIST_STRUCT(sn->sockets.prev, SOCKET, node_entry);
			do {
				s = LIST_STRUCT(sn->sockets.next, SOCKET, node_entry);
				DEL_FROM_LIST(&s->node_entry);
				ADD_TO_LIST_END(&sn->sockets, &s->node_entry);
				if (__unlikely(linger_socket(s, (IORQ *)RQ) > 0)) {
					RETURN;
				}
			} while (s != s1);
			RQ->status = 0;
			RETURN_AST(RQ);
		}
		default: {
			int r;
			if (__unlikely(!sn->ss->op->ioctl)) {
				RQ->status = -EOPNOTSUPP;
				RETURN_AST(RQ);
			}
			r = sn->ss->op->ioctl(RQ);
			if (__unlikely(r > 0)) RETURN;
			RQ->status = r;
			RETURN_AST(RQ);
		}
	}
}

void NET$FOR_ALL_SOCKETS(SOCKET_NODE *sn, void (*fn)(SOCKET *s))
{
	unsigned i;
	SOCKET_NODE *snn = sn;
	SOCKET *s;
	sub:
	i = 0;
	next_hash_list:
	for (; i < CHILDHASH_SIZE; i++) {
		if (__unlikely(!XLIST_EMPTY(&snn->childhash[i]))) {
			snn = LIST_STRUCT(snn->childhash[i].next, SOCKET_NODE, childhash_entry);
			goto sub;
		}
	}
	LIST_FOR_EACH(s, &snn->sockets, SOCKET, node_entry) {
#if __DEBUG >= 1
		if (__unlikely(snn->fork_leader != snn))
			KERNEL$SUICIDE("NET$FOR_ALL_SOCKETS: SOCKETS ON NON-LEADER NODE %016"__64_format"X, LEADER %016"__64_format"X", snn->jobname, snn->fork_leader->jobname);
#endif
		fn(s);
	}
	if (__unlikely(snn != sn)) {
		if (__unlikely(snn->childhash_entry.next != &KERNEL$LIST_END)) {
			snn = LIST_STRUCT(snn->childhash_entry.next, SOCKET_NODE, childhash_entry);
			goto sub;
		}
		i = jobnamehash(&snn->jobname);
#if __DEBUG >= 2
		{
			SOCKET_NODE *snnn;
			XLIST_FOR_EACH(snnn, &snn->parent->childhash[i], SOCKET_NODE, childhash_entry) if (snnn == snn) goto found;
			KERNEL$SUICIDE("NET$FOR_ALL_SOCKETS: BAD HASH OF JOBNAME %016"__64_format"X: %u", snn->jobname, i);
			found:;
		}
#endif
		snn = snn->parent;
		i++;
		goto next_hash_list;
	}
	return;
}

extern AST_STUB DELAYED_OOM_AST_FN;
static int NET_OOM_KILL(SOCKET_NODE *sn, SOCKET *s);

static int DELAYED_OOM_POSTED = 0;
static AST DELAYED_OOM_AST = { DELAYED_OOM_AST_FN, NULL };

void NET$DELAYED_OOM(void)
{
	if (__unlikely(NET$MEMORY_AVAIL <= 0) || __unlikely(KERNEL$OOM(VM_TYPE_WIRED_MAPPED))) {
		if (__likely(!__CMPXCHGI(&DELAYED_OOM_POSTED, 0, 1))) CALL_AST(&DELAYED_OOM_AST);
	}
}

DECL_AST(DELAYED_OOM_AST_FN, SPL_NET, AST)
{
	int p = DELAYED_OOM_POSTED;
#if __DEBUG >= 1
	if ((unsigned)(p - 1) > 1) KERNEL$SUICIDE("DELAYED_OOM_AST_FN: FLAG %d", DELAYED_OOM_POSTED);
#endif
	DELAYED_OOM_POSTED = 0;
	__barrier();
	if (__likely(p == 1)) NET_OOM_KILL(NULL, NULL);
	RETURN;
}

WQ *NET$OOM(void)
{
	return NET$OOM_NODE_SOCKET(NULL, NULL);
}

WQ *NET$OOM_NODE_SOCKET(SOCKET_NODE *sn, SOCKET *s)
{
#if __DEBUG >= 1
	if (__unlikely(KERNEL$SPL != SPL_X(SPL_NET)))
		KERNEL$SUICIDE("NET$OOM_NODE_SOCKET AT SPL %08X", KERNEL$SPL);
#endif
	if (__unlikely(NET$MEMORY_AVAIL <= 0) || __unlikely(KERNEL$OOM(VM_TYPE_WIRED_MAPPED))) {
		int r;
		if (__likely((r = NET_OOM_KILL(sn, s)) >= 0)) {
			__CMPXCHGI(&DELAYED_OOM_POSTED, 1, 2);
			if (__unlikely(r > 0)) return __ERR_PTR(-ENOMEM);
			return NULL;
		}
		return __ERR_PTR(r);
	}
	return &KERNEL$FREEMEM_WAIT;
}

typedef struct {
	AST_HEAD;
	WQ *result;
	MTX wait;
} MEMWAIT_STRUCT;

DECL_AST(MEMWAIT_FN, SPL_NET, MEMWAIT_STRUCT)
{
	RQ->result = NET$OOM();
	MTX_UNLOCK(&RQ->wait);
	RETURN;
}

int NET$MEMWAIT_SYNC(void)
{
	MEMWAIT_STRUCT str;
	str.fn = MEMWAIT_FN;
	MTX_INIT(&str.wait, "NET$MEMWAIT_SYNC");
	if (__unlikely(MTX_TRY_LOCK(&str.wait))) KERNEL$SUICIDE("NET$MEMWAIT_SYNC: CAN'T TRYLOCK NEWLY-CREATED LOCK");
	CALL_AST(&str);
	MTX_LOCK_SYNC(&str.wait);
	MTX_UNLOCK(&str.wait);
	if (!str.result) return 0;
	if (__IS_ERR(str.result)) return __PTR_ERR(str.result);
	return WQ_WAIT_SYNC_CANCELABLE(str.result);
}

/* returns -ENOMEM (nothing to free), 0 --- something freed, 1 --- specified node or socket freed */
static int NET_OOM_KILL(SOCKET_NODE *sn, SOCKET *s)
{
	SOCKET_SPACE *ss, *maxss = NULL;
	long max = 0;
	XLIST_FOR_EACH(ss, &socket_spaces, SOCKET_SPACE, list) {
		if (__likely(ss->root.q.q_subtree_usage > max)) {
			max = ss->root.q.q_subtree_usage;
			maxss = ss;
		}
	}
	if (__unlikely(!maxss)) return -ENOMEM;
	return NET_ZAP_FROM_NODE(&maxss->root.q, sn, s);
}

/* return: 0 --- something was killed, 1 --- specified node or socket was killed */
static int NET_ZAP_FROM_NODE(QUOTA *zap, SOCKET_NODE *current_node, SOCKET *current_socket)
{
	int ret;
	QUOTA *target_zap;
	SOCKET_NODE *n;
	SOCKET *s;
	QZAP(zap, q_for_all_subnodes, q_for_all_subnodes_tail, target_zap, 1);
	n = LIST_STRUCT(target_zap, SOCKET_NODE, q);
	ret = 0;
	if (__unlikely(LIST_EMPTY(&n->sockets))) {
		test_node_again:
		while (current_node) {
			if (current_node == n) {
				ret = 1;
				goto r1;
			}
			current_node = current_node->parent;
		}
		if (current_socket) {
			current_node = current_socket->node;
			current_socket = NULL;
			goto test_node_again;
		}
		r1:
		DELETE_NODE_RECURSIVE(n);
		return ret;
	}
	if (current_socket) {
		LIST_FOR_EACH(s, &n->sockets, SOCKET, node_entry) if (s == current_socket) {
			ret = 1;
			break;
		}
	}
		/* n->sockets must be nonempty */
	n->ss->op->delete_offensive_sockets(&n->sockets);
	if (ret) {
		LIST_FOR_EACH(s, &n->sockets, SOCKET, node_entry) if (s == current_socket) {
			ret = 0;
			break;
		}
	}
	return ret;
}

static void socket_init_root(HANDLE *h, void *data)
{
	SOCKET_SPACE *s = data;
	h->flags = 0;
	H_NETACL(h) = NULL;
	h->fnode = &s->root;
	h->op = &SOCKET_NODE_OPERATIONS;
}

static int SOCKET_UNLOAD(void *p, void **release, char *argv[]);

int NET$CREATE_SOCKET_SPACE(int argc, char *argv[], __const__ SOCKET_SPACE_OPERATIONS *op, char *driver_name)
{
	int r;
	char name[9];
	SOCKET_SPACE *ss;
	DEVICE_REQUEST devrq;
	MALLOC_REQUEST mrq;
	CONTIG_AREA_REQUEST car;
	memcpy(name, "NET$", 4);
	memcpy(name + 4, &op->af, 4);
	name[8] = 0;
	if (!socket_hash) {
		unsigned i;
		unsigned hash_size;
		__u64 mem = KERNEL$GET_MEMORY_SIZE(VM_TYPE_WIRED_UNMAPPED);
		mem >>= HASH_MEMORY_SHIFT;
		for (hash_size = __PAGE_CLUSTER_SIZE; hash_size < mem && hash_size < PG_SIZE * PG_BANK; hash_size <<= 1) ;
		socket_hash_mask = hash_size / sizeof(XLIST_HEAD) - 1;
		car.nclusters = hash_size >> __PAGE_CLUSTER_BITS;
		car.flags = CARF_DATA;
		car.align = 0;
		SYNC_IO_CANCELABLE(&car, KERNEL$VM_GRAB_CONTIG_AREA);
		if (__unlikely(car.status < 0)) {
			if (car.status != -EINTR) _snprintf(KERNEL$ERROR_MSG(), __MAX_STR_LEN, "COULD NOT ALLOCATE SOCKET HASH: %s", strerror(-car.status));
			r = mrq.status;
			goto ret0;
		}
		socket_hash = car.ptr;
		for (i = 0; i <= socket_hash_mask; i++) INIT_XLIST(&socket_hash[i]);
	}
	mrq.size = sizeof(SOCKET_SPACE);
	SYNC_IO_CANCELABLE(&mrq, KERNEL$UNIVERSAL_MALLOC);
	if (mrq.status < 0) {
		if (mrq.status != -EINTR) _snprintf(KERNEL$ERROR_MSG(), __MAX_STR_LEN, "%s: COULD NOT ALLOCATE SOCKET SPACE: %s", name, strerror(-mrq.status));
		r = mrq.status;
		goto ret0;
	}
	ss = mrq.ptr;
	strcpy(ss->name, name);
	socket_node_ctor(NULL, &ss->root);
	QINIT(&ss->root.q);
	QINIT2(&ss->root.q);
	ss->root.depth = 0;
	ss->root.parent = NULL;
	ss->root.jobname = 0;
	ss->root.fork_leader = &ss->root;
	ss->root.childhash_entry.next = ss->root.childhash_entry.prev = NULL;
	ss->root.ss = ss;
	ss->root.netacl = NETACL_END;

	ss->op = op;
	KERNEL$SLAB_INIT(&ss->socket, op->sizeof_SOCKET, __CPU_CACHELINE_ALIGN, VM_TYPE_WIRED_MAPPED, socket_ctor, ss, &NET$MEMORY_AVAIL, ss->name);

	RAISE_SPL(SPL_NET);
	if ((r = op->init(argv))) {
		LOWER_SPL(SPL_ZERO);
		if (!*KERNEL$ERROR_MSG()) if (r != -EINTR) _snprintf(KERNEL$ERROR_MSG(), __MAX_STR_LEN, "ERROR INITIALIZING PROTOCOL: %s", strerror(-r));
		goto ret1;
	}
	ADD_TO_XLIST(&socket_spaces, &ss->list);
	LOWER_SPL(SPL_ZERO);

	devrq.name = name;
	devrq.driver_name = driver_name;
	devrq.flags = LNTE_PUBLIC;
	devrq.init_root_handle = socket_init_root;
	devrq.dev_ptr = ss;
	devrq.dcall = NULL;
	devrq.dcall_type = NULL;
	devrq.dctl = op->dctl;
	devrq.unload = SOCKET_UNLOAD;
	SYNC_IO_CANCELABLE(&devrq, KERNEL$REGISTER_DEVICE);
	if (devrq.status < 0) {
		if (devrq.status != -EINTR) _snprintf(KERNEL$ERROR_MSG(), __MAX_STR_LEN, "%s: COULD NOT REGISTER DEVICE: %s", name, strerror(-devrq.status));
		r = devrq.status;
		goto ret2;
	}

	strlcpy(KERNEL$ERROR_MSG(), name, __MAX_STR_LEN);

	ss->lnte = devrq.lnte;
	ss->dlrq = KERNEL$TSR_IMAGE();
	return 0;

	ret2:
	RAISE_SPL(SPL_NET);
	DELETE_NODE_RECURSIVE(&ss->root);
	QDONE(&ss->root.q, "SOCKET NODE ROOT");
	DEL_FROM_LIST(&ss->list);
	op->done();
	LOWER_SPL(SPL_ZERO);
	ret1:
	KERNEL$SLAB_DESTROY(&ss->socket);
	KERNEL$UNIVERSAL_FREE(ss);
	ret0:
	return r;
}

static int SOCKET_UNLOAD(void *p, void **release, char *argv[])
{
	int r;
	SOCKET_SPACE *ss = p;
	if ((r = KERNEL$DEVICE_UNLOAD(ss->lnte, argv))) {
		return r;
	}
	RAISE_SPL(SPL_NET);
	DELETE_NODE_RECURSIVE(&ss->root);
	QDONE(&ss->root.q, "SOCKET NODE ROOT");
	DEL_FROM_LIST(&ss->list);
	ss->op->done();
	LOWER_SPL(SPL_ZERO);
	KERNEL$SLAB_DESTROY(&ss->socket);
	*release = ss->dlrq;
	KERNEL$UNIVERSAL_FREE(ss);
	return 0;
}

void SOCKET_INIT(void)
{
	KERNEL$SLAB_INIT(&socket_node, sizeof(SOCKET_NODE), 0, VM_TYPE_WIRED_MAPPED, socket_node_ctor, NULL, &NET$MEMORY_AVAIL, "NET$SOCKET_NODE");
}

void SOCKET_DONE(void)
{
	KERNEL$SLAB_DESTROY(&socket_node);
	if (socket_hash) {
		/*__debug_printf("dhash: %d\n", ((socket_hash_mask + 1) * sizeof(XLIST_HEAD)) >> __PAGE_CLUSTER_BITS);*/
		KERNEL$VM_RELEASE_CONTIG_AREA(socket_hash, ((socket_hash_mask + 1) * sizeof(XLIST_HEAD)) >> __PAGE_CLUSTER_BITS);
	}
}
