#include <ARCH/IPCHECKS.H>
#include <ARCH/MOV.H>
#include <SPAD/SYSLOG.H>
#include <STDLIB.H>

#include "TCPIP.H"

/* #define __TCPIP_DEBUG */

static void TCP_SYN_RTT(TIMER *t);
static void TCP_RTT(TIMER *t);
static void TCP_SEND_RESET(PACKET *op);
static void TCP_SEND_SYNACK(TCPIP_SOCKET *s);
static void TCP_SYN_RECEIVED(TCPIP_SOCKET *s, PACKET *p);
static void TCP_SYNACK_RTT(TIMER *t);

static struct {
	__u16 hiaddr;
	__u8 srtt;
	__u8 rttvar;
} RTT_CACHE[65536];

#define RTT_CACHE_TIME_SHIFT	(__BSR_CONST(JIFFIES_PER_SECOND) < 5 ? 0 : __BSR_CONST(JIFFIES_PER_SECOND) - 5)

static int CHECK_NETACL_FN(TCPIP_SOCKET *s)
{
	if (__unlikely(NETACL_SEARCH(s->node, s->remote_addr, ntohs(s->remote_port)) < 0)) {
		s->conn_error = -EACCES;
		return -1;
	}
	s->flags |= SOCK_NETACL_ACCEPT_IN;
	return 0;
}

#define CHECK_NETACL(fail)						\
{									\
	if (__unlikely(!(s->flags & SOCK_NETACL_ACCEPT_IN))) {		\
		if (__unlikely(CHECK_NETACL_FN(s))) {			\
			fail;						\
		}							\
	}								\
}

void TCP_NULL_PACKET_1(TCPIP_SOCKET *s, PACKET *p)
{
	TCP_NULL_PACKET(s, p);
}

void TCP_NULL_PACKET(TCPIP_SOCKET *s, PACKET *p)
{
#ifdef __TCPIP_DEBUG
	__debug_printf("n(sp:%04x,dp:%04x,flags:%02x,length:%d,seq:%08x,ack:%08x)", ntohs(tcp(p)->th_sport), ntohs(tcp(p)->th_dport), tcp(p)->th_flags, p->data_length, ntohl(tcp(p)->th_seq), ntohl(tcp(p)->th_ack));
#endif
	if (__unlikely(INAPPROPRIATE_TCP_ADDRESS(ip(p)->ip_src.s_addr))) {
		INVALID_TCP_PACKET(p);
		goto drop;
	}
	if (__likely(!(tcp(p)->th_flags & TH_RST))) TCP_SEND_RESET(p);
	drop:
	p->status = 0;
	CALL_PKT(p);
}

static __finline__ void INCREASE_RTO(TCPIP_SOCKET *s)
{
	u_jiffies_lo_t new = s->rto << 1;
	if (__unlikely(new > TCP_MAX_RTT)) new = TCP_MAX_RTT;
	if (__unlikely(new > s->sock_sndtimeo)) new = s->sock_sndtimeo;
}

static __finline__ void TCP_SET_RTO(TCPIP_SOCKET *s)
{
	s->rto = s->srtt + (s->rttvar << 2);
	/* 1 second ??? Are they sane ??? This isn't good even for terminal access */
	if (s->rto < TCP_MIN_RTT) s->rto = TCP_MIN_RTT;
	if (__unlikely(s->rto > TCP_MAX_RTT)) s->rto = TCP_MAX_RTT;
}

static __finline__ void INIT_SRTT(TCPIP_SOCKET *s)
{
	unsigned a = ntohl(s->remote_addr);
	unsigned aa = (a + (a >> 16)) & 0xffff;
	if (RTT_CACHE[aa].hiaddr == (a >> 16) && __likely(RTT_CACHE[aa].srtt) && __likely(RTT_CACHE[aa].rttvar)) {
		s->srtt = RTT_CACHE[aa].srtt << RTT_CACHE_TIME_SHIFT;
		s->rttvar = RTT_CACHE[aa].rttvar << RTT_CACHE_TIME_SHIFT;
		TCP_SET_RTO(s);
	} else {
		s->srtt = TCP_INIT_RTT;
		s->rttvar = 0;
		s->rto = TCP_INIT_RTT;
	}
}

static __finline__ void SAVE_SRTT(TCPIP_SOCKET *s)
{
	if (__likely(s->rttvar != 0)) {
		unsigned a = ntohl(s->remote_addr);
		unsigned aa = (a + (a >> 16)) & 0xffff;
		RTT_CACHE[aa].hiaddr = a >> 16;
		if (__unlikely(s->srtt >= 255 << RTT_CACHE_TIME_SHIFT)) {
			RTT_CACHE[aa].srtt = 0;
		} else {
			RTT_CACHE[aa].srtt = ((s->srtt + (1 << RTT_CACHE_TIME_SHIFT) - 1) >> RTT_CACHE_TIME_SHIFT);
			if (__unlikely(!RTT_CACHE[aa].srtt)) RTT_CACHE[aa].srtt = 0;
		}
		if (__unlikely(s->rttvar >= 255 << RTT_CACHE_TIME_SHIFT)) {
			RTT_CACHE[aa].rttvar = 0;
		} else {
			RTT_CACHE[aa].rttvar = ((s->rttvar + (1 << RTT_CACHE_TIME_SHIFT)) >> RTT_CACHE_TIME_SHIFT);
		}
	}
}

static __finline__ unsigned TCP_GET_WINDOW(TCPIP_SOCKET *s, int scale)
{
	int w = s->offered_window;
	if (__unlikely(w < 0)) w = 0;
	if (scale && s->flags & SOCK_WINSCALE) w >>= TCP_WINDOW_SCALE;
	if (__unlikely(w > 0xffff)) w = 0xffff;
	return htonl(w);
}

static __finline__ void FILL_TCPIP_HEADER(TCPIP_SOCKET *s, PACKET *p, int ecn)
{
	ip(p)->ip_vhl = IP_VHL;
	if (!ecn) ip(p)->ip_tos = s->ip_tos & ~IPTOS_ECT;
	else ip(p)->ip_tos = s->ip_tos;
	*(__u32 *)&ip(p)->ip_ttl = IP_TTL_P_SUM(IPPROTO_TCP);
	ip(p)->ip_src.s_addr = s->local_addr;
	ip(p)->ip_dst.s_addr = s->remote_addr;
	*(__u32 *)&tcp(p)->th_sport = TCP_SET_PORTS(s->local_port, s->remote_port);
	tcp(p)->th_seq = htonl(s->seq);
	tcp(p)->th_ack = htonl(s->ack);
}

static __finline__ void FILL_TCPIP_HEADER_RTT(TCPIP_SOCKET *s, PACKET *p)
{
	tcp(p)->th_ack = htonl(s->ack);
}

static __finline__ void FILL_TCPIP_FLAGS(TCPIP_SOCKET *s, PACKET *p, unsigned head_size)
{
	unsigned u = TH32_ACK | TH32_PUSH | TCP_SET_DATA(head_size) | TCP_GET_WINDOW(s, 1);
	if (__unlikely(s->flags & (SOCK_ECE | SOCK_CWR))) {
		if (__unlikely(s->flags & SOCK_CWR)) {
			u |= TH32_CWR, s->flags &= ~SOCK_CWR;
		}
		if (__unlikely(s->flags & SOCK_ECE)) u |= TH32_ECE;
	}
	*(__u32 *)&tcp(p)->th_off4 = u;
}

#define FILL_TCPIP_SUM(p, total_size)					\
do {									\
	/* save one cache line */					\
	if (sizeof(long) == 4) {					\
		__MOVNTI32((__u32 *)&tcp(p)->th_sum, TCP_SUM(TCPUDP_MAGIC_CHECKSUM(IPPROTO_TCP, total_size)));						\
	} else {							\
		*(__u32 *)&tcp(p)->th_sum = TCP_SUM(TCPUDP_MAGIC_CHECKSUM(IPPROTO_TCP, total_size));							\
	}								\
	p->checksum.u = MKCHECKSUM(sizeof(struct ip) - 2 * sizeof(in_addr_t), sizeof(struct ip) + 16);							\
	p->flags |= PKT_OUTPUT_CHECKSUM | PKT_OUTPUT_CHECKSUM_TCP | PKT_TCPUDP_CHECKSUM_OK;								\
	if (sizeof(long) == 4) {					\
		__MOVNTI_FLUSH();					\
	}								\
} while (0)

void TCP_SEND_SOCKET_RESET(TCPIP_SOCKET *s)
{
	int i;
/* because we are not sending on behalf of received packet, repeat it few times
   in case of lost packets */
	for (i = 0; i < TCP_RST_REPEATS; i++) {
		PACKET *p;
		ALLOC_PACKET(p, sizeof(struct ip) + sizeof(struct tcphdr), &NET$PKTPOOL, SPL_NET, NET$DELAYED_OOM(); return);
		p->fn = NET$FREE_PACKET;
		p->data_length = sizeof(struct ip) + sizeof(struct tcphdr);
		FILL_TCPIP_HEADER(s, p, 0);
		ip(p)->ip_tos = IPTOS_PREC_PRIORITY | IPTOS_RELIABILITY;
		*(__u32 *)&tcp(p)->th_off4 = TH32_RST | TH32_ACK | TCP_SET_DATA(5);
		FILL_TCPIP_SUM(p, sizeof(struct tcphdr));
#ifdef __TCPIP_DEBUG
		__debug_printf("socket reset(flags:%02x,length:%d,seq:%08x,ack:%08x)", tcp(p)->th_flags, p->data_length, ntohl(tcp(p)->th_seq), ntohl(tcp(p)->th_ack));
#endif
		IP_SEND_PACKET(p);
	}
}

static void TCP_SEND_RESET(PACKET *op)
{
	PACKET *p;
	ALLOC_PACKET(p, sizeof(struct ip) + sizeof(struct tcphdr), &NET$PKTPOOL, SPL_NET, NET$DELAYED_OOM(); return);
	p->fn = NET$FREE_PACKET;
	p->data_length = sizeof(struct ip) + sizeof(struct tcphdr);
	ip(p)->ip_vhl = IP_VHL;
	ip(p)->ip_tos = IPTOS_PREC_PRIORITY | IPTOS_RELIABILITY;
	*(__u32 *)&ip(p)->ip_ttl = IP_TTL_P_SUM(IPPROTO_TCP);
	ip(p)->ip_src.s_addr = ip(op)->ip_dst.s_addr;
	ip(p)->ip_dst.s_addr = ip(op)->ip_src.s_addr;
	tcp(p)->th_sport = tcp(op)->th_dport;
	tcp(p)->th_dport = tcp(op)->th_sport;
	if (tcp(op)->th_flags & TH_ACK) tcp(p)->th_seq = tcp(op)->th_ack;
	else tcp(p)->th_seq = htonl(0);
	tcp(p)->th_ack = htonl(ntohl(tcp(op)->th_seq) + (tcp(op)->th_flags & (TH_SYN | TH_FIN) ? 1 : 0) + op->data_length - sizeof(struct ip) - TCP_HLEN(tcp(op)->th_off4));
	*(__u32 *)&tcp(p)->th_off4 = (tcp(op)->th_flags & TH_FIN ? 0 : TH32_RST) | TH32_ACK | TCP_SET_DATA(5);
	FILL_TCPIP_SUM(p, sizeof(struct tcphdr));
#ifdef __TCPIP_DEBUG
	__debug_printf("reset(flags:%02x,length:%d,seq:%08x,ack:%08x)", tcp(p)->th_flags, p->data_length, ntohl(tcp(p)->th_seq), ntohl(tcp(p)->th_ack));
#endif
	IP_SEND_PACKET(p);
}

char *PRINT_OPTIONS(char str[__MAX_STR_LEN], __u8 *opt, int len)
{
	char *ostr = str;
	char *e = str + __MAX_STR_LEN;
	*str = 0;
	while (len--) {
		_snprintf(str, e - str, "%02X%s", *opt++, len ? ":" : "");
		str += strlen(str);
	}
	return ostr;
}

static void BIG_PACKET(TCPIP_SOCKET *s, PACKET *p)
{
	static char a1[16];
	static char a2[16];
	if (errorlevel >= 2)
		KERNEL$SYSLOG(__SYSLOG_NET_ERROR, net_name, "TOO BIG TCP PACKET: LOCAL %s:%u, REMOTE %s:%u, FLAGS %02X, SIZE %u, MSS %u", PRINT_IP(a1, ip(p)->ip_dst.s_addr), ntohs(tcp(p)->th_dport), PRINT_IP(a2, ip(p)->ip_src.s_addr), ntohs(tcp(p)->th_sport), tcp(p)->th_flags, p->data_length, s->mss);
}

static void INVALID_TCP_OPTIONS(PACKET *p)
{
	static char a1[16];
	static char a2[16];
	static char a3[__MAX_STR_LEN];
	if (errorlevel >= 2)
		KERNEL$SYSLOG(__SYSLOG_NET_ERROR, net_name, "INVALID TCP OPTIONS: LOCAL %s:%u, REMOTE %s:%u, FLAGS %02X, OPTIONS %s", PRINT_IP(a1, ip(p)->ip_dst.s_addr), ntohs(tcp(p)->th_dport), PRINT_IP(a2, ip(p)->ip_src.s_addr), ntohs(tcp(p)->th_sport), tcp(p)->th_flags, PRINT_OPTIONS(a3, (__u8 *)(tcp(p) + 1), TCP_HLEN(tcp(p)->th_off4) - sizeof(struct tcphdr)));
}

static void PARSE_TCP_OPTIONS(TCPIP_SOCKET *s, PACKET *p)
{
	unsigned opos = sizeof(struct tcphdr);
	unsigned olen = TCP_HLEN(tcp(p)->th_off4);
	while (opos < olen) {
		unsigned opt = *((__u8 *)tcp(p) + opos);
		unsigned opt_l;
		if (__unlikely(opt == TCPOPT_NOP)) {
			opos++;
			continue;
		}
		if (opt == TCPOPT_EOL) break;
		if (__unlikely(opos + 1 == olen)) {
			invl_opt:
			INVALID_TCP_OPTIONS(p);
			break;
		}
		opt_l = *((__u8 *)tcp(p) + opos + 1);
		if (__unlikely(opos + opt_l > olen)) goto invl_opt;
		if (opt == TCPOPT_MAXSEG) {
			unsigned mss;
			if (__unlikely(opt_l != TCPOLEN_MAXSEG)) goto invl_opt;
			mss = (*((__u8 *)tcp(p) + opos + 2) << 8) + *((__u8 *)tcp(p) + opos + 3);
			if (__unlikely(mss < IP_MIN_MTU - sizeof(struct ip)))
				mss = IP_MIN_MTU - sizeof(struct ip);
			if (__unlikely(mss < s->mss)) s->mss = mss & ~3;
		} else if (opt == TCPOPT_WINDOW) {
			unsigned ws;
			if (__unlikely(opt_l != TCPOLEN_WINDOW)) goto invl_opt;
			ws = *((__u8 *)tcp(p) + opos + 2);
			if (__unlikely(ws >= 32)) goto invl_opt;
			s->wnd_scale = ws;
			s->flags |= SOCK_WINSCALE;
		} else if (__likely(opt == TCPOPT_SACK_PERMITTED)) {
			if (__unlikely(opt_l != TCPOLEN_SACK_PERMITTED)) goto invl_opt;
			s->flags |= SOCK_SACK;
		}
		opos += opt_l;
	}
}

void INIT_TCP_SOCKET(TCPIP_SOCKET *s)
{
	INIT_SRTT(s);
	s->seq = RANDOM_TCP_SEQ(s);
	s->ack = 0;
	s->sent_queue_length = 0;
	s->wnd_scale = 0;
	s->bytes_acked = 0;
	s->offered_window = TCP_INIT_WINDOW;
	s->read_rate = 0;
	s->write_rate = 0;
	s->read_rate_time = s->write_rate_time = KERNEL$GET_JIFFIES_LO();
	SET_TIMEOUT(s);
}

static void TCP_INIT_CWND(TCPIP_SOCKET *s)
{
	/* RFC3390 */
	unsigned mss = s->mss;
	if (__unlikely(mss <= 1095)) s->cwnd = mss << 2;
	else if (__likely(mss < 2190)) s->cwnd = 4380;
	else s->cwnd = mss << 1;
	s->ssthresh = -1;
}

/*
 * syn options: MSS(4), SACK-permitted(2), Window scale(3), End(1)
 */

void TCP_SEND_SYN(TCPIP_SOCKET *s)
{
	int r;
	int mss;
	PACKET *p;
	CHECK_NETACL(return);
#ifdef __TCPIP_DEBUG
	__debug_printf("send syn.\n");
#endif
	ALLOC_PACKET(p, sizeof(struct ip) + sizeof(struct tcphdr) + 12, &NET$PKTPOOL, SPL_NET, NET$DELAYED_OOM(); goto skip_packet);
	p->fn = NET$FREE_PACKET;
	p->data_length = sizeof(struct ip) + sizeof(struct tcphdr) + 12;
	FILL_TCPIP_HEADER(s, p, 0);
	*(__u32 *)&tcp(p)->th_off4 = TH32_SYN | TH32_ECE | TH32_CWR | TCP_SET_DATA(8) | TCP_GET_WINDOW(s, 0);
	FILL_TCPIP_SUM(p, sizeof(struct tcphdr) + 12);
	mss = (IP_FIND_MTU(s->remote_addr) - sizeof(struct ip) - sizeof(struct tcphdr)) & ~3;
	s->mss = mss;
	((__u32 *)tcpopt(p))[0] = htonl((TCPOPT_MAXSEG << 24) | (TCPOLEN_MAXSEG << 16) | (mss));
	((__u32 *)tcpopt(p))[1] = htonl((TCPOPT_SACK_PERMITTED << 24) | (TCPOLEN_SACK_PERMITTED << 16) | (TCPOPT_WINDOW << 8) | (TCPOLEN_WINDOW));
	((__u32 *)tcpopt(p))[2] = htonl((TCP_WINDOW_SCALE << 24) | (TCPOPT_EOL << 16));
	if (__unlikely(r = IP_SEND_PACKET(p))) {
		s->sock_error = r;
		TCP_END_SOCKET(s);
		return;
	}
	skip_packet:
	s->packet = TCP_SYN_SENT;
	s->timer.fn = TCP_SYN_RTT;
	KERNEL$SET_TIMER(s->rto, &s->timer);
}

void TCP_SYN_SENT(TCPIP_SOCKET *s, PACKET *p)
{
	__u8 flags = tcp(p)->th_flags;
	/* no need to check NETACL because this is called only when packet was accepted */
#ifdef __TCPIP_DEBUG
	__debug_printf("rcvd synack: %02X.", flags);
#endif
	if (__likely((flags & (TH_ACK | TH_RST | TH_SYN | TH_FIN)) == (TH_ACK | TH_SYN))) {
#ifdef __TCPIP_DEBUG
		__debug_printf("valid.\n");
#endif
		CLONE_PACKET_FOR_RESEND(p, NET$DELAYED_OOM(); goto drop);
		p->flags &= PKT_REUSE_FLAGS;
		if (__unlikely(ntohl(tcp(p)->th_ack) != (tcp_seq)(s->seq + 1))) goto drop;
		if ((flags & (TH_ECE | TH_CWR)) == TH_ECE) s->ip_tos |= IPTOS_ECT, s->flags |= SOCK_ECN;
		RESET_TIMEOUT(s);
		s->seq = (tcp_seq)(s->seq + 1);
		s->seq_acked = s->seq;
		s->read_seq = s->ack = (tcp_seq)(ntohl(tcp(p)->th_seq) + 1);
		p->data_length = sizeof(struct ip) + sizeof(struct tcphdr);
		s->wnd = ntohs(tcp(p)->th_win);

		PARSE_TCP_OPTIONS(s, p);

		TCP_INIT_CWND(s);

		FILL_TCPIP_HEADER(s, p, 0);
		*(__u32 *)&tcp(p)->th_off4 = TH32_PUSH | TH32_ACK | TCP_SET_DATA(5) | TCP_GET_WINDOW(s, 1);
		FILL_TCPIP_SUM(p, sizeof(struct tcphdr));
		IP_SEND_PACKET(p);

		WQ_WAKE_ALL_PL(&s->write_wait);
		TCP_SET_RTO(s);
		s->packet = TCP_ESTABLISHED;
		return;
	}
	if (__unlikely(flags & TH_RST)) {
		if (__unlikely(ntohl(tcp(p)->th_ack) != (tcp_seq)(s->seq + 1))) goto drop;
		RESET_TIMEOUT(s);
		s->sock_error = -ECONNREFUSED;
		TCP_END_SOCKET(s);
		goto drop;
	}
	TCP_NULL_PACKET(s, p);
	return;
	drop:
	p->status = 0;
	CALL_PKT(p);
}

static void TCP_SYN_RTT(TIMER *t)
{
	TCPIP_SOCKET *s;
	LOWER_SPL(SPL_NET);
	s = GET_STRUCT(t, TCPIP_SOCKET, timer);
	s->timer.fn = NULL;
	if (__unlikely((KERNEL$GET_JIFFIES_LO() - s->last_time) >= (__unlikely(s->sock_sndtimeo != 0) ? s->sock_sndtimeo : TCP_CONNECT_TIMEOUT))) {
		if (s->rto <= TCP_INIT_RTT) {
			s->sock_error = s->conn_error;
			TCP_END_SOCKET(s);
			return;
		}
		s->rto = TCP_INIT_RTT;
	} else INCREASE_RTO(s);
	TCP_SEND_SYN(s);
}

static __finline__ void TCP_CONGESTION(TCPIP_SOCKET *s)
{
	s->ssthresh = s->cwnd >> 1;
	if (__unlikely(s->ssthresh < (s->mss << 1))) s->ssthresh = s->mss << 1;
	if (s->flags & SOCK_ECN) s->flags |= SOCK_CWR;
}

static __finline__ void TCP_LOCAL_CONGESTION(TCPIP_SOCKET *s)
{
	s->ssthresh = (s->cwnd >> 1) + (s->cwnd >> 2) + (s->cwnd >> 3);
	s->cwnd = s->ssthresh;
}

__finline__ void TCPIP_DELETE_PACKET(TCPIP_SOCKET *s, PACKET *p)
{
	if (__unlikely(KERNEL$SPL != SPL_X(SPL_NET)))
		KERNEL$SUICIDE("DELETE_PACKET AT SPL %08X", KERNEL$SPL);
	NET$QFREE((SOCKET *)s, sizeof(PACKET) + p->length);
	DEL_FROM_LIST(&p->list);
	if (__likely(!(p->flags & PKT_OUTSTANDING))) {
		FREE_PACKET(p, &NET$PKTPOOL, SPL_NET);
	} else {
		ADD_TO_LIST(&LOST_PACKETS, &p->list);
		p->sender_data = NULL;
	}
}

DECL_AST(TCP_SENT_PACKET_RETURNED, SPL_NET, PACKET)
{
	PKT_AST_ENTER(RQ);
	if (__unlikely(!RQ->sender_data)) {
		DEL_FROM_LIST(&RQ->list);
		FREE_PACKET(RQ, NULL, SPL_NET);
	} else {
		if (__unlikely(RQ->flags & PKT_LOCAL_ECN_CE)) TCP_LOCAL_CONGESTION(RQ->sender_data);
		RQ->flags &= ~(PKT_OUTSTANDING | PKT_LOCAL_ECN_CE);
#if __DEBUG >= 1
		RQ->fn = NULL;
#endif
	}
	RETURN;
}

int TCP_SEND_MORE(TCPIP_SOCKET *s)
{
	PACKET *p;
	unsigned wnd;
	unsigned l;
	int sent = 0;
	CHECK_NETACL(goto ret);
	s->flags &= ~SOCK_SHOULD_SEND_MORE;
	if (__unlikely(s->cwnd < s->mss)) {
		s->cwnd = s->mss;
	}
	if (__unlikely((wnd = s->wnd) < s->mss)) {
		wnd = s->mss;
	}
#ifdef __TCPIP_DEBUG
	__debug_printf("w: %d\n", wnd);
#endif
	while (__likely(s->sent_queue_length < wnd) && s->sent_queue_length < s->cwnd) {
		if (__unlikely(LIST_EMPTY(&s->out_queue))) {
			if (!s->prepared_packet) return sent;
			if (__likely(!(s->flags & (SOCK_SHUTDOWN_WRITE | SOCK_NONAGLE | SOCK_CORK)))) {
				if (!LIST_EMPTY(&s->sent_queue)) return sent;
			} else if ((s->flags & (SOCK_SHUTDOWN_WRITE | SOCK_CORK)) == SOCK_CORK) return sent;
			p = s->prepared_packet;
			s->prepared_packet = NULL;
		} else {
			p = LIST_STRUCT(s->out_queue.next, PACKET, list);
			DEL_FROM_LIST(&p->list);
		}
		if (__unlikely(LIST_EMPTY(&s->sent_queue))) {
			TCP_SET_RTO(s);
			SET_TIMEOUT(s);
			s->timer.fn = TCP_RTT;
			KERNEL$SET_TIMER(s->rto, &s->timer);
		}
		ADD_TO_LIST_END(&s->sent_queue, &p->list);
		p->fn = TCP_SENT_PACKET_RETURNED;
		p->sent_time = KERNEL$GET_JIFFIES_LO();
		p->flags |= PKT_OUTSTANDING | PKT_DONT_CHANGE | PKT_LOCAL_ECN;
		p->sender_data = s;
		FILL_TCPIP_HEADER(s, p, 1);
		FILL_TCPIP_FLAGS(s, p, 5);
		if (__unlikely(p->flags & PKT_FIN)) *(__u32 *)&tcp(p)->th_off4 |= TH32_FIN, l = 1;
		else l = p->data_length - sizeof(struct ip) - sizeof(struct tcphdr);
		s->sent_queue_length += l;
		if (__unlikely(s->sent_queue_length > s->wnd)) p->flags |= PKT_OVER_WINDOW;
		s->out_queue_length -= l;
		s->seq += l;
		FILL_TCPIP_SUM(p, p->data_length - sizeof(struct ip));
#ifdef __TCPIP_DEBUG
		__debug_printf("send(flags:%02x,length:%d,seq:%08x,ack:%08x)", tcp(p)->th_flags, p->data_length, ntohl(tcp(p)->th_seq), ntohl(tcp(p)->th_ack));
#endif
		IP_SEND_PACKET(p);
		sent = 1;
	}
	ret:
	return sent;
}

static void TCP_RETRANSMIT(TCPIP_SOCKET *s, PACKET *p)
{
	CHECK_NETACL(return);
	if (__unlikely(p->flags & PKT_OUTSTANDING)) return;
	p->fn = TCP_SENT_PACKET_RETURNED;
	p->sent_time = KERNEL$GET_JIFFIES_LO();
	p->flags = (p->flags & ~PKT_LOCAL_ECN_CE) | PKT_RETRANSMITTED | PKT_OUTSTANDING;
	p->sender_data = s;
	FILL_TCPIP_HEADER_RTT(s, p);
	FILL_TCPIP_FLAGS(s, p, 5);
	if (__unlikely(p->flags & PKT_FIN)) *(__u32 *)&tcp(p)->th_off4 |= TH32_FIN;
	FILL_TCPIP_SUM(p, p->data_length - sizeof(struct ip));
#ifdef __TCPIP_DEBUG
		__debug_printf("retransmit(flags:%02x,length:%d,seq:%08x,ack:%08x)", tcp(p)->th_flags, p->data_length, ntohl(tcp(p)->th_seq), ntohl(tcp(p)->th_ack));
#endif
	IP_SEND_PACKET(p);
}

/*
static void INVALID_ACK(TCPIP_SOCKET *s, PACKET *p)
{
	static char a1[16];
	static char a2[16];
	if (errorlevel >= 2)
		KERNEL$SYSLOG(__SYSLOG_NET_ERROR, net_name, "INVALID ACK, LOCAL %s:%u, REMOTE %s:%u, ACK %X, OUT SEQ %X, OUTSTANDING %X", PRINT_IP(a1, s->local_addr), ntohs(s->local_port), PRINT_IP(a2, s->remote_addr), ntohs(s->remote_port), ntohl(tcp(p)->th_ack), s->seq, s->sent_queue_length);
}
*/

/*
static void INVALID_SEQ(TCPIP_SOCKET *s, PACKET *p)
{
	static char a1[16];
	static char a2[16];
	if (errorlevel >= 2)
		KERNEL$SYSLOG(__SYSLOG_NET_ERROR, net_name, "INVALID SEQ, LOCAL %s:%u, REMOTE %s:%u, SEQ %X, LENGTH %X, ACKED %X, WINDOW %X", PRINT_IP(a1, s->local_addr), ntohs(s->local_port), PRINT_IP(a2, s->remote_addr), ntohs(s->remote_port), ntohl(tcp(p)->th_seq), p->data_length, s->ack, s->offered_window);
}

static void OVER_WINDOW(TCPIP_SOCKET *s, PACKET *p)
{
	static char a1[16];
	static char a2[16];
	if (errorlevel >= 2)
		KERNEL$SYSLOG(__SYSLOG_NET_ERROR, net_name, "SEQ OVER WINDOW, LOCAL %s:%u, REMOTE %s:%u, SEQ %X, LENGTH %X, ACKED %X, WINDOW %X", PRINT_IP(a1, s->local_addr), ntohs(s->local_port), PRINT_IP(a2, s->remote_addr), ntohs(s->remote_port), ntohl(tcp(p)->th_seq), p->data_length, s->ack, s->offered_window);
}
*/

static __finline__ int ENQUEUE_OOO(TCPIP_SOCKET *s, PACKET *p)
{
		/* -1 ... drop, 0 ... in line, 1 ... out of order */
	unsigned pl = p->data_length - sizeof(struct ip) - sizeof(struct tcphdr);
	unsigned od = (tcp_seq)(ntohl(tcp(p)->th_seq) + pl - s->ack - 1);
	if (__unlikely(od > s->offered_window + pl - 2)) {
		/*INVALID_SEQ(s, p);*/
		return -1;
	}
	if (__unlikely(od < pl)) return 0;
	return 1;
}

static __finline__ int PROCESS_SACKS(TCPIP_SOCKET *s, unsigned *sacks)
{
	int new_sacked = 0;
	PACKET *p;
	LIST_FOR_EACH(p, &s->sent_queue, PACKET, list) {
		unsigned seq = ntohl(tcp(p)->th_seq);
		unsigned seq2 = (tcp_seq)(seq + p->data_length - sizeof(struct ip) - sizeof(struct tcphdr));
		if (__likely(seq <= seq2)) {
			if (__likely(sacks[0] <= sacks[1])) {
				if (seq >= sacks[0] && seq2 <= sacks[1]) goto sacked;
			} else {
				if (seq >= sacks[0] || seq2 <= sacks[1]) goto sacked;
			}
			if (__likely(sacks[2] <= sacks[3])) {
				if (seq >= sacks[2] && seq2 <= sacks[3]) goto sacked;
			} else {
				if (seq >= sacks[2] || seq2 <= sacks[3]) goto sacked;
			}
			if (__likely(sacks[4] <= sacks[5])) {
				if (seq >= sacks[4] && seq2 <= sacks[5]) goto sacked;
			} else {
				if (seq >= sacks[4] || seq2 <= sacks[5]) goto sacked;
			}
			if (__likely(sacks[6] <= sacks[7])) {
				if (seq >= sacks[6] && seq2 <= sacks[7]) goto sacked;
			} else {
				if (seq >= sacks[6] || seq2 <= sacks[7]) goto sacked;
			}
		} else {
			if (__unlikely(sacks[0] > sacks[1])) {
				if (seq >= sacks[0] && seq2 <= sacks[1]) goto sacked;
			}
			if (__unlikely(sacks[2] > sacks[3])) {
				if (seq >= sacks[2] && seq2 <= sacks[3]) goto sacked;
			}
			if (__unlikely(sacks[4] > sacks[5])) {
				if (seq >= sacks[4] && seq2 <= sacks[5]) goto sacked;
			}
			if (__unlikely(sacks[6] > sacks[7])) {
				if (seq >= sacks[6] && seq2 <= sacks[7]) goto sacked;
			}
		}
		continue;
		sacked:
		if (!(p->flags & PKT_SACKED)) {
			p->flags |= PKT_SACKED;
			new_sacked = 1;
		}
	}
	return new_sacked;
}

static int GET_SACKS(TCPIP_SOCKET *s, PACKET *p)
{
	unsigned i;
	__u32 sack;
	unsigned sacks[8];
	if (!(s->flags & SOCK_SACK)) return 0;
	memset(sacks, 0, sizeof sacks);
	sack = ntohl(*(__u32 *)(tcp(p) + 1));
	if (__likely((sack & 0xFFFFFF00U) == (TCPOPT_NOP << 24) + (TCPOPT_NOP << 16) + (TCPOPT_SACK << 8))) {
		sack &= 0xFF;
		if (sizeof(struct tcphdr) + 2 + sack > TCP_HLEN(tcp(p)->th_off4)) goto inv_opt;
		sack -= 2;
		if (__unlikely(sack & 7)) {
			inv_opt:
			INVALID_TCP_OPTIONS(p);
			return 0;
		}
		for (i = 0; i < sack; i += 8) {
			*(__u32 *)((__u8 *)sacks + i) = ntohl(*(__u32 *)((__u8 *)tcp(p) + sizeof(struct tcphdr) + 4 + i));
			*(__u32 *)((__u8 *)sacks + i + 4) = ntohl(*(__u32 *)((__u8 *)tcp(p) + sizeof(struct tcphdr) + 4 + 4 + i));
		}
	} else {
		unsigned opos = sizeof(struct tcphdr);
		unsigned olen = TCP_HLEN(tcp(p)->th_off4);
		while (opos < olen) {
			unsigned opt = *((__u8 *)tcp(p) + opos);
			unsigned opt_l;
			if (__unlikely(opt == TCPOPT_NOP)) {
				opos++;
				continue;
			}
			if (opt == TCPOPT_EOL) break;
			if (__unlikely(opos + 1 == olen)) goto inv_opt;
			opt_l = *((__u8 *)tcp(p) + opos + 1);
			if (__unlikely(opos + opt_l > olen)) goto inv_opt;
			if (opt == TCPOPT_SACK) {
				opt_l -= 2;
				if (__unlikely(opt_l & 7)) goto inv_opt;
				for (i = 0; i < opt_l; i += 8) {
					*(__u32 *)((__u8 *)sacks + i) = (*((__u8 *)tcp(p) + opos + 2 + i) << 24) + (*((__u8 *)tcp(p) + opos + 3 + i) << 16) + (*((__u8 *)tcp(p) + opos + 4 + i) << 8) + *((__u8 *)tcp(p) + opos + 5 + i);
					*(__u32 *)((__u8 *)sacks + i + 4) = (*((__u8 *)tcp(p) + opos + 6 + i) << 24) + (*((__u8 *)tcp(p) + opos + 7 + i) << 16) + (*((__u8 *)tcp(p) + opos + 8 + i) << 8) + *((__u8 *)tcp(p) + opos + 9 + i);
				}
				break;
			}
			opos += opt_l;
		}
	}
	return PROCESS_SACKS(s, sacks);
}

static void TCP_SEND_ACK(TCPIP_SOCKET *s)
{
	PACKET *ap;
	if (__unlikely(s->flags & SOCK_SHOULD_SEND_MORE) && __likely(TCP_SEND_MORE(s))) return;
	CHECK_NETACL(return);
	ALLOC_PACKET(ap, sizeof(struct ip) + sizeof(struct tcphdr), &NET$PKTPOOL, SPL_NET, NET$DELAYED_OOM(); return);
	ap->fn = NET$FREE_PACKET;
	ap->data_length = sizeof(struct ip) + sizeof(struct tcphdr);
	FILL_TCPIP_HEADER(s, ap, 0);
	FILL_TCPIP_FLAGS(s, ap, 5);
	FILL_TCPIP_SUM(ap, sizeof(struct tcphdr));
#ifdef __TCPIP_DEBUG
		__debug_printf("ack(flags:%02x,length:%d,seq:%08x,ack:%08x)", tcp(ap)->th_flags, ap->data_length, ntohl(tcp(ap)->th_seq), ntohl(tcp(ap)->th_ack));
#endif
	IP_SEND_PACKET(ap);
}

static void TCP_MAYBE_SEND_ACK(TCPIP_SOCKET *s)
{
	if (__unlikely(s->flags & SOCK_SHOULD_SEND_MORE) && __likely(TCP_SEND_MORE(s))) return;
	/* sending an ack on invalid seq/ack number can cause infinite
	   packet pinging. Drop packet with a probability */
	if (__unlikely(!(random() & 63))) return;
	TCP_SEND_ACK(s);
}

void TCP_ESTABLISHED(TCPIP_SOCKET *s, PACKET *p)
{
	unsigned ack;
	__u8 flags = tcp(p)->th_flags;
#ifdef __TCPIP_DEBUG
	__debug_printf("recv(flags:%02x,length:%d,seq:%08x,ack:%08x)", flags, p->data_length, ntohl(tcp(p)->th_seq), ntohl(tcp(p)->th_ack));
#endif
	if (__likely((flags & (TH_SYN | TH_RST | TH_ACK)) == TH_ACK)) {
		tcp_seq a;
		ack = ntohl(tcp(p)->th_ack);
		if (__likely((a = (tcp_seq)(ack - s->seq_acked)) <= s->sent_queue_length)) {
			u_jiffies_lo_t j;
			unsigned ownd;
			s->write_rate += a;
			j = KERNEL$GET_JIFFIES_LO();
			if (__unlikely(j - s->write_rate_time > TCP_WRITE_BUFFER_TIME)) {
				s->write_rate >>= 1;
				s->write_rate_time = s->write_rate_time + ((j - s->write_rate_time) >> 1);
			}
			ownd = s->wnd;
			s->wnd = ntohs(tcp(p)->th_win) << s->wnd_scale;
			if (s->flags & SOCK_ECN) {
				if (__unlikely(ip(p)->ip_tos & IPTOS_CE)) s->flags |= SOCK_ECE;
				if (__unlikely(flags & TH_CWR)) s->flags &= ~SOCK_ECE;
				if (__unlikely(s->flags & SOCK_ECE_IGNORE)) {
					if (__unlikely(j >= s->cwr_time) || !(flags & TH_ECE)) s->flags &= ~SOCK_ECE_IGNORE;
				} else {
					if (__unlikely(flags & TH_ECE)) {
						TCP_CONGESTION(s);
						s->cwnd = s->ssthresh;
						s->flags |= SOCK_ECE_IGNORE;
						s->cwr_time = j + s->srtt + (s->rttvar << 1);
					}
				}
			}
			if (ack != s->seq_acked) {
				int do_rttcalc = 1;
				while (__likely(!LIST_EMPTY(&s->sent_queue))) {
					PACKET *pp = LIST_STRUCT(s->sent_queue.next, PACKET, list);
					if ((ack - (ntohl(tcp(pp)->th_seq) + pp->data_length - sizeof(struct ip) - sizeof(struct tcphdr) + !!(pp->flags & PKT_FIN))) < 0x80000000U) {
						unsigned lim;
						if (do_rttcalc) {
							if (__likely(!(pp->flags & PKT_RETRANSMITTED))) {
								u_jiffies_lo_t jj = j - pp->sent_time;
								if (__unlikely(!s->rttvar)) {
									s->srtt = jj | 1;
									s->rttvar = (jj >> 1) | 1;
								} else {
									u_jiffies_lo_t a = jj - s->srtt;
									if ((jiffies_lo_t)a < 0) a = -a;
									s->rttvar = ((s->rttvar >> 1) + (s->rttvar >> 2) + (a >> 2)) | 1;
									s->srtt = ((s->srtt >> 1) + (s->srtt >> 2) + (s->srtt >> 3) + (jj >> 3)) | 1;
								}
							} else if (__likely(!(pp->flags & PKT_OVER_WINDOW))) {
				/* RFC does not specify this, but in case rto
				   gets too low so that packets get
				   retransmitted prematuraly, TCP has no
				   chance to recover (every packet gets ack
				   after retransmit, no time is sampled, srtt
				   doesn't grow). This allows slow recoverry
				   in such cases */
								if (__likely(s->srtt < TCP_MAX_RTT)) s->srtt += (s->srtt >> 5) + 1;
							}
							do_rttcalc = 0;
						}
						if (__unlikely(pp->flags & PKT_FIN)) {
							s->sent_queue_length--;
							s->flags |= SOCK_FIN_ACKED;
							WQ_WAKE_ALL(&s->linger_wait);
						}
						s->sent_queue_length -= pp->data_length - sizeof(struct ip) - sizeof(struct tcphdr);
						lim = TCP_WRITE_BUFFER(s);
						if (__likely(s->sock_sndlowat <= 1)) {
							if (s->sent_queue_length + s->out_queue_length < (lim >> 1) + (lim >> 2)) WQ_WAKE_ALL(&s->write_wait);
						} else {
							if (s->sent_queue_length + s->out_queue_length + s->sock_sndlowat <= lim) WQ_WAKE_ALL_PL(&s->write_wait);
						}
						TCPIP_DELETE_PACKET(s, pp);
						if (s->timer.fn) RESET_TIMEOUT(s);
					} else {
						if (__likely(!s->timer.fn)) {
							jiffies_lo_t jj;
							TCP_SET_RTO(s);
							jj = pp->sent_time + s->rto - j;
							if (__unlikely(jj < 0)) jj = 0;
							SET_TIMEOUT(s);
							s->timer.fn = TCP_RTT;
							KERNEL$SET_TIMER(jj, &s->timer);
						}
						break;
					}
				}
				SAVE_SRTT(s);
				if (__unlikely((s->flags & (SOCK_FIN_ACKED | SOCK_SHUTDOWN_READ)) == (SOCK_FIN_ACKED | SOCK_SHUTDOWN_READ))) {
					if (tcp(p)->th_flags & TH_FIN) {
						s->ack++;
						TCP_SEND_ACK(s);
					}
					TCP_END_SOCKET(s);
					goto drop;
				}
				if (__unlikely(s->flags & (SOCK_DUP_ACK | SOCK_DUP_ACK_2 | SOCK_FAST_RECOVERY))) {
					s->flags &= ~(SOCK_DUP_ACK | SOCK_DUP_ACK_2 | SOCK_FAST_RECOVERY);
					if (__unlikely(!(s->flags & SOCK_FAST_RECOVERY))) goto x1;
					s->cwnd = s->ssthresh;
				} else x1: if (__unlikely(s->cwnd <= s->ssthresh)) {
					/* slow start */
					/* RFC 3465 */
					unsigned d = (tcp_seq)(ack - s->seq_acked);
					unsigned l = s->mss << 1;
					if (s->flags & SOCK_SS_AFTER_RTO) l = s->mss;
					if (__unlikely(d > l)) d = l;
					s->cwnd += d;
					/*s->cwnd += s->mss;*/
				} else {
					/* congestion avoidance */
					/* RFC 3465 */
					s->flags &= ~SOCK_SS_AFTER_RTO;
					if (__unlikely((s->bytes_acked += (tcp_seq)(ack - s->seq_acked)) >= s->cwnd)) {
						s->bytes_acked -= s->cwnd;
						s->cwnd += s->mss;
					}
					/*unsigned inc = s->mss * s->mss / s->cwnd;
					if (__unlikely(!inc)) inc = 1;
					s->cwnd += inc;*/
				}
				s->seq_acked = ack;
				/*TCP_SEND_MORE(s);*/
				s->flags |= SOCK_SHOULD_SEND_MORE;
				if (__unlikely(TCP_HLEN(tcp(p)->th_off4) != sizeof(struct tcphdr))) GET_SACKS(s, p);
			} else {
				if (sizeof(struct ip) + TCP_HLEN(tcp(p)->th_off4) == p->data_length && __likely(!(tcp(p)->th_flags & TH_FIN))) {
					int new_sacked;
					if (__unlikely(!s->wnd)) {
						TCP_SEND_MORE(s);
						goto just_an_ack;
					}
					if (__unlikely(LIST_EMPTY(&s->sent_queue))) goto just_an_ack;
					new_sacked = 0;
					if (__likely(TCP_HLEN(tcp(p)->th_off4) != sizeof(struct tcphdr))) new_sacked = GET_SACKS(s, p);
					/* RFC2001: FAST RECOVERY */
					if (__unlikely(!(s->flags & SOCK_DUP_ACK))) {
						s->flags |= SOCK_DUP_ACK;
						goto limited_transmit;
					}
					if (__unlikely(!(s->flags & SOCK_DUP_ACK_2))) {
						unsigned oc;
						int sent;
						s->flags |= SOCK_DUP_ACK_2;

						limited_transmit:
						/* RFC3042: LIMITED TRANSMIT */
						/*if (!(s->flags & SOCK_SACK) || new_sacked) */
						if (__likely(!((s->flags & (new_sacked - 1) & SOCK_SACK)))) {
/* I don't know the exact reason why we can't do it without new_sacked. RFC3042
   talks something about protecting from misbehaving clients however, but it is
   not an issue, because attacker can turn off SACK anyway. But it's preceded
   with MUST NOT, so I implement it ... it costs only 64 bytes of code */
							oc = s->cwnd;
							s->cwnd += s->mss << 1;
							sent = TCP_SEND_MORE(s);
							s->cwnd = oc;
							if (!sent) goto just_an_ack;
							goto drop;
						}
						goto just_an_ack;
					}
					if (__unlikely(!(s->flags & SOCK_FAST_RECOVERY))) {
						if (!(s->flags & SOCK_SACK)) {
							rt1:
							TCP_RETRANSMIT(s, LIST_STRUCT(s->sent_queue.next, PACKET, list));
						} else {
							int rt = 0;
							PACKET *pp, *ps = NULL;
							LIST_FOR_EACH(pp, &s->sent_queue, PACKET, list) {
								if (!(pp->flags & PKT_SACKED)) {
									if (!ps) ps = pp;
								} else {
									if (__unlikely(ps != NULL)) {
										do {
											TCP_RETRANSMIT(s, ps);
											rt = 1;
										} while ((ps = LIST_STRUCT(ps->list.next, PACKET, list)) != pp);
										ps = NULL;
									}
								}
							}
							if (__unlikely(!rt)) goto rt1;
						}
						TCP_CONGESTION(s);
						s->flags |= SOCK_FAST_RECOVERY;
						s->cwnd = s->ssthresh + 3 * s->mss;
					} else s->cwnd += s->mss;
					if (!TCP_SEND_MORE(s)) goto just_an_ack;
					goto drop;
				}
			}
		} else if (a < -(tcp_seq)TCP_ACK_BACKWARD) {
	/* allow ack backward for the case packet get reordered */
			/*INVALID_ACK(s, p);*/
			TCP_MAYBE_SEND_ACK(s);
			goto drop;
		}
		if (__unlikely(TCP_HLEN(tcp(p)->th_off4) != sizeof(struct tcphdr))) {
			if (__likely(sizeof(struct ip) + TCP_HLEN(tcp(p)->th_off4) == p->data_length) && __likely(!(tcp(p)->th_flags & TH_FIN))) {
				test_more_drop:
				if (s->flags & SOCK_SHOULD_SEND_MORE) TCP_SEND_MORE(s);
				goto drop;
			}
			CLONE_PACKET(p, NET$DELAYED_OOM(); goto test_more_drop);
			memmove(tcp(p) + 1, (char *)tcp(p) + TCP_HLEN(tcp(p)->th_off4), p->data_length - (sizeof(struct ip) + TCP_HLEN(tcp(p)->th_off4)));
			p->data_length -= TCP_HLEN(tcp(p)->th_off4) - sizeof(struct tcphdr);
		}
		if (p->data_length == sizeof(struct ip) + sizeof(struct tcphdr) && __likely(!(tcp(p)->th_flags & TH_FIN))) {
			just_an_ack:
			if (__unlikely(ntohl(tcp(p)->th_seq) != s->ack)) {
				TCP_MAYBE_SEND_ACK(s);
			} else goto test_more_drop;
			goto drop;
		}
		if (__unlikely(p->data_length > s->mss + sizeof(struct ip) + sizeof(struct tcphdr) + 3)) {
			BIG_PACKET(s, p);
			goto drop;
		}
		if (__likely(ntohl(tcp(p)->th_seq) == s->ack)) {
			unsigned d;
			if (__unlikely(s->flags & SOCK_SHUTDOWN_READ)) {
				TCP_SEND_RESET(p);
				TCP_RESET_SOCKET(s);
				goto drop;
			}
			enq:
			if (__unlikely(!s->offered_window)) {
				/*OVER_WINDOW(s, p);*/	/* this is actually
					not error --- zero window probe */
				TCP_SEND_ACK(s);
				goto drop;
			}
			DUP_PACKET(p, NULL, NET$DELAYED_OOM(); goto test_more_drop);
			if (__unlikely(NET$QALLOC((SOCKET *)s, sizeof(PACKET) + p->length))) goto drop;
			ADD_TO_LIST_END(&s->in_queue, &p->list);
			if (__likely((tcp_seq)(s->ack - s->read_seq) >= s->sock_rcvlowat))
				WQ_WAKE_ALL_PL(&s->read_wait);
			d = p->data_length - sizeof(struct ip) - sizeof(struct tcphdr) + (tcp_seq)(s->ack - ntohl(tcp(p)->th_seq));
			s->offered_window -= d;
			s->ack = (tcp_seq)(ntohl(tcp(p)->th_seq) + p->data_length - sizeof(struct ip) - sizeof(struct tcphdr));
			if (__unlikely(tcp(p)->th_flags & TH_FIN)) {
				fin_recv:
				s->ack++;
				s->flags |= SOCK_SHUTDOWN_READ;
				WQ_WAKE_ALL_PL(&s->read_wait);
				if (s->flags & SOCK_FIN_ACKED) {
					TCP_SEND_ACK(s);
		/* we should have 2*MSL timeout here, but who account the socket to ? */
					TCP_END_SOCKET(s);
					return;
				}
			} else while (__unlikely(!LIST_EMPTY(&s->in_ooo_queue))) {
				int r;
				PACKET *pp = LIST_STRUCT(s->in_ooo_queue.next, PACKET, list);
				if (__likely(ntohl(tcp(pp)->th_seq) == s->ack)) {
					enq_ooo:
					DEL_FROM_LIST(&pp->list);
					ADD_TO_LIST_END(&s->in_queue, &pp->list);
					d = pp->data_length - sizeof(struct ip) - sizeof(struct tcphdr) + (tcp_seq)(s->ack - ntohl(tcp(pp)->th_seq));
					s->offered_window -= d;
					s->ack = (tcp_seq)(ntohl(tcp(pp)->th_seq) + pp->data_length - sizeof(struct ip) - sizeof(struct tcphdr));
					if (__unlikely(tcp(pp)->th_flags & TH_FIN)) goto fin_recv;
					continue;
				}
				r = ENQUEUE_OOO(s, pp);
				if (__unlikely(r < 0)) {
					DEL_FROM_LIST(&pp->list);
					NET$QFREE((SOCKET *)s, sizeof(PACKET) + pp->length);
					FREE_PACKET(pp, NULL, SPL_NET);
				} else if (__unlikely(r == 0)) goto enq_ooo;
				break;
			}
			/* ? delay ack ? */
			TCP_SEND_ACK(s);
			return;
		} else {
			int r;
			PACKET *pp, *ap, *xp;
			unsigned nsack;
			r = ENQUEUE_OOO(s, p);
			if (r < 0) {
				TCP_SEND_ACK(s);
				goto drop;
			}
			if (__unlikely(s->flags & SOCK_SHUTDOWN_READ)) {
				TCP_SEND_RESET(p);
				TCP_RESET_SOCKET(s);
				goto drop;
			}
			if (__unlikely(r == 0)) {
				goto enq;
			}
			if (s->flags & SOCK_SHOULD_SEND_MORE) TCP_SEND_MORE(s);
			LIST_FOR_EACH_BACK(pp, &s->in_ooo_queue, PACKET, list) {
				if (__unlikely((tcp_seq)(ntohl(tcp(pp)->th_seq) - ntohl(tcp(p)->th_seq) - (p->data_length - sizeof(struct ip) - sizeof(struct tcphdr))) < 0x80000000U)) continue;
				if (__unlikely((tcp_seq)(ntohl(tcp(p)->th_seq) - ntohl(tcp(pp)->th_seq) - (pp->data_length - sizeof(struct ip) - sizeof(struct tcphdr))) < 0x80000000U)) break;
				p->status = 0;
				CALL_PKT(p);
				p = pp;
				goto sak;
			}
			DUP_PACKET(p, NULL, NET$DELAYED_OOM(); goto drop);
			if (__unlikely(NET$QALLOC((SOCKET *)s, sizeof(PACKET) + p->length))) goto drop;
			ADD_TO_LIST_AFTER(&pp->list, &p->list);

			sak:
			if (!(s->flags & SOCK_SACK)) {
				TCP_SEND_ACK(s);
				return;
			}
			ALLOC_PACKET(ap, sizeof(struct ip) + sizeof(struct tcphdr) + 4 + 32, &NET$PKTPOOL, SPL_NET, NET$DELAYED_OOM(); return);
			ap->fn = NET$FREE_PACKET;

			while (__unlikely(p->list.next != (LIST_ENTRY *)&s->in_ooo_queue)) {
				pp = LIST_STRUCT(p->list.next, PACKET, list);
				if (__unlikely(ntohl(tcp(pp)->th_seq) != (tcp_seq)(ntohl(tcp(p)->th_seq) + p->data_length - sizeof(struct ip) - sizeof(struct tcphdr)))) break;
				p = pp;
			}
#define sack(p)	((__u32 *)((__u8 *)tcpopt(p) + 4))
			xp = p;
			nsack = 0;
			do {
				sack(ap)[nsack + 1] = htonl(ntohl(tcp(p)->th_seq) + p->data_length - sizeof(struct ip) - sizeof(struct tcphdr) + (__unlikely(tcp(p)->th_flags & TH_FIN) ? 1 : 0));
				while (__likely(p->list.prev != (LIST_ENTRY *)&s->in_ooo_queue)) {
					pp = LIST_STRUCT(p->list.prev, PACKET, list);
					if (__unlikely(ntohl(tcp(p)->th_seq) != (tcp_seq)(ntohl(tcp(pp)->th_seq) + pp->data_length - sizeof(struct ip) - sizeof(struct tcphdr)))) break;
					p = pp;
				}
				sack(ap)[nsack] = tcp(p)->th_seq;
				nsack += 2;
				p = LIST_STRUCT(p->list.prev, PACKET, list);
				if (__unlikely(p == LIST_STRUCT((LIST_ENTRY *)&s->in_ooo_queue, PACKET, list))) p = LIST_STRUCT(p->list.prev, PACKET, list);
			} while (p != xp && nsack < 8);
#undef sack
			FILL_TCPIP_HEADER(s, ap, 0);
			FILL_TCPIP_FLAGS(s, ap, 6 + nsack);
			nsack <<= 2;
			ap->data_length = sizeof(struct ip) + sizeof(struct tcphdr) + 4 + nsack;
			FILL_TCPIP_SUM(ap, sizeof(struct tcphdr) + 4 + nsack);
			((__u32 *)tcpopt(ap))[0] = htonl((TCPOPT_NOP << 24) + (TCPOPT_NOP << 16) + (TCPOPT_SACK << 8) + 2 + nsack);
#ifdef __TCPIP_DEBUG
		__debug_printf("sack(flags:%02x,length:%d,seq:%08x,ack:%08x)", tcp(ap)->th_flags, ap->data_length, ntohl(tcp(ap)->th_seq), ntohl(tcp(ap)->th_ack));
#endif
			IP_SEND_PACKET(ap);
			return;
		}
	}
	if (__unlikely((flags & (TH_FIN | TH_SYN | TH_RST | TH_ACK)) == (TH_SYN | TH_ACK))) {
		if (__unlikely(ntohl(tcp(p)->th_ack) == s->seq_acked)) {
			TCP_SEND_ACK(s);
		}
		goto drop;
	}
		/* !!! TODO: react to SYN according to RFC793 or draft-ietf-tcpm-tcpsecure-00.txt */
	if (flags & TH_RST) {
		if (__likely((tcp_seq)(ntohl(tcp(p)->th_seq) - s->ack) <= s->offered_window)) {
			if (__unlikely((s->flags & (SOCK_SHUTDOWN_READ | SOCK_SHUTDOWN_WRITE)) != (SOCK_SHUTDOWN_READ | SOCK_SHUTDOWN_WRITE))) {
				s->sock_error = -ECONNRESET;
			} else {
				s->flags |= SOCK_FIN_ACKED;
			}
			TCP_END_SOCKET(s);
		}
		goto drop;
	}
	drop:
	p->status = 0;
	CALL_PKT(p);
}

static void TCP_RTT(TIMER *t)
{
	PACKET *p;
	TCPIP_SOCKET *s;
	LOWER_SPL(SPL_NET);
	s = GET_STRUCT(t, TCPIP_SOCKET, timer);
	if (__unlikely((KERNEL$GET_JIFFIES_LO() - s->last_time) >= (__unlikely(s->sock_sndtimeo != 0) ? s->sock_sndtimeo : TCP_WRITE_TIMEOUT)) && s->wnd) {
		if (s->rto <= TCP_INIT_RTT) {
			s->timer.fn = NULL;
			s->sock_error = s->conn_error;
			TCP_END_SOCKET(s);
			return;
		}
		s->rto = TCP_INIT_RTT;
	} else {
		INCREASE_RTO(s);
	}
	if (__unlikely(LIST_EMPTY(&s->sent_queue)))
		KERNEL$SUICIDE("TCP_RTT: RETRANSMIT TIMEOUT ON EMPTY QUEUE");
	p = LIST_STRUCT(s->sent_queue.next, PACKET, list);
	TCP_RETRANSMIT(s, p);
	if (s->wnd) {
		TCP_CONGESTION(s);
		s->cwnd = s->mss;
		if (s->flags & SOCK_SACK) LIST_FOR_EACH(p, &s->sent_queue, PACKET, list) p->flags &= ~PKT_SACKED;
		s->flags |= SOCK_SS_AFTER_RTO;
	}
	KERNEL$SET_TIMER(s->rto, &s->timer);
}

void TCP_LISTEN(TCPIP_SOCKET *s, PACKET *p)
{
	__u8 flags = tcp(p)->th_flags;
#ifdef __TCPIP_DEBUG
		__debug_printf("listen rcv.");
#endif
	if (__unlikely(INAPPROPRIATE_TCP_ADDRESS(ip(p)->ip_src.s_addr))) goto drop;
	if (__likely((flags & (TH_ACK | TH_RST | TH_SYN | TH_FIN)) == TH_SYN)) {
		TCPIP_SOCKET *ns;
#ifdef __TCPIP_DEBUG
		__debug_printf("valid.\n");
#endif
		if (__unlikely(!(ns = (TCPIP_SOCKET *)NET$SOCKET_CREATE_SIBLING((SOCKET *)s)))) {
			goto drop;
		}
		if (__likely(!(s->flags & SOCK_TOS_SET))) {
			unsigned tos = ip(p)->ip_tos & ~(IPTOS_CE | IPTOS_ECT);
			if (__likely(TOS_VALID(tos))) ns->ip_tos = tos;
		}
		ns->remote_addr = ip(p)->ip_src.s_addr;
		ns->remote_port = tcp(p)->th_sport;
		ns->local_addr = ip(p)->ip_dst.s_addr;
		INSERT_INTO_HASH(ns);
		ns->listener = s;
		ADD_TO_LIST_END(&s->backlog_preparing, &ns->backlog_list);
		INIT_TCP_SOCKET(ns);
		ns->seq_acked = (tcp_seq)(ns->seq + 1);
		ns->read_seq = ns->ack = (tcp_seq)(ntohl(tcp(p)->th_seq) + 1);
		ns->mss = (IP_FIND_MTU(ns->remote_addr) - sizeof(struct ip) - sizeof(struct tcphdr)) & ~3;
		if ((tcp(p)->th_flags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) ns->ip_tos |= IPTOS_ECT, ns->flags |= SOCK_ECN;
		PARSE_TCP_OPTIONS(ns, p);
		TCP_SEND_SYNACK(ns);
		goto drop;
	}
	TCP_NULL_PACKET(s, p);
	return;

	drop:
	p->status = 0;
	CALL_PKT(p);
}

static void TCP_SEND_SYNACK(TCPIP_SOCKET *s)
{
	unsigned l;
	PACKET *p;
	CHECK_NETACL(return);
#ifdef __TCPIP_DEBUG
		__debug_printf("send synack.\n");
#endif
	ALLOC_PACKET(p, sizeof(struct ip) + sizeof(struct tcphdr) + 12, &NET$PKTPOOL, SPL_NET, NET$DELAYED_OOM(); goto skip_packet);
	p->fn = NET$FREE_PACKET;
	FILL_TCPIP_HEADER(s, p, 0);
	((__u32 *)tcpopt(p))[0] = htonl((TCPOPT_MAXSEG << 24) | (TCPOLEN_MAXSEG << 16) | (s->mss + sizeof(struct tcphdr)));
	if (s->flags & SOCK_SACK) {
		if (s->flags & SOCK_WINSCALE) {
			((__u32 *)tcpopt(p))[1] = htonl((TCPOPT_SACK_PERMITTED << 24) | (TCPOLEN_SACK_PERMITTED << 16) | (TCPOPT_WINDOW << 8) | (TCPOLEN_WINDOW));
			((__u32 *)tcpopt(p))[2] = htonl((TCP_WINDOW_SCALE << 24) | (TCPOPT_EOL << 16));
			l = 2;
		} else {
			((__u32 *)tcpopt(p))[1] = htonl((TCPOPT_SACK_PERMITTED << 24) | (TCPOLEN_SACK_PERMITTED << 16) | (TCPOPT_EOL << 8));
			l = 1;
		}
	} else {
		if (s->flags & SOCK_WINSCALE) {
			((__u32 *)tcpopt(p))[1] = htonl(((TCPOPT_WINDOW) << 24) | (TCPOLEN_WINDOW << 16) | (TCP_WINDOW_SCALE << 8) | TCPOPT_EOL);
			l = 1;
		} else {
			l = 0;
		}
	}
	p->data_length = sizeof(struct ip) + sizeof(struct tcphdr) + 4 + (l << 2);
	*(__u32 *)&tcp(p)->th_off4 = TH32_SYN | TH32_ACK | (s->flags & SOCK_ECN ? TH32_ECE : 0) | TCP_SET_DATA(6 + l) | TCP_GET_WINDOW(s, 0);
	FILL_TCPIP_SUM(p, sizeof(struct tcphdr) + 4 + (l << 2));
	IP_SEND_PACKET(p);
	skip_packet:
	s->packet = TCP_SYN_RECEIVED;
	s->timer.fn = TCP_SYNACK_RTT;
	KERNEL$SET_TIMER(s->rto, &s->timer);
}

static void TCP_SYNACK_RTT(TIMER *t)
{
	TCPIP_SOCKET *s;
	LOWER_SPL(SPL_NET);
	s = GET_STRUCT(t, TCPIP_SOCKET, timer);
	s->timer.fn = NULL;
	if (__unlikely((KERNEL$GET_JIFFIES_LO() - s->last_time) >= (__unlikely(s->sock_sndtimeo) ? s->sock_sndtimeo : TCP_CONNECT_TIMEOUT))) {
		if (s->rto <= TCP_INIT_RTT) {
			TCPIP_DESTROY_SOCKET((SOCKET *)s);
			return;
		}
		s->rto = TCP_INIT_RTT;
	} else INCREASE_RTO(s);
	TCP_SEND_SYNACK(s);
}

static void TCP_SYN_RECEIVED(TCPIP_SOCKET *s, PACKET *p)
{
	__u8 flags = tcp(p)->th_flags;
#ifdef __TCPIP_DEBUG
		__debug_printf("ack rcv.");
#endif
	if (__likely((flags & (TH_ACK | TH_RST | TH_SYN)) == TH_ACK)) {
#ifdef __TCPIP_DEBUG
		__debug_printf("valid.\n");
#endif
		if (__unlikely(ntohl(tcp(p)->th_ack) != (tcp_seq)(s->seq + 1))) goto drop;
		RESET_TIMEOUT(s);
		s->seq = (tcp_seq)(s->seq + 1);
		s->seq_acked = s->seq;
		TCP_SET_RTO(s);
		DEL_FROM_LIST(&s->backlog_list);
		ADD_TO_LIST_END(&s->listener->backlog_connected, &s->backlog_list);
		WQ_WAKE_ALL_PL(&s->listener->read_wait);
		WQ_WAKE_ALL(&s->write_wait);
		s->packet = TCP_ESTABLISHED;
		TCP_ESTABLISHED(s, p);
		return;
	}
	if (__likely((flags & (TH_ACK | TH_RST | TH_SYN)) == TH_SYN)) {
		if (ntohl(tcp(p)->th_seq) == (tcp_seq)(s->seq_acked - 1)) {
			p->status = 0;
			CALL_PKT(p);
			KERNEL$DEL_TIMER(&s->timer);
			TCP_SYNACK_RTT(&s->timer);
			return;
		}
	}
	TCP_NULL_PACKET(s, p);
	return;
	drop:
	p->status = 0;
	CALL_PKT(p);
}

#define sio
#include "IPTCP_RW.I"

#undef sio
#include "IPTCP_RW.I"

static void SOCK_LINGER_WAKE(TIMER *t)
{
	TCPIP_SOCKET *s;
	LOWER_SPL(SPL_NET);
	s = LIST_STRUCT(t, TCPIP_SOCKET, linger_timer);
	if (__unlikely(KERNEL$GET_JIFFIES_LO() - s->linger_start < s->sock_lingertime)) {
		KERNEL$SET_TIMER(1, &s->linger_timer);
		return;
	}
	s->linger_timer.fn = NULL;
	WQ_WAKE_ALL(&s->linger_wait);
}

int TCP_CLOSE(TCPIP_SOCKET *s, int type, IORQ *rq)
{
	int r;
	PACKET *p;
	if (__unlikely(s->sock_error)) return 0;
	type++;
	if (__unlikely(s->packet != TCP_ESTABLISHED)) {
		if (__unlikely(s->packet == TCP_LISTEN)) {
			if (type & 1) s->packet = TCP_NULL_PACKET;
		}
		return 0;
	}
	if (__likely(type & 2)) {
		u_jiffies_lo_t j = KERNEL$GET_JIFFIES_LO();
		if (__likely(!(s->flags & SOCK_SHUTDOWN_WRITE))) {
			if (__unlikely(s->prepared_packet != NULL)) {
				ADD_TO_LIST_END(&s->out_queue, &s->prepared_packet->list);
				s->prepared_packet = NULL;
			}
			ALLOC_PACKET(p, sizeof(struct ip) + sizeof(struct tcphdr), NULL, SPL_NET, {
				WQ *wq = NET$OOM();
				if (__unlikely(!wq)) {
					call_again:
					CALL_IORQ_LSTAT_EXPR(rq, (unsigned long)rq->tmp1);
				} else if (__unlikely(__IS_ERR(wq))) {
					rq->status = __PTR_ERR(wq);
					CALL_AST(rq);
				} else {
					WQ_WAIT_F(wq, rq);
				}
				retry:
				return 1;
			});
			if (__unlikely(r = NET$QALLOC((SOCKET *)s, sizeof(PACKET) + p->length))) {
				FREE_PACKET(p, NULL, SPL_NET);
				if (r == 1) goto call_again;
				rq->status = r;
				CALL_AST(rq);
				goto retry;
			}
			s->flags |= SOCK_SHUTDOWN_WRITE;
			WQ_WAKE_ALL(&s->write_wait);
			p->data_length = sizeof(struct ip) + sizeof(struct tcphdr);
			p->flags |= PKT_FIN;
			ADD_TO_LIST_END(&s->out_queue, &p->list);
			s->out_queue_length++;
			TCP_SEND_MORE(s);
			s->linger_start = j;
			if (__unlikely(s->sock_lingertime != 0)) {
				s->linger_timer.fn = SOCK_LINGER_WAKE;
				KERNEL$SET_TIMER(s->sock_lingertime, &s->linger_timer);
			}
		}
		if (__likely(!(s->flags & SOCK_FIN_ACKED)) && __unlikely(j - s->linger_start < s->sock_lingertime)) {
			WQ_WAIT_F(&s->linger_wait, rq);
			goto retry;
		}
	}
	if (__likely(type & 1)) {
		s->flags |= SOCK_SHUTDOWN_READ;
		WQ_WAKE_ALL(&s->read_wait);
		if (__unlikely(!LIST_EMPTY(&s->in_queue))) {
			TCP_SEND_SOCKET_RESET(s);
			TCP_END_SOCKET(s);
			return 0;
		}
		while (__unlikely(!LIST_EMPTY(&s->in_ooo_queue))) {
			p = LIST_STRUCT(s->in_ooo_queue.next, PACKET, list);
			DEL_FROM_LIST(&p->list);
			NET$QFREE((SOCKET *)s, sizeof(PACKET) + p->length);
			FREE_PACKET(p, NULL, SPL_NET);
		}
		if (s->flags & SOCK_FIN_ACKED) TCP_END_SOCKET(s);
	}
	return 0;
}

