/*
 * IN: %EAX - INIT VAL	( / DEST POINTER)
 *     %EDX - POINTER
 *     %ECX - LEN
 * OUT: %EAX - UNFOLDED CHECKSUM
 */

	.ALIGN	64
	.GLOBAL	CHECKSUM_AMD
CHECKSUM_AMD:
#ifdef COPY
	PUSHL	%EDI
	MOVL	%EAX, %EDI
	XORL	%EAX, %EAX
#endif
	TESTL	$3, %ECX
	PUSHL	%EBX
	JNZ	2f
6:	MOVL	%ECX, %EBX
	SHRL	$4, %ECX
	ANDL	$12, %EBX
	LEAL	-16(%EBX, %EDX), %EDX
#ifndef COPY
	SHRL	$2, %EBX
	XORL	$3, %EBX
	LEAL	0f(%EBX, %EBX, 2), %EBX
#else
	LEAL	-16(%EBX, %EDI), %EDI
	XORL	$12, %EBX
	LEAL	0f(%EBX, %EBX), %EBX
#endif
	.GLOBAL	CHECKSUM_AMD_FIXUP_PLUS
CHECKSUM_AMD_FIXUP_PLUS:
	JMP	*%EBX
	.ALIGN	16
#ifndef COPY
1:	ADCL	(%EDX), %EAX
0:	ADCL	4(%EDX), %EAX
	ADCL	8(%EDX), %EAX
	ADCL	12(%EDX), %EAX
	LEAL	16(%EDX), %EDX
#else
1:	MOVL	(%EDX), %EBX
	ADCL	%EBX, %EAX
	MOVL	%EBX, (%EDI)
0:	MOVL	4(%EDX), %EBX
	ADCL	%EBX, %EAX
	MOVL	%EBX, 4(%EDI)
	MOVL	8(%EDX), %EBX
	ADCL	%EBX, %EAX
	MOVL	%EBX, 8(%EDI)
	MOVL	12(%EDX), %EBX
	ADCL	%EBX, %EAX
	MOVL	%EBX, 12(%EDI)
	LEAL	16(%EDX), %EDX
	LEAL	16(%EDI), %EDI
#endif
	DECL	%ECX
	JNS	1b
	POPL	%EBX
	ADCL	$0, %EAX
#ifdef COPY
	POPL	%EDI
#endif
	RET

	.ALIGN	__CPU_BRANCH_ALIGN
2:	TESTL	$1, %ECX
	JNZ	3f
#ifndef COPY
	ADDW	-2(%EDX, %ECX), %AX
	ADCL	$0, %EAX
#else
	MOVZWL	-2(%EDX, %ECX), %EAX
	MOVW	%AX, -2(%EDI, %ECX)
#endif
	JMP	6b
	.ALIGN	__CPU_BRANCH_ALIGN
3:	TESTL	$2, %ECX
	JZ	4f
#ifndef COPY
	ADDW	-3(%EDX, %ECX), %AX
#else
	MOVZWL	-3(%EDX, %ECX), %EAX
	MOVW	%AX, -3(%EDI, %ECX)
#endif
4:	MOVZBL	-1(%EDX, %ECX), %EBX
#ifndef COPY
	ADCL	%EBX, %EAX
	ADCL	$0, %EAX
#else
	ADDL	%EBX, %EAX
	MOVB	%BL, -1(%EDI, %ECX)
#endif
	JMP	6b
	.GLOBAL	CHECKSUM_AMD_END
CHECKSUM_AMD_END:


	.ALIGN	64
	.GLOBAL	CHECKSUM_INTEL
CHECKSUM_INTEL:
#ifdef COPY
	PUSHL	%EDI
CHECKSUM_COPY_INTEL_NOPUSH:
#endif
	PUSHL	%EBX
	MOVL	%ECX, %EBX
	ANDL	$14, %EBX
	LEAL	-16(%EBX, %EDX), %EDX
#ifndef COPY
	XORL	$14, %EBX
	LEAL	0f(%EBX, %EBX, 2), %EBX
#else
	LEAL	-16(%EBX, %EAX), %EDI
	XORL	$14, %EBX
	XORL	%EAX, %EAX
	LEAL	0f(%EBX, %EBX, 4), %EBX
#endif
	.GLOBAL	CHECKSUM_INTEL_FIXUP_PLUS
CHECKSUM_INTEL_FIXUP_PLUS:
	JMP	*%EBX
	.ALIGN	16
1:	MOVZWL	(%EDX), %EBX
#ifdef COPY
	MOVW	%BX, (%EDI)
#endif
	ADDL	%EBX, %EAX
0:	MOVZWL	2(%EDX), %EBX
#ifdef COPY
	MOVW	%BX, 2(%EDI)
#endif
	ADDL	%EBX, %EAX
	MOVZWL	4(%EDX), %EBX
#ifdef COPY
	MOVW	%BX, 4(%EDI)
#endif
	ADDL	%EBX, %EAX
	MOVZWL	6(%EDX), %EBX
#ifdef COPY
	MOVW	%BX, 6(%EDI)
#endif
	ADDL	%EBX, %EAX
	MOVZWL	8(%EDX), %EBX
#ifdef COPY
	MOVW	%BX, 8(%EDI)
#endif
	ADDL	%EBX, %EAX
	MOVZWL	10(%EDX), %EBX
#ifdef COPY
	MOVW	%BX, 10(%EDI)
#endif
	ADDL	%EBX, %EAX
	MOVZWL	12(%EDX), %EBX
#ifdef COPY
	MOVW	%BX, 12(%EDI)
#endif
	ADDL	%EBX, %EAX
	MOVZWL	14(%EDX), %EBX
#ifdef COPY
	MOVW	%BX, 14(%EDI)
#endif
	ADDL	%EBX, %EAX
	ADDL	$16, %EDX
#ifdef COPY
	ADDL	$16, %EDI
#endif
	SUBL	$16, %ECX
	JNS	1b
	POPL	%EBX
	TESTL   $1, %ECX
	JNZ	2f
#ifdef COPY
	POPL	%EDI
#endif
	RET
	.ALIGN	__CPU_BRANCH_ALIGN
2:	MOVZBL	(%EDX), %ECX
	ADDL	%ECX, %EAX
#ifdef COPY
	MOVB	%CL, (%EDI)
	POPL	%EDI
#endif
	RET
	.GLOBAL	CHECKSUM_INTEL_END
CHECKSUM_INTEL_END:


#ifndef __NO_SSE
	.ALIGN	64
	.GLOBAL	CHECKSUM_SSE
CHECKSUM_SSE:
#ifdef COPY
	TESTL	$7, %EAX
/* movq seriously sux with unaligned destination. Unfortunately this happens on
   send with RTL driver. Unaligned load is slower too, but not that bad */
	PUSHL	%EDI
	JNZ	CHECKSUM_COPY_INTEL_NOPUSH
	.GLOBAL	CHECKSUM_COPY_SSE_FIXUP_MINUS_2
CHECKSUM_COPY_SSE_FIXUP_MINUS_2:
	XORL	%EDI, %EDI
#endif
	TESTL	$3, %ECX
	PUSHL	%EBX
	JNZ	2f
6:	TESTL	$4, %ECX
	JZ	7f
#ifndef COPY
	MOVZWL	-4(%EDX, %ECX), %EBX
	ADDL	%EBX, %EAX
	MOVZWL	-2(%EDX, %ECX), %EBX
	ADDL	%EBX, %EAX
#else
	MOVZWL	-4(%EDX, %ECX), %EBX
	ADDL	%EBX, %EDI
	MOVW	%BX, -4(%EAX, %ECX)
	MOVZWL	-2(%EDX, %ECX), %EBX
	ADDL	%EBX, %EDI
	MOVW	%BX, -2(%EAX, %ECX)
#endif
7:	ANDL	$~7, %ECX
	POPL	%EBX
	JZ	8f
	CMPL	$0, KERNEL$FPU_ENABLED
/* FPU fault would enable it anyway, but it it is very slow */
	JZ	10f
11:	MOVDQA	%XMM1, xm1
	PXOR	%XMM1, %XMM1
	MOVDQA	%XMM2, xm2
#ifndef COPY
	MOVD	%EAX, %XMM2
	MOVDQA	%XMM0, xm0
#else
	MOVD	%EDI, %XMM2
	MOVDQA	%XMM0, xm0
	POPL	%EDI
#endif
	.ALIGN	16
1:	MOVQ	(%EDX), %XMM0
#ifdef COPY
	MOVQ	%XMM0, (%EAX)
#endif
	PUNPCKLWD %XMM1, %XMM0
	PADDD	%XMM0, %XMM2
	ADDL	$8, %EDX
#ifdef COPY
	ADDL	$8, %EAX
#endif
	SUBL	$8, %ECX
	JNZ	1b
	PSHUFD	$0xEE, %XMM2, %XMM0
	MOVDQA	xm1, %XMM1
	PADDD	%XMM0, %XMM2
	PSHUFD	$0x55, %XMM2, %XMM0
	PADDD	%XMM0, %XMM2
	MOVDQA	xm0, %XMM0
	MOVD	%XMM2, %EAX
	MOVDQA	xm2, %XMM2
#ifndef COPY
8:	RET
#else
	RET
	.ALIGN	__CPU_BRANCH_ALIGN
8:	MOVL	%EDI, %EAX
	POPL	%EDI
	RET
#endif
	.ALIGN	__CPU_BRANCH_ALIGN
2:	TESTL	$1, %ECX
	JNZ	3f
#ifndef COPY
	MOVZWL	-2(%EDX, %ECX), %EBX
	ANDL	$~3, %ECX
	ADDL	%EBX, %EAX
#else
	MOVZWL	-2(%EDX, %ECX), %EDI
	MOVW	%DI, -2(%EAX, %ECX)
	ANDL	$~3, %ECX
#endif
	JMP	6b
	.ALIGN	__CPU_BRANCH_ALIGN
3:	TESTL	$2, %ECX
	JZ	4f
#ifndef COPY
	MOVZWL	-3(%EDX, %ECX), %EBX
	ADDL	%EBX, %EAX
#else
	MOVZWL	-3(%EDX, %ECX), %EDI
	MOVW	%DI, -3(%EAX, %ECX)
#endif
4:	MOVZBL	-1(%EDX, %ECX), %EBX
#ifndef COPY
	ANDL	$~3, %ECX
	ADDL	%EBX, %EAX
#else
	ADDL	%EBX, %EDI
	MOVB	%BL, -1(%EAX, %ECX)
	ANDL	$~3, %ECX
#endif
	JMP	6b
	.ALIGN	__CPU_BRANCH_ALIGN
10:	PUSHL	%EAX	/* pushes are not needed ... but ABI is ABI */
	PUSHL	%ECX
	PUSHL	%EDX
	CALL	KERNEL$ENABLE_FPU
	.GLOBAL	CHECKSUM_SSE_FIXUP_MINUS
CHECKSUM_SSE_FIXUP_MINUS:
	POPL	%EDX
	POPL	%ECX
	POPL	%EAX
	JMP	11b
	.GLOBAL	CHECKSUM_SSE_END
CHECKSUM_SSE_END:
#endif


