Update dependencies

2024-11-01 17:33:34 +00:00
parent 033ac0b400
commit 5cdfab398d
3596 changed files with 1033483 additions and 259 deletions
--- a/vendor/github.com/tailscale/wireguard-go/tun/checksum_generated_amd64.s
+++ b/vendor/github.com/tailscale/wireguard-go/tun/checksum_generated_amd64.s
@@ -0,0 +1,851 @@
+// Code generated by command: go run generate_amd64.go -out checksum_generated_amd64.s -stubs checksum_generated_amd64.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA xmmLoadMasks<>+0(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff"
+DATA xmmLoadMasks<>+16(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff"
+DATA xmmLoadMasks<>+32(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff"
+DATA xmmLoadMasks<>+48(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff"
+DATA xmmLoadMasks<>+64(SB)/16, $"\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
+DATA xmmLoadMasks<>+80(SB)/16, $"\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
+DATA xmmLoadMasks<>+96(SB)/16, $"\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
+GLOBL xmmLoadMasks<>(SB), RODATA|NOPTR, $112
+
+// func checksumAVX2(b []byte, initial uint16) uint16
+// Requires: AVX, AVX2, BMI2
+TEXT ·checksumAVX2(SB), NOSPLIT|NOFRAME, $0-34
+	MOVWQZX initial+24(FP), AX
+	XCHGB   AH, AL
+	MOVQ    b_base+0(FP), DX
+	MOVQ    b_len+8(FP), BX
+
+	// handle odd length buffers; they are difficult to handle in general
+	TESTQ   $0x00000001, BX
+	JZ      lengthIsEven
+	MOVBQZX -1(DX)(BX*1), CX
+	DECQ    BX
+	ADDQ    CX, AX
+
+lengthIsEven:
+	// handle tiny buffers (<=31 bytes) specially
+	CMPQ BX, $0x1f
+	JGT  bufferIsNotTiny
+	XORQ CX, CX
+	XORQ SI, SI
+	XORQ DI, DI
+
+	// shift twice to start because length is guaranteed to be even
+	// n = n >> 2; CF = originalN & 2
+	SHRQ $0x02, BX
+	JNC  handleTiny4
+
+	// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
+	MOVWQZX (DX), CX
+	ADDQ    $0x02, DX
+
+handleTiny4:
+	// n = n >> 1; CF = originalN & 4
+	SHRQ $0x01, BX
+	JNC  handleTiny8
+
+	// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
+	MOVLQZX (DX), SI
+	ADDQ    $0x04, DX
+
+handleTiny8:
+	// n = n >> 1; CF = originalN & 8
+	SHRQ $0x01, BX
+	JNC  handleTiny16
+
+	// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
+	MOVQ (DX), DI
+	ADDQ $0x08, DX
+
+handleTiny16:
+	// n = n >> 1; CF = originalN & 16
+	// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
+	SHRQ $0x01, BX
+	JNC  handleTinyFinish
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+
+handleTinyFinish:
+	// CF should be included from the previous add, so we use ADCQ.
+	// If we arrived via the JNC above, then CF=0 due to the branch condition,
+	// so ADCQ will still produce the correct result.
+	ADCQ CX, AX
+	ADCQ SI, AX
+	ADCQ DI, AX
+	JMP  foldAndReturn
+
+bufferIsNotTiny:
+	// skip all SIMD for small buffers
+	CMPQ BX, $0x00000100
+	JGE  startSIMD
+
+	// Accumulate carries in this register. It is never expected to overflow.
+	XORQ SI, SI
+
+	// We will perform an overlapped read for buffers with length not a multiple of 8.
+	// Overlapped in this context means some memory will be read twice, but a shift will
+	// eliminate the duplicated data. This extra read is performed at the end of the buffer to
+	// preserve any alignment that may exist for the start of the buffer.
+	MOVQ BX, CX
+	SHRQ $0x03, BX
+	ANDQ $0x07, CX
+	JZ   handleRemaining8
+	LEAQ (DX)(BX*8), DI
+	MOVQ -8(DI)(CX*1), DI
+
+	// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
+	SHLQ $0x03, CX
+	NEGQ CX
+	ADDQ $0x40, CX
+	SHRQ CL, DI
+	ADDQ DI, AX
+	ADCQ $0x00, SI
+
+handleRemaining8:
+	SHRQ $0x01, BX
+	JNC  handleRemaining16
+	ADDQ (DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x08, DX
+
+handleRemaining16:
+	SHRQ $0x01, BX
+	JNC  handleRemaining32
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x10, DX
+
+handleRemaining32:
+	SHRQ $0x01, BX
+	JNC  handleRemaining64
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x20, DX
+
+handleRemaining64:
+	SHRQ $0x01, BX
+	JNC  handleRemaining128
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ 32(DX), AX
+	ADCQ 40(DX), AX
+	ADCQ 48(DX), AX
+	ADCQ 56(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x40, DX
+
+handleRemaining128:
+	SHRQ $0x01, BX
+	JNC  handleRemainingComplete
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ 32(DX), AX
+	ADCQ 40(DX), AX
+	ADCQ 48(DX), AX
+	ADCQ 56(DX), AX
+	ADCQ 64(DX), AX
+	ADCQ 72(DX), AX
+	ADCQ 80(DX), AX
+	ADCQ 88(DX), AX
+	ADCQ 96(DX), AX
+	ADCQ 104(DX), AX
+	ADCQ 112(DX), AX
+	ADCQ 120(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x80, DX
+
+handleRemainingComplete:
+	ADDQ SI, AX
+	JMP  foldAndReturn
+
+startSIMD:
+	VPXOR Y0, Y0, Y0
+	VPXOR Y1, Y1, Y1
+	VPXOR Y2, Y2, Y2
+	VPXOR Y3, Y3, Y3
+	MOVQ  BX, CX
+
+	// Update number of bytes remaining after the loop completes
+	ANDQ $0xff, BX
+
+	// Number of 256 byte iterations
+	SHRQ $0x08, CX
+	JZ   smallLoop
+
+bigLoop:
+	VPMOVZXWD (DX), Y4
+	VPADDD    Y4, Y0, Y0
+	VPMOVZXWD 16(DX), Y4
+	VPADDD    Y4, Y1, Y1
+	VPMOVZXWD 32(DX), Y4
+	VPADDD    Y4, Y2, Y2
+	VPMOVZXWD 48(DX), Y4
+	VPADDD    Y4, Y3, Y3
+	VPMOVZXWD 64(DX), Y4
+	VPADDD    Y4, Y0, Y0
+	VPMOVZXWD 80(DX), Y4
+	VPADDD    Y4, Y1, Y1
+	VPMOVZXWD 96(DX), Y4
+	VPADDD    Y4, Y2, Y2
+	VPMOVZXWD 112(DX), Y4
+	VPADDD    Y4, Y3, Y3
+	VPMOVZXWD 128(DX), Y4
+	VPADDD    Y4, Y0, Y0
+	VPMOVZXWD 144(DX), Y4
+	VPADDD    Y4, Y1, Y1
+	VPMOVZXWD 160(DX), Y4
+	VPADDD    Y4, Y2, Y2
+	VPMOVZXWD 176(DX), Y4
+	VPADDD    Y4, Y3, Y3
+	VPMOVZXWD 192(DX), Y4
+	VPADDD    Y4, Y0, Y0
+	VPMOVZXWD 208(DX), Y4
+	VPADDD    Y4, Y1, Y1
+	VPMOVZXWD 224(DX), Y4
+	VPADDD    Y4, Y2, Y2
+	VPMOVZXWD 240(DX), Y4
+	VPADDD    Y4, Y3, Y3
+	ADDQ      $0x00000100, DX
+	DECQ      CX
+	JNZ       bigLoop
+	CMPQ      BX, $0x10
+	JLT       doneSmallLoop
+
+	// now read a single 16 byte unit of data at a time
+smallLoop:
+	VPMOVZXWD (DX), Y4
+	VPADDD    Y4, Y0, Y0
+	ADDQ      $0x10, DX
+	SUBQ      $0x10, BX
+	CMPQ      BX, $0x10
+	JGE       smallLoop
+
+doneSmallLoop:
+	CMPQ BX, $0x00
+	JE   doneSIMD
+
+	// There are between 1 and 15 bytes remaining. Perform an overlapped read.
+	LEAQ      xmmLoadMasks<>+0(SB), CX
+	VMOVDQU   -16(DX)(BX*1), X4
+	VPAND     -16(CX)(BX*8), X4, X4
+	VPMOVZXWD X4, Y4
+	VPADDD    Y4, Y0, Y0
+
+doneSIMD:
+	// Multi-chain loop is done, combine the accumulators
+	VPADDD Y1, Y0, Y0
+	VPADDD Y2, Y0, Y0
+	VPADDD Y3, Y0, Y0
+
+	// extract the YMM into a pair of XMM and sum them
+	VEXTRACTI128 $0x01, Y0, X1
+	VPADDD       X0, X1, X0
+
+	// extract the XMM into GP64
+	VPEXTRQ $0x00, X0, CX
+	VPEXTRQ $0x01, X0, DX
+
+	// no more AVX code, clear upper registers to avoid SSE slowdowns
+	VZEROUPPER
+	ADDQ CX, AX
+	ADCQ DX, AX
+
+foldAndReturn:
+	// add CF and fold
+	RORXQ $0x20, AX, CX
+	ADCL  CX, AX
+	RORXL $0x10, AX, CX
+	ADCW  CX, AX
+	ADCW  $0x00, AX
+	XCHGB AH, AL
+	MOVW  AX, ret+32(FP)
+	RET
+
+// func checksumSSE2(b []byte, initial uint16) uint16
+// Requires: SSE2
+TEXT ·checksumSSE2(SB), NOSPLIT|NOFRAME, $0-34
+	MOVWQZX initial+24(FP), AX
+	XCHGB   AH, AL
+	MOVQ    b_base+0(FP), DX
+	MOVQ    b_len+8(FP), BX
+
+	// handle odd length buffers; they are difficult to handle in general
+	TESTQ   $0x00000001, BX
+	JZ      lengthIsEven
+	MOVBQZX -1(DX)(BX*1), CX
+	DECQ    BX
+	ADDQ    CX, AX
+
+lengthIsEven:
+	// handle tiny buffers (<=31 bytes) specially
+	CMPQ BX, $0x1f
+	JGT  bufferIsNotTiny
+	XORQ CX, CX
+	XORQ SI, SI
+	XORQ DI, DI
+
+	// shift twice to start because length is guaranteed to be even
+	// n = n >> 2; CF = originalN & 2
+	SHRQ $0x02, BX
+	JNC  handleTiny4
+
+	// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
+	MOVWQZX (DX), CX
+	ADDQ    $0x02, DX
+
+handleTiny4:
+	// n = n >> 1; CF = originalN & 4
+	SHRQ $0x01, BX
+	JNC  handleTiny8
+
+	// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
+	MOVLQZX (DX), SI
+	ADDQ    $0x04, DX
+
+handleTiny8:
+	// n = n >> 1; CF = originalN & 8
+	SHRQ $0x01, BX
+	JNC  handleTiny16
+
+	// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
+	MOVQ (DX), DI
+	ADDQ $0x08, DX
+
+handleTiny16:
+	// n = n >> 1; CF = originalN & 16
+	// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
+	SHRQ $0x01, BX
+	JNC  handleTinyFinish
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+
+handleTinyFinish:
+	// CF should be included from the previous add, so we use ADCQ.
+	// If we arrived via the JNC above, then CF=0 due to the branch condition,
+	// so ADCQ will still produce the correct result.
+	ADCQ CX, AX
+	ADCQ SI, AX
+	ADCQ DI, AX
+	JMP  foldAndReturn
+
+bufferIsNotTiny:
+	// skip all SIMD for small buffers
+	CMPQ BX, $0x00000100
+	JGE  startSIMD
+
+	// Accumulate carries in this register. It is never expected to overflow.
+	XORQ SI, SI
+
+	// We will perform an overlapped read for buffers with length not a multiple of 8.
+	// Overlapped in this context means some memory will be read twice, but a shift will
+	// eliminate the duplicated data. This extra read is performed at the end of the buffer to
+	// preserve any alignment that may exist for the start of the buffer.
+	MOVQ BX, CX
+	SHRQ $0x03, BX
+	ANDQ $0x07, CX
+	JZ   handleRemaining8
+	LEAQ (DX)(BX*8), DI
+	MOVQ -8(DI)(CX*1), DI
+
+	// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
+	SHLQ $0x03, CX
+	NEGQ CX
+	ADDQ $0x40, CX
+	SHRQ CL, DI
+	ADDQ DI, AX
+	ADCQ $0x00, SI
+
+handleRemaining8:
+	SHRQ $0x01, BX
+	JNC  handleRemaining16
+	ADDQ (DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x08, DX
+
+handleRemaining16:
+	SHRQ $0x01, BX
+	JNC  handleRemaining32
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x10, DX
+
+handleRemaining32:
+	SHRQ $0x01, BX
+	JNC  handleRemaining64
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x20, DX
+
+handleRemaining64:
+	SHRQ $0x01, BX
+	JNC  handleRemaining128
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ 32(DX), AX
+	ADCQ 40(DX), AX
+	ADCQ 48(DX), AX
+	ADCQ 56(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x40, DX
+
+handleRemaining128:
+	SHRQ $0x01, BX
+	JNC  handleRemainingComplete
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ 32(DX), AX
+	ADCQ 40(DX), AX
+	ADCQ 48(DX), AX
+	ADCQ 56(DX), AX
+	ADCQ 64(DX), AX
+	ADCQ 72(DX), AX
+	ADCQ 80(DX), AX
+	ADCQ 88(DX), AX
+	ADCQ 96(DX), AX
+	ADCQ 104(DX), AX
+	ADCQ 112(DX), AX
+	ADCQ 120(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x80, DX
+
+handleRemainingComplete:
+	ADDQ SI, AX
+	JMP  foldAndReturn
+
+startSIMD:
+	PXOR X0, X0
+	PXOR X1, X1
+	PXOR X2, X2
+	PXOR X3, X3
+	PXOR X4, X4
+	MOVQ BX, CX
+
+	// Update number of bytes remaining after the loop completes
+	ANDQ $0xff, BX
+
+	// Number of 256 byte iterations
+	SHRQ $0x08, CX
+	JZ   smallLoop
+
+bigLoop:
+	MOVOU     (DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X0
+	PADDD     X6, X2
+	MOVOU     16(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X1
+	PADDD     X6, X3
+	MOVOU     32(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X2
+	PADDD     X6, X0
+	MOVOU     48(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X3
+	PADDD     X6, X1
+	MOVOU     64(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X0
+	PADDD     X6, X2
+	MOVOU     80(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X1
+	PADDD     X6, X3
+	MOVOU     96(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X2
+	PADDD     X6, X0
+	MOVOU     112(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X3
+	PADDD     X6, X1
+	MOVOU     128(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X0
+	PADDD     X6, X2
+	MOVOU     144(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X1
+	PADDD     X6, X3
+	MOVOU     160(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X2
+	PADDD     X6, X0
+	MOVOU     176(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X3
+	PADDD     X6, X1
+	MOVOU     192(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X0
+	PADDD     X6, X2
+	MOVOU     208(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X1
+	PADDD     X6, X3
+	MOVOU     224(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X2
+	PADDD     X6, X0
+	MOVOU     240(DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X3
+	PADDD     X6, X1
+	ADDQ      $0x00000100, DX
+	DECQ      CX
+	JNZ       bigLoop
+	CMPQ      BX, $0x10
+	JLT       doneSmallLoop
+
+	// now read a single 16 byte unit of data at a time
+smallLoop:
+	MOVOU     (DX), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X0
+	PADDD     X6, X1
+	ADDQ      $0x10, DX
+	SUBQ      $0x10, BX
+	CMPQ      BX, $0x10
+	JGE       smallLoop
+
+doneSmallLoop:
+	CMPQ BX, $0x00
+	JE   doneSIMD
+
+	// There are between 1 and 15 bytes remaining. Perform an overlapped read.
+	LEAQ      xmmLoadMasks<>+0(SB), CX
+	MOVOU     -16(DX)(BX*1), X5
+	PAND      -16(CX)(BX*8), X5
+	MOVOA     X5, X6
+	PUNPCKHWL X4, X5
+	PUNPCKLWL X4, X6
+	PADDD     X5, X0
+	PADDD     X6, X1
+
+doneSIMD:
+	// Multi-chain loop is done, combine the accumulators
+	PADDD X1, X0
+	PADDD X2, X0
+	PADDD X3, X0
+
+	// extract the XMM into GP64
+	MOVQ   X0, CX
+	PSRLDQ $0x08, X0
+	MOVQ   X0, DX
+	ADDQ   CX, AX
+	ADCQ   DX, AX
+
+foldAndReturn:
+	// add CF and fold
+	MOVL    AX, CX
+	ADCQ    $0x00, CX
+	SHRQ    $0x20, AX
+	ADDQ    CX, AX
+	MOVWQZX AX, CX
+	SHRQ    $0x10, AX
+	ADDQ    CX, AX
+	MOVW    AX, CX
+	SHRQ    $0x10, AX
+	ADDW    CX, AX
+	ADCW    $0x00, AX
+	XCHGB   AH, AL
+	MOVW    AX, ret+32(FP)
+	RET
+
+// func checksumAMD64(b []byte, initial uint16) uint16
+TEXT ·checksumAMD64(SB), NOSPLIT|NOFRAME, $0-34
+	MOVWQZX initial+24(FP), AX
+	XCHGB   AH, AL
+	MOVQ    b_base+0(FP), DX
+	MOVQ    b_len+8(FP), BX
+
+	// handle odd length buffers; they are difficult to handle in general
+	TESTQ   $0x00000001, BX
+	JZ      lengthIsEven
+	MOVBQZX -1(DX)(BX*1), CX
+	DECQ    BX
+	ADDQ    CX, AX
+
+lengthIsEven:
+	// handle tiny buffers (<=31 bytes) specially
+	CMPQ BX, $0x1f
+	JGT  bufferIsNotTiny
+	XORQ CX, CX
+	XORQ SI, SI
+	XORQ DI, DI
+
+	// shift twice to start because length is guaranteed to be even
+	// n = n >> 2; CF = originalN & 2
+	SHRQ $0x02, BX
+	JNC  handleTiny4
+
+	// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
+	MOVWQZX (DX), CX
+	ADDQ    $0x02, DX
+
+handleTiny4:
+	// n = n >> 1; CF = originalN & 4
+	SHRQ $0x01, BX
+	JNC  handleTiny8
+
+	// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
+	MOVLQZX (DX), SI
+	ADDQ    $0x04, DX
+
+handleTiny8:
+	// n = n >> 1; CF = originalN & 8
+	SHRQ $0x01, BX
+	JNC  handleTiny16
+
+	// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
+	MOVQ (DX), DI
+	ADDQ $0x08, DX
+
+handleTiny16:
+	// n = n >> 1; CF = originalN & 16
+	// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
+	SHRQ $0x01, BX
+	JNC  handleTinyFinish
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+
+handleTinyFinish:
+	// CF should be included from the previous add, so we use ADCQ.
+	// If we arrived via the JNC above, then CF=0 due to the branch condition,
+	// so ADCQ will still produce the correct result.
+	ADCQ CX, AX
+	ADCQ SI, AX
+	ADCQ DI, AX
+	JMP  foldAndReturn
+
+bufferIsNotTiny:
+	// Number of 256 byte iterations into loop counter
+	MOVQ BX, CX
+
+	// Update number of bytes remaining after the loop completes
+	ANDQ $0xff, BX
+	SHRQ $0x08, CX
+	JZ   startCleanup
+	CLC
+	XORQ SI, SI
+	XORQ DI, DI
+	XORQ R8, R8
+	XORQ R9, R9
+	XORQ R10, R10
+	XORQ R11, R11
+	XORQ R12, R12
+
+bigLoop:
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ $0x00, SI
+	ADDQ 32(DX), DI
+	ADCQ 40(DX), DI
+	ADCQ 48(DX), DI
+	ADCQ 56(DX), DI
+	ADCQ $0x00, R8
+	ADDQ 64(DX), R9
+	ADCQ 72(DX), R9
+	ADCQ 80(DX), R9
+	ADCQ 88(DX), R9
+	ADCQ $0x00, R10
+	ADDQ 96(DX), R11
+	ADCQ 104(DX), R11
+	ADCQ 112(DX), R11
+	ADCQ 120(DX), R11
+	ADCQ $0x00, R12
+	ADDQ 128(DX), AX
+	ADCQ 136(DX), AX
+	ADCQ 144(DX), AX
+	ADCQ 152(DX), AX
+	ADCQ $0x00, SI
+	ADDQ 160(DX), DI
+	ADCQ 168(DX), DI
+	ADCQ 176(DX), DI
+	ADCQ 184(DX), DI
+	ADCQ $0x00, R8
+	ADDQ 192(DX), R9
+	ADCQ 200(DX), R9
+	ADCQ 208(DX), R9
+	ADCQ 216(DX), R9
+	ADCQ $0x00, R10
+	ADDQ 224(DX), R11
+	ADCQ 232(DX), R11
+	ADCQ 240(DX), R11
+	ADCQ 248(DX), R11
+	ADCQ $0x00, R12
+	ADDQ $0x00000100, DX
+	SUBQ $0x01, CX
+	JNZ  bigLoop
+	ADDQ SI, AX
+	ADCQ DI, AX
+	ADCQ R8, AX
+	ADCQ R9, AX
+	ADCQ R10, AX
+	ADCQ R11, AX
+	ADCQ R12, AX
+
+	// accumulate CF (twice, in case the first time overflows)
+	ADCQ $0x00, AX
+	ADCQ $0x00, AX
+
+startCleanup:
+	// Accumulate carries in this register. It is never expected to overflow.
+	XORQ SI, SI
+
+	// We will perform an overlapped read for buffers with length not a multiple of 8.
+	// Overlapped in this context means some memory will be read twice, but a shift will
+	// eliminate the duplicated data. This extra read is performed at the end of the buffer to
+	// preserve any alignment that may exist for the start of the buffer.
+	MOVQ BX, CX
+	SHRQ $0x03, BX
+	ANDQ $0x07, CX
+	JZ   handleRemaining8
+	LEAQ (DX)(BX*8), DI
+	MOVQ -8(DI)(CX*1), DI
+
+	// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
+	SHLQ $0x03, CX
+	NEGQ CX
+	ADDQ $0x40, CX
+	SHRQ CL, DI
+	ADDQ DI, AX
+	ADCQ $0x00, SI
+
+handleRemaining8:
+	SHRQ $0x01, BX
+	JNC  handleRemaining16
+	ADDQ (DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x08, DX
+
+handleRemaining16:
+	SHRQ $0x01, BX
+	JNC  handleRemaining32
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x10, DX
+
+handleRemaining32:
+	SHRQ $0x01, BX
+	JNC  handleRemaining64
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x20, DX
+
+handleRemaining64:
+	SHRQ $0x01, BX
+	JNC  handleRemaining128
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ 32(DX), AX
+	ADCQ 40(DX), AX
+	ADCQ 48(DX), AX
+	ADCQ 56(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x40, DX
+
+handleRemaining128:
+	SHRQ $0x01, BX
+	JNC  handleRemainingComplete
+	ADDQ (DX), AX
+	ADCQ 8(DX), AX
+	ADCQ 16(DX), AX
+	ADCQ 24(DX), AX
+	ADCQ 32(DX), AX
+	ADCQ 40(DX), AX
+	ADCQ 48(DX), AX
+	ADCQ 56(DX), AX
+	ADCQ 64(DX), AX
+	ADCQ 72(DX), AX
+	ADCQ 80(DX), AX
+	ADCQ 88(DX), AX
+	ADCQ 96(DX), AX
+	ADCQ 104(DX), AX
+	ADCQ 112(DX), AX
+	ADCQ 120(DX), AX
+	ADCQ $0x00, SI
+	ADDQ $0x80, DX
+
+handleRemainingComplete:
+	ADDQ SI, AX
+
+foldAndReturn:
+	// add CF and fold
+	MOVL    AX, CX
+	ADCQ    $0x00, CX
+	SHRQ    $0x20, AX
+	ADDQ    CX, AX
+	MOVWQZX AX, CX
+	SHRQ    $0x10, AX
+	ADDQ    CX, AX
+	MOVW    AX, CX
+	SHRQ    $0x10, AX
+	ADDW    CX, AX
+	ADCW    $0x00, AX
+	XCHGB   AH, AL
+	MOVW    AX, ret+32(FP)
+	RET