Update dependencies
This commit is contained in:
851
vendor/github.com/tailscale/wireguard-go/tun/checksum_generated_amd64.s
generated
vendored
Normal file
851
vendor/github.com/tailscale/wireguard-go/tun/checksum_generated_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,851 @@
|
||||
// Code generated by command: go run generate_amd64.go -out checksum_generated_amd64.s -stubs checksum_generated_amd64.go. DO NOT EDIT.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA xmmLoadMasks<>+0(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff"
|
||||
DATA xmmLoadMasks<>+16(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+32(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+48(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+64(SB)/16, $"\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+80(SB)/16, $"\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+96(SB)/16, $"\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
GLOBL xmmLoadMasks<>(SB), RODATA|NOPTR, $112
|
||||
|
||||
// func checksumAVX2(b []byte, initial uint16) uint16
|
||||
// Requires: AVX, AVX2, BMI2
|
||||
TEXT ·checksumAVX2(SB), NOSPLIT|NOFRAME, $0-34
|
||||
MOVWQZX initial+24(FP), AX
|
||||
XCHGB AH, AL
|
||||
MOVQ b_base+0(FP), DX
|
||||
MOVQ b_len+8(FP), BX
|
||||
|
||||
// handle odd length buffers; they are difficult to handle in general
|
||||
TESTQ $0x00000001, BX
|
||||
JZ lengthIsEven
|
||||
MOVBQZX -1(DX)(BX*1), CX
|
||||
DECQ BX
|
||||
ADDQ CX, AX
|
||||
|
||||
lengthIsEven:
|
||||
// handle tiny buffers (<=31 bytes) specially
|
||||
CMPQ BX, $0x1f
|
||||
JGT bufferIsNotTiny
|
||||
XORQ CX, CX
|
||||
XORQ SI, SI
|
||||
XORQ DI, DI
|
||||
|
||||
// shift twice to start because length is guaranteed to be even
|
||||
// n = n >> 2; CF = originalN & 2
|
||||
SHRQ $0x02, BX
|
||||
JNC handleTiny4
|
||||
|
||||
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
||||
MOVWQZX (DX), CX
|
||||
ADDQ $0x02, DX
|
||||
|
||||
handleTiny4:
|
||||
// n = n >> 1; CF = originalN & 4
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny8
|
||||
|
||||
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
||||
MOVLQZX (DX), SI
|
||||
ADDQ $0x04, DX
|
||||
|
||||
handleTiny8:
|
||||
// n = n >> 1; CF = originalN & 8
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny16
|
||||
|
||||
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
||||
MOVQ (DX), DI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleTiny16:
|
||||
// n = n >> 1; CF = originalN & 16
|
||||
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTinyFinish
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
|
||||
handleTinyFinish:
|
||||
// CF should be included from the previous add, so we use ADCQ.
|
||||
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
||||
// so ADCQ will still produce the correct result.
|
||||
ADCQ CX, AX
|
||||
ADCQ SI, AX
|
||||
ADCQ DI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
bufferIsNotTiny:
|
||||
// skip all SIMD for small buffers
|
||||
CMPQ BX, $0x00000100
|
||||
JGE startSIMD
|
||||
|
||||
// Accumulate carries in this register. It is never expected to overflow.
|
||||
XORQ SI, SI
|
||||
|
||||
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
||||
// Overlapped in this context means some memory will be read twice, but a shift will
|
||||
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
||||
// preserve any alignment that may exist for the start of the buffer.
|
||||
MOVQ BX, CX
|
||||
SHRQ $0x03, BX
|
||||
ANDQ $0x07, CX
|
||||
JZ handleRemaining8
|
||||
LEAQ (DX)(BX*8), DI
|
||||
MOVQ -8(DI)(CX*1), DI
|
||||
|
||||
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
||||
SHLQ $0x03, CX
|
||||
NEGQ CX
|
||||
ADDQ $0x40, CX
|
||||
SHRQ CL, DI
|
||||
ADDQ DI, AX
|
||||
ADCQ $0x00, SI
|
||||
|
||||
handleRemaining8:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining16
|
||||
ADDQ (DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleRemaining16:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining32
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x10, DX
|
||||
|
||||
handleRemaining32:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining64
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x20, DX
|
||||
|
||||
handleRemaining64:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining128
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x40, DX
|
||||
|
||||
handleRemaining128:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemainingComplete
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ 64(DX), AX
|
||||
ADCQ 72(DX), AX
|
||||
ADCQ 80(DX), AX
|
||||
ADCQ 88(DX), AX
|
||||
ADCQ 96(DX), AX
|
||||
ADCQ 104(DX), AX
|
||||
ADCQ 112(DX), AX
|
||||
ADCQ 120(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x80, DX
|
||||
|
||||
handleRemainingComplete:
|
||||
ADDQ SI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
startSIMD:
|
||||
VPXOR Y0, Y0, Y0
|
||||
VPXOR Y1, Y1, Y1
|
||||
VPXOR Y2, Y2, Y2
|
||||
VPXOR Y3, Y3, Y3
|
||||
MOVQ BX, CX
|
||||
|
||||
// Update number of bytes remaining after the loop completes
|
||||
ANDQ $0xff, BX
|
||||
|
||||
// Number of 256 byte iterations
|
||||
SHRQ $0x08, CX
|
||||
JZ smallLoop
|
||||
|
||||
bigLoop:
|
||||
VPMOVZXWD (DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
VPMOVZXWD 16(DX), Y4
|
||||
VPADDD Y4, Y1, Y1
|
||||
VPMOVZXWD 32(DX), Y4
|
||||
VPADDD Y4, Y2, Y2
|
||||
VPMOVZXWD 48(DX), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
VPMOVZXWD 64(DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
VPMOVZXWD 80(DX), Y4
|
||||
VPADDD Y4, Y1, Y1
|
||||
VPMOVZXWD 96(DX), Y4
|
||||
VPADDD Y4, Y2, Y2
|
||||
VPMOVZXWD 112(DX), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
VPMOVZXWD 128(DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
VPMOVZXWD 144(DX), Y4
|
||||
VPADDD Y4, Y1, Y1
|
||||
VPMOVZXWD 160(DX), Y4
|
||||
VPADDD Y4, Y2, Y2
|
||||
VPMOVZXWD 176(DX), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
VPMOVZXWD 192(DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
VPMOVZXWD 208(DX), Y4
|
||||
VPADDD Y4, Y1, Y1
|
||||
VPMOVZXWD 224(DX), Y4
|
||||
VPADDD Y4, Y2, Y2
|
||||
VPMOVZXWD 240(DX), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
ADDQ $0x00000100, DX
|
||||
DECQ CX
|
||||
JNZ bigLoop
|
||||
CMPQ BX, $0x10
|
||||
JLT doneSmallLoop
|
||||
|
||||
// now read a single 16 byte unit of data at a time
|
||||
smallLoop:
|
||||
VPMOVZXWD (DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
ADDQ $0x10, DX
|
||||
SUBQ $0x10, BX
|
||||
CMPQ BX, $0x10
|
||||
JGE smallLoop
|
||||
|
||||
doneSmallLoop:
|
||||
CMPQ BX, $0x00
|
||||
JE doneSIMD
|
||||
|
||||
// There are between 1 and 15 bytes remaining. Perform an overlapped read.
|
||||
LEAQ xmmLoadMasks<>+0(SB), CX
|
||||
VMOVDQU -16(DX)(BX*1), X4
|
||||
VPAND -16(CX)(BX*8), X4, X4
|
||||
VPMOVZXWD X4, Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
|
||||
doneSIMD:
|
||||
// Multi-chain loop is done, combine the accumulators
|
||||
VPADDD Y1, Y0, Y0
|
||||
VPADDD Y2, Y0, Y0
|
||||
VPADDD Y3, Y0, Y0
|
||||
|
||||
// extract the YMM into a pair of XMM and sum them
|
||||
VEXTRACTI128 $0x01, Y0, X1
|
||||
VPADDD X0, X1, X0
|
||||
|
||||
// extract the XMM into GP64
|
||||
VPEXTRQ $0x00, X0, CX
|
||||
VPEXTRQ $0x01, X0, DX
|
||||
|
||||
// no more AVX code, clear upper registers to avoid SSE slowdowns
|
||||
VZEROUPPER
|
||||
ADDQ CX, AX
|
||||
ADCQ DX, AX
|
||||
|
||||
foldAndReturn:
|
||||
// add CF and fold
|
||||
RORXQ $0x20, AX, CX
|
||||
ADCL CX, AX
|
||||
RORXL $0x10, AX, CX
|
||||
ADCW CX, AX
|
||||
ADCW $0x00, AX
|
||||
XCHGB AH, AL
|
||||
MOVW AX, ret+32(FP)
|
||||
RET
|
||||
|
||||
// func checksumSSE2(b []byte, initial uint16) uint16
|
||||
// Requires: SSE2
|
||||
TEXT ·checksumSSE2(SB), NOSPLIT|NOFRAME, $0-34
|
||||
MOVWQZX initial+24(FP), AX
|
||||
XCHGB AH, AL
|
||||
MOVQ b_base+0(FP), DX
|
||||
MOVQ b_len+8(FP), BX
|
||||
|
||||
// handle odd length buffers; they are difficult to handle in general
|
||||
TESTQ $0x00000001, BX
|
||||
JZ lengthIsEven
|
||||
MOVBQZX -1(DX)(BX*1), CX
|
||||
DECQ BX
|
||||
ADDQ CX, AX
|
||||
|
||||
lengthIsEven:
|
||||
// handle tiny buffers (<=31 bytes) specially
|
||||
CMPQ BX, $0x1f
|
||||
JGT bufferIsNotTiny
|
||||
XORQ CX, CX
|
||||
XORQ SI, SI
|
||||
XORQ DI, DI
|
||||
|
||||
// shift twice to start because length is guaranteed to be even
|
||||
// n = n >> 2; CF = originalN & 2
|
||||
SHRQ $0x02, BX
|
||||
JNC handleTiny4
|
||||
|
||||
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
||||
MOVWQZX (DX), CX
|
||||
ADDQ $0x02, DX
|
||||
|
||||
handleTiny4:
|
||||
// n = n >> 1; CF = originalN & 4
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny8
|
||||
|
||||
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
||||
MOVLQZX (DX), SI
|
||||
ADDQ $0x04, DX
|
||||
|
||||
handleTiny8:
|
||||
// n = n >> 1; CF = originalN & 8
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny16
|
||||
|
||||
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
||||
MOVQ (DX), DI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleTiny16:
|
||||
// n = n >> 1; CF = originalN & 16
|
||||
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTinyFinish
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
|
||||
handleTinyFinish:
|
||||
// CF should be included from the previous add, so we use ADCQ.
|
||||
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
||||
// so ADCQ will still produce the correct result.
|
||||
ADCQ CX, AX
|
||||
ADCQ SI, AX
|
||||
ADCQ DI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
bufferIsNotTiny:
|
||||
// skip all SIMD for small buffers
|
||||
CMPQ BX, $0x00000100
|
||||
JGE startSIMD
|
||||
|
||||
// Accumulate carries in this register. It is never expected to overflow.
|
||||
XORQ SI, SI
|
||||
|
||||
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
||||
// Overlapped in this context means some memory will be read twice, but a shift will
|
||||
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
||||
// preserve any alignment that may exist for the start of the buffer.
|
||||
MOVQ BX, CX
|
||||
SHRQ $0x03, BX
|
||||
ANDQ $0x07, CX
|
||||
JZ handleRemaining8
|
||||
LEAQ (DX)(BX*8), DI
|
||||
MOVQ -8(DI)(CX*1), DI
|
||||
|
||||
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
||||
SHLQ $0x03, CX
|
||||
NEGQ CX
|
||||
ADDQ $0x40, CX
|
||||
SHRQ CL, DI
|
||||
ADDQ DI, AX
|
||||
ADCQ $0x00, SI
|
||||
|
||||
handleRemaining8:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining16
|
||||
ADDQ (DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleRemaining16:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining32
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x10, DX
|
||||
|
||||
handleRemaining32:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining64
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x20, DX
|
||||
|
||||
handleRemaining64:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining128
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x40, DX
|
||||
|
||||
handleRemaining128:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemainingComplete
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ 64(DX), AX
|
||||
ADCQ 72(DX), AX
|
||||
ADCQ 80(DX), AX
|
||||
ADCQ 88(DX), AX
|
||||
ADCQ 96(DX), AX
|
||||
ADCQ 104(DX), AX
|
||||
ADCQ 112(DX), AX
|
||||
ADCQ 120(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x80, DX
|
||||
|
||||
handleRemainingComplete:
|
||||
ADDQ SI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
startSIMD:
|
||||
PXOR X0, X0
|
||||
PXOR X1, X1
|
||||
PXOR X2, X2
|
||||
PXOR X3, X3
|
||||
PXOR X4, X4
|
||||
MOVQ BX, CX
|
||||
|
||||
// Update number of bytes remaining after the loop completes
|
||||
ANDQ $0xff, BX
|
||||
|
||||
// Number of 256 byte iterations
|
||||
SHRQ $0x08, CX
|
||||
JZ smallLoop
|
||||
|
||||
bigLoop:
|
||||
MOVOU (DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X2
|
||||
MOVOU 16(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X1
|
||||
PADDD X6, X3
|
||||
MOVOU 32(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X2
|
||||
PADDD X6, X0
|
||||
MOVOU 48(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X3
|
||||
PADDD X6, X1
|
||||
MOVOU 64(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X2
|
||||
MOVOU 80(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X1
|
||||
PADDD X6, X3
|
||||
MOVOU 96(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X2
|
||||
PADDD X6, X0
|
||||
MOVOU 112(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X3
|
||||
PADDD X6, X1
|
||||
MOVOU 128(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X2
|
||||
MOVOU 144(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X1
|
||||
PADDD X6, X3
|
||||
MOVOU 160(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X2
|
||||
PADDD X6, X0
|
||||
MOVOU 176(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X3
|
||||
PADDD X6, X1
|
||||
MOVOU 192(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X2
|
||||
MOVOU 208(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X1
|
||||
PADDD X6, X3
|
||||
MOVOU 224(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X2
|
||||
PADDD X6, X0
|
||||
MOVOU 240(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X3
|
||||
PADDD X6, X1
|
||||
ADDQ $0x00000100, DX
|
||||
DECQ CX
|
||||
JNZ bigLoop
|
||||
CMPQ BX, $0x10
|
||||
JLT doneSmallLoop
|
||||
|
||||
// now read a single 16 byte unit of data at a time
|
||||
smallLoop:
|
||||
MOVOU (DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X1
|
||||
ADDQ $0x10, DX
|
||||
SUBQ $0x10, BX
|
||||
CMPQ BX, $0x10
|
||||
JGE smallLoop
|
||||
|
||||
doneSmallLoop:
|
||||
CMPQ BX, $0x00
|
||||
JE doneSIMD
|
||||
|
||||
// There are between 1 and 15 bytes remaining. Perform an overlapped read.
|
||||
LEAQ xmmLoadMasks<>+0(SB), CX
|
||||
MOVOU -16(DX)(BX*1), X5
|
||||
PAND -16(CX)(BX*8), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X1
|
||||
|
||||
doneSIMD:
|
||||
// Multi-chain loop is done, combine the accumulators
|
||||
PADDD X1, X0
|
||||
PADDD X2, X0
|
||||
PADDD X3, X0
|
||||
|
||||
// extract the XMM into GP64
|
||||
MOVQ X0, CX
|
||||
PSRLDQ $0x08, X0
|
||||
MOVQ X0, DX
|
||||
ADDQ CX, AX
|
||||
ADCQ DX, AX
|
||||
|
||||
foldAndReturn:
|
||||
// add CF and fold
|
||||
MOVL AX, CX
|
||||
ADCQ $0x00, CX
|
||||
SHRQ $0x20, AX
|
||||
ADDQ CX, AX
|
||||
MOVWQZX AX, CX
|
||||
SHRQ $0x10, AX
|
||||
ADDQ CX, AX
|
||||
MOVW AX, CX
|
||||
SHRQ $0x10, AX
|
||||
ADDW CX, AX
|
||||
ADCW $0x00, AX
|
||||
XCHGB AH, AL
|
||||
MOVW AX, ret+32(FP)
|
||||
RET
|
||||
|
||||
// func checksumAMD64(b []byte, initial uint16) uint16
|
||||
TEXT ·checksumAMD64(SB), NOSPLIT|NOFRAME, $0-34
|
||||
MOVWQZX initial+24(FP), AX
|
||||
XCHGB AH, AL
|
||||
MOVQ b_base+0(FP), DX
|
||||
MOVQ b_len+8(FP), BX
|
||||
|
||||
// handle odd length buffers; they are difficult to handle in general
|
||||
TESTQ $0x00000001, BX
|
||||
JZ lengthIsEven
|
||||
MOVBQZX -1(DX)(BX*1), CX
|
||||
DECQ BX
|
||||
ADDQ CX, AX
|
||||
|
||||
lengthIsEven:
|
||||
// handle tiny buffers (<=31 bytes) specially
|
||||
CMPQ BX, $0x1f
|
||||
JGT bufferIsNotTiny
|
||||
XORQ CX, CX
|
||||
XORQ SI, SI
|
||||
XORQ DI, DI
|
||||
|
||||
// shift twice to start because length is guaranteed to be even
|
||||
// n = n >> 2; CF = originalN & 2
|
||||
SHRQ $0x02, BX
|
||||
JNC handleTiny4
|
||||
|
||||
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
||||
MOVWQZX (DX), CX
|
||||
ADDQ $0x02, DX
|
||||
|
||||
handleTiny4:
|
||||
// n = n >> 1; CF = originalN & 4
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny8
|
||||
|
||||
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
||||
MOVLQZX (DX), SI
|
||||
ADDQ $0x04, DX
|
||||
|
||||
handleTiny8:
|
||||
// n = n >> 1; CF = originalN & 8
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny16
|
||||
|
||||
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
||||
MOVQ (DX), DI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleTiny16:
|
||||
// n = n >> 1; CF = originalN & 16
|
||||
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTinyFinish
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
|
||||
handleTinyFinish:
|
||||
// CF should be included from the previous add, so we use ADCQ.
|
||||
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
||||
// so ADCQ will still produce the correct result.
|
||||
ADCQ CX, AX
|
||||
ADCQ SI, AX
|
||||
ADCQ DI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
bufferIsNotTiny:
|
||||
// Number of 256 byte iterations into loop counter
|
||||
MOVQ BX, CX
|
||||
|
||||
// Update number of bytes remaining after the loop completes
|
||||
ANDQ $0xff, BX
|
||||
SHRQ $0x08, CX
|
||||
JZ startCleanup
|
||||
CLC
|
||||
XORQ SI, SI
|
||||
XORQ DI, DI
|
||||
XORQ R8, R8
|
||||
XORQ R9, R9
|
||||
XORQ R10, R10
|
||||
XORQ R11, R11
|
||||
XORQ R12, R12
|
||||
|
||||
bigLoop:
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ 32(DX), DI
|
||||
ADCQ 40(DX), DI
|
||||
ADCQ 48(DX), DI
|
||||
ADCQ 56(DX), DI
|
||||
ADCQ $0x00, R8
|
||||
ADDQ 64(DX), R9
|
||||
ADCQ 72(DX), R9
|
||||
ADCQ 80(DX), R9
|
||||
ADCQ 88(DX), R9
|
||||
ADCQ $0x00, R10
|
||||
ADDQ 96(DX), R11
|
||||
ADCQ 104(DX), R11
|
||||
ADCQ 112(DX), R11
|
||||
ADCQ 120(DX), R11
|
||||
ADCQ $0x00, R12
|
||||
ADDQ 128(DX), AX
|
||||
ADCQ 136(DX), AX
|
||||
ADCQ 144(DX), AX
|
||||
ADCQ 152(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ 160(DX), DI
|
||||
ADCQ 168(DX), DI
|
||||
ADCQ 176(DX), DI
|
||||
ADCQ 184(DX), DI
|
||||
ADCQ $0x00, R8
|
||||
ADDQ 192(DX), R9
|
||||
ADCQ 200(DX), R9
|
||||
ADCQ 208(DX), R9
|
||||
ADCQ 216(DX), R9
|
||||
ADCQ $0x00, R10
|
||||
ADDQ 224(DX), R11
|
||||
ADCQ 232(DX), R11
|
||||
ADCQ 240(DX), R11
|
||||
ADCQ 248(DX), R11
|
||||
ADCQ $0x00, R12
|
||||
ADDQ $0x00000100, DX
|
||||
SUBQ $0x01, CX
|
||||
JNZ bigLoop
|
||||
ADDQ SI, AX
|
||||
ADCQ DI, AX
|
||||
ADCQ R8, AX
|
||||
ADCQ R9, AX
|
||||
ADCQ R10, AX
|
||||
ADCQ R11, AX
|
||||
ADCQ R12, AX
|
||||
|
||||
// accumulate CF (twice, in case the first time overflows)
|
||||
ADCQ $0x00, AX
|
||||
ADCQ $0x00, AX
|
||||
|
||||
startCleanup:
|
||||
// Accumulate carries in this register. It is never expected to overflow.
|
||||
XORQ SI, SI
|
||||
|
||||
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
||||
// Overlapped in this context means some memory will be read twice, but a shift will
|
||||
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
||||
// preserve any alignment that may exist for the start of the buffer.
|
||||
MOVQ BX, CX
|
||||
SHRQ $0x03, BX
|
||||
ANDQ $0x07, CX
|
||||
JZ handleRemaining8
|
||||
LEAQ (DX)(BX*8), DI
|
||||
MOVQ -8(DI)(CX*1), DI
|
||||
|
||||
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
||||
SHLQ $0x03, CX
|
||||
NEGQ CX
|
||||
ADDQ $0x40, CX
|
||||
SHRQ CL, DI
|
||||
ADDQ DI, AX
|
||||
ADCQ $0x00, SI
|
||||
|
||||
handleRemaining8:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining16
|
||||
ADDQ (DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleRemaining16:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining32
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x10, DX
|
||||
|
||||
handleRemaining32:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining64
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x20, DX
|
||||
|
||||
handleRemaining64:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining128
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x40, DX
|
||||
|
||||
handleRemaining128:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemainingComplete
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ 64(DX), AX
|
||||
ADCQ 72(DX), AX
|
||||
ADCQ 80(DX), AX
|
||||
ADCQ 88(DX), AX
|
||||
ADCQ 96(DX), AX
|
||||
ADCQ 104(DX), AX
|
||||
ADCQ 112(DX), AX
|
||||
ADCQ 120(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x80, DX
|
||||
|
||||
handleRemainingComplete:
|
||||
ADDQ SI, AX
|
||||
|
||||
foldAndReturn:
|
||||
// add CF and fold
|
||||
MOVL AX, CX
|
||||
ADCQ $0x00, CX
|
||||
SHRQ $0x20, AX
|
||||
ADDQ CX, AX
|
||||
MOVWQZX AX, CX
|
||||
SHRQ $0x10, AX
|
||||
ADDQ CX, AX
|
||||
MOVW AX, CX
|
||||
SHRQ $0x10, AX
|
||||
ADDW CX, AX
|
||||
ADCW $0x00, AX
|
||||
XCHGB AH, AL
|
||||
MOVW AX, ret+32(FP)
|
||||
RET
|
||||
Reference in New Issue
Block a user