Update dependencies

This commit is contained in:
bluepython508
2024-11-01 17:33:34 +00:00
parent 033ac0b400
commit 5cdfab398d
3596 changed files with 1033483 additions and 259 deletions

View File

@@ -0,0 +1,481 @@
package lz4block
import (
"encoding/binary"
"math/bits"
"sync"
"github.com/pierrec/lz4/v4/internal/lz4errors"
)
const (
// The following constants are used to setup the compression algorithm.
minMatch = 4 // the minimum size of the match sequence size (4 bytes)
winSizeLog = 16 // LZ4 64Kb window size limit
winSize = 1 << winSizeLog
winMask = winSize - 1 // 64Kb window of previous data for dependent blocks
// hashLog determines the size of the hash table used to quickly find a previous match position.
// Its value influences the compression speed and memory usage, the lower the faster,
// but at the expense of the compression ratio.
// 16 seems to be the best compromise for fast compression.
hashLog = 16
htSize = 1 << hashLog
mfLimit = 10 + minMatch // The last match cannot start within the last 14 bytes.
)
func recoverBlock(e *error) {
if r := recover(); r != nil && *e == nil {
*e = lz4errors.ErrInvalidSourceShortBuffer
}
}
// blockHash hashes the lower 6 bytes into a value < htSize.
func blockHash(x uint64) uint32 {
const prime6bytes = 227718039650203
return uint32(((x << (64 - 48)) * prime6bytes) >> (64 - hashLog))
}
func CompressBlockBound(n int) int {
return n + n/255 + 16
}
func UncompressBlock(src, dst, dict []byte) (int, error) {
if len(src) == 0 {
return 0, nil
}
if di := decodeBlock(dst, src, dict); di >= 0 {
return di, nil
}
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
type Compressor struct {
// Offsets are at most 64kiB, so we can store only the lower 16 bits of
// match positions: effectively, an offset from some 64kiB block boundary.
//
// When we retrieve such an offset, we interpret it as relative to the last
// block boundary si &^ 0xffff, or the one before, (si &^ 0xffff) - 0x10000,
// depending on which of these is inside the current window. If a table
// entry was generated more than 64kiB back in the input, we find out by
// inspecting the input stream.
table [htSize]uint16
// Bitmap indicating which positions in the table are in use.
// This allows us to quickly reset the table for reuse,
// without having to zero everything.
inUse [htSize / 32]uint32
}
// Get returns the position of a presumptive match for the hash h.
// The match may be a false positive due to a hash collision or an old entry.
// If si < winSize, the return value may be negative.
func (c *Compressor) get(h uint32, si int) int {
h &= htSize - 1
i := 0
if c.inUse[h/32]&(1<<(h%32)) != 0 {
i = int(c.table[h])
}
i += si &^ winMask
if i >= si {
// Try previous 64kiB block (negative when in first block).
i -= winSize
}
return i
}
func (c *Compressor) put(h uint32, si int) {
h &= htSize - 1
c.table[h] = uint16(si)
c.inUse[h/32] |= 1 << (h % 32)
}
func (c *Compressor) reset() { c.inUse = [htSize / 32]uint32{} }
var compressorPool = sync.Pool{New: func() interface{} { return new(Compressor) }}
func CompressBlock(src, dst []byte) (int, error) {
c := compressorPool.Get().(*Compressor)
n, err := c.CompressBlock(src, dst)
compressorPool.Put(c)
return n, err
}
func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
// Zero out reused table to avoid non-deterministic output (issue #65).
c.reset()
// Return 0, nil only if the destination buffer size is < CompressBlockBound.
isNotCompressible := len(dst) < CompressBlockBound(len(src))
// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
// This significantly speeds up incompressible data and usually has very small impact on compression.
// bytes to skip = 1 + (bytes since last match >> adaptSkipLog)
const adaptSkipLog = 7
// si: Current position of the search.
// anchor: Position of the current literals.
var si, di, anchor int
sn := len(src) - mfLimit
if sn <= 0 {
goto lastLiterals
}
// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
for si < sn {
// Hash the next 6 bytes (sequence)...
match := binary.LittleEndian.Uint64(src[si:])
h := blockHash(match)
h2 := blockHash(match >> 8)
// We check a match at s, s+1 and s+2 and pick the first one we get.
// Checking 3 only requires us to load the source one.
ref := c.get(h, si)
ref2 := c.get(h2, si+1)
c.put(h, si)
c.put(h2, si+1)
offset := si - ref
if offset <= 0 || offset >= winSize || uint32(match) != binary.LittleEndian.Uint32(src[ref:]) {
// No match. Start calculating another hash.
// The processor can usually do this out-of-order.
h = blockHash(match >> 16)
ref3 := c.get(h, si+2)
// Check the second match at si+1
si += 1
offset = si - ref2
if offset <= 0 || offset >= winSize || uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
// No match. Check the third match at si+2
si += 1
offset = si - ref3
c.put(h, si)
if offset <= 0 || offset >= winSize || uint32(match>>16) != binary.LittleEndian.Uint32(src[ref3:]) {
// Skip one extra byte (at si+3) before we check 3 matches again.
si += 2 + (si-anchor)>>adaptSkipLog
continue
}
}
}
// Match found.
lLen := si - anchor // Literal length.
// We already matched 4 bytes.
mLen := 4
// Extend backwards if we can, reducing literals.
tOff := si - offset - 1
for lLen > 0 && tOff >= 0 && src[si-1] == src[tOff] {
si--
tOff--
lLen--
mLen++
}
// Add the match length, so we continue search at the end.
// Use mLen to store the offset base.
si, mLen = si+mLen, si+minMatch
// Find the longest match by looking by batches of 8 bytes.
for si+8 <= sn {
x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:])
if x == 0 {
si += 8
} else {
// Stop is first non-zero byte.
si += bits.TrailingZeros64(x) >> 3
break
}
}
mLen = si - mLen
if di >= len(dst) {
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
if mLen < 0xF {
dst[di] = byte(mLen)
} else {
dst[di] = 0xF
}
// Encode literals length.
if lLen < 0xF {
dst[di] |= byte(lLen << 4)
} else {
dst[di] |= 0xF0
di++
l := lLen - 0xF
for ; l >= 0xFF && di < len(dst); l -= 0xFF {
dst[di] = 0xFF
di++
}
if di >= len(dst) {
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
dst[di] = byte(l)
}
di++
// Literals.
if di+lLen > len(dst) {
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
copy(dst[di:di+lLen], src[anchor:anchor+lLen])
di += lLen + 2
anchor = si
// Encode offset.
if di > len(dst) {
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
// Encode match length part 2.
if mLen >= 0xF {
for mLen -= 0xF; mLen >= 0xFF && di < len(dst); mLen -= 0xFF {
dst[di] = 0xFF
di++
}
if di >= len(dst) {
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
dst[di] = byte(mLen)
di++
}
// Check if we can load next values.
if si >= sn {
break
}
// Hash match end-2
h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
c.put(h, si-2)
}
lastLiterals:
if isNotCompressible && anchor == 0 {
// Incompressible.
return 0, nil
}
// Last literals.
if di >= len(dst) {
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
lLen := len(src) - anchor
if lLen < 0xF {
dst[di] = byte(lLen << 4)
} else {
dst[di] = 0xF0
di++
for lLen -= 0xF; lLen >= 0xFF && di < len(dst); lLen -= 0xFF {
dst[di] = 0xFF
di++
}
if di >= len(dst) {
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
dst[di] = byte(lLen)
}
di++
// Write the last literals.
if isNotCompressible && di >= anchor {
// Incompressible.
return 0, nil
}
if di+len(src)-anchor > len(dst) {
return 0, lz4errors.ErrInvalidSourceShortBuffer
}
di += copy(dst[di:di+len(src)-anchor], src[anchor:])
return di, nil
}
// blockHash hashes 4 bytes into a value < winSize.
func blockHashHC(x uint32) uint32 {
const hasher uint32 = 2654435761 // Knuth multiplicative hash.
return x * hasher >> (32 - winSizeLog)
}
type CompressorHC struct {
// hashTable: stores the last position found for a given hash
// chainTable: stores previous positions for a given hash
hashTable, chainTable [htSize]int
needsReset bool
}
var compressorHCPool = sync.Pool{New: func() interface{} { return new(CompressorHC) }}
func CompressBlockHC(src, dst []byte, depth CompressionLevel) (int, error) {
c := compressorHCPool.Get().(*CompressorHC)
n, err := c.CompressBlock(src, dst, depth)
compressorHCPool.Put(c)
return n, err
}
func (c *CompressorHC) CompressBlock(src, dst []byte, depth CompressionLevel) (_ int, err error) {
if c.needsReset {
// Zero out reused table to avoid non-deterministic output (issue #65).
c.hashTable = [htSize]int{}
c.chainTable = [htSize]int{}
}
c.needsReset = true // Only false on first call.
defer recoverBlock(&err)
// Return 0, nil only if the destination buffer size is < CompressBlockBound.
isNotCompressible := len(dst) < CompressBlockBound(len(src))
// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
// This significantly speeds up incompressible data and usually has very small impact on compression.
// bytes to skip = 1 + (bytes since last match >> adaptSkipLog)
const adaptSkipLog = 7
var si, di, anchor int
sn := len(src) - mfLimit
if sn <= 0 {
goto lastLiterals
}
if depth == 0 {
depth = winSize
}
for si < sn {
// Hash the next 4 bytes (sequence).
match := binary.LittleEndian.Uint32(src[si:])
h := blockHashHC(match)
// Follow the chain until out of window and give the longest match.
mLen := 0
offset := 0
for next, try := c.hashTable[h], depth; try > 0 && next > 0 && si-next < winSize; next, try = c.chainTable[next&winMask], try-1 {
// The first (mLen==0) or next byte (mLen>=minMatch) at current match length
// must match to improve on the match length.
if src[next+mLen] != src[si+mLen] {
continue
}
ml := 0
// Compare the current position with a previous with the same hash.
for ml < sn-si {
x := binary.LittleEndian.Uint64(src[next+ml:]) ^ binary.LittleEndian.Uint64(src[si+ml:])
if x == 0 {
ml += 8
} else {
// Stop is first non-zero byte.
ml += bits.TrailingZeros64(x) >> 3
break
}
}
if ml < minMatch || ml <= mLen {
// Match too small (<minMath) or smaller than the current match.
continue
}
// Found a longer match, keep its position and length.
mLen = ml
offset = si - next
// Try another previous position with the same hash.
}
c.chainTable[si&winMask] = c.hashTable[h]
c.hashTable[h] = si
// No match found.
if mLen == 0 {
si += 1 + (si-anchor)>>adaptSkipLog
continue
}
// Match found.
// Update hash/chain tables with overlapping bytes:
// si already hashed, add everything from si+1 up to the match length.
winStart := si + 1
if ws := si + mLen - winSize; ws > winStart {
winStart = ws
}
for si, ml := winStart, si+mLen; si < ml; {
match >>= 8
match |= uint32(src[si+3]) << 24
h := blockHashHC(match)
c.chainTable[si&winMask] = c.hashTable[h]
c.hashTable[h] = si
si++
}
lLen := si - anchor
si += mLen
mLen -= minMatch // Match length does not include minMatch.
if mLen < 0xF {
dst[di] = byte(mLen)
} else {
dst[di] = 0xF
}
// Encode literals length.
if lLen < 0xF {
dst[di] |= byte(lLen << 4)
} else {
dst[di] |= 0xF0
di++
l := lLen - 0xF
for ; l >= 0xFF; l -= 0xFF {
dst[di] = 0xFF
di++
}
dst[di] = byte(l)
}
di++
// Literals.
copy(dst[di:di+lLen], src[anchor:anchor+lLen])
di += lLen
anchor = si
// Encode offset.
di += 2
dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
// Encode match length part 2.
if mLen >= 0xF {
for mLen -= 0xF; mLen >= 0xFF; mLen -= 0xFF {
dst[di] = 0xFF
di++
}
dst[di] = byte(mLen)
di++
}
}
if isNotCompressible && anchor == 0 {
// Incompressible.
return 0, nil
}
// Last literals.
lastLiterals:
lLen := len(src) - anchor
if lLen < 0xF {
dst[di] = byte(lLen << 4)
} else {
dst[di] = 0xF0
di++
lLen -= 0xF
for ; lLen >= 0xFF; lLen -= 0xFF {
dst[di] = 0xFF
di++
}
dst[di] = byte(lLen)
}
di++
// Write the last literals.
if isNotCompressible && di >= anchor {
// Incompressible.
return 0, nil
}
di += copy(dst[di:di+len(src)-anchor], src[anchor:])
return di, nil
}

View File

@@ -0,0 +1,87 @@
// Package lz4block provides LZ4 BlockSize types and pools of buffers.
package lz4block
import "sync"
const (
Block64Kb uint32 = 1 << (16 + iota*2)
Block256Kb
Block1Mb
Block4Mb
Block8Mb = 2 * Block4Mb
)
var (
BlockPool64K = sync.Pool{New: func() interface{} { return make([]byte, Block64Kb) }}
BlockPool256K = sync.Pool{New: func() interface{} { return make([]byte, Block256Kb) }}
BlockPool1M = sync.Pool{New: func() interface{} { return make([]byte, Block1Mb) }}
BlockPool4M = sync.Pool{New: func() interface{} { return make([]byte, Block4Mb) }}
BlockPool8M = sync.Pool{New: func() interface{} { return make([]byte, Block8Mb) }}
)
func Index(b uint32) BlockSizeIndex {
switch b {
case Block64Kb:
return 4
case Block256Kb:
return 5
case Block1Mb:
return 6
case Block4Mb:
return 7
case Block8Mb: // only valid in legacy mode
return 3
}
return 0
}
func IsValid(b uint32) bool {
return Index(b) > 0
}
type BlockSizeIndex uint8
func (b BlockSizeIndex) IsValid() bool {
switch b {
case 4, 5, 6, 7:
return true
}
return false
}
func (b BlockSizeIndex) Get() []byte {
var buf interface{}
switch b {
case 4:
buf = BlockPool64K.Get()
case 5:
buf = BlockPool256K.Get()
case 6:
buf = BlockPool1M.Get()
case 7:
buf = BlockPool4M.Get()
case 3:
buf = BlockPool8M.Get()
}
return buf.([]byte)
}
func Put(buf []byte) {
// Safeguard: do not allow invalid buffers.
switch c := cap(buf); uint32(c) {
case Block64Kb:
BlockPool64K.Put(buf[:c])
case Block256Kb:
BlockPool256K.Put(buf[:c])
case Block1Mb:
BlockPool1M.Put(buf[:c])
case Block4Mb:
BlockPool4M.Put(buf[:c])
case Block8Mb:
BlockPool8M.Put(buf[:c])
}
}
type CompressionLevel uint32
const Fast CompressionLevel = 0

View File

@@ -0,0 +1,448 @@
// +build !appengine
// +build gc
// +build !noasm
#include "go_asm.h"
#include "textflag.h"
// AX scratch
// BX scratch
// CX literal and match lengths
// DX token, match offset
//
// DI &dst
// SI &src
// R8 &dst + len(dst)
// R9 &src + len(src)
// R11 &dst
// R12 short output end
// R13 short input end
// R14 &dict
// R15 len(dict)
// func decodeBlock(dst, src, dict []byte) int
TEXT ·decodeBlock(SB), NOSPLIT, $48-80
MOVQ dst_base+0(FP), DI
MOVQ DI, R11
MOVQ dst_len+8(FP), R8
ADDQ DI, R8
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R9
CMPQ R9, $0
JE err_corrupt
ADDQ SI, R9
MOVQ dict_base+48(FP), R14
MOVQ dict_len+56(FP), R15
// shortcut ends
// short output end
MOVQ R8, R12
SUBQ $32, R12
// short input end
MOVQ R9, R13
SUBQ $16, R13
XORL CX, CX
loop:
// token := uint32(src[si])
MOVBLZX (SI), DX
INCQ SI
// lit_len = token >> 4
// if lit_len > 0
// CX = lit_len
MOVL DX, CX
SHRL $4, CX
// if lit_len != 0xF
CMPL CX, $0xF
JEQ lit_len_loop
CMPQ DI, R12
JAE copy_literal
CMPQ SI, R13
JAE copy_literal
// copy shortcut
// A two-stage shortcut for the most common case:
// 1) If the literal length is 0..14, and there is enough space,
// enter the shortcut and copy 16 bytes on behalf of the literals
// (in the fast mode, only 8 bytes can be safely copied this way).
// 2) Further if the match length is 4..18, copy 18 bytes in a similar
// manner; but we ensure that there's enough space in the output for
// those 18 bytes earlier, upon entering the shortcut (in other words,
// there is a combined check for both stages).
// copy literal
MOVOU (SI), X0
MOVOU X0, (DI)
ADDQ CX, DI
ADDQ CX, SI
MOVL DX, CX
ANDL $0xF, CX
// The second stage: prepare for match copying, decode full info.
// If it doesn't work out, the info won't be wasted.
// offset := uint16(data[:2])
MOVWLZX (SI), DX
TESTL DX, DX
JE err_corrupt
ADDQ $2, SI
JC err_short_buf
MOVQ DI, AX
SUBQ DX, AX
JC err_corrupt
CMPQ AX, DI
JA err_short_buf
// if we can't do the second stage then jump straight to read the
// match length, we already have the offset.
CMPL CX, $0xF
JEQ match_len_loop_pre
CMPL DX, $8
JLT match_len_loop_pre
CMPQ AX, R11
JB match_len_loop_pre
// memcpy(op + 0, match + 0, 8);
MOVQ (AX), BX
MOVQ BX, (DI)
// memcpy(op + 8, match + 8, 8);
MOVQ 8(AX), BX
MOVQ BX, 8(DI)
// memcpy(op +16, match +16, 2);
MOVW 16(AX), BX
MOVW BX, 16(DI)
LEAQ const_minMatch(DI)(CX*1), DI
// shortcut complete, load next token
JMP loopcheck
// Read the rest of the literal length:
// do { BX = src[si++]; lit_len += BX } while (BX == 0xFF).
lit_len_loop:
CMPQ SI, R9
JAE err_short_buf
MOVBLZX (SI), BX
INCQ SI
ADDQ BX, CX
CMPB BX, $0xFF
JE lit_len_loop
copy_literal:
// bounds check src and dst
MOVQ SI, AX
ADDQ CX, AX
JC err_short_buf
CMPQ AX, R9
JA err_short_buf
MOVQ DI, BX
ADDQ CX, BX
JC err_short_buf
CMPQ BX, R8
JA err_short_buf
// Copy literals of <=48 bytes through the XMM registers.
CMPQ CX, $48
JGT memmove_lit
// if len(dst[di:]) < 48
MOVQ R8, AX
SUBQ DI, AX
CMPQ AX, $48
JLT memmove_lit
// if len(src[si:]) < 48
MOVQ R9, BX
SUBQ SI, BX
CMPQ BX, $48
JLT memmove_lit
MOVOU (SI), X0
MOVOU 16(SI), X1
MOVOU 32(SI), X2
MOVOU X0, (DI)
MOVOU X1, 16(DI)
MOVOU X2, 32(DI)
ADDQ CX, SI
ADDQ CX, DI
JMP finish_lit_copy
memmove_lit:
// memmove(to, from, len)
MOVQ DI, 0(SP)
MOVQ SI, 8(SP)
MOVQ CX, 16(SP)
// Spill registers. Increment SI, DI now so we don't need to save CX.
ADDQ CX, DI
ADDQ CX, SI
MOVQ DI, 24(SP)
MOVQ SI, 32(SP)
MOVL DX, 40(SP)
CALL runtime·memmove(SB)
// restore registers
MOVQ 24(SP), DI
MOVQ 32(SP), SI
MOVL 40(SP), DX
// recalc initial values
MOVQ dst_base+0(FP), R8
MOVQ R8, R11
ADDQ dst_len+8(FP), R8
MOVQ src_base+24(FP), R9
ADDQ src_len+32(FP), R9
MOVQ dict_base+48(FP), R14
MOVQ dict_len+56(FP), R15
MOVQ R8, R12
SUBQ $32, R12
MOVQ R9, R13
SUBQ $16, R13
finish_lit_copy:
// CX := mLen
// free up DX to use for offset
MOVL DX, CX
ANDL $0xF, CX
CMPQ SI, R9
JAE end
// offset
// si += 2
// DX := int(src[si-2]) | int(src[si-1])<<8
ADDQ $2, SI
JC err_short_buf
CMPQ SI, R9
JA err_short_buf
MOVWQZX -2(SI), DX
// 0 offset is invalid
TESTL DX, DX
JEQ err_corrupt
match_len_loop_pre:
// if mlen != 0xF
CMPB CX, $0xF
JNE copy_match
// do { BX = src[si++]; mlen += BX } while (BX == 0xFF).
match_len_loop:
CMPQ SI, R9
JAE err_short_buf
MOVBLZX (SI), BX
INCQ SI
ADDQ BX, CX
CMPB BX, $0xFF
JE match_len_loop
copy_match:
ADDQ $const_minMatch, CX
// check we have match_len bytes left in dst
// di+match_len < len(dst)
MOVQ DI, AX
ADDQ CX, AX
JC err_short_buf
CMPQ AX, R8
JA err_short_buf
// DX = offset
// CX = match_len
// BX = &dst + (di - offset)
MOVQ DI, BX
SUBQ DX, BX
// check BX is within dst
// if BX < &dst
JC copy_match_from_dict
CMPQ BX, R11
JBE copy_match_from_dict
// if offset + match_len < di
LEAQ (BX)(CX*1), AX
CMPQ DI, AX
JA copy_interior_match
// AX := len(dst[:di])
// MOVQ DI, AX
// SUBQ R11, AX
// copy 16 bytes at a time
// if di-offset < 16 copy 16-(di-offset) bytes to di
// then do the remaining
copy_match_loop:
// for match_len >= 0
// dst[di] = dst[i]
// di++
// i++
MOVB (BX), AX
MOVB AX, (DI)
INCQ DI
INCQ BX
DECQ CX
JNZ copy_match_loop
JMP loopcheck
copy_interior_match:
CMPQ CX, $16
JGT memmove_match
// if len(dst[di:]) < 16
MOVQ R8, AX
SUBQ DI, AX
CMPQ AX, $16
JLT memmove_match
MOVOU (BX), X0
MOVOU X0, (DI)
ADDQ CX, DI
XORL CX, CX
JMP loopcheck
copy_match_from_dict:
// CX = match_len
// BX = &dst + (di - offset)
// AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary
MOVQ R11, AX
SUBQ BX, AX
// BX = len(dict) - dict_bytes_available
MOVQ R15, BX
SUBQ AX, BX
JS err_short_dict
ADDQ R14, BX
// if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy
CMPQ CX, AX
JLT memmove_match
// The match stretches over the dictionary and our block
// 1) copy what comes from the dictionary
// AX = dict_bytes_available = copy_size
// BX = &dict_end - copy_size
// CX = match_len
// memmove(to, from, len)
MOVQ DI, 0(SP)
MOVQ BX, 8(SP)
MOVQ AX, 16(SP)
// store extra stuff we want to recover
// spill
MOVQ DI, 24(SP)
MOVQ SI, 32(SP)
MOVQ CX, 40(SP)
CALL runtime·memmove(SB)
// restore registers
MOVQ 16(SP), AX // copy_size
MOVQ 24(SP), DI
MOVQ 32(SP), SI
MOVQ 40(SP), CX // match_len
// recalc initial values
MOVQ dst_base+0(FP), R8
MOVQ R8, R11 // TODO: make these sensible numbers
ADDQ dst_len+8(FP), R8
MOVQ src_base+24(FP), R9
ADDQ src_len+32(FP), R9
MOVQ dict_base+48(FP), R14
MOVQ dict_len+56(FP), R15
MOVQ R8, R12
SUBQ $32, R12
MOVQ R9, R13
SUBQ $16, R13
// di+=copy_size
ADDQ AX, DI
// 2) copy the rest from the current block
// CX = match_len - copy_size = rest_size
SUBQ AX, CX
MOVQ R11, BX
// check if we have a copy overlap
// AX = &dst + rest_size
MOVQ CX, AX
ADDQ BX, AX
// if &dst + rest_size > di, copy byte by byte
CMPQ AX, DI
JA copy_match_loop
memmove_match:
// memmove(to, from, len)
MOVQ DI, 0(SP)
MOVQ BX, 8(SP)
MOVQ CX, 16(SP)
// Spill registers. Increment DI now so we don't need to save CX.
ADDQ CX, DI
MOVQ DI, 24(SP)
MOVQ SI, 32(SP)
CALL runtime·memmove(SB)
// restore registers
MOVQ 24(SP), DI
MOVQ 32(SP), SI
// recalc initial values
MOVQ dst_base+0(FP), R8
MOVQ R8, R11 // TODO: make these sensible numbers
ADDQ dst_len+8(FP), R8
MOVQ src_base+24(FP), R9
ADDQ src_len+32(FP), R9
MOVQ R8, R12
SUBQ $32, R12
MOVQ R9, R13
SUBQ $16, R13
MOVQ dict_base+48(FP), R14
MOVQ dict_len+56(FP), R15
XORL CX, CX
loopcheck:
// for si < len(src)
CMPQ SI, R9
JB loop
end:
// Remaining length must be zero.
TESTQ CX, CX
JNE err_corrupt
SUBQ R11, DI
MOVQ DI, ret+72(FP)
RET
err_corrupt:
MOVQ $-1, ret+72(FP)
RET
err_short_buf:
MOVQ $-2, ret+72(FP)
RET
err_short_dict:
MOVQ $-3, ret+72(FP)
RET

View File

@@ -0,0 +1,231 @@
// +build gc
// +build !noasm
#include "go_asm.h"
#include "textflag.h"
// Register allocation.
#define dst R0
#define dstorig R1
#define src R2
#define dstend R3
#define srcend R4
#define match R5 // Match address.
#define dictend R6
#define token R7
#define len R8 // Literal and match lengths.
#define offset R7 // Match offset; overlaps with token.
#define tmp1 R9
#define tmp2 R11
#define tmp3 R12
// func decodeBlock(dst, src, dict []byte) int
TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $-4-40
MOVW dst_base +0(FP), dst
MOVW dst_len +4(FP), dstend
MOVW src_base +12(FP), src
MOVW src_len +16(FP), srcend
CMP $0, srcend
BEQ shortSrc
ADD dst, dstend
ADD src, srcend
MOVW dst, dstorig
loop:
// Read token. Extract literal length.
MOVBU.P 1(src), token
MOVW token >> 4, len
CMP $15, len
BNE readLitlenDone
readLitlenLoop:
CMP src, srcend
BEQ shortSrc
MOVBU.P 1(src), tmp1
ADD.S tmp1, len
BVS shortDst
CMP $255, tmp1
BEQ readLitlenLoop
readLitlenDone:
CMP $0, len
BEQ copyLiteralDone
// Bounds check dst+len and src+len.
ADD.S dst, len, tmp1
ADD.CC.S src, len, tmp2
BCS shortSrc
CMP dstend, tmp1
//BHI shortDst // Uncomment for distinct error codes.
CMP.LS srcend, tmp2
BHI shortSrc
// Copy literal.
CMP $4, len
BLO copyLiteralFinish
// Copy 0-3 bytes until src is aligned.
TST $1, src
MOVBU.NE.P 1(src), tmp1
MOVB.NE.P tmp1, 1(dst)
SUB.NE $1, len
TST $2, src
MOVHU.NE.P 2(src), tmp2
MOVB.NE.P tmp2, 1(dst)
MOVW.NE tmp2 >> 8, tmp1
MOVB.NE.P tmp1, 1(dst)
SUB.NE $2, len
B copyLiteralLoopCond
copyLiteralLoop:
// Aligned load, unaligned write.
MOVW.P 4(src), tmp1
MOVW tmp1 >> 8, tmp2
MOVB tmp2, 1(dst)
MOVW tmp1 >> 16, tmp3
MOVB tmp3, 2(dst)
MOVW tmp1 >> 24, tmp2
MOVB tmp2, 3(dst)
MOVB.P tmp1, 4(dst)
copyLiteralLoopCond:
// Loop until len-4 < 0.
SUB.S $4, len
BPL copyLiteralLoop
copyLiteralFinish:
// Copy remaining 0-3 bytes.
// At this point, len may be < 0, but len&3 is still accurate.
TST $1, len
MOVB.NE.P 1(src), tmp3
MOVB.NE.P tmp3, 1(dst)
TST $2, len
MOVB.NE.P 2(src), tmp1
MOVB.NE.P tmp1, 2(dst)
MOVB.NE -1(src), tmp2
MOVB.NE tmp2, -1(dst)
copyLiteralDone:
// Initial part of match length.
// This frees up the token register for reuse as offset.
AND $15, token, len
CMP src, srcend
BEQ end
// Read offset.
ADD.S $2, src
BCS shortSrc
CMP srcend, src
BHI shortSrc
MOVBU -2(src), offset
MOVBU -1(src), tmp1
ORR.S tmp1 << 8, offset
BEQ corrupt
// Read rest of match length.
CMP $15, len
BNE readMatchlenDone
readMatchlenLoop:
CMP src, srcend
BEQ shortSrc
MOVBU.P 1(src), tmp1
ADD.S tmp1, len
BVS shortDst
CMP $255, tmp1
BEQ readMatchlenLoop
readMatchlenDone:
// Bounds check dst+len+minMatch.
ADD.S dst, len, tmp1
ADD.CC.S $const_minMatch, tmp1
BCS shortDst
CMP dstend, tmp1
BHI shortDst
RSB dst, offset, match
CMP dstorig, match
BGE copyMatch4
// match < dstorig means the match starts in the dictionary,
// at len(dict) - offset + (dst - dstorig).
MOVW dict_base+24(FP), match
MOVW dict_len +28(FP), dictend
ADD $const_minMatch, len
RSB dst, dstorig, tmp1
RSB dictend, offset, tmp2
ADD.S tmp2, tmp1
BMI shortDict
ADD match, dictend
ADD tmp1, match
copyDict:
MOVBU.P 1(match), tmp1
MOVB.P tmp1, 1(dst)
SUB.S $1, len
CMP.NE match, dictend
BNE copyDict
// If the match extends beyond the dictionary, the rest is at dstorig.
CMP $0, len
BEQ copyMatchDone
MOVW dstorig, match
B copyMatch
// Copy a regular match.
// Since len+minMatch is at least four, we can do a 4× unrolled
// byte copy loop. Using MOVW instead of four byte loads is faster,
// but to remain portable we'd have to align match first, which is
// too expensive. By alternating loads and stores, we also handle
// the case offset < 4.
copyMatch4:
SUB.S $4, len
MOVBU.P 4(match), tmp1
MOVB.P tmp1, 4(dst)
MOVBU -3(match), tmp2
MOVB tmp2, -3(dst)
MOVBU -2(match), tmp3
MOVB tmp3, -2(dst)
MOVBU -1(match), tmp1
MOVB tmp1, -1(dst)
BPL copyMatch4
// Restore len, which is now negative.
ADD.S $4, len
BEQ copyMatchDone
copyMatch:
// Finish with a byte-at-a-time copy.
SUB.S $1, len
MOVBU.P 1(match), tmp2
MOVB.P tmp2, 1(dst)
BNE copyMatch
copyMatchDone:
CMP src, srcend
BNE loop
end:
CMP $0, len
BNE corrupt
SUB dstorig, dst, tmp1
MOVW tmp1, ret+36(FP)
RET
// The error cases have distinct labels so we can put different
// return codes here when debugging, or if the error returns need to
// be changed.
shortDict:
shortDst:
shortSrc:
corrupt:
MOVW $-1, tmp1
MOVW tmp1, ret+36(FP)
RET

View File

@@ -0,0 +1,241 @@
// +build gc
// +build !noasm
// This implementation assumes that strict alignment checking is turned off.
// The Go compiler makes the same assumption.
#include "go_asm.h"
#include "textflag.h"
// Register allocation.
#define dst R0
#define dstorig R1
#define src R2
#define dstend R3
#define dstend16 R4 // dstend - 16
#define srcend R5
#define srcend16 R6 // srcend - 16
#define match R7 // Match address.
#define dict R8
#define dictlen R9
#define dictend R10
#define token R11
#define len R12 // Literal and match lengths.
#define lenRem R13
#define offset R14 // Match offset.
#define tmp1 R15
#define tmp2 R16
#define tmp3 R17
#define tmp4 R19
// func decodeBlock(dst, src, dict []byte) int
TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
LDP dst_base+0(FP), (dst, dstend)
ADD dst, dstend
MOVD dst, dstorig
LDP src_base+24(FP), (src, srcend)
CBZ srcend, shortSrc
ADD src, srcend
// dstend16 = max(dstend-16, 0) and similarly for srcend16.
SUBS $16, dstend, dstend16
CSEL LO, ZR, dstend16, dstend16
SUBS $16, srcend, srcend16
CSEL LO, ZR, srcend16, srcend16
LDP dict_base+48(FP), (dict, dictlen)
ADD dict, dictlen, dictend
loop:
// Read token. Extract literal length.
MOVBU.P 1(src), token
LSR $4, token, len
CMP $15, len
BNE readLitlenDone
readLitlenLoop:
CMP src, srcend
BEQ shortSrc
MOVBU.P 1(src), tmp1
ADDS tmp1, len
BVS shortDst
CMP $255, tmp1
BEQ readLitlenLoop
readLitlenDone:
CBZ len, copyLiteralDone
// Bounds check dst+len and src+len.
ADDS dst, len, tmp1
BCS shortSrc
ADDS src, len, tmp2
BCS shortSrc
CMP dstend, tmp1
BHI shortDst
CMP srcend, tmp2
BHI shortSrc
// Copy literal.
SUBS $16, len
BLO copyLiteralShort
copyLiteralLoop:
LDP.P 16(src), (tmp1, tmp2)
STP.P (tmp1, tmp2), 16(dst)
SUBS $16, len
BPL copyLiteralLoop
// Copy (final part of) literal of length 0-15.
// If we have >=16 bytes left in src and dst, just copy 16 bytes.
copyLiteralShort:
CMP dstend16, dst
CCMP LO, src, srcend16, $0b0010 // 0010 = preserve carry (LO).
BHS copyLiteralShortEnd
AND $15, len
LDP (src), (tmp1, tmp2)
ADD len, src
STP (tmp1, tmp2), (dst)
ADD len, dst
B copyLiteralDone
// Safe but slow copy near the end of src, dst.
copyLiteralShortEnd:
TBZ $3, len, 3(PC)
MOVD.P 8(src), tmp1
MOVD.P tmp1, 8(dst)
TBZ $2, len, 3(PC)
MOVW.P 4(src), tmp2
MOVW.P tmp2, 4(dst)
TBZ $1, len, 3(PC)
MOVH.P 2(src), tmp3
MOVH.P tmp3, 2(dst)
TBZ $0, len, 3(PC)
MOVBU.P 1(src), tmp4
MOVB.P tmp4, 1(dst)
copyLiteralDone:
// Initial part of match length.
AND $15, token, len
CMP src, srcend
BEQ end
// Read offset.
ADDS $2, src
BCS shortSrc
CMP srcend, src
BHI shortSrc
MOVHU -2(src), offset
CBZ offset, corrupt
// Read rest of match length.
CMP $15, len
BNE readMatchlenDone
readMatchlenLoop:
CMP src, srcend
BEQ shortSrc
MOVBU.P 1(src), tmp1
ADDS tmp1, len
BVS shortDst
CMP $255, tmp1
BEQ readMatchlenLoop
readMatchlenDone:
ADD $const_minMatch, len
// Bounds check dst+len.
ADDS dst, len, tmp2
BCS shortDst
CMP dstend, tmp2
BHI shortDst
SUB offset, dst, match
CMP dstorig, match
BHS copyMatchTry8
// match < dstorig means the match starts in the dictionary,
// at len(dict) - offset + (dst - dstorig).
SUB dstorig, dst, tmp1
SUB offset, dictlen, tmp2
ADDS tmp2, tmp1
BMI shortDict
ADD dict, tmp1, match
copyDict:
MOVBU.P 1(match), tmp3
MOVB.P tmp3, 1(dst)
SUBS $1, len
CCMP NE, dictend, match, $0b0100 // 0100 sets the Z (EQ) flag.
BNE copyDict
CBZ len, copyMatchDone
// If the match extends beyond the dictionary, the rest is at dstorig.
// Recompute the offset for the next check.
MOVD dstorig, match
SUB dstorig, dst, offset
copyMatchTry8:
// Copy doublewords if both len and offset are at least eight.
// A 16-at-a-time loop doesn't provide a further speedup.
CMP $8, len
CCMP HS, offset, $8, $0
BLO copyMatchTry4
AND $7, len, lenRem
SUB $8, len
copyMatchLoop8:
MOVD.P 8(match), tmp1
MOVD.P tmp1, 8(dst)
SUBS $8, len
BPL copyMatchLoop8
MOVD (match)(len), tmp2 // match+len == match+lenRem-8.
ADD lenRem, dst
MOVD $0, len
MOVD tmp2, -8(dst)
B copyMatchDone
copyMatchTry4:
// Copy words if both len and offset are at least four.
CMP $4, len
CCMP HS, offset, $4, $0
BLO copyMatchLoop1
MOVWU.P 4(match), tmp2
MOVWU.P tmp2, 4(dst)
SUBS $4, len
BEQ copyMatchDone
copyMatchLoop1:
// Byte-at-a-time copy for small offsets <= 3.
MOVBU.P 1(match), tmp2
MOVB.P tmp2, 1(dst)
SUBS $1, len
BNE copyMatchLoop1
copyMatchDone:
CMP src, srcend
BNE loop
end:
CBNZ len, corrupt
SUB dstorig, dst, tmp1
MOVD tmp1, ret+72(FP)
RET
// The error cases have distinct labels so we can put different
// return codes here when debugging, or if the error returns need to
// be changed.
shortDict:
shortDst:
shortSrc:
corrupt:
MOVD $-1, tmp1
MOVD tmp1, ret+72(FP)
RET

View File

@@ -0,0 +1,10 @@
//go:build (amd64 || arm || arm64) && !appengine && gc && !noasm
// +build amd64 arm arm64
// +build !appengine
// +build gc
// +build !noasm
package lz4block
//go:noescape
func decodeBlock(dst, src, dict []byte) int

View File

@@ -0,0 +1,139 @@
//go:build (!amd64 && !arm && !arm64) || appengine || !gc || noasm
// +build !amd64,!arm,!arm64 appengine !gc noasm
package lz4block
import (
"encoding/binary"
)
func decodeBlock(dst, src, dict []byte) (ret int) {
// Restrict capacities so we don't read or write out of bounds.
dst = dst[:len(dst):len(dst)]
src = src[:len(src):len(src)]
const hasError = -2
if len(src) == 0 {
return hasError
}
defer func() {
if recover() != nil {
ret = hasError
}
}()
var si, di uint
for si < uint(len(src)) {
// Literals and match lengths (token).
b := uint(src[si])
si++
// Literals.
if lLen := b >> 4; lLen > 0 {
switch {
case lLen < 0xF && si+16 < uint(len(src)):
// Shortcut 1
// if we have enough room in src and dst, and the literals length
// is small enough (0..14) then copy all 16 bytes, even if not all
// are part of the literals.
copy(dst[di:], src[si:si+16])
si += lLen
di += lLen
if mLen := b & 0xF; mLen < 0xF {
// Shortcut 2
// if the match length (4..18) fits within the literals, then copy
// all 18 bytes, even if not all are part of the literals.
mLen += 4
if offset := u16(src[si:]); mLen <= offset && offset < di {
i := di - offset
// The remaining buffer may not hold 18 bytes.
// See https://github.com/pierrec/lz4/issues/51.
if end := i + 18; end <= uint(len(dst)) {
copy(dst[di:], dst[i:end])
si += 2
di += mLen
continue
}
}
}
case lLen == 0xF:
for {
x := uint(src[si])
if lLen += x; int(lLen) < 0 {
return hasError
}
si++
if x != 0xFF {
break
}
}
fallthrough
default:
copy(dst[di:di+lLen], src[si:si+lLen])
si += lLen
di += lLen
}
}
mLen := b & 0xF
if si == uint(len(src)) && mLen == 0 {
break
} else if si >= uint(len(src)) {
return hasError
}
offset := u16(src[si:])
if offset == 0 {
return hasError
}
si += 2
// Match.
mLen += minMatch
if mLen == minMatch+0xF {
for {
x := uint(src[si])
if mLen += x; int(mLen) < 0 {
return hasError
}
si++
if x != 0xFF {
break
}
}
}
// Copy the match.
if di < offset {
// The match is beyond our block, meaning the first part
// is in the dictionary.
fromDict := dict[uint(len(dict))+di-offset:]
n := uint(copy(dst[di:di+mLen], fromDict))
di += n
if mLen -= n; mLen == 0 {
continue
}
// We copied n = offset-di bytes from the dictionary,
// then set di = di+n = offset, so the following code
// copies from dst[di-offset:] = dst[0:].
}
expanded := dst[di-offset:]
if mLen > offset {
// Efficiently copy the match dst[di-offset:di] into the dst slice.
bytesToCopy := offset * (mLen / offset)
for n := offset; n <= bytesToCopy+offset; n *= 2 {
copy(expanded[n:], expanded[:n])
}
di += bytesToCopy
mLen -= bytesToCopy
}
di += uint(copy(dst[di:di+mLen], expanded[:mLen]))
}
return int(di)
}
func u16(p []byte) uint { return uint(binary.LittleEndian.Uint16(p)) }