Update dependencies

2024-11-01 17:33:34 +00:00
parent 033ac0b400
commit 5cdfab398d
3596 changed files with 1033483 additions and 259 deletions
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
@@ -0,0 +1,481 @@
+package lz4block
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"sync"
+
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+const (
+	// The following constants are used to setup the compression algorithm.
+	minMatch   = 4  // the minimum size of the match sequence size (4 bytes)
+	winSizeLog = 16 // LZ4 64Kb window size limit
+	winSize    = 1 << winSizeLog
+	winMask    = winSize - 1 // 64Kb window of previous data for dependent blocks
+
+	// hashLog determines the size of the hash table used to quickly find a previous match position.
+	// Its value influences the compression speed and memory usage, the lower the faster,
+	// but at the expense of the compression ratio.
+	// 16 seems to be the best compromise for fast compression.
+	hashLog = 16
+	htSize  = 1 << hashLog
+
+	mfLimit = 10 + minMatch // The last match cannot start within the last 14 bytes.
+)
+
+func recoverBlock(e *error) {
+	if r := recover(); r != nil && *e == nil {
+		*e = lz4errors.ErrInvalidSourceShortBuffer
+	}
+}
+
+// blockHash hashes the lower 6 bytes into a value < htSize.
+func blockHash(x uint64) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((x << (64 - 48)) * prime6bytes) >> (64 - hashLog))
+}
+
+func CompressBlockBound(n int) int {
+	return n + n/255 + 16
+}
+
+func UncompressBlock(src, dst, dict []byte) (int, error) {
+	if len(src) == 0 {
+		return 0, nil
+	}
+	if di := decodeBlock(dst, src, dict); di >= 0 {
+		return di, nil
+	}
+	return 0, lz4errors.ErrInvalidSourceShortBuffer
+}
+
+type Compressor struct {
+	// Offsets are at most 64kiB, so we can store only the lower 16 bits of
+	// match positions: effectively, an offset from some 64kiB block boundary.
+	//
+	// When we retrieve such an offset, we interpret it as relative to the last
+	// block boundary si &^ 0xffff, or the one before, (si &^ 0xffff) - 0x10000,
+	// depending on which of these is inside the current window. If a table
+	// entry was generated more than 64kiB back in the input, we find out by
+	// inspecting the input stream.
+	table [htSize]uint16
+
+	// Bitmap indicating which positions in the table are in use.
+	// This allows us to quickly reset the table for reuse,
+	// without having to zero everything.
+	inUse [htSize / 32]uint32
+}
+
+// Get returns the position of a presumptive match for the hash h.
+// The match may be a false positive due to a hash collision or an old entry.
+// If si < winSize, the return value may be negative.
+func (c *Compressor) get(h uint32, si int) int {
+	h &= htSize - 1
+	i := 0
+	if c.inUse[h/32]&(1<<(h%32)) != 0 {
+		i = int(c.table[h])
+	}
+	i += si &^ winMask
+	if i >= si {
+		// Try previous 64kiB block (negative when in first block).
+		i -= winSize
+	}
+	return i
+}
+
+func (c *Compressor) put(h uint32, si int) {
+	h &= htSize - 1
+	c.table[h] = uint16(si)
+	c.inUse[h/32] |= 1 << (h % 32)
+}
+
+func (c *Compressor) reset() { c.inUse = [htSize / 32]uint32{} }
+
+var compressorPool = sync.Pool{New: func() interface{} { return new(Compressor) }}
+
+func CompressBlock(src, dst []byte) (int, error) {
+	c := compressorPool.Get().(*Compressor)
+	n, err := c.CompressBlock(src, dst)
+	compressorPool.Put(c)
+	return n, err
+}
+
+func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
+	// Zero out reused table to avoid non-deterministic output (issue #65).
+	c.reset()
+
+	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
+	isNotCompressible := len(dst) < CompressBlockBound(len(src))
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compression.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
+	// si: Current position of the search.
+	// anchor: Position of the current literals.
+	var si, di, anchor int
+	sn := len(src) - mfLimit
+	if sn <= 0 {
+		goto lastLiterals
+	}
+
+	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
+	for si < sn {
+		// Hash the next 6 bytes (sequence)...
+		match := binary.LittleEndian.Uint64(src[si:])
+		h := blockHash(match)
+		h2 := blockHash(match >> 8)
+
+		// We check a match at s, s+1 and s+2 and pick the first one we get.
+		// Checking 3 only requires us to load the source one.
+		ref := c.get(h, si)
+		ref2 := c.get(h2, si+1)
+		c.put(h, si)
+		c.put(h2, si+1)
+
+		offset := si - ref
+
+		if offset <= 0 || offset >= winSize || uint32(match) != binary.LittleEndian.Uint32(src[ref:]) {
+			// No match. Start calculating another hash.
+			// The processor can usually do this out-of-order.
+			h = blockHash(match >> 16)
+			ref3 := c.get(h, si+2)
+
+			// Check the second match at si+1
+			si += 1
+			offset = si - ref2
+
+			if offset <= 0 || offset >= winSize || uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
+				// No match. Check the third match at si+2
+				si += 1
+				offset = si - ref3
+				c.put(h, si)
+
+				if offset <= 0 || offset >= winSize || uint32(match>>16) != binary.LittleEndian.Uint32(src[ref3:]) {
+					// Skip one extra byte (at si+3) before we check 3 matches again.
+					si += 2 + (si-anchor)>>adaptSkipLog
+					continue
+				}
+			}
+		}
+
+		// Match found.
+		lLen := si - anchor // Literal length.
+		// We already matched 4 bytes.
+		mLen := 4
+
+		// Extend backwards if we can, reducing literals.
+		tOff := si - offset - 1
+		for lLen > 0 && tOff >= 0 && src[si-1] == src[tOff] {
+			si--
+			tOff--
+			lLen--
+			mLen++
+		}
+
+		// Add the match length, so we continue search at the end.
+		// Use mLen to store the offset base.
+		si, mLen = si+mLen, si+minMatch
+
+		// Find the longest match by looking by batches of 8 bytes.
+		for si+8 <= sn {
+			x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:])
+			if x == 0 {
+				si += 8
+			} else {
+				// Stop is first non-zero byte.
+				si += bits.TrailingZeros64(x) >> 3
+				break
+			}
+		}
+
+		mLen = si - mLen
+		if di >= len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF && di < len(dst); l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			if di >= len(dst) {
+				return 0, lz4errors.ErrInvalidSourceShortBuffer
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		if di+lLen > len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
+		di += lLen + 2
+		anchor = si
+
+		// Encode offset.
+		if di > len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF && di < len(dst); mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			if di >= len(dst) {
+				return 0, lz4errors.ErrInvalidSourceShortBuffer
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+		// Check if we can load next values.
+		if si >= sn {
+			break
+		}
+		// Hash match end-2
+		h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
+		c.put(h, si-2)
+	}
+
+lastLiterals:
+	if isNotCompressible && anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+	if di >= len(dst) {
+		return 0, lz4errors.ErrInvalidSourceShortBuffer
+	}
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		for lLen -= 0xF; lLen >= 0xFF && di < len(dst); lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		if di >= len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if isNotCompressible && di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	if di+len(src)-anchor > len(dst) {
+		return 0, lz4errors.ErrInvalidSourceShortBuffer
+	}
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
+	return di, nil
+}
+
+// blockHash hashes 4 bytes into a value < winSize.
+func blockHashHC(x uint32) uint32 {
+	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
+	return x * hasher >> (32 - winSizeLog)
+}
+
+type CompressorHC struct {
+	// hashTable: stores the last position found for a given hash
+	// chainTable: stores previous positions for a given hash
+	hashTable, chainTable [htSize]int
+	needsReset            bool
+}
+
+var compressorHCPool = sync.Pool{New: func() interface{} { return new(CompressorHC) }}
+
+func CompressBlockHC(src, dst []byte, depth CompressionLevel) (int, error) {
+	c := compressorHCPool.Get().(*CompressorHC)
+	n, err := c.CompressBlock(src, dst, depth)
+	compressorHCPool.Put(c)
+	return n, err
+}
+
+func (c *CompressorHC) CompressBlock(src, dst []byte, depth CompressionLevel) (_ int, err error) {
+	if c.needsReset {
+		// Zero out reused table to avoid non-deterministic output (issue #65).
+		c.hashTable = [htSize]int{}
+		c.chainTable = [htSize]int{}
+	}
+	c.needsReset = true // Only false on first call.
+
+	defer recoverBlock(&err)
+
+	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
+	isNotCompressible := len(dst) < CompressBlockBound(len(src))
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compression.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
+	var si, di, anchor int
+	sn := len(src) - mfLimit
+	if sn <= 0 {
+		goto lastLiterals
+	}
+
+	if depth == 0 {
+		depth = winSize
+	}
+
+	for si < sn {
+		// Hash the next 4 bytes (sequence).
+		match := binary.LittleEndian.Uint32(src[si:])
+		h := blockHashHC(match)
+
+		// Follow the chain until out of window and give the longest match.
+		mLen := 0
+		offset := 0
+		for next, try := c.hashTable[h], depth; try > 0 && next > 0 && si-next < winSize; next, try = c.chainTable[next&winMask], try-1 {
+			// The first (mLen==0) or next byte (mLen>=minMatch) at current match length
+			// must match to improve on the match length.
+			if src[next+mLen] != src[si+mLen] {
+				continue
+			}
+			ml := 0
+			// Compare the current position with a previous with the same hash.
+			for ml < sn-si {
+				x := binary.LittleEndian.Uint64(src[next+ml:]) ^ binary.LittleEndian.Uint64(src[si+ml:])
+				if x == 0 {
+					ml += 8
+				} else {
+					// Stop is first non-zero byte.
+					ml += bits.TrailingZeros64(x) >> 3
+					break
+				}
+			}
+			if ml < minMatch || ml <= mLen {
+				// Match too small (<minMath) or smaller than the current match.
+				continue
+			}
+			// Found a longer match, keep its position and length.
+			mLen = ml
+			offset = si - next
+			// Try another previous position with the same hash.
+		}
+		c.chainTable[si&winMask] = c.hashTable[h]
+		c.hashTable[h] = si
+
+		// No match found.
+		if mLen == 0 {
+			si += 1 + (si-anchor)>>adaptSkipLog
+			continue
+		}
+
+		// Match found.
+		// Update hash/chain tables with overlapping bytes:
+		// si already hashed, add everything from si+1 up to the match length.
+		winStart := si + 1
+		if ws := si + mLen - winSize; ws > winStart {
+			winStart = ws
+		}
+		for si, ml := winStart, si+mLen; si < ml; {
+			match >>= 8
+			match |= uint32(src[si+3]) << 24
+			h := blockHashHC(match)
+			c.chainTable[si&winMask] = c.hashTable[h]
+			c.hashTable[h] = si
+			si++
+		}
+
+		lLen := si - anchor
+		si += mLen
+		mLen -= minMatch // Match length does not include minMatch.
+
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF; l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
+		di += lLen
+		anchor = si
+
+		// Encode offset.
+		di += 2
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF; mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+	}
+
+	if isNotCompressible && anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+lastLiterals:
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		lLen -= 0xF
+		for ; lLen >= 0xFF; lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if isNotCompressible && di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
+	return di, nil
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
@@ -0,0 +1,87 @@
+// Package lz4block provides LZ4 BlockSize types and pools of buffers.
+package lz4block
+
+import "sync"
+
+const (
+	Block64Kb uint32 = 1 << (16 + iota*2)
+	Block256Kb
+	Block1Mb
+	Block4Mb
+	Block8Mb = 2 * Block4Mb
+)
+
+var (
+	BlockPool64K  = sync.Pool{New: func() interface{} { return make([]byte, Block64Kb) }}
+	BlockPool256K = sync.Pool{New: func() interface{} { return make([]byte, Block256Kb) }}
+	BlockPool1M   = sync.Pool{New: func() interface{} { return make([]byte, Block1Mb) }}
+	BlockPool4M   = sync.Pool{New: func() interface{} { return make([]byte, Block4Mb) }}
+	BlockPool8M   = sync.Pool{New: func() interface{} { return make([]byte, Block8Mb) }}
+)
+
+func Index(b uint32) BlockSizeIndex {
+	switch b {
+	case Block64Kb:
+		return 4
+	case Block256Kb:
+		return 5
+	case Block1Mb:
+		return 6
+	case Block4Mb:
+		return 7
+	case Block8Mb: // only valid in legacy mode
+		return 3
+	}
+	return 0
+}
+
+func IsValid(b uint32) bool {
+	return Index(b) > 0
+}
+
+type BlockSizeIndex uint8
+
+func (b BlockSizeIndex) IsValid() bool {
+	switch b {
+	case 4, 5, 6, 7:
+		return true
+	}
+	return false
+}
+
+func (b BlockSizeIndex) Get() []byte {
+	var buf interface{}
+	switch b {
+	case 4:
+		buf = BlockPool64K.Get()
+	case 5:
+		buf = BlockPool256K.Get()
+	case 6:
+		buf = BlockPool1M.Get()
+	case 7:
+		buf = BlockPool4M.Get()
+	case 3:
+		buf = BlockPool8M.Get()
+	}
+	return buf.([]byte)
+}
+
+func Put(buf []byte) {
+	// Safeguard: do not allow invalid buffers.
+	switch c := cap(buf); uint32(c) {
+	case Block64Kb:
+		BlockPool64K.Put(buf[:c])
+	case Block256Kb:
+		BlockPool256K.Put(buf[:c])
+	case Block1Mb:
+		BlockPool1M.Put(buf[:c])
+	case Block4Mb:
+		BlockPool4M.Put(buf[:c])
+	case Block8Mb:
+		BlockPool8M.Put(buf[:c])
+	}
+}
+
+type CompressionLevel uint32
+
+const Fast CompressionLevel = 0
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
@@ -0,0 +1,448 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// AX scratch
+// BX scratch
+// CX literal and match lengths
+// DX token, match offset
+//
+// DI &dst
+// SI &src
+// R8 &dst + len(dst)
+// R9 &src + len(src)
+// R11 &dst
+// R12 short output end
+// R13 short input end
+// R14 &dict
+// R15 len(dict)
+
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOSPLIT, $48-80
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, R11
+	MOVQ dst_len+8(FP), R8
+	ADDQ DI, R8
+
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R9
+	CMPQ R9, $0
+	JE   err_corrupt
+	ADDQ SI, R9
+
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+
+	// shortcut ends
+	// short output end
+	MOVQ R8, R12
+	SUBQ $32, R12
+	// short input end
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	XORL CX, CX
+
+loop:
+	// token := uint32(src[si])
+	MOVBLZX (SI), DX
+	INCQ SI
+
+	// lit_len = token >> 4
+	// if lit_len > 0
+	// CX = lit_len
+	MOVL DX, CX
+	SHRL $4, CX
+
+	// if lit_len != 0xF
+	CMPL CX, $0xF
+	JEQ  lit_len_loop
+	CMPQ DI, R12
+	JAE  copy_literal
+	CMPQ SI, R13
+	JAE  copy_literal
+
+	// copy shortcut
+
+	// A two-stage shortcut for the most common case:
+	// 1) If the literal length is 0..14, and there is enough space,
+	// enter the shortcut and copy 16 bytes on behalf of the literals
+	// (in the fast mode, only 8 bytes can be safely copied this way).
+	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
+	// manner; but we ensure that there's enough space in the output for
+	// those 18 bytes earlier, upon entering the shortcut (in other words,
+	// there is a combined check for both stages).
+
+	// copy literal
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+	ADDQ CX, DI
+	ADDQ CX, SI
+
+	MOVL DX, CX
+	ANDL $0xF, CX
+
+	// The second stage: prepare for match copying, decode full info.
+	// If it doesn't work out, the info won't be wasted.
+	// offset := uint16(data[:2])
+	MOVWLZX (SI), DX
+	TESTL DX, DX
+	JE err_corrupt
+	ADDQ $2, SI
+	JC err_short_buf
+
+	MOVQ DI, AX
+	SUBQ DX, AX
+	JC err_corrupt
+	CMPQ AX, DI
+	JA err_short_buf
+
+	// if we can't do the second stage then jump straight to read the
+	// match length, we already have the offset.
+	CMPL CX, $0xF
+	JEQ match_len_loop_pre
+	CMPL DX, $8
+	JLT match_len_loop_pre
+	CMPQ AX, R11
+	JB match_len_loop_pre
+
+	// memcpy(op + 0, match + 0, 8);
+	MOVQ (AX), BX
+	MOVQ BX, (DI)
+	// memcpy(op + 8, match + 8, 8);
+	MOVQ 8(AX), BX
+	MOVQ BX, 8(DI)
+	// memcpy(op +16, match +16, 2);
+	MOVW 16(AX), BX
+	MOVW BX, 16(DI)
+
+	LEAQ const_minMatch(DI)(CX*1), DI
+
+	// shortcut complete, load next token
+	JMP loopcheck
+
+	// Read the rest of the literal length:
+	// do { BX = src[si++]; lit_len += BX } while (BX == 0xFF).
+lit_len_loop:
+	CMPQ SI, R9
+	JAE err_short_buf
+
+	MOVBLZX (SI), BX
+	INCQ SI
+	ADDQ BX, CX
+
+	CMPB BX, $0xFF
+	JE lit_len_loop
+
+copy_literal:
+	// bounds check src and dst
+	MOVQ SI, AX
+	ADDQ CX, AX
+	JC err_short_buf
+	CMPQ AX, R9
+	JA err_short_buf
+
+	MOVQ DI, BX
+	ADDQ CX, BX
+	JC err_short_buf
+	CMPQ BX, R8
+	JA err_short_buf
+
+	// Copy literals of <=48 bytes through the XMM registers.
+	CMPQ CX, $48
+	JGT memmove_lit
+
+	// if len(dst[di:]) < 48
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $48
+	JLT memmove_lit
+
+	// if len(src[si:]) < 48
+	MOVQ R9, BX
+	SUBQ SI, BX
+	CMPQ BX, $48
+	JLT memmove_lit
+
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU 32(SI), X2
+	MOVOU X0, (DI)
+	MOVOU X1, 16(DI)
+	MOVOU X2, 32(DI)
+
+	ADDQ CX, SI
+	ADDQ CX, DI
+
+	JMP finish_lit_copy
+
+memmove_lit:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+
+	// Spill registers. Increment SI, DI now so we don't need to save CX.
+	ADDQ CX, DI
+	ADDQ CX, SI
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVL DX, 40(SP)
+
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVL 40(SP), DX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+finish_lit_copy:
+	// CX := mLen
+	// free up DX to use for offset
+	MOVL DX, CX
+	ANDL $0xF, CX
+
+	CMPQ SI, R9
+	JAE end
+
+	// offset
+	// si += 2
+	// DX := int(src[si-2]) | int(src[si-1])<<8
+	ADDQ $2, SI
+	JC err_short_buf
+	CMPQ SI, R9
+	JA err_short_buf
+	MOVWQZX -2(SI), DX
+
+	// 0 offset is invalid
+	TESTL DX, DX
+	JEQ   err_corrupt
+
+match_len_loop_pre:
+	// if mlen != 0xF
+	CMPB CX, $0xF
+	JNE copy_match
+
+	// do { BX = src[si++]; mlen += BX } while (BX == 0xFF).
+match_len_loop:
+	CMPQ SI, R9
+	JAE err_short_buf
+
+	MOVBLZX (SI), BX
+	INCQ SI
+	ADDQ BX, CX
+
+	CMPB BX, $0xFF
+	JE match_len_loop
+
+copy_match:
+	ADDQ $const_minMatch, CX
+
+	// check we have match_len bytes left in dst
+	// di+match_len < len(dst)
+	MOVQ DI, AX
+	ADDQ CX, AX
+	JC err_short_buf
+	CMPQ AX, R8
+	JA err_short_buf
+
+	// DX = offset
+	// CX = match_len
+	// BX = &dst + (di - offset)
+	MOVQ DI, BX
+	SUBQ DX, BX
+
+	// check BX is within dst
+	// if BX < &dst
+	JC copy_match_from_dict
+	CMPQ BX, R11
+	JBE copy_match_from_dict
+
+	// if offset + match_len < di
+	LEAQ (BX)(CX*1), AX
+	CMPQ DI, AX
+	JA copy_interior_match
+
+	// AX := len(dst[:di])
+	// MOVQ DI, AX
+	// SUBQ R11, AX
+
+	// copy 16 bytes at a time
+	// if di-offset < 16 copy 16-(di-offset) bytes to di
+	// then do the remaining
+
+copy_match_loop:
+	// for match_len >= 0
+	// dst[di] = dst[i]
+	// di++
+	// i++
+	MOVB (BX), AX
+	MOVB AX, (DI)
+	INCQ DI
+	INCQ BX
+	DECQ CX
+	JNZ copy_match_loop
+
+	JMP loopcheck
+
+copy_interior_match:
+	CMPQ CX, $16
+	JGT memmove_match
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_match
+
+	MOVOU (BX), X0
+	MOVOU X0, (DI)
+
+	ADDQ CX, DI
+	XORL CX, CX
+	JMP  loopcheck
+
+copy_match_from_dict:
+	// CX = match_len
+	// BX = &dst + (di - offset)
+
+	// AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary
+	MOVQ R11, AX
+	SUBQ BX, AX
+
+	// BX = len(dict) - dict_bytes_available
+	MOVQ R15, BX
+	SUBQ AX, BX
+	JS err_short_dict
+
+	ADDQ R14, BX
+
+	// if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy
+	CMPQ CX, AX
+	JLT memmove_match
+
+	// The match stretches over the dictionary and our block
+	// 1) copy what comes from the dictionary
+	// AX = dict_bytes_available = copy_size
+	// BX = &dict_end - copy_size
+	// CX = match_len
+
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ AX, 16(SP)
+	// store extra stuff we want to recover
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 16(SP), AX // copy_size
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX // match_len
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	// di+=copy_size
+	ADDQ AX, DI
+
+	// 2) copy the rest from the current block
+	// CX = match_len - copy_size = rest_size
+	SUBQ AX, CX
+	MOVQ R11, BX
+
+	// check if we have a copy overlap
+	// AX = &dst + rest_size
+	MOVQ CX, AX
+	ADDQ BX, AX
+	// if &dst + rest_size > di, copy byte by byte
+	CMPQ AX, DI
+
+	JA copy_match_loop
+
+memmove_match:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ CX, 16(SP)
+
+	// Spill registers. Increment DI now so we don't need to save CX.
+	ADDQ CX, DI
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	XORL CX, CX
+
+loopcheck:
+	// for si < len(src)
+	CMPQ SI, R9
+	JB   loop
+
+end:
+	// Remaining length must be zero.
+	TESTQ CX, CX
+	JNE   err_corrupt
+
+	SUBQ R11, DI
+	MOVQ DI, ret+72(FP)
+	RET
+
+err_corrupt:
+	MOVQ $-1, ret+72(FP)
+	RET
+
+err_short_buf:
+	MOVQ $-2, ret+72(FP)
+	RET
+
+err_short_dict:
+	MOVQ $-3, ret+72(FP)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
@@ -0,0 +1,231 @@
+// +build gc
+// +build !noasm
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Register allocation.
+#define dst	R0
+#define dstorig	R1
+#define src	R2
+#define dstend	R3
+#define srcend	R4
+#define match	R5	// Match address.
+#define dictend	R6
+#define token	R7
+#define len	R8	// Literal and match lengths.
+#define offset	R7	// Match offset; overlaps with token.
+#define tmp1	R9
+#define tmp2	R11
+#define tmp3	R12
+
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $-4-40
+	MOVW dst_base  +0(FP), dst
+	MOVW dst_len   +4(FP), dstend
+	MOVW src_base +12(FP), src
+	MOVW src_len  +16(FP), srcend
+
+	CMP $0, srcend
+	BEQ shortSrc
+
+	ADD dst, dstend
+	ADD src, srcend
+
+	MOVW dst, dstorig
+
+loop:
+	// Read token. Extract literal length.
+	MOVBU.P 1(src), token
+	MOVW    token >> 4, len
+	CMP     $15, len
+	BNE     readLitlenDone
+
+readLitlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADD.S   tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readLitlenLoop
+
+readLitlenDone:
+	CMP $0, len
+	BEQ copyLiteralDone
+
+	// Bounds check dst+len and src+len.
+	ADD.S    dst, len, tmp1
+	ADD.CC.S src, len, tmp2
+	BCS      shortSrc
+	CMP      dstend, tmp1
+	//BHI    shortDst // Uncomment for distinct error codes.
+	CMP.LS   srcend, tmp2
+	BHI      shortSrc
+
+	// Copy literal.
+	CMP $4, len
+	BLO copyLiteralFinish
+
+	// Copy 0-3 bytes until src is aligned.
+	TST        $1, src
+	MOVBU.NE.P 1(src), tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	SUB.NE     $1, len
+
+	TST        $2, src
+	MOVHU.NE.P 2(src), tmp2
+	MOVB.NE.P  tmp2, 1(dst)
+	MOVW.NE    tmp2 >> 8, tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	SUB.NE     $2, len
+
+	B copyLiteralLoopCond
+
+copyLiteralLoop:
+	// Aligned load, unaligned write.
+	MOVW.P 4(src), tmp1
+	MOVW   tmp1 >>  8, tmp2
+	MOVB   tmp2, 1(dst)
+	MOVW   tmp1 >> 16, tmp3
+	MOVB   tmp3, 2(dst)
+	MOVW   tmp1 >> 24, tmp2
+	MOVB   tmp2, 3(dst)
+	MOVB.P tmp1, 4(dst)
+copyLiteralLoopCond:
+	// Loop until len-4 < 0.
+	SUB.S  $4, len
+	BPL    copyLiteralLoop
+
+copyLiteralFinish:
+	// Copy remaining 0-3 bytes.
+	// At this point, len may be < 0, but len&3 is still accurate.
+	TST       $1, len
+	MOVB.NE.P 1(src), tmp3
+	MOVB.NE.P tmp3, 1(dst)
+	TST       $2, len
+	MOVB.NE.P 2(src), tmp1
+	MOVB.NE.P tmp1, 2(dst)
+	MOVB.NE   -1(src), tmp2
+	MOVB.NE   tmp2, -1(dst)
+
+copyLiteralDone:
+	// Initial part of match length.
+	// This frees up the token register for reuse as offset.
+	AND $15, token, len
+
+	CMP src, srcend
+	BEQ end
+
+	// Read offset.
+	ADD.S $2, src
+	BCS   shortSrc
+	CMP   srcend, src
+	BHI   shortSrc
+	MOVBU -2(src), offset
+	MOVBU -1(src), tmp1
+	ORR.S tmp1 << 8, offset
+	BEQ   corrupt
+
+	// Read rest of match length.
+	CMP $15, len
+	BNE readMatchlenDone
+
+readMatchlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADD.S   tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readMatchlenLoop
+
+readMatchlenDone:
+	// Bounds check dst+len+minMatch.
+	ADD.S    dst, len, tmp1
+	ADD.CC.S $const_minMatch, tmp1
+	BCS      shortDst
+	CMP      dstend, tmp1
+	BHI      shortDst
+
+	RSB dst, offset, match
+	CMP dstorig, match
+	BGE copyMatch4
+
+	// match < dstorig means the match starts in the dictionary,
+	// at len(dict) - offset + (dst - dstorig).
+	MOVW dict_base+24(FP), match
+	MOVW dict_len +28(FP), dictend
+
+	ADD $const_minMatch, len
+
+	RSB   dst, dstorig, tmp1
+	RSB   dictend, offset, tmp2
+	ADD.S tmp2, tmp1
+	BMI   shortDict
+	ADD   match, dictend
+	ADD   tmp1, match
+
+copyDict:
+	MOVBU.P 1(match), tmp1
+	MOVB.P  tmp1, 1(dst)
+	SUB.S   $1, len
+	CMP.NE  match, dictend
+	BNE     copyDict
+
+	// If the match extends beyond the dictionary, the rest is at dstorig.
+	CMP  $0, len
+	BEQ  copyMatchDone
+	MOVW dstorig, match
+	B    copyMatch
+
+	// Copy a regular match.
+	// Since len+minMatch is at least four, we can do a 4× unrolled
+	// byte copy loop. Using MOVW instead of four byte loads is faster,
+	// but to remain portable we'd have to align match first, which is
+	// too expensive. By alternating loads and stores, we also handle
+	// the case offset < 4.
+copyMatch4:
+	SUB.S   $4, len
+	MOVBU.P 4(match), tmp1
+	MOVB.P  tmp1, 4(dst)
+	MOVBU   -3(match), tmp2
+	MOVB    tmp2, -3(dst)
+	MOVBU   -2(match), tmp3
+	MOVB    tmp3, -2(dst)
+	MOVBU   -1(match), tmp1
+	MOVB    tmp1, -1(dst)
+	BPL     copyMatch4
+
+	// Restore len, which is now negative.
+	ADD.S $4, len
+	BEQ   copyMatchDone
+
+copyMatch:
+	// Finish with a byte-at-a-time copy.
+	SUB.S   $1, len
+	MOVBU.P 1(match), tmp2
+	MOVB.P  tmp2, 1(dst)
+	BNE     copyMatch
+
+copyMatchDone:
+	CMP src, srcend
+	BNE loop
+
+end:
+	CMP  $0, len
+	BNE  corrupt
+	SUB  dstorig, dst, tmp1
+	MOVW tmp1, ret+36(FP)
+	RET
+
+	// The error cases have distinct labels so we can put different
+	// return codes here when debugging, or if the error returns need to
+	// be changed.
+shortDict:
+shortDst:
+shortSrc:
+corrupt:
+	MOVW $-1, tmp1
+	MOVW tmp1, ret+36(FP)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm64.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm64.s
@@ -0,0 +1,241 @@
+// +build gc
+// +build !noasm
+
+// This implementation assumes that strict alignment checking is turned off.
+// The Go compiler makes the same assumption.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Register allocation.
+#define dst		R0
+#define dstorig		R1
+#define src		R2
+#define dstend		R3
+#define dstend16	R4	// dstend - 16
+#define srcend		R5
+#define srcend16	R6	// srcend - 16
+#define match		R7	// Match address.
+#define dict		R8
+#define dictlen		R9
+#define dictend		R10
+#define token		R11
+#define len		R12	// Literal and match lengths.
+#define lenRem		R13
+#define offset		R14	// Match offset.
+#define tmp1		R15
+#define tmp2		R16
+#define tmp3		R17
+#define tmp4		R19
+
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
+	LDP  dst_base+0(FP), (dst, dstend)
+	ADD  dst, dstend
+	MOVD dst, dstorig
+
+	LDP src_base+24(FP), (src, srcend)
+	CBZ srcend, shortSrc
+	ADD src, srcend
+
+	// dstend16 = max(dstend-16, 0) and similarly for srcend16.
+	SUBS $16, dstend, dstend16
+	CSEL LO, ZR, dstend16, dstend16
+	SUBS $16, srcend, srcend16
+	CSEL LO, ZR, srcend16, srcend16
+
+	LDP dict_base+48(FP), (dict, dictlen)
+	ADD dict, dictlen, dictend
+
+loop:
+	// Read token. Extract literal length.
+	MOVBU.P 1(src), token
+	LSR     $4, token, len
+	CMP     $15, len
+	BNE     readLitlenDone
+
+readLitlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADDS    tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readLitlenLoop
+
+readLitlenDone:
+	CBZ len, copyLiteralDone
+
+	// Bounds check dst+len and src+len.
+	ADDS dst, len, tmp1
+	BCS  shortSrc
+	ADDS src, len, tmp2
+	BCS  shortSrc
+	CMP  dstend, tmp1
+	BHI  shortDst
+	CMP  srcend, tmp2
+	BHI  shortSrc
+
+	// Copy literal.
+	SUBS $16, len
+	BLO  copyLiteralShort
+
+copyLiteralLoop:
+	LDP.P 16(src), (tmp1, tmp2)
+	STP.P (tmp1, tmp2), 16(dst)
+	SUBS  $16, len
+	BPL   copyLiteralLoop
+
+	// Copy (final part of) literal of length 0-15.
+	// If we have >=16 bytes left in src and dst, just copy 16 bytes.
+copyLiteralShort:
+	CMP  dstend16, dst
+	CCMP LO, src, srcend16, $0b0010 // 0010 = preserve carry (LO).
+	BHS  copyLiteralShortEnd
+
+	AND $15, len
+
+	LDP (src), (tmp1, tmp2)
+	ADD len, src
+	STP (tmp1, tmp2), (dst)
+	ADD len, dst
+
+	B copyLiteralDone
+
+	// Safe but slow copy near the end of src, dst.
+copyLiteralShortEnd:
+	TBZ     $3, len, 3(PC)
+	MOVD.P  8(src), tmp1
+	MOVD.P  tmp1, 8(dst)
+	TBZ     $2, len, 3(PC)
+	MOVW.P  4(src), tmp2
+	MOVW.P  tmp2, 4(dst)
+	TBZ     $1, len, 3(PC)
+	MOVH.P  2(src), tmp3
+	MOVH.P  tmp3, 2(dst)
+	TBZ     $0, len, 3(PC)
+	MOVBU.P 1(src), tmp4
+	MOVB.P  tmp4, 1(dst)
+
+copyLiteralDone:
+	// Initial part of match length.
+	AND $15, token, len
+
+	CMP src, srcend
+	BEQ end
+
+	// Read offset.
+	ADDS  $2, src
+	BCS   shortSrc
+	CMP   srcend, src
+	BHI   shortSrc
+	MOVHU -2(src), offset
+	CBZ   offset, corrupt
+
+	// Read rest of match length.
+	CMP $15, len
+	BNE readMatchlenDone
+
+readMatchlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADDS    tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readMatchlenLoop
+
+readMatchlenDone:
+	ADD $const_minMatch, len
+
+	// Bounds check dst+len.
+	ADDS dst, len, tmp2
+	BCS  shortDst
+	CMP  dstend, tmp2
+	BHI  shortDst
+
+	SUB offset, dst, match
+	CMP dstorig, match
+	BHS copyMatchTry8
+
+	// match < dstorig means the match starts in the dictionary,
+	// at len(dict) - offset + (dst - dstorig).
+	SUB  dstorig, dst, tmp1
+	SUB  offset, dictlen, tmp2
+	ADDS tmp2, tmp1
+	BMI  shortDict
+	ADD  dict, tmp1, match
+
+copyDict:
+	MOVBU.P 1(match), tmp3
+	MOVB.P  tmp3, 1(dst)
+	SUBS    $1, len
+	CCMP    NE, dictend, match, $0b0100 // 0100 sets the Z (EQ) flag.
+	BNE     copyDict
+
+	CBZ len, copyMatchDone
+
+	// If the match extends beyond the dictionary, the rest is at dstorig.
+	// Recompute the offset for the next check.
+	MOVD dstorig, match
+	SUB  dstorig, dst, offset
+
+copyMatchTry8:
+	// Copy doublewords if both len and offset are at least eight.
+	// A 16-at-a-time loop doesn't provide a further speedup.
+	CMP  $8, len
+	CCMP HS, offset, $8, $0
+	BLO  copyMatchTry4
+
+	AND    $7, len, lenRem
+	SUB    $8, len
+copyMatchLoop8:
+	MOVD.P 8(match), tmp1
+	MOVD.P tmp1, 8(dst)
+	SUBS   $8, len
+	BPL    copyMatchLoop8
+
+	MOVD (match)(len), tmp2 // match+len == match+lenRem-8.
+	ADD  lenRem, dst
+	MOVD $0, len
+	MOVD tmp2, -8(dst)
+	B    copyMatchDone
+
+copyMatchTry4:
+	// Copy words if both len and offset are at least four.
+	CMP  $4, len
+	CCMP HS, offset, $4, $0
+	BLO  copyMatchLoop1
+
+	MOVWU.P 4(match), tmp2
+	MOVWU.P tmp2, 4(dst)
+	SUBS    $4, len
+	BEQ     copyMatchDone
+
+copyMatchLoop1:
+	// Byte-at-a-time copy for small offsets <= 3.
+	MOVBU.P 1(match), tmp2
+	MOVB.P  tmp2, 1(dst)
+	SUBS    $1, len
+	BNE     copyMatchLoop1
+
+copyMatchDone:
+	CMP src, srcend
+	BNE loop
+
+end:
+	CBNZ len, corrupt
+	SUB  dstorig, dst, tmp1
+	MOVD tmp1, ret+72(FP)
+	RET
+
+	// The error cases have distinct labels so we can put different
+	// return codes here when debugging, or if the error returns need to
+	// be changed.
+shortDict:
+shortDst:
+shortSrc:
+corrupt:
+	MOVD $-1, tmp1
+	MOVD tmp1, ret+72(FP)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
@@ -0,0 +1,10 @@
+//go:build (amd64 || arm || arm64) && !appengine && gc && !noasm
+// +build amd64 arm arm64
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package lz4block
+
+//go:noescape
+func decodeBlock(dst, src, dict []byte) int
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
@@ -0,0 +1,139 @@
+//go:build (!amd64 && !arm && !arm64) || appengine || !gc || noasm
+// +build !amd64,!arm,!arm64 appengine !gc noasm
+
+package lz4block
+
+import (
+	"encoding/binary"
+)
+
+func decodeBlock(dst, src, dict []byte) (ret int) {
+	// Restrict capacities so we don't read or write out of bounds.
+	dst = dst[:len(dst):len(dst)]
+	src = src[:len(src):len(src)]
+
+	const hasError = -2
+
+	if len(src) == 0 {
+		return hasError
+	}
+
+	defer func() {
+		if recover() != nil {
+			ret = hasError
+		}
+	}()
+
+	var si, di uint
+	for si < uint(len(src)) {
+		// Literals and match lengths (token).
+		b := uint(src[si])
+		si++
+
+		// Literals.
+		if lLen := b >> 4; lLen > 0 {
+			switch {
+			case lLen < 0xF && si+16 < uint(len(src)):
+				// Shortcut 1
+				// if we have enough room in src and dst, and the literals length
+				// is small enough (0..14) then copy all 16 bytes, even if not all
+				// are part of the literals.
+				copy(dst[di:], src[si:si+16])
+				si += lLen
+				di += lLen
+				if mLen := b & 0xF; mLen < 0xF {
+					// Shortcut 2
+					// if the match length (4..18) fits within the literals, then copy
+					// all 18 bytes, even if not all are part of the literals.
+					mLen += 4
+					if offset := u16(src[si:]); mLen <= offset && offset < di {
+						i := di - offset
+						// The remaining buffer may not hold 18 bytes.
+						// See https://github.com/pierrec/lz4/issues/51.
+						if end := i + 18; end <= uint(len(dst)) {
+							copy(dst[di:], dst[i:end])
+							si += 2
+							di += mLen
+							continue
+						}
+					}
+				}
+			case lLen == 0xF:
+				for {
+					x := uint(src[si])
+					if lLen += x; int(lLen) < 0 {
+						return hasError
+					}
+					si++
+					if x != 0xFF {
+						break
+					}
+				}
+				fallthrough
+			default:
+				copy(dst[di:di+lLen], src[si:si+lLen])
+				si += lLen
+				di += lLen
+			}
+		}
+
+		mLen := b & 0xF
+		if si == uint(len(src)) && mLen == 0 {
+			break
+		} else if si >= uint(len(src)) {
+			return hasError
+		}
+
+		offset := u16(src[si:])
+		if offset == 0 {
+			return hasError
+		}
+		si += 2
+
+		// Match.
+		mLen += minMatch
+		if mLen == minMatch+0xF {
+			for {
+				x := uint(src[si])
+				if mLen += x; int(mLen) < 0 {
+					return hasError
+				}
+				si++
+				if x != 0xFF {
+					break
+				}
+			}
+		}
+
+		// Copy the match.
+		if di < offset {
+			// The match is beyond our block, meaning the first part
+			// is in the dictionary.
+			fromDict := dict[uint(len(dict))+di-offset:]
+			n := uint(copy(dst[di:di+mLen], fromDict))
+			di += n
+			if mLen -= n; mLen == 0 {
+				continue
+			}
+			// We copied n = offset-di bytes from the dictionary,
+			// then set di = di+n = offset, so the following code
+			// copies from dst[di-offset:] = dst[0:].
+		}
+
+		expanded := dst[di-offset:]
+		if mLen > offset {
+			// Efficiently copy the match dst[di-offset:di] into the dst slice.
+			bytesToCopy := offset * (mLen / offset)
+			for n := offset; n <= bytesToCopy+offset; n *= 2 {
+				copy(expanded[n:], expanded[:n])
+			}
+			di += bytesToCopy
+			mLen -= bytesToCopy
+		}
+		di += uint(copy(dst[di:di+mLen], expanded[:mLen]))
+	}
+
+	return int(di)
+}
+
+func u16(p []byte) uint { return uint(binary.LittleEndian.Uint16(p)) }