Update dependencies

This commit is contained in:
bluepython508
2024-11-01 17:33:34 +00:00
parent 033ac0b400
commit 5cdfab398d
3596 changed files with 1033483 additions and 259 deletions

27
vendor/golang.org/x/crypto/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

22
vendor/golang.org/x/crypto/PATENTS generated vendored Normal file
View File

@@ -0,0 +1,22 @@
Additional IP Rights Grant (Patents)
"This implementation" means the copyrightable works distributed by
Google as part of the Go project.
Google hereby grants to You a perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as stated in this section)
patent license to make, have made, use, offer to sell, sell, import,
transfer and otherwise run, modify and propagate the contents of this
implementation of Go, where such license applies only to those patent
claims, both currently owned or controlled by Google and acquired in
the future, licensable by Google that are necessarily infringed by this
implementation of Go. This grant does not include claims that would be
infringed only as a consequence of further modification of this
implementation. If you or your agent or exclusive licensee institute or
order or agree to the institution of patent litigation against any
entity (including a cross-claim or counterclaim in a lawsuit) alleging
that this implementation of Go or any code incorporated within this
implementation of Go constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any patent
rights granted to you under this License for this implementation of Go
shall terminate as of the date such litigation is filed.

283
vendor/golang.org/x/crypto/argon2/argon2.go generated vendored Normal file
View File

@@ -0,0 +1,283 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package argon2 implements the key derivation function Argon2.
// Argon2 was selected as the winner of the Password Hashing Competition and can
// be used to derive cryptographic keys from passwords.
//
// For a detailed specification of Argon2 see [1].
//
// If you aren't sure which function you need, use Argon2id (IDKey) and
// the parameter recommendations for your scenario.
//
// # Argon2i
//
// Argon2i (implemented by Key) is the side-channel resistant version of Argon2.
// It uses data-independent memory access, which is preferred for password
// hashing and password-based key derivation. Argon2i requires more passes over
// memory than Argon2id to protect from trade-off attacks. The recommended
// parameters (taken from [2]) for non-interactive operations are time=3 and to
// use the maximum available memory.
//
// # Argon2id
//
// Argon2id (implemented by IDKey) is a hybrid version of Argon2 combining
// Argon2i and Argon2d. It uses data-independent memory access for the first
// half of the first iteration over the memory and data-dependent memory access
// for the rest. Argon2id is side-channel resistant and provides better brute-
// force cost savings due to time-memory tradeoffs than Argon2i. The recommended
// parameters for non-interactive operations (taken from [2]) are time=1 and to
// use the maximum available memory.
//
// [1] https://github.com/P-H-C/phc-winner-argon2/blob/master/argon2-specs.pdf
// [2] https://tools.ietf.org/html/draft-irtf-cfrg-argon2-03#section-9.3
package argon2
import (
"encoding/binary"
"sync"
"golang.org/x/crypto/blake2b"
)
// The Argon2 version implemented by this package.
const Version = 0x13
const (
argon2d = iota
argon2i
argon2id
)
// Key derives a key from the password, salt, and cost parameters using Argon2i
// returning a byte slice of length keyLen that can be used as cryptographic
// key. The CPU cost and parallelism degree must be greater than zero.
//
// For example, you can get a derived key for e.g. AES-256 (which needs a
// 32-byte key) by doing:
//
// key := argon2.Key([]byte("some password"), salt, 3, 32*1024, 4, 32)
//
// The draft RFC recommends[2] time=3, and memory=32*1024 is a sensible number.
// If using that amount of memory (32 MB) is not possible in some contexts then
// the time parameter can be increased to compensate.
//
// The time parameter specifies the number of passes over the memory and the
// memory parameter specifies the size of the memory in KiB. For example
// memory=32*1024 sets the memory cost to ~32 MB. The number of threads can be
// adjusted to the number of available CPUs. The cost parameters should be
// increased as memory latency and CPU parallelism increases. Remember to get a
// good random salt.
func Key(password, salt []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
return deriveKey(argon2i, password, salt, nil, nil, time, memory, threads, keyLen)
}
// IDKey derives a key from the password, salt, and cost parameters using
// Argon2id returning a byte slice of length keyLen that can be used as
// cryptographic key. The CPU cost and parallelism degree must be greater than
// zero.
//
// For example, you can get a derived key for e.g. AES-256 (which needs a
// 32-byte key) by doing:
//
// key := argon2.IDKey([]byte("some password"), salt, 1, 64*1024, 4, 32)
//
// The draft RFC recommends[2] time=1, and memory=64*1024 is a sensible number.
// If using that amount of memory (64 MB) is not possible in some contexts then
// the time parameter can be increased to compensate.
//
// The time parameter specifies the number of passes over the memory and the
// memory parameter specifies the size of the memory in KiB. For example
// memory=64*1024 sets the memory cost to ~64 MB. The number of threads can be
// adjusted to the numbers of available CPUs. The cost parameters should be
// increased as memory latency and CPU parallelism increases. Remember to get a
// good random salt.
func IDKey(password, salt []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
return deriveKey(argon2id, password, salt, nil, nil, time, memory, threads, keyLen)
}
func deriveKey(mode int, password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []byte {
if time < 1 {
panic("argon2: number of rounds too small")
}
if threads < 1 {
panic("argon2: parallelism degree too low")
}
h0 := initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
if memory < 2*syncPoints*uint32(threads) {
memory = 2 * syncPoints * uint32(threads)
}
B := initBlocks(&h0, memory, uint32(threads))
processBlocks(B, time, memory, uint32(threads), mode)
return extractKey(B, memory, uint32(threads), keyLen)
}
const (
blockLength = 128
syncPoints = 4
)
type block [blockLength]uint64
func initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte {
var (
h0 [blake2b.Size + 8]byte
params [24]byte
tmp [4]byte
)
b2, _ := blake2b.New512(nil)
binary.LittleEndian.PutUint32(params[0:4], threads)
binary.LittleEndian.PutUint32(params[4:8], keyLen)
binary.LittleEndian.PutUint32(params[8:12], memory)
binary.LittleEndian.PutUint32(params[12:16], time)
binary.LittleEndian.PutUint32(params[16:20], uint32(Version))
binary.LittleEndian.PutUint32(params[20:24], uint32(mode))
b2.Write(params[:])
binary.LittleEndian.PutUint32(tmp[:], uint32(len(password)))
b2.Write(tmp[:])
b2.Write(password)
binary.LittleEndian.PutUint32(tmp[:], uint32(len(salt)))
b2.Write(tmp[:])
b2.Write(salt)
binary.LittleEndian.PutUint32(tmp[:], uint32(len(key)))
b2.Write(tmp[:])
b2.Write(key)
binary.LittleEndian.PutUint32(tmp[:], uint32(len(data)))
b2.Write(tmp[:])
b2.Write(data)
b2.Sum(h0[:0])
return h0
}
func initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []block {
var block0 [1024]byte
B := make([]block, memory)
for lane := uint32(0); lane < threads; lane++ {
j := lane * (memory / threads)
binary.LittleEndian.PutUint32(h0[blake2b.Size+4:], lane)
binary.LittleEndian.PutUint32(h0[blake2b.Size:], 0)
blake2bHash(block0[:], h0[:])
for i := range B[j+0] {
B[j+0][i] = binary.LittleEndian.Uint64(block0[i*8:])
}
binary.LittleEndian.PutUint32(h0[blake2b.Size:], 1)
blake2bHash(block0[:], h0[:])
for i := range B[j+1] {
B[j+1][i] = binary.LittleEndian.Uint64(block0[i*8:])
}
}
return B
}
func processBlocks(B []block, time, memory, threads uint32, mode int) {
lanes := memory / threads
segments := lanes / syncPoints
processSegment := func(n, slice, lane uint32, wg *sync.WaitGroup) {
var addresses, in, zero block
if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
in[0] = uint64(n)
in[1] = uint64(lane)
in[2] = uint64(slice)
in[3] = uint64(memory)
in[4] = uint64(time)
in[5] = uint64(mode)
}
index := uint32(0)
if n == 0 && slice == 0 {
index = 2 // we have already generated the first two blocks
if mode == argon2i || mode == argon2id {
in[6]++
processBlock(&addresses, &in, &zero)
processBlock(&addresses, &addresses, &zero)
}
}
offset := lane*lanes + slice*segments + index
var random uint64
for index < segments {
prev := offset - 1
if index == 0 && slice == 0 {
prev += lanes // last block in lane
}
if mode == argon2i || (mode == argon2id && n == 0 && slice < syncPoints/2) {
if index%blockLength == 0 {
in[6]++
processBlock(&addresses, &in, &zero)
processBlock(&addresses, &addresses, &zero)
}
random = addresses[index%blockLength]
} else {
random = B[prev][0]
}
newOffset := indexAlpha(random, lanes, segments, threads, n, slice, lane, index)
processBlockXOR(&B[offset], &B[prev], &B[newOffset])
index, offset = index+1, offset+1
}
wg.Done()
}
for n := uint32(0); n < time; n++ {
for slice := uint32(0); slice < syncPoints; slice++ {
var wg sync.WaitGroup
for lane := uint32(0); lane < threads; lane++ {
wg.Add(1)
go processSegment(n, slice, lane, &wg)
}
wg.Wait()
}
}
}
func extractKey(B []block, memory, threads, keyLen uint32) []byte {
lanes := memory / threads
for lane := uint32(0); lane < threads-1; lane++ {
for i, v := range B[(lane*lanes)+lanes-1] {
B[memory-1][i] ^= v
}
}
var block [1024]byte
for i, v := range B[memory-1] {
binary.LittleEndian.PutUint64(block[i*8:], v)
}
key := make([]byte, keyLen)
blake2bHash(key, block[:])
return key
}
func indexAlpha(rand uint64, lanes, segments, threads, n, slice, lane, index uint32) uint32 {
refLane := uint32(rand>>32) % threads
if n == 0 && slice == 0 {
refLane = lane
}
m, s := 3*segments, ((slice+1)%syncPoints)*segments
if lane == refLane {
m += index
}
if n == 0 {
m, s = slice*segments, 0
if slice == 0 || lane == refLane {
m += index
}
}
if index == 0 || lane == refLane {
m--
}
return phi(rand, uint64(m), uint64(s), refLane, lanes)
}
func phi(rand, m, s uint64, lane, lanes uint32) uint32 {
p := rand & 0xFFFFFFFF
p = (p * p) >> 32
p = (p * m) >> 32
return lane*lanes + uint32((s+m-(p+1))%uint64(lanes))
}

53
vendor/golang.org/x/crypto/argon2/blake2b.go generated vendored Normal file
View File

@@ -0,0 +1,53 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package argon2
import (
"encoding/binary"
"hash"
"golang.org/x/crypto/blake2b"
)
// blake2bHash computes an arbitrary long hash value of in
// and writes the hash to out.
func blake2bHash(out []byte, in []byte) {
var b2 hash.Hash
if n := len(out); n < blake2b.Size {
b2, _ = blake2b.New(n, nil)
} else {
b2, _ = blake2b.New512(nil)
}
var buffer [blake2b.Size]byte
binary.LittleEndian.PutUint32(buffer[:4], uint32(len(out)))
b2.Write(buffer[:4])
b2.Write(in)
if len(out) <= blake2b.Size {
b2.Sum(out[:0])
return
}
outLen := len(out)
b2.Sum(buffer[:0])
b2.Reset()
copy(out, buffer[:32])
out = out[32:]
for len(out) > blake2b.Size {
b2.Write(buffer[:])
b2.Sum(buffer[:0])
copy(out, buffer[:32])
out = out[32:]
b2.Reset()
}
if outLen%blake2b.Size > 0 { // outLen > 64
r := ((outLen + 31) / 32) - 2 // ⌈τ /32⌉-2
b2, _ = blake2b.New(outLen-32*r, nil)
}
b2.Write(buffer[:])
b2.Sum(out[:0])
}

60
vendor/golang.org/x/crypto/argon2/blamka_amd64.go generated vendored Normal file
View File

@@ -0,0 +1,60 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
package argon2
import "golang.org/x/sys/cpu"
func init() {
useSSE4 = cpu.X86.HasSSE41
}
//go:noescape
func mixBlocksSSE2(out, a, b, c *block)
//go:noescape
func xorBlocksSSE2(out, a, b, c *block)
//go:noescape
func blamkaSSE4(b *block)
func processBlockSSE(out, in1, in2 *block, xor bool) {
var t block
mixBlocksSSE2(&t, in1, in2, &t)
if useSSE4 {
blamkaSSE4(&t)
} else {
for i := 0; i < blockLength; i += 16 {
blamkaGeneric(
&t[i+0], &t[i+1], &t[i+2], &t[i+3],
&t[i+4], &t[i+5], &t[i+6], &t[i+7],
&t[i+8], &t[i+9], &t[i+10], &t[i+11],
&t[i+12], &t[i+13], &t[i+14], &t[i+15],
)
}
for i := 0; i < blockLength/8; i += 2 {
blamkaGeneric(
&t[i], &t[i+1], &t[16+i], &t[16+i+1],
&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
)
}
}
if xor {
xorBlocksSSE2(out, in1, in2, &t)
} else {
mixBlocksSSE2(out, in1, in2, &t)
}
}
func processBlock(out, in1, in2 *block) {
processBlockSSE(out, in1, in2, false)
}
func processBlockXOR(out, in1, in2 *block) {
processBlockSSE(out, in1, in2, true)
}

243
vendor/golang.org/x/crypto/argon2/blamka_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,243 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
#include "textflag.h"
DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v6, t1; \
PUNPCKLQDQ v6, t2; \
PUNPCKHQDQ v7, v6; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ v7, t2; \
MOVO t1, v7; \
MOVO v2, t1; \
PUNPCKHQDQ t2, v7; \
PUNPCKLQDQ v3, t2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v3
#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v2, t1; \
PUNPCKLQDQ v2, t2; \
PUNPCKHQDQ v3, v2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ v3, t2; \
MOVO t1, v3; \
MOVO v6, t1; \
PUNPCKHQDQ t2, v3; \
PUNPCKLQDQ v7, t2; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v7
#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
MOVO v0, t0; \
PMULULQ v2, t0; \
PADDQ v2, v0; \
PADDQ t0, v0; \
PADDQ t0, v0; \
PXOR v0, v6; \
PSHUFD $0xB1, v6, v6; \
MOVO v4, t0; \
PMULULQ v6, t0; \
PADDQ v6, v4; \
PADDQ t0, v4; \
PADDQ t0, v4; \
PXOR v4, v2; \
PSHUFB c40, v2; \
MOVO v0, t0; \
PMULULQ v2, t0; \
PADDQ v2, v0; \
PADDQ t0, v0; \
PADDQ t0, v0; \
PXOR v0, v6; \
PSHUFB c48, v6; \
MOVO v4, t0; \
PMULULQ v6, t0; \
PADDQ v6, v4; \
PADDQ t0, v4; \
PADDQ t0, v4; \
PXOR v4, v2; \
MOVO v2, t0; \
PADDQ v2, t0; \
PSRLQ $63, v2; \
PXOR t0, v2; \
MOVO v1, t0; \
PMULULQ v3, t0; \
PADDQ v3, v1; \
PADDQ t0, v1; \
PADDQ t0, v1; \
PXOR v1, v7; \
PSHUFD $0xB1, v7, v7; \
MOVO v5, t0; \
PMULULQ v7, t0; \
PADDQ v7, v5; \
PADDQ t0, v5; \
PADDQ t0, v5; \
PXOR v5, v3; \
PSHUFB c40, v3; \
MOVO v1, t0; \
PMULULQ v3, t0; \
PADDQ v3, v1; \
PADDQ t0, v1; \
PADDQ t0, v1; \
PXOR v1, v7; \
PSHUFB c48, v7; \
MOVO v5, t0; \
PMULULQ v7, t0; \
PADDQ v7, v5; \
PADDQ t0, v5; \
PADDQ t0, v5; \
PXOR v5, v3; \
MOVO v3, t0; \
PADDQ v3, t0; \
PSRLQ $63, v3; \
PXOR t0, v3
#define LOAD_MSG_0(block, off) \
MOVOU 8*(off+0)(block), X0; \
MOVOU 8*(off+2)(block), X1; \
MOVOU 8*(off+4)(block), X2; \
MOVOU 8*(off+6)(block), X3; \
MOVOU 8*(off+8)(block), X4; \
MOVOU 8*(off+10)(block), X5; \
MOVOU 8*(off+12)(block), X6; \
MOVOU 8*(off+14)(block), X7
#define STORE_MSG_0(block, off) \
MOVOU X0, 8*(off+0)(block); \
MOVOU X1, 8*(off+2)(block); \
MOVOU X2, 8*(off+4)(block); \
MOVOU X3, 8*(off+6)(block); \
MOVOU X4, 8*(off+8)(block); \
MOVOU X5, 8*(off+10)(block); \
MOVOU X6, 8*(off+12)(block); \
MOVOU X7, 8*(off+14)(block)
#define LOAD_MSG_1(block, off) \
MOVOU 8*off+0*8(block), X0; \
MOVOU 8*off+16*8(block), X1; \
MOVOU 8*off+32*8(block), X2; \
MOVOU 8*off+48*8(block), X3; \
MOVOU 8*off+64*8(block), X4; \
MOVOU 8*off+80*8(block), X5; \
MOVOU 8*off+96*8(block), X6; \
MOVOU 8*off+112*8(block), X7
#define STORE_MSG_1(block, off) \
MOVOU X0, 8*off+0*8(block); \
MOVOU X1, 8*off+16*8(block); \
MOVOU X2, 8*off+32*8(block); \
MOVOU X3, 8*off+48*8(block); \
MOVOU X4, 8*off+64*8(block); \
MOVOU X5, 8*off+80*8(block); \
MOVOU X6, 8*off+96*8(block); \
MOVOU X7, 8*off+112*8(block)
#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
LOAD_MSG_0(block, off); \
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
STORE_MSG_0(block, off)
#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
LOAD_MSG_1(block, off); \
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
STORE_MSG_1(block, off)
// func blamkaSSE4(b *block)
TEXT ·blamkaSSE4(SB), 4, $0-8
MOVQ b+0(FP), AX
MOVOU ·c40<>(SB), X10
MOVOU ·c48<>(SB), X11
BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
RET
// func mixBlocksSSE2(out, a, b, c *block)
TEXT ·mixBlocksSSE2(SB), 4, $0-32
MOVQ out+0(FP), DX
MOVQ a+8(FP), AX
MOVQ b+16(FP), BX
MOVQ c+24(FP), CX
MOVQ $128, DI
loop:
MOVOU 0(AX), X0
MOVOU 0(BX), X1
MOVOU 0(CX), X2
PXOR X1, X0
PXOR X2, X0
MOVOU X0, 0(DX)
ADDQ $16, AX
ADDQ $16, BX
ADDQ $16, CX
ADDQ $16, DX
SUBQ $2, DI
JA loop
RET
// func xorBlocksSSE2(out, a, b, c *block)
TEXT ·xorBlocksSSE2(SB), 4, $0-32
MOVQ out+0(FP), DX
MOVQ a+8(FP), AX
MOVQ b+16(FP), BX
MOVQ c+24(FP), CX
MOVQ $128, DI
loop:
MOVOU 0(AX), X0
MOVOU 0(BX), X1
MOVOU 0(CX), X2
MOVOU 0(DX), X3
PXOR X1, X0
PXOR X2, X0
PXOR X3, X0
MOVOU X0, 0(DX)
ADDQ $16, AX
ADDQ $16, BX
ADDQ $16, CX
ADDQ $16, DX
SUBQ $2, DI
JA loop
RET

163
vendor/golang.org/x/crypto/argon2/blamka_generic.go generated vendored Normal file
View File

@@ -0,0 +1,163 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package argon2
var useSSE4 bool
func processBlockGeneric(out, in1, in2 *block, xor bool) {
var t block
for i := range t {
t[i] = in1[i] ^ in2[i]
}
for i := 0; i < blockLength; i += 16 {
blamkaGeneric(
&t[i+0], &t[i+1], &t[i+2], &t[i+3],
&t[i+4], &t[i+5], &t[i+6], &t[i+7],
&t[i+8], &t[i+9], &t[i+10], &t[i+11],
&t[i+12], &t[i+13], &t[i+14], &t[i+15],
)
}
for i := 0; i < blockLength/8; i += 2 {
blamkaGeneric(
&t[i], &t[i+1], &t[16+i], &t[16+i+1],
&t[32+i], &t[32+i+1], &t[48+i], &t[48+i+1],
&t[64+i], &t[64+i+1], &t[80+i], &t[80+i+1],
&t[96+i], &t[96+i+1], &t[112+i], &t[112+i+1],
)
}
if xor {
for i := range t {
out[i] ^= in1[i] ^ in2[i] ^ t[i]
}
} else {
for i := range t {
out[i] = in1[i] ^ in2[i] ^ t[i]
}
}
}
func blamkaGeneric(t00, t01, t02, t03, t04, t05, t06, t07, t08, t09, t10, t11, t12, t13, t14, t15 *uint64) {
v00, v01, v02, v03 := *t00, *t01, *t02, *t03
v04, v05, v06, v07 := *t04, *t05, *t06, *t07
v08, v09, v10, v11 := *t08, *t09, *t10, *t11
v12, v13, v14, v15 := *t12, *t13, *t14, *t15
v00 += v04 + 2*uint64(uint32(v00))*uint64(uint32(v04))
v12 ^= v00
v12 = v12>>32 | v12<<32
v08 += v12 + 2*uint64(uint32(v08))*uint64(uint32(v12))
v04 ^= v08
v04 = v04>>24 | v04<<40
v00 += v04 + 2*uint64(uint32(v00))*uint64(uint32(v04))
v12 ^= v00
v12 = v12>>16 | v12<<48
v08 += v12 + 2*uint64(uint32(v08))*uint64(uint32(v12))
v04 ^= v08
v04 = v04>>63 | v04<<1
v01 += v05 + 2*uint64(uint32(v01))*uint64(uint32(v05))
v13 ^= v01
v13 = v13>>32 | v13<<32
v09 += v13 + 2*uint64(uint32(v09))*uint64(uint32(v13))
v05 ^= v09
v05 = v05>>24 | v05<<40
v01 += v05 + 2*uint64(uint32(v01))*uint64(uint32(v05))
v13 ^= v01
v13 = v13>>16 | v13<<48
v09 += v13 + 2*uint64(uint32(v09))*uint64(uint32(v13))
v05 ^= v09
v05 = v05>>63 | v05<<1
v02 += v06 + 2*uint64(uint32(v02))*uint64(uint32(v06))
v14 ^= v02
v14 = v14>>32 | v14<<32
v10 += v14 + 2*uint64(uint32(v10))*uint64(uint32(v14))
v06 ^= v10
v06 = v06>>24 | v06<<40
v02 += v06 + 2*uint64(uint32(v02))*uint64(uint32(v06))
v14 ^= v02
v14 = v14>>16 | v14<<48
v10 += v14 + 2*uint64(uint32(v10))*uint64(uint32(v14))
v06 ^= v10
v06 = v06>>63 | v06<<1
v03 += v07 + 2*uint64(uint32(v03))*uint64(uint32(v07))
v15 ^= v03
v15 = v15>>32 | v15<<32
v11 += v15 + 2*uint64(uint32(v11))*uint64(uint32(v15))
v07 ^= v11
v07 = v07>>24 | v07<<40
v03 += v07 + 2*uint64(uint32(v03))*uint64(uint32(v07))
v15 ^= v03
v15 = v15>>16 | v15<<48
v11 += v15 + 2*uint64(uint32(v11))*uint64(uint32(v15))
v07 ^= v11
v07 = v07>>63 | v07<<1
v00 += v05 + 2*uint64(uint32(v00))*uint64(uint32(v05))
v15 ^= v00
v15 = v15>>32 | v15<<32
v10 += v15 + 2*uint64(uint32(v10))*uint64(uint32(v15))
v05 ^= v10
v05 = v05>>24 | v05<<40
v00 += v05 + 2*uint64(uint32(v00))*uint64(uint32(v05))
v15 ^= v00
v15 = v15>>16 | v15<<48
v10 += v15 + 2*uint64(uint32(v10))*uint64(uint32(v15))
v05 ^= v10
v05 = v05>>63 | v05<<1
v01 += v06 + 2*uint64(uint32(v01))*uint64(uint32(v06))
v12 ^= v01
v12 = v12>>32 | v12<<32
v11 += v12 + 2*uint64(uint32(v11))*uint64(uint32(v12))
v06 ^= v11
v06 = v06>>24 | v06<<40
v01 += v06 + 2*uint64(uint32(v01))*uint64(uint32(v06))
v12 ^= v01
v12 = v12>>16 | v12<<48
v11 += v12 + 2*uint64(uint32(v11))*uint64(uint32(v12))
v06 ^= v11
v06 = v06>>63 | v06<<1
v02 += v07 + 2*uint64(uint32(v02))*uint64(uint32(v07))
v13 ^= v02
v13 = v13>>32 | v13<<32
v08 += v13 + 2*uint64(uint32(v08))*uint64(uint32(v13))
v07 ^= v08
v07 = v07>>24 | v07<<40
v02 += v07 + 2*uint64(uint32(v02))*uint64(uint32(v07))
v13 ^= v02
v13 = v13>>16 | v13<<48
v08 += v13 + 2*uint64(uint32(v08))*uint64(uint32(v13))
v07 ^= v08
v07 = v07>>63 | v07<<1
v03 += v04 + 2*uint64(uint32(v03))*uint64(uint32(v04))
v14 ^= v03
v14 = v14>>32 | v14<<32
v09 += v14 + 2*uint64(uint32(v09))*uint64(uint32(v14))
v04 ^= v09
v04 = v04>>24 | v04<<40
v03 += v04 + 2*uint64(uint32(v03))*uint64(uint32(v04))
v14 ^= v03
v14 = v14>>16 | v14<<48
v09 += v14 + 2*uint64(uint32(v09))*uint64(uint32(v14))
v04 ^= v09
v04 = v04>>63 | v04<<1
*t00, *t01, *t02, *t03 = v00, v01, v02, v03
*t04, *t05, *t06, *t07 = v04, v05, v06, v07
*t08, *t09, *t10, *t11 = v08, v09, v10, v11
*t12, *t13, *t14, *t15 = v12, v13, v14, v15
}

15
vendor/golang.org/x/crypto/argon2/blamka_ref.go generated vendored Normal file
View File

@@ -0,0 +1,15 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || purego || !gc
package argon2
func processBlock(out, in1, in2 *block) {
processBlockGeneric(out, in1, in2, false)
}
func processBlockXOR(out, in1, in2 *block) {
processBlockGeneric(out, in1, in2, true)
}

291
vendor/golang.org/x/crypto/blake2b/blake2b.go generated vendored Normal file
View File

@@ -0,0 +1,291 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package blake2b implements the BLAKE2b hash algorithm defined by RFC 7693
// and the extendable output function (XOF) BLAKE2Xb.
//
// BLAKE2b is optimized for 64-bit platforms—including NEON-enabled ARMs—and
// produces digests of any size between 1 and 64 bytes.
// For a detailed specification of BLAKE2b see https://blake2.net/blake2.pdf
// and for BLAKE2Xb see https://blake2.net/blake2x.pdf
//
// If you aren't sure which function you need, use BLAKE2b (Sum512 or New512).
// If you need a secret-key MAC (message authentication code), use the New512
// function with a non-nil key.
//
// BLAKE2X is a construction to compute hash values larger than 64 bytes. It
// can produce hash values between 0 and 4 GiB.
package blake2b
import (
"encoding/binary"
"errors"
"hash"
)
const (
// The blocksize of BLAKE2b in bytes.
BlockSize = 128
// The hash size of BLAKE2b-512 in bytes.
Size = 64
// The hash size of BLAKE2b-384 in bytes.
Size384 = 48
// The hash size of BLAKE2b-256 in bytes.
Size256 = 32
)
var (
useAVX2 bool
useAVX bool
useSSE4 bool
)
var (
errKeySize = errors.New("blake2b: invalid key size")
errHashSize = errors.New("blake2b: invalid hash size")
)
var iv = [8]uint64{
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
}
// Sum512 returns the BLAKE2b-512 checksum of the data.
func Sum512(data []byte) [Size]byte {
var sum [Size]byte
checkSum(&sum, Size, data)
return sum
}
// Sum384 returns the BLAKE2b-384 checksum of the data.
func Sum384(data []byte) [Size384]byte {
var sum [Size]byte
var sum384 [Size384]byte
checkSum(&sum, Size384, data)
copy(sum384[:], sum[:Size384])
return sum384
}
// Sum256 returns the BLAKE2b-256 checksum of the data.
func Sum256(data []byte) [Size256]byte {
var sum [Size]byte
var sum256 [Size256]byte
checkSum(&sum, Size256, data)
copy(sum256[:], sum[:Size256])
return sum256
}
// New512 returns a new hash.Hash computing the BLAKE2b-512 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New512(key []byte) (hash.Hash, error) { return newDigest(Size, key) }
// New384 returns a new hash.Hash computing the BLAKE2b-384 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New384(key []byte) (hash.Hash, error) { return newDigest(Size384, key) }
// New256 returns a new hash.Hash computing the BLAKE2b-256 checksum. A non-nil
// key turns the hash into a MAC. The key must be between zero and 64 bytes long.
func New256(key []byte) (hash.Hash, error) { return newDigest(Size256, key) }
// New returns a new hash.Hash computing the BLAKE2b checksum with a custom length.
// A non-nil key turns the hash into a MAC. The key must be between zero and 64 bytes long.
// The hash size can be a value between 1 and 64 but it is highly recommended to use
// values equal or greater than:
// - 32 if BLAKE2b is used as a hash function (The key is zero bytes long).
// - 16 if BLAKE2b is used as a MAC function (The key is at least 16 bytes long).
// When the key is nil, the returned hash.Hash implements BinaryMarshaler
// and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash.
func New(size int, key []byte) (hash.Hash, error) { return newDigest(size, key) }
func newDigest(hashSize int, key []byte) (*digest, error) {
if hashSize < 1 || hashSize > Size {
return nil, errHashSize
}
if len(key) > Size {
return nil, errKeySize
}
d := &digest{
size: hashSize,
keyLen: len(key),
}
copy(d.key[:], key)
d.Reset()
return d, nil
}
func checkSum(sum *[Size]byte, hashSize int, data []byte) {
h := iv
h[0] ^= uint64(hashSize) | (1 << 16) | (1 << 24)
var c [2]uint64
if length := len(data); length > BlockSize {
n := length &^ (BlockSize - 1)
if length == n {
n -= BlockSize
}
hashBlocks(&h, &c, 0, data[:n])
data = data[n:]
}
var block [BlockSize]byte
offset := copy(block[:], data)
remaining := uint64(BlockSize - offset)
if c[0] < remaining {
c[1]--
}
c[0] -= remaining
hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
for i, v := range h[:(hashSize+7)/8] {
binary.LittleEndian.PutUint64(sum[8*i:], v)
}
}
type digest struct {
h [8]uint64
c [2]uint64
size int
block [BlockSize]byte
offset int
key [BlockSize]byte
keyLen int
}
const (
magic = "b2b"
marshaledSize = len(magic) + 8*8 + 2*8 + 1 + BlockSize + 1
)
func (d *digest) MarshalBinary() ([]byte, error) {
if d.keyLen != 0 {
return nil, errors.New("crypto/blake2b: cannot marshal MACs")
}
b := make([]byte, 0, marshaledSize)
b = append(b, magic...)
for i := 0; i < 8; i++ {
b = appendUint64(b, d.h[i])
}
b = appendUint64(b, d.c[0])
b = appendUint64(b, d.c[1])
// Maximum value for size is 64
b = append(b, byte(d.size))
b = append(b, d.block[:]...)
b = append(b, byte(d.offset))
return b, nil
}
func (d *digest) UnmarshalBinary(b []byte) error {
if len(b) < len(magic) || string(b[:len(magic)]) != magic {
return errors.New("crypto/blake2b: invalid hash state identifier")
}
if len(b) != marshaledSize {
return errors.New("crypto/blake2b: invalid hash state size")
}
b = b[len(magic):]
for i := 0; i < 8; i++ {
b, d.h[i] = consumeUint64(b)
}
b, d.c[0] = consumeUint64(b)
b, d.c[1] = consumeUint64(b)
d.size = int(b[0])
b = b[1:]
copy(d.block[:], b[:BlockSize])
b = b[BlockSize:]
d.offset = int(b[0])
return nil
}
func (d *digest) BlockSize() int { return BlockSize }
func (d *digest) Size() int { return d.size }
func (d *digest) Reset() {
d.h = iv
d.h[0] ^= uint64(d.size) | (uint64(d.keyLen) << 8) | (1 << 16) | (1 << 24)
d.offset, d.c[0], d.c[1] = 0, 0, 0
if d.keyLen > 0 {
d.block = d.key
d.offset = BlockSize
}
}
func (d *digest) Write(p []byte) (n int, err error) {
n = len(p)
if d.offset > 0 {
remaining := BlockSize - d.offset
if n <= remaining {
d.offset += copy(d.block[d.offset:], p)
return
}
copy(d.block[d.offset:], p[:remaining])
hashBlocks(&d.h, &d.c, 0, d.block[:])
d.offset = 0
p = p[remaining:]
}
if length := len(p); length > BlockSize {
nn := length &^ (BlockSize - 1)
if length == nn {
nn -= BlockSize
}
hashBlocks(&d.h, &d.c, 0, p[:nn])
p = p[nn:]
}
if len(p) > 0 {
d.offset += copy(d.block[:], p)
}
return
}
func (d *digest) Sum(sum []byte) []byte {
var hash [Size]byte
d.finalize(&hash)
return append(sum, hash[:d.size]...)
}
func (d *digest) finalize(hash *[Size]byte) {
var block [BlockSize]byte
copy(block[:], d.block[:d.offset])
remaining := uint64(BlockSize - d.offset)
c := d.c
if c[0] < remaining {
c[1]--
}
c[0] -= remaining
h := d.h
hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
for i, v := range h {
binary.LittleEndian.PutUint64(hash[8*i:], v)
}
}
func appendUint64(b []byte, x uint64) []byte {
var a [8]byte
binary.BigEndian.PutUint64(a[:], x)
return append(b, a[:]...)
}
func appendUint32(b []byte, x uint32) []byte {
var a [4]byte
binary.BigEndian.PutUint32(a[:], x)
return append(b, a[:]...)
}
func consumeUint64(b []byte) ([]byte, uint64) {
x := binary.BigEndian.Uint64(b)
return b[8:], x
}
func consumeUint32(b []byte) ([]byte, uint32) {
x := binary.BigEndian.Uint32(b)
return b[4:], x
}

View File

@@ -0,0 +1,37 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
package blake2b
import "golang.org/x/sys/cpu"
func init() {
useAVX2 = cpu.X86.HasAVX2
useAVX = cpu.X86.HasAVX
useSSE4 = cpu.X86.HasSSE41
}
//go:noescape
func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
//go:noescape
func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
//go:noescape
func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
switch {
case useAVX2:
hashBlocksAVX2(h, c, flag, blocks)
case useAVX:
hashBlocksAVX(h, c, flag, blocks)
case useSSE4:
hashBlocksSSE4(h, c, flag, blocks)
default:
hashBlocksGeneric(h, c, flag, blocks)
}
}

744
vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,744 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
#include "textflag.h"
DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
VPADDQ m0, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFD $-79, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPSHUFB c40, Y1, Y1; \
VPADDQ m1, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFB c48, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPADDQ Y1, Y1, t; \
VPSRLQ $63, Y1, Y1; \
VPXOR t, Y1, Y1; \
VPERMQ_0x39_Y1_Y1; \
VPERMQ_0x4E_Y2_Y2; \
VPERMQ_0x93_Y3_Y3; \
VPADDQ m2, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFD $-79, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPSHUFB c40, Y1, Y1; \
VPADDQ m3, Y0, Y0; \
VPADDQ Y1, Y0, Y0; \
VPXOR Y0, Y3, Y3; \
VPSHUFB c48, Y3, Y3; \
VPADDQ Y3, Y2, Y2; \
VPXOR Y2, Y1, Y1; \
VPADDQ Y1, Y1, t; \
VPSRLQ $63, Y1, Y1; \
VPXOR t, Y1, Y1; \
VPERMQ_0x39_Y3_Y3; \
VPERMQ_0x4E_Y2_Y2; \
VPERMQ_0x93_Y1_Y1
#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
// load msg: Y12 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
VMOVQ_SI_X12(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X12(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y12, Y12
// load msg: Y13 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
VMOVQ_SI_X13(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X13(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y13, Y13
// load msg: Y14 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
VMOVQ_SI_X14(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X14(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y14, Y14
// load msg: Y15 = (i0, i1, i2, i3)
// i0, i1, i2, i3 must not be 0
#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
VMOVQ_SI_X15(i0*8); \
VMOVQ_SI_X11(i2*8); \
VPINSRQ_1_SI_X15(i1*8); \
VPINSRQ_1_SI_X11(i3*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
VMOVQ_SI_X12_0; \
VMOVQ_SI_X11(4*8); \
VPINSRQ_1_SI_X12(2*8); \
VPINSRQ_1_SI_X11(6*8); \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \
LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
VMOVQ_SI_X11(11*8); \
VPSHUFD $0x4E, 0*8(SI), X14; \
VPINSRQ_1_SI_X11(5*8); \
VINSERTI128 $1, X11, Y14, Y14; \
LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
VMOVQ_SI_X11(5*8); \
VMOVDQU 11*8(SI), X12; \
VPINSRQ_1_SI_X11(15*8); \
VINSERTI128 $1, X11, Y12, Y12; \
VMOVQ_SI_X13(8*8); \
VMOVQ_SI_X11(2*8); \
VPINSRQ_1_SI_X13_0; \
VPINSRQ_1_SI_X11(13*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \
VMOVQ_SI_X15(6*8); \
VMOVQ_SI_X11_0; \
VPINSRQ_1_SI_X15(10*8); \
VPINSRQ_1_SI_X11(8*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \
VMOVQ_SI_X13_0; \
VMOVQ_SI_X11(4*8); \
VPINSRQ_1_SI_X13(7*8); \
VPINSRQ_1_SI_X11(15*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X11_0; \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X11(8*8); \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \
LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \
LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
VMOVQ_SI_X14_0; \
VPSHUFD $0x4E, 8*8(SI), X11; \
VPINSRQ_1_SI_X14(6*8); \
VINSERTI128 $1, X11, Y14, Y14; \
LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \
VMOVQ_SI_X15_0; \
VMOVQ_SI_X11(6*8); \
VPINSRQ_1_SI_X15(4*8); \
VPINSRQ_1_SI_X11(10*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
VMOVQ_SI_X12(6*8); \
VMOVQ_SI_X11(11*8); \
VPINSRQ_1_SI_X12(14*8); \
VPINSRQ_1_SI_X11_0; \
VINSERTI128 $1, X11, Y12, Y12; \
LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
VMOVQ_SI_X11(1*8); \
VMOVDQU 12*8(SI), X14; \
VPINSRQ_1_SI_X11(10*8); \
VINSERTI128 $1, X11, Y14, Y14; \
VMOVQ_SI_X15(2*8); \
VMOVDQU 4*8(SI), X11; \
VPINSRQ_1_SI_X15(7*8); \
VINSERTI128 $1, X11, Y15, Y15
#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \
VMOVQ_SI_X13(2*8); \
VPSHUFD $0x4E, 5*8(SI), X11; \
VPINSRQ_1_SI_X13(4*8); \
VINSERTI128 $1, X11, Y13, Y13; \
LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
VMOVQ_SI_X15(11*8); \
VMOVQ_SI_X11(12*8); \
VPINSRQ_1_SI_X15(14*8); \
VPINSRQ_1_SI_X11_0; \
VINSERTI128 $1, X11, Y15, Y15
// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, DX
ADDQ $31, DX
ANDQ $~31, DX
MOVQ CX, 16(DX)
XORQ CX, CX
MOVQ CX, 24(DX)
VMOVDQU ·AVX2_c40<>(SB), Y4
VMOVDQU ·AVX2_c48<>(SB), Y5
VMOVDQU 0(AX), Y8
VMOVDQU 32(AX), Y9
VMOVDQU ·AVX2_iv0<>(SB), Y6
VMOVDQU ·AVX2_iv1<>(SB), Y7
MOVQ 0(BX), R8
MOVQ 8(BX), R9
MOVQ R9, 8(DX)
loop:
ADDQ $128, R8
MOVQ R8, 0(DX)
CMPQ R8, $128
JGE noinc
INCQ R9
MOVQ R9, 8(DX)
noinc:
VMOVDQA Y8, Y0
VMOVDQA Y9, Y1
VMOVDQA Y6, Y2
VPXOR 0(DX), Y7, Y3
LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
VMOVDQA Y12, 32(DX)
VMOVDQA Y13, 64(DX)
VMOVDQA Y14, 96(DX)
VMOVDQA Y15, 128(DX)
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
VMOVDQA Y12, 160(DX)
VMOVDQA Y13, 192(DX)
VMOVDQA Y14, 224(DX)
VMOVDQA Y15, 256(DX)
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
VPXOR Y0, Y8, Y8
VPXOR Y1, Y9, Y9
VPXOR Y2, Y8, Y8
VPXOR Y3, Y9, Y9
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
VMOVDQU Y8, 0(AX)
VMOVDQU Y9, 32(AX)
VZEROUPPER
RET
#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
#define SHUFFLE_AVX() \
VMOVDQA X6, X13; \
VMOVDQA X2, X14; \
VMOVDQA X4, X6; \
VPUNPCKLQDQ_X13_X13_X15; \
VMOVDQA X5, X4; \
VMOVDQA X6, X5; \
VPUNPCKHQDQ_X15_X7_X6; \
VPUNPCKLQDQ_X7_X7_X15; \
VPUNPCKHQDQ_X15_X13_X7; \
VPUNPCKLQDQ_X3_X3_X15; \
VPUNPCKHQDQ_X15_X2_X2; \
VPUNPCKLQDQ_X14_X14_X15; \
VPUNPCKHQDQ_X15_X3_X3; \
#define SHUFFLE_AVX_INV() \
VMOVDQA X2, X13; \
VMOVDQA X4, X14; \
VPUNPCKLQDQ_X2_X2_X15; \
VMOVDQA X5, X4; \
VPUNPCKHQDQ_X15_X3_X2; \
VMOVDQA X14, X5; \
VPUNPCKLQDQ_X3_X3_X15; \
VMOVDQA X6, X14; \
VPUNPCKHQDQ_X15_X13_X3; \
VPUNPCKLQDQ_X7_X7_X15; \
VPUNPCKHQDQ_X15_X6_X6; \
VPUNPCKLQDQ_X14_X14_X15; \
VPUNPCKHQDQ_X15_X7_X7; \
#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
VPADDQ m0, v0, v0; \
VPADDQ v2, v0, v0; \
VPADDQ m1, v1, v1; \
VPADDQ v3, v1, v1; \
VPXOR v0, v6, v6; \
VPXOR v1, v7, v7; \
VPSHUFD $-79, v6, v6; \
VPSHUFD $-79, v7, v7; \
VPADDQ v6, v4, v4; \
VPADDQ v7, v5, v5; \
VPXOR v4, v2, v2; \
VPXOR v5, v3, v3; \
VPSHUFB c40, v2, v2; \
VPSHUFB c40, v3, v3; \
VPADDQ m2, v0, v0; \
VPADDQ v2, v0, v0; \
VPADDQ m3, v1, v1; \
VPADDQ v3, v1, v1; \
VPXOR v0, v6, v6; \
VPXOR v1, v7, v7; \
VPSHUFB c48, v6, v6; \
VPSHUFB c48, v7, v7; \
VPADDQ v6, v4, v4; \
VPADDQ v7, v5, v5; \
VPXOR v4, v2, v2; \
VPXOR v5, v3, v3; \
VPADDQ v2, v2, t0; \
VPSRLQ $63, v2, v2; \
VPXOR t0, v2, v2; \
VPADDQ v3, v3, t0; \
VPSRLQ $63, v3, v3; \
VPXOR t0, v3, v3
// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
VMOVQ_SI_X12(i0*8); \
VMOVQ_SI_X13(i2*8); \
VMOVQ_SI_X14(i4*8); \
VMOVQ_SI_X15(i6*8); \
VPINSRQ_1_SI_X12(i1*8); \
VPINSRQ_1_SI_X13(i3*8); \
VPINSRQ_1_SI_X14(i5*8); \
VPINSRQ_1_SI_X15(i7*8)
// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
VMOVQ_SI_X12_0; \
VMOVQ_SI_X13(4*8); \
VMOVQ_SI_X14(1*8); \
VMOVQ_SI_X15(5*8); \
VPINSRQ_1_SI_X12(2*8); \
VPINSRQ_1_SI_X13(6*8); \
VPINSRQ_1_SI_X14(3*8); \
VPINSRQ_1_SI_X15(7*8)
// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
VPSHUFD $0x4E, 0*8(SI), X12; \
VMOVQ_SI_X13(11*8); \
VMOVQ_SI_X14(12*8); \
VMOVQ_SI_X15(7*8); \
VPINSRQ_1_SI_X13(5*8); \
VPINSRQ_1_SI_X14(2*8); \
VPINSRQ_1_SI_X15(3*8)
// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
VMOVDQU 11*8(SI), X12; \
VMOVQ_SI_X13(5*8); \
VMOVQ_SI_X14(8*8); \
VMOVQ_SI_X15(2*8); \
VPINSRQ_1_SI_X13(15*8); \
VPINSRQ_1_SI_X14_0; \
VPINSRQ_1_SI_X15(13*8)
// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X13(4*8); \
VMOVQ_SI_X14(6*8); \
VMOVQ_SI_X15_0; \
VPINSRQ_1_SI_X12(5*8); \
VPINSRQ_1_SI_X13(15*8); \
VPINSRQ_1_SI_X14(10*8); \
VPINSRQ_1_SI_X15(8*8)
// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
VMOVQ_SI_X12(9*8); \
VMOVQ_SI_X13(2*8); \
VMOVQ_SI_X14_0; \
VMOVQ_SI_X15(4*8); \
VPINSRQ_1_SI_X12(5*8); \
VPINSRQ_1_SI_X13(10*8); \
VPINSRQ_1_SI_X14(7*8); \
VPINSRQ_1_SI_X15(15*8)
// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
VMOVQ_SI_X12(2*8); \
VMOVQ_SI_X13_0; \
VMOVQ_SI_X14(12*8); \
VMOVQ_SI_X15(11*8); \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X13(8*8); \
VPINSRQ_1_SI_X14(10*8); \
VPINSRQ_1_SI_X15(3*8)
// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
MOVQ 0*8(SI), X12; \
VPSHUFD $0x4E, 8*8(SI), X13; \
MOVQ 7*8(SI), X14; \
MOVQ 2*8(SI), X15; \
VPINSRQ_1_SI_X12(6*8); \
VPINSRQ_1_SI_X14(3*8); \
VPINSRQ_1_SI_X15(11*8)
// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
MOVQ 6*8(SI), X12; \
MOVQ 11*8(SI), X13; \
MOVQ 15*8(SI), X14; \
MOVQ 3*8(SI), X15; \
VPINSRQ_1_SI_X12(14*8); \
VPINSRQ_1_SI_X13_0; \
VPINSRQ_1_SI_X14(9*8); \
VPINSRQ_1_SI_X15(8*8)
// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
MOVQ 5*8(SI), X12; \
MOVQ 8*8(SI), X13; \
MOVQ 0*8(SI), X14; \
MOVQ 6*8(SI), X15; \
VPINSRQ_1_SI_X12(15*8); \
VPINSRQ_1_SI_X13(2*8); \
VPINSRQ_1_SI_X14(4*8); \
VPINSRQ_1_SI_X15(10*8)
// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
VMOVDQU 12*8(SI), X12; \
MOVQ 1*8(SI), X13; \
MOVQ 2*8(SI), X14; \
VPINSRQ_1_SI_X13(10*8); \
VPINSRQ_1_SI_X14(7*8); \
VMOVDQU 4*8(SI), X15
// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
MOVQ 15*8(SI), X12; \
MOVQ 3*8(SI), X13; \
MOVQ 11*8(SI), X14; \
MOVQ 12*8(SI), X15; \
VPINSRQ_1_SI_X12(9*8); \
VPINSRQ_1_SI_X13(13*8); \
VPINSRQ_1_SI_X14(14*8); \
VPINSRQ_1_SI_X15_0
// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, R10
ADDQ $15, R10
ANDQ $~15, R10
VMOVDQU ·AVX_c40<>(SB), X0
VMOVDQU ·AVX_c48<>(SB), X1
VMOVDQA X0, X8
VMOVDQA X1, X9
VMOVDQU ·AVX_iv3<>(SB), X0
VMOVDQA X0, 0(R10)
XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0)
VMOVDQU 0(AX), X10
VMOVDQU 16(AX), X11
VMOVDQU 32(AX), X2
VMOVDQU 48(AX), X3
MOVQ 0(BX), R8
MOVQ 8(BX), R9
loop:
ADDQ $128, R8
CMPQ R8, $128
JGE noinc
INCQ R9
noinc:
VMOVQ_R8_X15
VPINSRQ_1_R9_X15
VMOVDQA X10, X0
VMOVDQA X11, X1
VMOVDQU ·AVX_iv0<>(SB), X4
VMOVDQU ·AVX_iv1<>(SB), X5
VMOVDQU ·AVX_iv2<>(SB), X6
VPXOR X15, X6, X6
VMOVDQA 0(R10), X7
LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
VMOVDQA X12, 16(R10)
VMOVDQA X13, 32(R10)
VMOVDQA X14, 48(R10)
VMOVDQA X15, 64(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
VMOVDQA X12, 80(R10)
VMOVDQA X13, 96(R10)
VMOVDQA X14, 112(R10)
VMOVDQA X15, 128(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
VMOVDQA X12, 144(R10)
VMOVDQA X13, 160(R10)
VMOVDQA X14, 176(R10)
VMOVDQA X15, 192(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
VMOVDQA X12, 208(R10)
VMOVDQA X13, 224(R10)
VMOVDQA X14, 240(R10)
VMOVDQA X15, 256(R10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX()
LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
SHUFFLE_AVX_INV()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
SHUFFLE_AVX()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
SHUFFLE_AVX_INV()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
SHUFFLE_AVX()
HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
SHUFFLE_AVX_INV()
VMOVDQU 32(AX), X14
VMOVDQU 48(AX), X15
VPXOR X0, X10, X10
VPXOR X1, X11, X11
VPXOR X2, X14, X14
VPXOR X3, X15, X15
VPXOR X4, X10, X10
VPXOR X5, X11, X11
VPXOR X6, X14, X2
VPXOR X7, X15, X3
VMOVDQU X2, 32(AX)
VMOVDQU X3, 48(AX)
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
VMOVDQU X10, 0(AX)
VMOVDQU X11, 16(AX)
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
VZEROUPPER
RET

278
vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,278 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
#include "textflag.h"
DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v6, t1; \
PUNPCKLQDQ v6, t2; \
PUNPCKHQDQ v7, v6; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ v7, t2; \
MOVO t1, v7; \
MOVO v2, t1; \
PUNPCKHQDQ t2, v7; \
PUNPCKLQDQ v3, t2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v3
#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
MOVO v4, t1; \
MOVO v5, v4; \
MOVO t1, v5; \
MOVO v2, t1; \
PUNPCKLQDQ v2, t2; \
PUNPCKHQDQ v3, v2; \
PUNPCKHQDQ t2, v2; \
PUNPCKLQDQ v3, t2; \
MOVO t1, v3; \
MOVO v6, t1; \
PUNPCKHQDQ t2, v3; \
PUNPCKLQDQ v7, t2; \
PUNPCKHQDQ t2, v6; \
PUNPCKLQDQ t1, t2; \
PUNPCKHQDQ t2, v7
#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
PADDQ m0, v0; \
PADDQ m1, v1; \
PADDQ v2, v0; \
PADDQ v3, v1; \
PXOR v0, v6; \
PXOR v1, v7; \
PSHUFD $0xB1, v6, v6; \
PSHUFD $0xB1, v7, v7; \
PADDQ v6, v4; \
PADDQ v7, v5; \
PXOR v4, v2; \
PXOR v5, v3; \
PSHUFB c40, v2; \
PSHUFB c40, v3; \
PADDQ m2, v0; \
PADDQ m3, v1; \
PADDQ v2, v0; \
PADDQ v3, v1; \
PXOR v0, v6; \
PXOR v1, v7; \
PSHUFB c48, v6; \
PSHUFB c48, v7; \
PADDQ v6, v4; \
PADDQ v7, v5; \
PXOR v4, v2; \
PXOR v5, v3; \
MOVOU v2, t0; \
PADDQ v2, t0; \
PSRLQ $63, v2; \
PXOR t0, v2; \
MOVOU v3, t0; \
PADDQ v3, t0; \
PSRLQ $63, v3; \
PXOR t0, v3
#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
MOVQ i0*8(src), m0; \
PINSRQ $1, i1*8(src), m0; \
MOVQ i2*8(src), m1; \
PINSRQ $1, i3*8(src), m1; \
MOVQ i4*8(src), m2; \
PINSRQ $1, i5*8(src), m2; \
MOVQ i6*8(src), m3; \
PINSRQ $1, i7*8(src), m3
// func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
MOVQ h+0(FP), AX
MOVQ c+8(FP), BX
MOVQ flag+16(FP), CX
MOVQ blocks_base+24(FP), SI
MOVQ blocks_len+32(FP), DI
MOVQ SP, R10
ADDQ $15, R10
ANDQ $~15, R10
MOVOU ·iv3<>(SB), X0
MOVO X0, 0(R10)
XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0)
MOVOU ·c40<>(SB), X13
MOVOU ·c48<>(SB), X14
MOVOU 0(AX), X12
MOVOU 16(AX), X15
MOVQ 0(BX), R8
MOVQ 8(BX), R9
loop:
ADDQ $128, R8
CMPQ R8, $128
JGE noinc
INCQ R9
noinc:
MOVQ R8, X8
PINSRQ $1, R9, X8
MOVO X12, X0
MOVO X15, X1
MOVOU 32(AX), X2
MOVOU 48(AX), X3
MOVOU ·iv0<>(SB), X4
MOVOU ·iv1<>(SB), X5
MOVOU ·iv2<>(SB), X6
PXOR X8, X6
MOVO 0(R10), X7
LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
MOVO X8, 16(R10)
MOVO X9, 32(R10)
MOVO X10, 48(R10)
MOVO X11, 64(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
MOVO X8, 80(R10)
MOVO X9, 96(R10)
MOVO X10, 112(R10)
MOVO X11, 128(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
MOVO X8, 144(R10)
MOVO X9, 160(R10)
MOVO X10, 176(R10)
MOVO X11, 192(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
MOVO X8, 208(R10)
MOVO X9, 224(R10)
MOVO X10, 240(R10)
MOVO X11, 256(R10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
MOVOU 32(AX), X10
MOVOU 48(AX), X11
PXOR X0, X12
PXOR X1, X15
PXOR X2, X10
PXOR X3, X11
PXOR X4, X12
PXOR X5, X15
PXOR X6, X10
PXOR X7, X11
MOVOU X10, 32(AX)
MOVOU X11, 48(AX)
LEAQ 128(SI), SI
SUBQ $128, DI
JNE loop
MOVOU X12, 0(AX)
MOVOU X15, 16(AX)
MOVQ R8, 0(BX)
MOVQ R9, 8(BX)
RET

182
vendor/golang.org/x/crypto/blake2b/blake2b_generic.go generated vendored Normal file
View File

@@ -0,0 +1,182 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2b
import (
"encoding/binary"
"math/bits"
)
// the precomputed values for BLAKE2b
// there are 12 16-byte arrays - one for each round
// the entries are calculated from the sigma constants.
var precomputed = [12][16]byte{
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, // equal to the first
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, // equal to the second
}
func hashBlocksGeneric(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
var m [16]uint64
c0, c1 := c[0], c[1]
for i := 0; i < len(blocks); {
c0 += BlockSize
if c0 < BlockSize {
c1++
}
v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7]
v12 ^= c0
v13 ^= c1
v14 ^= flag
for j := range m {
m[j] = binary.LittleEndian.Uint64(blocks[i:])
i += 8
}
for j := range precomputed {
s := &(precomputed[j])
v0 += m[s[0]]
v0 += v4
v12 ^= v0
v12 = bits.RotateLeft64(v12, -32)
v8 += v12
v4 ^= v8
v4 = bits.RotateLeft64(v4, -24)
v1 += m[s[1]]
v1 += v5
v13 ^= v1
v13 = bits.RotateLeft64(v13, -32)
v9 += v13
v5 ^= v9
v5 = bits.RotateLeft64(v5, -24)
v2 += m[s[2]]
v2 += v6
v14 ^= v2
v14 = bits.RotateLeft64(v14, -32)
v10 += v14
v6 ^= v10
v6 = bits.RotateLeft64(v6, -24)
v3 += m[s[3]]
v3 += v7
v15 ^= v3
v15 = bits.RotateLeft64(v15, -32)
v11 += v15
v7 ^= v11
v7 = bits.RotateLeft64(v7, -24)
v0 += m[s[4]]
v0 += v4
v12 ^= v0
v12 = bits.RotateLeft64(v12, -16)
v8 += v12
v4 ^= v8
v4 = bits.RotateLeft64(v4, -63)
v1 += m[s[5]]
v1 += v5
v13 ^= v1
v13 = bits.RotateLeft64(v13, -16)
v9 += v13
v5 ^= v9
v5 = bits.RotateLeft64(v5, -63)
v2 += m[s[6]]
v2 += v6
v14 ^= v2
v14 = bits.RotateLeft64(v14, -16)
v10 += v14
v6 ^= v10
v6 = bits.RotateLeft64(v6, -63)
v3 += m[s[7]]
v3 += v7
v15 ^= v3
v15 = bits.RotateLeft64(v15, -16)
v11 += v15
v7 ^= v11
v7 = bits.RotateLeft64(v7, -63)
v0 += m[s[8]]
v0 += v5
v15 ^= v0
v15 = bits.RotateLeft64(v15, -32)
v10 += v15
v5 ^= v10
v5 = bits.RotateLeft64(v5, -24)
v1 += m[s[9]]
v1 += v6
v12 ^= v1
v12 = bits.RotateLeft64(v12, -32)
v11 += v12
v6 ^= v11
v6 = bits.RotateLeft64(v6, -24)
v2 += m[s[10]]
v2 += v7
v13 ^= v2
v13 = bits.RotateLeft64(v13, -32)
v8 += v13
v7 ^= v8
v7 = bits.RotateLeft64(v7, -24)
v3 += m[s[11]]
v3 += v4
v14 ^= v3
v14 = bits.RotateLeft64(v14, -32)
v9 += v14
v4 ^= v9
v4 = bits.RotateLeft64(v4, -24)
v0 += m[s[12]]
v0 += v5
v15 ^= v0
v15 = bits.RotateLeft64(v15, -16)
v10 += v15
v5 ^= v10
v5 = bits.RotateLeft64(v5, -63)
v1 += m[s[13]]
v1 += v6
v12 ^= v1
v12 = bits.RotateLeft64(v12, -16)
v11 += v12
v6 ^= v11
v6 = bits.RotateLeft64(v6, -63)
v2 += m[s[14]]
v2 += v7
v13 ^= v2
v13 = bits.RotateLeft64(v13, -16)
v8 += v13
v7 ^= v8
v7 = bits.RotateLeft64(v7, -63)
v3 += m[s[15]]
v3 += v4
v14 ^= v3
v14 = bits.RotateLeft64(v14, -16)
v9 += v14
v4 ^= v9
v4 = bits.RotateLeft64(v4, -63)
}
h[0] ^= v0 ^ v8
h[1] ^= v1 ^ v9
h[2] ^= v2 ^ v10
h[3] ^= v3 ^ v11
h[4] ^= v4 ^ v12
h[5] ^= v5 ^ v13
h[6] ^= v6 ^ v14
h[7] ^= v7 ^ v15
}
c[0], c[1] = c0, c1
}

11
vendor/golang.org/x/crypto/blake2b/blake2b_ref.go generated vendored Normal file
View File

@@ -0,0 +1,11 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || purego || !gc
package blake2b
func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
hashBlocksGeneric(h, c, flag, blocks)
}

177
vendor/golang.org/x/crypto/blake2b/blake2x.go generated vendored Normal file
View File

@@ -0,0 +1,177 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2b
import (
"encoding/binary"
"errors"
"io"
)
// XOF defines the interface to hash functions that
// support arbitrary-length output.
type XOF interface {
// Write absorbs more data into the hash's state. It panics if called
// after Read.
io.Writer
// Read reads more output from the hash. It returns io.EOF if the limit
// has been reached.
io.Reader
// Clone returns a copy of the XOF in its current state.
Clone() XOF
// Reset resets the XOF to its initial state.
Reset()
}
// OutputLengthUnknown can be used as the size argument to NewXOF to indicate
// the length of the output is not known in advance.
const OutputLengthUnknown = 0
// magicUnknownOutputLength is a magic value for the output size that indicates
// an unknown number of output bytes.
const magicUnknownOutputLength = (1 << 32) - 1
// maxOutputLength is the absolute maximum number of bytes to produce when the
// number of output bytes is unknown.
const maxOutputLength = (1 << 32) * 64
// NewXOF creates a new variable-output-length hash. The hash either produce a
// known number of bytes (1 <= size < 2**32-1), or an unknown number of bytes
// (size == OutputLengthUnknown). In the latter case, an absolute limit of
// 256GiB applies.
//
// A non-nil key turns the hash into a MAC. The key must between
// zero and 32 bytes long.
func NewXOF(size uint32, key []byte) (XOF, error) {
if len(key) > Size {
return nil, errKeySize
}
if size == magicUnknownOutputLength {
// 2^32-1 indicates an unknown number of bytes and thus isn't a
// valid length.
return nil, errors.New("blake2b: XOF length too large")
}
if size == OutputLengthUnknown {
size = magicUnknownOutputLength
}
x := &xof{
d: digest{
size: Size,
keyLen: len(key),
},
length: size,
}
copy(x.d.key[:], key)
x.Reset()
return x, nil
}
type xof struct {
d digest
length uint32
remaining uint64
cfg, root, block [Size]byte
offset int
nodeOffset uint32
readMode bool
}
func (x *xof) Write(p []byte) (n int, err error) {
if x.readMode {
panic("blake2b: write to XOF after read")
}
return x.d.Write(p)
}
func (x *xof) Clone() XOF {
clone := *x
return &clone
}
func (x *xof) Reset() {
x.cfg[0] = byte(Size)
binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length
binary.LittleEndian.PutUint32(x.cfg[12:], x.length) // XOF length
x.cfg[17] = byte(Size) // inner hash size
x.d.Reset()
x.d.h[1] ^= uint64(x.length) << 32
x.remaining = uint64(x.length)
if x.remaining == magicUnknownOutputLength {
x.remaining = maxOutputLength
}
x.offset, x.nodeOffset = 0, 0
x.readMode = false
}
func (x *xof) Read(p []byte) (n int, err error) {
if !x.readMode {
x.d.finalize(&x.root)
x.readMode = true
}
if x.remaining == 0 {
return 0, io.EOF
}
n = len(p)
if uint64(n) > x.remaining {
n = int(x.remaining)
p = p[:n]
}
if x.offset > 0 {
blockRemaining := Size - x.offset
if n < blockRemaining {
x.offset += copy(p, x.block[x.offset:])
x.remaining -= uint64(n)
return
}
copy(p, x.block[x.offset:])
p = p[blockRemaining:]
x.offset = 0
x.remaining -= uint64(blockRemaining)
}
for len(p) >= Size {
binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
x.nodeOffset++
x.d.initConfig(&x.cfg)
x.d.Write(x.root[:])
x.d.finalize(&x.block)
copy(p, x.block[:])
p = p[Size:]
x.remaining -= uint64(Size)
}
if todo := len(p); todo > 0 {
if x.remaining < uint64(Size) {
x.cfg[0] = byte(x.remaining)
}
binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
x.nodeOffset++
x.d.initConfig(&x.cfg)
x.d.Write(x.root[:])
x.d.finalize(&x.block)
x.offset = copy(p, x.block[:todo])
x.remaining -= uint64(todo)
}
return
}
func (d *digest) initConfig(cfg *[Size]byte) {
d.offset, d.c[0], d.c[1] = 0, 0, 0
for i := range d.h {
d.h[i] = iv[i] ^ binary.LittleEndian.Uint64(cfg[i*8:])
}
}

30
vendor/golang.org/x/crypto/blake2b/register.go generated vendored Normal file
View File

@@ -0,0 +1,30 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2b
import (
"crypto"
"hash"
)
func init() {
newHash256 := func() hash.Hash {
h, _ := New256(nil)
return h
}
newHash384 := func() hash.Hash {
h, _ := New384(nil)
return h
}
newHash512 := func() hash.Hash {
h, _ := New512(nil)
return h
}
crypto.RegisterHash(crypto.BLAKE2b_256, newHash256)
crypto.RegisterHash(crypto.BLAKE2b_384, newHash384)
crypto.RegisterHash(crypto.BLAKE2b_512, newHash512)
}

254
vendor/golang.org/x/crypto/blake2s/blake2s.go generated vendored Normal file
View File

@@ -0,0 +1,254 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package blake2s implements the BLAKE2s hash algorithm defined by RFC 7693
// and the extendable output function (XOF) BLAKE2Xs.
//
// BLAKE2s is optimized for 8- to 32-bit platforms and produces digests of any
// size between 1 and 32 bytes.
// For a detailed specification of BLAKE2s see https://blake2.net/blake2.pdf
// and for BLAKE2Xs see https://blake2.net/blake2x.pdf
//
// If you aren't sure which function you need, use BLAKE2s (Sum256 or New256).
// If you need a secret-key MAC (message authentication code), use the New256
// function with a non-nil key.
//
// BLAKE2X is a construction to compute hash values larger than 32 bytes. It
// can produce hash values between 0 and 65535 bytes.
package blake2s
import (
"crypto"
"encoding/binary"
"errors"
"hash"
)
const (
// The blocksize of BLAKE2s in bytes.
BlockSize = 64
// The hash size of BLAKE2s-256 in bytes.
Size = 32
// The hash size of BLAKE2s-128 in bytes.
Size128 = 16
)
var errKeySize = errors.New("blake2s: invalid key size")
var iv = [8]uint32{
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
}
// Sum256 returns the BLAKE2s-256 checksum of the data.
func Sum256(data []byte) [Size]byte {
var sum [Size]byte
checkSum(&sum, Size, data)
return sum
}
// New256 returns a new hash.Hash computing the BLAKE2s-256 checksum. A non-nil
// key turns the hash into a MAC. The key must between zero and 32 bytes long.
// When the key is nil, the returned hash.Hash implements BinaryMarshaler
// and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash.
func New256(key []byte) (hash.Hash, error) { return newDigest(Size, key) }
func init() {
crypto.RegisterHash(crypto.BLAKE2s_256, func() hash.Hash {
h, _ := New256(nil)
return h
})
}
// New128 returns a new hash.Hash computing the BLAKE2s-128 checksum given a
// non-empty key. Note that a 128-bit digest is too small to be secure as a
// cryptographic hash and should only be used as a MAC, thus the key argument
// is not optional.
func New128(key []byte) (hash.Hash, error) {
if len(key) == 0 {
return nil, errors.New("blake2s: a key is required for a 128-bit hash")
}
return newDigest(Size128, key)
}
func newDigest(hashSize int, key []byte) (*digest, error) {
if len(key) > Size {
return nil, errKeySize
}
d := &digest{
size: hashSize,
keyLen: len(key),
}
copy(d.key[:], key)
d.Reset()
return d, nil
}
func checkSum(sum *[Size]byte, hashSize int, data []byte) {
var (
h [8]uint32
c [2]uint32
)
h = iv
h[0] ^= uint32(hashSize) | (1 << 16) | (1 << 24)
if length := len(data); length > BlockSize {
n := length &^ (BlockSize - 1)
if length == n {
n -= BlockSize
}
hashBlocks(&h, &c, 0, data[:n])
data = data[n:]
}
var block [BlockSize]byte
offset := copy(block[:], data)
remaining := uint32(BlockSize - offset)
if c[0] < remaining {
c[1]--
}
c[0] -= remaining
hashBlocks(&h, &c, 0xFFFFFFFF, block[:])
for i, v := range h {
binary.LittleEndian.PutUint32(sum[4*i:], v)
}
}
type digest struct {
h [8]uint32
c [2]uint32
size int
block [BlockSize]byte
offset int
key [BlockSize]byte
keyLen int
}
const (
magic = "b2s"
marshaledSize = len(magic) + 8*4 + 2*4 + 1 + BlockSize + 1
)
func (d *digest) MarshalBinary() ([]byte, error) {
if d.keyLen != 0 {
return nil, errors.New("crypto/blake2s: cannot marshal MACs")
}
b := make([]byte, 0, marshaledSize)
b = append(b, magic...)
for i := 0; i < 8; i++ {
b = appendUint32(b, d.h[i])
}
b = appendUint32(b, d.c[0])
b = appendUint32(b, d.c[1])
// Maximum value for size is 32
b = append(b, byte(d.size))
b = append(b, d.block[:]...)
b = append(b, byte(d.offset))
return b, nil
}
func (d *digest) UnmarshalBinary(b []byte) error {
if len(b) < len(magic) || string(b[:len(magic)]) != magic {
return errors.New("crypto/blake2s: invalid hash state identifier")
}
if len(b) != marshaledSize {
return errors.New("crypto/blake2s: invalid hash state size")
}
b = b[len(magic):]
for i := 0; i < 8; i++ {
b, d.h[i] = consumeUint32(b)
}
b, d.c[0] = consumeUint32(b)
b, d.c[1] = consumeUint32(b)
d.size = int(b[0])
b = b[1:]
copy(d.block[:], b[:BlockSize])
b = b[BlockSize:]
d.offset = int(b[0])
return nil
}
func (d *digest) BlockSize() int { return BlockSize }
func (d *digest) Size() int { return d.size }
func (d *digest) Reset() {
d.h = iv
d.h[0] ^= uint32(d.size) | (uint32(d.keyLen) << 8) | (1 << 16) | (1 << 24)
d.offset, d.c[0], d.c[1] = 0, 0, 0
if d.keyLen > 0 {
d.block = d.key
d.offset = BlockSize
}
}
func (d *digest) Write(p []byte) (n int, err error) {
n = len(p)
if d.offset > 0 {
remaining := BlockSize - d.offset
if n <= remaining {
d.offset += copy(d.block[d.offset:], p)
return
}
copy(d.block[d.offset:], p[:remaining])
hashBlocks(&d.h, &d.c, 0, d.block[:])
d.offset = 0
p = p[remaining:]
}
if length := len(p); length > BlockSize {
nn := length &^ (BlockSize - 1)
if length == nn {
nn -= BlockSize
}
hashBlocks(&d.h, &d.c, 0, p[:nn])
p = p[nn:]
}
d.offset += copy(d.block[:], p)
return
}
func (d *digest) Sum(sum []byte) []byte {
var hash [Size]byte
d.finalize(&hash)
return append(sum, hash[:d.size]...)
}
func (d *digest) finalize(hash *[Size]byte) {
var block [BlockSize]byte
h := d.h
c := d.c
copy(block[:], d.block[:d.offset])
remaining := uint32(BlockSize - d.offset)
if c[0] < remaining {
c[1]--
}
c[0] -= remaining
hashBlocks(&h, &c, 0xFFFFFFFF, block[:])
for i, v := range h {
binary.LittleEndian.PutUint32(hash[4*i:], v)
}
}
func appendUint32(b []byte, x uint32) []byte {
var a [4]byte
binary.BigEndian.PutUint32(a[:], x)
return append(b, a[:]...)
}
func consumeUint32(b []byte) ([]byte, uint32) {
x := binary.BigEndian.Uint32(b)
return b[4:], x
}

32
vendor/golang.org/x/crypto/blake2s/blake2s_386.go generated vendored Normal file
View File

@@ -0,0 +1,32 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build 386 && gc && !purego
package blake2s
import "golang.org/x/sys/cpu"
var (
useSSE4 = false
useSSSE3 = cpu.X86.HasSSSE3
useSSE2 = cpu.X86.HasSSE2
)
//go:noescape
func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
//go:noescape
func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
switch {
case useSSSE3:
hashBlocksSSSE3(h, c, flag, blocks)
case useSSE2:
hashBlocksSSE2(h, c, flag, blocks)
default:
hashBlocksGeneric(h, c, flag, blocks)
}
}

429
vendor/golang.org/x/crypto/blake2s/blake2s_386.s generated vendored Normal file
View File

@@ -0,0 +1,429 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build 386 && gc && !purego
#include "textflag.h"
DATA iv0<>+0x00(SB)/4, $0x6a09e667
DATA iv0<>+0x04(SB)/4, $0xbb67ae85
DATA iv0<>+0x08(SB)/4, $0x3c6ef372
DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
GLOBL iv0<>(SB), (NOPTR+RODATA), $16
DATA iv1<>+0x00(SB)/4, $0x510e527f
DATA iv1<>+0x04(SB)/4, $0x9b05688c
DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
GLOBL iv1<>(SB), (NOPTR+RODATA), $16
DATA rol16<>+0x00(SB)/8, $0x0504070601000302
DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL rol16<>(SB), (NOPTR+RODATA), $16
DATA rol8<>+0x00(SB)/8, $0x0407060500030201
DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL rol8<>(SB), (NOPTR+RODATA), $16
DATA counter<>+0x00(SB)/8, $0x40
DATA counter<>+0x08(SB)/8, $0x0
GLOBL counter<>(SB), (NOPTR+RODATA), $16
#define ROTL_SSE2(n, t, v) \
MOVO v, t; \
PSLLL $n, t; \
PSRLL $(32-n), v; \
PXOR t, v
#define ROTL_SSSE3(c, v) \
PSHUFB c, v
#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
PADDL m0, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(16, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(20, t, v1); \
PADDL m1, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(24, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(25, t, v1); \
PSHUFL $0x39, v1, v1; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v3, v3; \
PADDL m2, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(16, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(20, t, v1); \
PADDL m3, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(24, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(25, t, v1); \
PSHUFL $0x39, v3, v3; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v1, v1
#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
PADDL m0, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSSE3(c16, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(20, t, v1); \
PADDL m1, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSSE3(c8, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(25, t, v1); \
PSHUFL $0x39, v1, v1; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v3, v3; \
PADDL m2, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSSE3(c16, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(20, t, v1); \
PADDL m3, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSSE3(c8, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(25, t, v1); \
PSHUFL $0x39, v3, v3; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v1, v1
#define PRECOMPUTE(dst, off, src, t) \
MOVL 0*4(src), t; \
MOVL t, 0*4+off+0(dst); \
MOVL t, 9*4+off+64(dst); \
MOVL t, 5*4+off+128(dst); \
MOVL t, 14*4+off+192(dst); \
MOVL t, 4*4+off+256(dst); \
MOVL t, 2*4+off+320(dst); \
MOVL t, 8*4+off+384(dst); \
MOVL t, 12*4+off+448(dst); \
MOVL t, 3*4+off+512(dst); \
MOVL t, 15*4+off+576(dst); \
MOVL 1*4(src), t; \
MOVL t, 4*4+off+0(dst); \
MOVL t, 8*4+off+64(dst); \
MOVL t, 14*4+off+128(dst); \
MOVL t, 5*4+off+192(dst); \
MOVL t, 12*4+off+256(dst); \
MOVL t, 11*4+off+320(dst); \
MOVL t, 1*4+off+384(dst); \
MOVL t, 6*4+off+448(dst); \
MOVL t, 10*4+off+512(dst); \
MOVL t, 3*4+off+576(dst); \
MOVL 2*4(src), t; \
MOVL t, 1*4+off+0(dst); \
MOVL t, 13*4+off+64(dst); \
MOVL t, 6*4+off+128(dst); \
MOVL t, 8*4+off+192(dst); \
MOVL t, 2*4+off+256(dst); \
MOVL t, 0*4+off+320(dst); \
MOVL t, 14*4+off+384(dst); \
MOVL t, 11*4+off+448(dst); \
MOVL t, 12*4+off+512(dst); \
MOVL t, 4*4+off+576(dst); \
MOVL 3*4(src), t; \
MOVL t, 5*4+off+0(dst); \
MOVL t, 15*4+off+64(dst); \
MOVL t, 9*4+off+128(dst); \
MOVL t, 1*4+off+192(dst); \
MOVL t, 11*4+off+256(dst); \
MOVL t, 7*4+off+320(dst); \
MOVL t, 13*4+off+384(dst); \
MOVL t, 3*4+off+448(dst); \
MOVL t, 6*4+off+512(dst); \
MOVL t, 10*4+off+576(dst); \
MOVL 4*4(src), t; \
MOVL t, 2*4+off+0(dst); \
MOVL t, 1*4+off+64(dst); \
MOVL t, 15*4+off+128(dst); \
MOVL t, 10*4+off+192(dst); \
MOVL t, 6*4+off+256(dst); \
MOVL t, 8*4+off+320(dst); \
MOVL t, 3*4+off+384(dst); \
MOVL t, 13*4+off+448(dst); \
MOVL t, 14*4+off+512(dst); \
MOVL t, 5*4+off+576(dst); \
MOVL 5*4(src), t; \
MOVL t, 6*4+off+0(dst); \
MOVL t, 11*4+off+64(dst); \
MOVL t, 2*4+off+128(dst); \
MOVL t, 9*4+off+192(dst); \
MOVL t, 1*4+off+256(dst); \
MOVL t, 13*4+off+320(dst); \
MOVL t, 4*4+off+384(dst); \
MOVL t, 8*4+off+448(dst); \
MOVL t, 15*4+off+512(dst); \
MOVL t, 7*4+off+576(dst); \
MOVL 6*4(src), t; \
MOVL t, 3*4+off+0(dst); \
MOVL t, 7*4+off+64(dst); \
MOVL t, 13*4+off+128(dst); \
MOVL t, 12*4+off+192(dst); \
MOVL t, 10*4+off+256(dst); \
MOVL t, 1*4+off+320(dst); \
MOVL t, 9*4+off+384(dst); \
MOVL t, 14*4+off+448(dst); \
MOVL t, 0*4+off+512(dst); \
MOVL t, 6*4+off+576(dst); \
MOVL 7*4(src), t; \
MOVL t, 7*4+off+0(dst); \
MOVL t, 14*4+off+64(dst); \
MOVL t, 10*4+off+128(dst); \
MOVL t, 0*4+off+192(dst); \
MOVL t, 5*4+off+256(dst); \
MOVL t, 9*4+off+320(dst); \
MOVL t, 12*4+off+384(dst); \
MOVL t, 1*4+off+448(dst); \
MOVL t, 13*4+off+512(dst); \
MOVL t, 2*4+off+576(dst); \
MOVL 8*4(src), t; \
MOVL t, 8*4+off+0(dst); \
MOVL t, 5*4+off+64(dst); \
MOVL t, 4*4+off+128(dst); \
MOVL t, 15*4+off+192(dst); \
MOVL t, 14*4+off+256(dst); \
MOVL t, 3*4+off+320(dst); \
MOVL t, 11*4+off+384(dst); \
MOVL t, 10*4+off+448(dst); \
MOVL t, 7*4+off+512(dst); \
MOVL t, 1*4+off+576(dst); \
MOVL 9*4(src), t; \
MOVL t, 12*4+off+0(dst); \
MOVL t, 2*4+off+64(dst); \
MOVL t, 11*4+off+128(dst); \
MOVL t, 4*4+off+192(dst); \
MOVL t, 0*4+off+256(dst); \
MOVL t, 15*4+off+320(dst); \
MOVL t, 10*4+off+384(dst); \
MOVL t, 7*4+off+448(dst); \
MOVL t, 5*4+off+512(dst); \
MOVL t, 9*4+off+576(dst); \
MOVL 10*4(src), t; \
MOVL t, 9*4+off+0(dst); \
MOVL t, 4*4+off+64(dst); \
MOVL t, 8*4+off+128(dst); \
MOVL t, 13*4+off+192(dst); \
MOVL t, 3*4+off+256(dst); \
MOVL t, 5*4+off+320(dst); \
MOVL t, 7*4+off+384(dst); \
MOVL t, 15*4+off+448(dst); \
MOVL t, 11*4+off+512(dst); \
MOVL t, 0*4+off+576(dst); \
MOVL 11*4(src), t; \
MOVL t, 13*4+off+0(dst); \
MOVL t, 10*4+off+64(dst); \
MOVL t, 0*4+off+128(dst); \
MOVL t, 3*4+off+192(dst); \
MOVL t, 9*4+off+256(dst); \
MOVL t, 6*4+off+320(dst); \
MOVL t, 15*4+off+384(dst); \
MOVL t, 4*4+off+448(dst); \
MOVL t, 2*4+off+512(dst); \
MOVL t, 12*4+off+576(dst); \
MOVL 12*4(src), t; \
MOVL t, 10*4+off+0(dst); \
MOVL t, 12*4+off+64(dst); \
MOVL t, 1*4+off+128(dst); \
MOVL t, 6*4+off+192(dst); \
MOVL t, 13*4+off+256(dst); \
MOVL t, 4*4+off+320(dst); \
MOVL t, 0*4+off+384(dst); \
MOVL t, 2*4+off+448(dst); \
MOVL t, 8*4+off+512(dst); \
MOVL t, 14*4+off+576(dst); \
MOVL 13*4(src), t; \
MOVL t, 14*4+off+0(dst); \
MOVL t, 3*4+off+64(dst); \
MOVL t, 7*4+off+128(dst); \
MOVL t, 2*4+off+192(dst); \
MOVL t, 15*4+off+256(dst); \
MOVL t, 12*4+off+320(dst); \
MOVL t, 6*4+off+384(dst); \
MOVL t, 0*4+off+448(dst); \
MOVL t, 9*4+off+512(dst); \
MOVL t, 11*4+off+576(dst); \
MOVL 14*4(src), t; \
MOVL t, 11*4+off+0(dst); \
MOVL t, 0*4+off+64(dst); \
MOVL t, 12*4+off+128(dst); \
MOVL t, 7*4+off+192(dst); \
MOVL t, 8*4+off+256(dst); \
MOVL t, 14*4+off+320(dst); \
MOVL t, 2*4+off+384(dst); \
MOVL t, 5*4+off+448(dst); \
MOVL t, 1*4+off+512(dst); \
MOVL t, 13*4+off+576(dst); \
MOVL 15*4(src), t; \
MOVL t, 15*4+off+0(dst); \
MOVL t, 6*4+off+64(dst); \
MOVL t, 3*4+off+128(dst); \
MOVL t, 11*4+off+192(dst); \
MOVL t, 7*4+off+256(dst); \
MOVL t, 10*4+off+320(dst); \
MOVL t, 5*4+off+384(dst); \
MOVL t, 9*4+off+448(dst); \
MOVL t, 4*4+off+512(dst); \
MOVL t, 8*4+off+576(dst)
// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment
MOVL h+0(FP), AX
MOVL c+4(FP), BX
MOVL flag+8(FP), CX
MOVL blocks_base+12(FP), SI
MOVL blocks_len+16(FP), DX
MOVL SP, DI
ADDL $15, DI
ANDL $~15, DI
MOVL CX, 8(DI)
MOVL 0(BX), CX
MOVL CX, 0(DI)
MOVL 4(BX), CX
MOVL CX, 4(DI)
XORL CX, CX
MOVL CX, 12(DI)
MOVOU 0(AX), X0
MOVOU 16(AX), X1
MOVOU counter<>(SB), X2
loop:
MOVO X0, X4
MOVO X1, X5
MOVOU iv0<>(SB), X6
MOVOU iv1<>(SB), X7
MOVO 0(DI), X3
PADDQ X2, X3
PXOR X3, X7
MOVO X3, 0(DI)
PRECOMPUTE(DI, 16, SI, CX)
ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3)
ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3)
PXOR X4, X0
PXOR X5, X1
PXOR X6, X0
PXOR X7, X1
LEAL 64(SI), SI
SUBL $64, DX
JNE loop
MOVL 0(DI), CX
MOVL CX, 0(BX)
MOVL 4(DI), CX
MOVL CX, 4(BX)
MOVOU X0, 0(AX)
MOVOU X1, 16(AX)
RET
// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment
MOVL h+0(FP), AX
MOVL c+4(FP), BX
MOVL flag+8(FP), CX
MOVL blocks_base+12(FP), SI
MOVL blocks_len+16(FP), DX
MOVL SP, DI
ADDL $15, DI
ANDL $~15, DI
MOVL CX, 8(DI)
MOVL 0(BX), CX
MOVL CX, 0(DI)
MOVL 4(BX), CX
MOVL CX, 4(DI)
XORL CX, CX
MOVL CX, 12(DI)
MOVOU 0(AX), X0
MOVOU 16(AX), X1
MOVOU counter<>(SB), X2
loop:
MOVO X0, 656(DI)
MOVO X1, 672(DI)
MOVO X0, X4
MOVO X1, X5
MOVOU iv0<>(SB), X6
MOVOU iv1<>(SB), X7
MOVO 0(DI), X3
PADDQ X2, X3
PXOR X3, X7
MOVO X3, 0(DI)
MOVOU rol16<>(SB), X0
MOVOU rol8<>(SB), X1
PRECOMPUTE(DI, 16, SI, CX)
ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1)
ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1)
MOVO 656(DI), X0
MOVO 672(DI), X1
PXOR X4, X0
PXOR X5, X1
PXOR X6, X0
PXOR X7, X1
LEAL 64(SI), SI
SUBL $64, DX
JNE loop
MOVL 0(DI), CX
MOVL CX, 0(BX)
MOVL 4(DI), CX
MOVL CX, 4(BX)
MOVOU X0, 0(AX)
MOVOU X1, 16(AX)
RET

37
vendor/golang.org/x/crypto/blake2s/blake2s_amd64.go generated vendored Normal file
View File

@@ -0,0 +1,37 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
package blake2s
import "golang.org/x/sys/cpu"
var (
useSSE4 = cpu.X86.HasSSE41
useSSSE3 = cpu.X86.HasSSSE3
useSSE2 = cpu.X86.HasSSE2
)
//go:noescape
func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
//go:noescape
func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
//go:noescape
func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
switch {
case useSSE4:
hashBlocksSSE4(h, c, flag, blocks)
case useSSSE3:
hashBlocksSSSE3(h, c, flag, blocks)
case useSSE2:
hashBlocksSSE2(h, c, flag, blocks)
default:
hashBlocksGeneric(h, c, flag, blocks)
}
}

432
vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,432 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && gc && !purego
#include "textflag.h"
DATA iv0<>+0x00(SB)/4, $0x6a09e667
DATA iv0<>+0x04(SB)/4, $0xbb67ae85
DATA iv0<>+0x08(SB)/4, $0x3c6ef372
DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
GLOBL iv0<>(SB), (NOPTR+RODATA), $16
DATA iv1<>+0x00(SB)/4, $0x510e527f
DATA iv1<>+0x04(SB)/4, $0x9b05688c
DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
GLOBL iv1<>(SB), (NOPTR+RODATA), $16
DATA rol16<>+0x00(SB)/8, $0x0504070601000302
DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL rol16<>(SB), (NOPTR+RODATA), $16
DATA rol8<>+0x00(SB)/8, $0x0407060500030201
DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
GLOBL rol8<>(SB), (NOPTR+RODATA), $16
DATA counter<>+0x00(SB)/8, $0x40
DATA counter<>+0x08(SB)/8, $0x0
GLOBL counter<>(SB), (NOPTR+RODATA), $16
#define ROTL_SSE2(n, t, v) \
MOVO v, t; \
PSLLL $n, t; \
PSRLL $(32-n), v; \
PXOR t, v
#define ROTL_SSSE3(c, v) \
PSHUFB c, v
#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
PADDL m0, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(16, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(20, t, v1); \
PADDL m1, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(24, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(25, t, v1); \
PSHUFL $0x39, v1, v1; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v3, v3; \
PADDL m2, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(16, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(20, t, v1); \
PADDL m3, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(24, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(25, t, v1); \
PSHUFL $0x39, v3, v3; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v1, v1
#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
PADDL m0, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSSE3(c16, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(20, t, v1); \
PADDL m1, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSSE3(c8, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(25, t, v1); \
PSHUFL $0x39, v1, v1; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v3, v3; \
PADDL m2, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSSE3(c16, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(20, t, v1); \
PADDL m3, v0; \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSSE3(c8, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(25, t, v1); \
PSHUFL $0x39, v3, v3; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v1, v1
#define LOAD_MSG_SSE4(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \
MOVL i0*4(src), m0; \
PINSRD $1, i1*4(src), m0; \
PINSRD $2, i2*4(src), m0; \
PINSRD $3, i3*4(src), m0; \
MOVL i4*4(src), m1; \
PINSRD $1, i5*4(src), m1; \
PINSRD $2, i6*4(src), m1; \
PINSRD $3, i7*4(src), m1; \
MOVL i8*4(src), m2; \
PINSRD $1, i9*4(src), m2; \
PINSRD $2, i10*4(src), m2; \
PINSRD $3, i11*4(src), m2; \
MOVL i12*4(src), m3; \
PINSRD $1, i13*4(src), m3; \
PINSRD $2, i14*4(src), m3; \
PINSRD $3, i15*4(src), m3
#define PRECOMPUTE_MSG(dst, off, src, R8, R9, R10, R11, R12, R13, R14, R15) \
MOVQ 0*4(src), R8; \
MOVQ 2*4(src), R9; \
MOVQ 4*4(src), R10; \
MOVQ 6*4(src), R11; \
MOVQ 8*4(src), R12; \
MOVQ 10*4(src), R13; \
MOVQ 12*4(src), R14; \
MOVQ 14*4(src), R15; \
\
MOVL R8, 0*4+off+0(dst); \
MOVL R8, 9*4+off+64(dst); \
MOVL R8, 5*4+off+128(dst); \
MOVL R8, 14*4+off+192(dst); \
MOVL R8, 4*4+off+256(dst); \
MOVL R8, 2*4+off+320(dst); \
MOVL R8, 8*4+off+384(dst); \
MOVL R8, 12*4+off+448(dst); \
MOVL R8, 3*4+off+512(dst); \
MOVL R8, 15*4+off+576(dst); \
SHRQ $32, R8; \
MOVL R8, 4*4+off+0(dst); \
MOVL R8, 8*4+off+64(dst); \
MOVL R8, 14*4+off+128(dst); \
MOVL R8, 5*4+off+192(dst); \
MOVL R8, 12*4+off+256(dst); \
MOVL R8, 11*4+off+320(dst); \
MOVL R8, 1*4+off+384(dst); \
MOVL R8, 6*4+off+448(dst); \
MOVL R8, 10*4+off+512(dst); \
MOVL R8, 3*4+off+576(dst); \
\
MOVL R9, 1*4+off+0(dst); \
MOVL R9, 13*4+off+64(dst); \
MOVL R9, 6*4+off+128(dst); \
MOVL R9, 8*4+off+192(dst); \
MOVL R9, 2*4+off+256(dst); \
MOVL R9, 0*4+off+320(dst); \
MOVL R9, 14*4+off+384(dst); \
MOVL R9, 11*4+off+448(dst); \
MOVL R9, 12*4+off+512(dst); \
MOVL R9, 4*4+off+576(dst); \
SHRQ $32, R9; \
MOVL R9, 5*4+off+0(dst); \
MOVL R9, 15*4+off+64(dst); \
MOVL R9, 9*4+off+128(dst); \
MOVL R9, 1*4+off+192(dst); \
MOVL R9, 11*4+off+256(dst); \
MOVL R9, 7*4+off+320(dst); \
MOVL R9, 13*4+off+384(dst); \
MOVL R9, 3*4+off+448(dst); \
MOVL R9, 6*4+off+512(dst); \
MOVL R9, 10*4+off+576(dst); \
\
MOVL R10, 2*4+off+0(dst); \
MOVL R10, 1*4+off+64(dst); \
MOVL R10, 15*4+off+128(dst); \
MOVL R10, 10*4+off+192(dst); \
MOVL R10, 6*4+off+256(dst); \
MOVL R10, 8*4+off+320(dst); \
MOVL R10, 3*4+off+384(dst); \
MOVL R10, 13*4+off+448(dst); \
MOVL R10, 14*4+off+512(dst); \
MOVL R10, 5*4+off+576(dst); \
SHRQ $32, R10; \
MOVL R10, 6*4+off+0(dst); \
MOVL R10, 11*4+off+64(dst); \
MOVL R10, 2*4+off+128(dst); \
MOVL R10, 9*4+off+192(dst); \
MOVL R10, 1*4+off+256(dst); \
MOVL R10, 13*4+off+320(dst); \
MOVL R10, 4*4+off+384(dst); \
MOVL R10, 8*4+off+448(dst); \
MOVL R10, 15*4+off+512(dst); \
MOVL R10, 7*4+off+576(dst); \
\
MOVL R11, 3*4+off+0(dst); \
MOVL R11, 7*4+off+64(dst); \
MOVL R11, 13*4+off+128(dst); \
MOVL R11, 12*4+off+192(dst); \
MOVL R11, 10*4+off+256(dst); \
MOVL R11, 1*4+off+320(dst); \
MOVL R11, 9*4+off+384(dst); \
MOVL R11, 14*4+off+448(dst); \
MOVL R11, 0*4+off+512(dst); \
MOVL R11, 6*4+off+576(dst); \
SHRQ $32, R11; \
MOVL R11, 7*4+off+0(dst); \
MOVL R11, 14*4+off+64(dst); \
MOVL R11, 10*4+off+128(dst); \
MOVL R11, 0*4+off+192(dst); \
MOVL R11, 5*4+off+256(dst); \
MOVL R11, 9*4+off+320(dst); \
MOVL R11, 12*4+off+384(dst); \
MOVL R11, 1*4+off+448(dst); \
MOVL R11, 13*4+off+512(dst); \
MOVL R11, 2*4+off+576(dst); \
\
MOVL R12, 8*4+off+0(dst); \
MOVL R12, 5*4+off+64(dst); \
MOVL R12, 4*4+off+128(dst); \
MOVL R12, 15*4+off+192(dst); \
MOVL R12, 14*4+off+256(dst); \
MOVL R12, 3*4+off+320(dst); \
MOVL R12, 11*4+off+384(dst); \
MOVL R12, 10*4+off+448(dst); \
MOVL R12, 7*4+off+512(dst); \
MOVL R12, 1*4+off+576(dst); \
SHRQ $32, R12; \
MOVL R12, 12*4+off+0(dst); \
MOVL R12, 2*4+off+64(dst); \
MOVL R12, 11*4+off+128(dst); \
MOVL R12, 4*4+off+192(dst); \
MOVL R12, 0*4+off+256(dst); \
MOVL R12, 15*4+off+320(dst); \
MOVL R12, 10*4+off+384(dst); \
MOVL R12, 7*4+off+448(dst); \
MOVL R12, 5*4+off+512(dst); \
MOVL R12, 9*4+off+576(dst); \
\
MOVL R13, 9*4+off+0(dst); \
MOVL R13, 4*4+off+64(dst); \
MOVL R13, 8*4+off+128(dst); \
MOVL R13, 13*4+off+192(dst); \
MOVL R13, 3*4+off+256(dst); \
MOVL R13, 5*4+off+320(dst); \
MOVL R13, 7*4+off+384(dst); \
MOVL R13, 15*4+off+448(dst); \
MOVL R13, 11*4+off+512(dst); \
MOVL R13, 0*4+off+576(dst); \
SHRQ $32, R13; \
MOVL R13, 13*4+off+0(dst); \
MOVL R13, 10*4+off+64(dst); \
MOVL R13, 0*4+off+128(dst); \
MOVL R13, 3*4+off+192(dst); \
MOVL R13, 9*4+off+256(dst); \
MOVL R13, 6*4+off+320(dst); \
MOVL R13, 15*4+off+384(dst); \
MOVL R13, 4*4+off+448(dst); \
MOVL R13, 2*4+off+512(dst); \
MOVL R13, 12*4+off+576(dst); \
\
MOVL R14, 10*4+off+0(dst); \
MOVL R14, 12*4+off+64(dst); \
MOVL R14, 1*4+off+128(dst); \
MOVL R14, 6*4+off+192(dst); \
MOVL R14, 13*4+off+256(dst); \
MOVL R14, 4*4+off+320(dst); \
MOVL R14, 0*4+off+384(dst); \
MOVL R14, 2*4+off+448(dst); \
MOVL R14, 8*4+off+512(dst); \
MOVL R14, 14*4+off+576(dst); \
SHRQ $32, R14; \
MOVL R14, 14*4+off+0(dst); \
MOVL R14, 3*4+off+64(dst); \
MOVL R14, 7*4+off+128(dst); \
MOVL R14, 2*4+off+192(dst); \
MOVL R14, 15*4+off+256(dst); \
MOVL R14, 12*4+off+320(dst); \
MOVL R14, 6*4+off+384(dst); \
MOVL R14, 0*4+off+448(dst); \
MOVL R14, 9*4+off+512(dst); \
MOVL R14, 11*4+off+576(dst); \
\
MOVL R15, 11*4+off+0(dst); \
MOVL R15, 0*4+off+64(dst); \
MOVL R15, 12*4+off+128(dst); \
MOVL R15, 7*4+off+192(dst); \
MOVL R15, 8*4+off+256(dst); \
MOVL R15, 14*4+off+320(dst); \
MOVL R15, 2*4+off+384(dst); \
MOVL R15, 5*4+off+448(dst); \
MOVL R15, 1*4+off+512(dst); \
MOVL R15, 13*4+off+576(dst); \
SHRQ $32, R15; \
MOVL R15, 15*4+off+0(dst); \
MOVL R15, 6*4+off+64(dst); \
MOVL R15, 3*4+off+128(dst); \
MOVL R15, 11*4+off+192(dst); \
MOVL R15, 7*4+off+256(dst); \
MOVL R15, 10*4+off+320(dst); \
MOVL R15, 5*4+off+384(dst); \
MOVL R15, 9*4+off+448(dst); \
MOVL R15, 4*4+off+512(dst); \
MOVL R15, 8*4+off+576(dst)
#define BLAKE2s_SSE2() \
PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \
ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \
ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8)
#define BLAKE2s_SSSE3() \
PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \
ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \
ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14)
#define BLAKE2s_SSE4() \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \
ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14)
#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \
MOVQ h, AX; \
MOVQ c, BX; \
MOVL flag, CX; \
MOVQ blocks_base, SI; \
MOVQ blocks_len, DX; \
\
MOVQ SP, BP; \
ADDQ $15, BP; \
ANDQ $~15, BP; \
\
MOVQ 0(BX), R9; \
MOVQ R9, 0(BP); \
MOVQ CX, 8(BP); \
\
MOVOU 0(AX), X0; \
MOVOU 16(AX), X1; \
MOVOU iv0<>(SB), X2; \
MOVOU iv1<>(SB), X3 \
\
MOVOU counter<>(SB), X12; \
MOVOU rol16<>(SB), X13; \
MOVOU rol8<>(SB), X14; \
MOVO 0(BP), X15; \
\
loop: \
MOVO X0, X4; \
MOVO X1, X5; \
MOVO X2, X6; \
MOVO X3, X7; \
\
PADDQ X12, X15; \
PXOR X15, X7; \
\
BLAKE2s_FUNC(); \
\
PXOR X4, X0; \
PXOR X5, X1; \
PXOR X6, X0; \
PXOR X7, X1; \
\
LEAQ 64(SI), SI; \
SUBQ $64, DX; \
JNE loop; \
\
MOVO X15, 0(BP); \
MOVQ 0(BP), R9; \
MOVQ R9, 0(BX); \
\
MOVOU X0, 0(AX); \
MOVOU X1, 16(AX)
// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment
HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2)
RET
// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
TEXT ·hashBlocksSSSE3(SB), 0, $672-48 // frame = 656 + 16 byte alignment
HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3)
RET
// func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
TEXT ·hashBlocksSSE4(SB), 0, $32-48 // frame = 16 + 16 byte alignment
HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4)
RET

178
vendor/golang.org/x/crypto/blake2s/blake2s_generic.go generated vendored Normal file
View File

@@ -0,0 +1,178 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2s
import (
"math/bits"
)
// the precomputed values for BLAKE2s
// there are 10 16-byte arrays - one for each round
// the entries are calculated from the sigma constants.
var precomputed = [10][16]byte{
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
}
func hashBlocksGeneric(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
var m [16]uint32
c0, c1 := c[0], c[1]
for i := 0; i < len(blocks); {
c0 += BlockSize
if c0 < BlockSize {
c1++
}
v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7]
v12 ^= c0
v13 ^= c1
v14 ^= flag
for j := range m {
m[j] = uint32(blocks[i]) | uint32(blocks[i+1])<<8 | uint32(blocks[i+2])<<16 | uint32(blocks[i+3])<<24
i += 4
}
for k := range precomputed {
s := &(precomputed[k])
v0 += m[s[0]]
v0 += v4
v12 ^= v0
v12 = bits.RotateLeft32(v12, -16)
v8 += v12
v4 ^= v8
v4 = bits.RotateLeft32(v4, -12)
v1 += m[s[1]]
v1 += v5
v13 ^= v1
v13 = bits.RotateLeft32(v13, -16)
v9 += v13
v5 ^= v9
v5 = bits.RotateLeft32(v5, -12)
v2 += m[s[2]]
v2 += v6
v14 ^= v2
v14 = bits.RotateLeft32(v14, -16)
v10 += v14
v6 ^= v10
v6 = bits.RotateLeft32(v6, -12)
v3 += m[s[3]]
v3 += v7
v15 ^= v3
v15 = bits.RotateLeft32(v15, -16)
v11 += v15
v7 ^= v11
v7 = bits.RotateLeft32(v7, -12)
v0 += m[s[4]]
v0 += v4
v12 ^= v0
v12 = bits.RotateLeft32(v12, -8)
v8 += v12
v4 ^= v8
v4 = bits.RotateLeft32(v4, -7)
v1 += m[s[5]]
v1 += v5
v13 ^= v1
v13 = bits.RotateLeft32(v13, -8)
v9 += v13
v5 ^= v9
v5 = bits.RotateLeft32(v5, -7)
v2 += m[s[6]]
v2 += v6
v14 ^= v2
v14 = bits.RotateLeft32(v14, -8)
v10 += v14
v6 ^= v10
v6 = bits.RotateLeft32(v6, -7)
v3 += m[s[7]]
v3 += v7
v15 ^= v3
v15 = bits.RotateLeft32(v15, -8)
v11 += v15
v7 ^= v11
v7 = bits.RotateLeft32(v7, -7)
v0 += m[s[8]]
v0 += v5
v15 ^= v0
v15 = bits.RotateLeft32(v15, -16)
v10 += v15
v5 ^= v10
v5 = bits.RotateLeft32(v5, -12)
v1 += m[s[9]]
v1 += v6
v12 ^= v1
v12 = bits.RotateLeft32(v12, -16)
v11 += v12
v6 ^= v11
v6 = bits.RotateLeft32(v6, -12)
v2 += m[s[10]]
v2 += v7
v13 ^= v2
v13 = bits.RotateLeft32(v13, -16)
v8 += v13
v7 ^= v8
v7 = bits.RotateLeft32(v7, -12)
v3 += m[s[11]]
v3 += v4
v14 ^= v3
v14 = bits.RotateLeft32(v14, -16)
v9 += v14
v4 ^= v9
v4 = bits.RotateLeft32(v4, -12)
v0 += m[s[12]]
v0 += v5
v15 ^= v0
v15 = bits.RotateLeft32(v15, -8)
v10 += v15
v5 ^= v10
v5 = bits.RotateLeft32(v5, -7)
v1 += m[s[13]]
v1 += v6
v12 ^= v1
v12 = bits.RotateLeft32(v12, -8)
v11 += v12
v6 ^= v11
v6 = bits.RotateLeft32(v6, -7)
v2 += m[s[14]]
v2 += v7
v13 ^= v2
v13 = bits.RotateLeft32(v13, -8)
v8 += v13
v7 ^= v8
v7 = bits.RotateLeft32(v7, -7)
v3 += m[s[15]]
v3 += v4
v14 ^= v3
v14 = bits.RotateLeft32(v14, -8)
v9 += v14
v4 ^= v9
v4 = bits.RotateLeft32(v4, -7)
}
h[0] ^= v0 ^ v8
h[1] ^= v1 ^ v9
h[2] ^= v2 ^ v10
h[3] ^= v3 ^ v11
h[4] ^= v4 ^ v12
h[5] ^= v5 ^ v13
h[6] ^= v6 ^ v14
h[7] ^= v7 ^ v15
}
c[0], c[1] = c0, c1
}

17
vendor/golang.org/x/crypto/blake2s/blake2s_ref.go generated vendored Normal file
View File

@@ -0,0 +1,17 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !386) || !gc || purego
package blake2s
var (
useSSE4 = false
useSSSE3 = false
useSSE2 = false
)
func hashBlocks(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte) {
hashBlocksGeneric(h, c, flag, blocks)
}

178
vendor/golang.org/x/crypto/blake2s/blake2x.go generated vendored Normal file
View File

@@ -0,0 +1,178 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blake2s
import (
"encoding/binary"
"errors"
"io"
)
// XOF defines the interface to hash functions that
// support arbitrary-length output.
type XOF interface {
// Write absorbs more data into the hash's state. It panics if called
// after Read.
io.Writer
// Read reads more output from the hash. It returns io.EOF if the limit
// has been reached.
io.Reader
// Clone returns a copy of the XOF in its current state.
Clone() XOF
// Reset resets the XOF to its initial state.
Reset()
}
// OutputLengthUnknown can be used as the size argument to NewXOF to indicate
// the length of the output is not known in advance.
const OutputLengthUnknown = 0
// magicUnknownOutputLength is a magic value for the output size that indicates
// an unknown number of output bytes.
const magicUnknownOutputLength = 65535
// maxOutputLength is the absolute maximum number of bytes to produce when the
// number of output bytes is unknown.
const maxOutputLength = (1 << 32) * 32
// NewXOF creates a new variable-output-length hash. The hash either produce a
// known number of bytes (1 <= size < 65535), or an unknown number of bytes
// (size == OutputLengthUnknown). In the latter case, an absolute limit of
// 128GiB applies.
//
// A non-nil key turns the hash into a MAC. The key must between
// zero and 32 bytes long.
func NewXOF(size uint16, key []byte) (XOF, error) {
if len(key) > Size {
return nil, errKeySize
}
if size == magicUnknownOutputLength {
// 2^16-1 indicates an unknown number of bytes and thus isn't a
// valid length.
return nil, errors.New("blake2s: XOF length too large")
}
if size == OutputLengthUnknown {
size = magicUnknownOutputLength
}
x := &xof{
d: digest{
size: Size,
keyLen: len(key),
},
length: size,
}
copy(x.d.key[:], key)
x.Reset()
return x, nil
}
type xof struct {
d digest
length uint16
remaining uint64
cfg, root, block [Size]byte
offset int
nodeOffset uint32
readMode bool
}
func (x *xof) Write(p []byte) (n int, err error) {
if x.readMode {
panic("blake2s: write to XOF after read")
}
return x.d.Write(p)
}
func (x *xof) Clone() XOF {
clone := *x
return &clone
}
func (x *xof) Reset() {
x.cfg[0] = byte(Size)
binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length
binary.LittleEndian.PutUint16(x.cfg[12:], x.length) // XOF length
x.cfg[15] = byte(Size) // inner hash size
x.d.Reset()
x.d.h[3] ^= uint32(x.length)
x.remaining = uint64(x.length)
if x.remaining == magicUnknownOutputLength {
x.remaining = maxOutputLength
}
x.offset, x.nodeOffset = 0, 0
x.readMode = false
}
func (x *xof) Read(p []byte) (n int, err error) {
if !x.readMode {
x.d.finalize(&x.root)
x.readMode = true
}
if x.remaining == 0 {
return 0, io.EOF
}
n = len(p)
if uint64(n) > x.remaining {
n = int(x.remaining)
p = p[:n]
}
if x.offset > 0 {
blockRemaining := Size - x.offset
if n < blockRemaining {
x.offset += copy(p, x.block[x.offset:])
x.remaining -= uint64(n)
return
}
copy(p, x.block[x.offset:])
p = p[blockRemaining:]
x.offset = 0
x.remaining -= uint64(blockRemaining)
}
for len(p) >= Size {
binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
x.nodeOffset++
x.d.initConfig(&x.cfg)
x.d.Write(x.root[:])
x.d.finalize(&x.block)
copy(p, x.block[:])
p = p[Size:]
x.remaining -= uint64(Size)
}
if todo := len(p); todo > 0 {
if x.remaining < uint64(Size) {
x.cfg[0] = byte(x.remaining)
}
binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
x.nodeOffset++
x.d.initConfig(&x.cfg)
x.d.Write(x.root[:])
x.d.finalize(&x.block)
x.offset = copy(p, x.block[:todo])
x.remaining -= uint64(todo)
}
return
}
func (d *digest) initConfig(cfg *[Size]byte) {
d.offset, d.c[0], d.c[1] = 0, 0, 0
for i := range d.h {
d.h[i] = iv[i] ^ binary.LittleEndian.Uint32(cfg[i*4:])
}
}

159
vendor/golang.org/x/crypto/blowfish/block.go generated vendored Normal file
View File

@@ -0,0 +1,159 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blowfish
// getNextWord returns the next big-endian uint32 value from the byte slice
// at the given position in a circular manner, updating the position.
func getNextWord(b []byte, pos *int) uint32 {
var w uint32
j := *pos
for i := 0; i < 4; i++ {
w = w<<8 | uint32(b[j])
j++
if j >= len(b) {
j = 0
}
}
*pos = j
return w
}
// ExpandKey performs a key expansion on the given *Cipher. Specifically, it
// performs the Blowfish algorithm's key schedule which sets up the *Cipher's
// pi and substitution tables for calls to Encrypt. This is used, primarily,
// by the bcrypt package to reuse the Blowfish key schedule during its
// set up. It's unlikely that you need to use this directly.
func ExpandKey(key []byte, c *Cipher) {
j := 0
for i := 0; i < 18; i++ {
// Using inlined getNextWord for performance.
var d uint32
for k := 0; k < 4; k++ {
d = d<<8 | uint32(key[j])
j++
if j >= len(key) {
j = 0
}
}
c.p[i] ^= d
}
var l, r uint32
for i := 0; i < 18; i += 2 {
l, r = encryptBlock(l, r, c)
c.p[i], c.p[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l, r = encryptBlock(l, r, c)
c.s0[i], c.s0[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l, r = encryptBlock(l, r, c)
c.s1[i], c.s1[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l, r = encryptBlock(l, r, c)
c.s2[i], c.s2[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l, r = encryptBlock(l, r, c)
c.s3[i], c.s3[i+1] = l, r
}
}
// This is similar to ExpandKey, but folds the salt during the key
// schedule. While ExpandKey is essentially expandKeyWithSalt with an all-zero
// salt passed in, reusing ExpandKey turns out to be a place of inefficiency
// and specializing it here is useful.
func expandKeyWithSalt(key []byte, salt []byte, c *Cipher) {
j := 0
for i := 0; i < 18; i++ {
c.p[i] ^= getNextWord(key, &j)
}
j = 0
var l, r uint32
for i := 0; i < 18; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.p[i], c.p[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.s0[i], c.s0[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.s1[i], c.s1[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.s2[i], c.s2[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.s3[i], c.s3[i+1] = l, r
}
}
func encryptBlock(l, r uint32, c *Cipher) (uint32, uint32) {
xl, xr := l, r
xl ^= c.p[0]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[1]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[2]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[3]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[4]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[5]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[6]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[7]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[8]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[9]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[10]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[11]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[12]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[13]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[14]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[15]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[16]
xr ^= c.p[17]
return xr, xl
}
func decryptBlock(l, r uint32, c *Cipher) (uint32, uint32) {
xl, xr := l, r
xl ^= c.p[17]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[16]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[15]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[14]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[13]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[12]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[11]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[10]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[9]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[8]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[7]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[6]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[5]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[4]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[3]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[2]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[1]
xr ^= c.p[0]
return xr, xl
}

99
vendor/golang.org/x/crypto/blowfish/cipher.go generated vendored Normal file
View File

@@ -0,0 +1,99 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package blowfish implements Bruce Schneier's Blowfish encryption algorithm.
//
// Blowfish is a legacy cipher and its short block size makes it vulnerable to
// birthday bound attacks (see https://sweet32.info). It should only be used
// where compatibility with legacy systems, not security, is the goal.
//
// Deprecated: any new system should use AES (from crypto/aes, if necessary in
// an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from
// golang.org/x/crypto/chacha20poly1305).
package blowfish
// The code is a port of Bruce Schneier's C implementation.
// See https://www.schneier.com/blowfish.html.
import "strconv"
// The Blowfish block size in bytes.
const BlockSize = 8
// A Cipher is an instance of Blowfish encryption using a particular key.
type Cipher struct {
p [18]uint32
s0, s1, s2, s3 [256]uint32
}
type KeySizeError int
func (k KeySizeError) Error() string {
return "crypto/blowfish: invalid key size " + strconv.Itoa(int(k))
}
// NewCipher creates and returns a Cipher.
// The key argument should be the Blowfish key, from 1 to 56 bytes.
func NewCipher(key []byte) (*Cipher, error) {
var result Cipher
if k := len(key); k < 1 || k > 56 {
return nil, KeySizeError(k)
}
initCipher(&result)
ExpandKey(key, &result)
return &result, nil
}
// NewSaltedCipher creates a returns a Cipher that folds a salt into its key
// schedule. For most purposes, NewCipher, instead of NewSaltedCipher, is
// sufficient and desirable. For bcrypt compatibility, the key can be over 56
// bytes.
func NewSaltedCipher(key, salt []byte) (*Cipher, error) {
if len(salt) == 0 {
return NewCipher(key)
}
var result Cipher
if k := len(key); k < 1 {
return nil, KeySizeError(k)
}
initCipher(&result)
expandKeyWithSalt(key, salt, &result)
return &result, nil
}
// BlockSize returns the Blowfish block size, 8 bytes.
// It is necessary to satisfy the Block interface in the
// package "crypto/cipher".
func (c *Cipher) BlockSize() int { return BlockSize }
// Encrypt encrypts the 8-byte buffer src using the key k
// and stores the result in dst.
// Note that for amounts of data larger than a block,
// it is not safe to just call Encrypt on successive blocks;
// instead, use an encryption mode like CBC (see crypto/cipher/cbc.go).
func (c *Cipher) Encrypt(dst, src []byte) {
l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
l, r = encryptBlock(l, r, c)
dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l)
dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r)
}
// Decrypt decrypts the 8-byte buffer src using the key k
// and stores the result in dst.
func (c *Cipher) Decrypt(dst, src []byte) {
l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
l, r = decryptBlock(l, r, c)
dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l)
dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r)
}
func initCipher(c *Cipher) {
copy(c.p[0:], p[0:])
copy(c.s0[0:], s0[0:])
copy(c.s1[0:], s1[0:])
copy(c.s2[0:], s2[0:])
copy(c.s3[0:], s3[0:])
}

199
vendor/golang.org/x/crypto/blowfish/const.go generated vendored Normal file
View File

@@ -0,0 +1,199 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The startup permutation array and substitution boxes.
// They are the hexadecimal digits of PI; see:
// https://www.schneier.com/code/constants.txt.
package blowfish
var s0 = [256]uint32{
0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7, 0xb8e1afed, 0x6a267e96,
0xba7c9045, 0xf12c7f99, 0x24a19947, 0xb3916cf7, 0x0801f2e2, 0x858efc16,
0x636920d8, 0x71574e69, 0xa458fea3, 0xf4933d7e, 0x0d95748f, 0x728eb658,
0x718bcd58, 0x82154aee, 0x7b54a41d, 0xc25a59b5, 0x9c30d539, 0x2af26013,
0xc5d1b023, 0x286085f0, 0xca417918, 0xb8db38ef, 0x8e79dcb0, 0x603a180e,
0x6c9e0e8b, 0xb01e8a3e, 0xd71577c1, 0xbd314b27, 0x78af2fda, 0x55605c60,
0xe65525f3, 0xaa55ab94, 0x57489862, 0x63e81440, 0x55ca396a, 0x2aab10b6,
0xb4cc5c34, 0x1141e8ce, 0xa15486af, 0x7c72e993, 0xb3ee1411, 0x636fbc2a,
0x2ba9c55d, 0x741831f6, 0xce5c3e16, 0x9b87931e, 0xafd6ba33, 0x6c24cf5c,
0x7a325381, 0x28958677, 0x3b8f4898, 0x6b4bb9af, 0xc4bfe81b, 0x66282193,
0x61d809cc, 0xfb21a991, 0x487cac60, 0x5dec8032, 0xef845d5d, 0xe98575b1,
0xdc262302, 0xeb651b88, 0x23893e81, 0xd396acc5, 0x0f6d6ff3, 0x83f44239,
0x2e0b4482, 0xa4842004, 0x69c8f04a, 0x9e1f9b5e, 0x21c66842, 0xf6e96c9a,
0x670c9c61, 0xabd388f0, 0x6a51a0d2, 0xd8542f68, 0x960fa728, 0xab5133a3,
0x6eef0b6c, 0x137a3be4, 0xba3bf050, 0x7efb2a98, 0xa1f1651d, 0x39af0176,
0x66ca593e, 0x82430e88, 0x8cee8619, 0x456f9fb4, 0x7d84a5c3, 0x3b8b5ebe,
0xe06f75d8, 0x85c12073, 0x401a449f, 0x56c16aa6, 0x4ed3aa62, 0x363f7706,
0x1bfedf72, 0x429b023d, 0x37d0d724, 0xd00a1248, 0xdb0fead3, 0x49f1c09b,
0x075372c9, 0x80991b7b, 0x25d479d8, 0xf6e8def7, 0xe3fe501a, 0xb6794c3b,
0x976ce0bd, 0x04c006ba, 0xc1a94fb6, 0x409f60c4, 0x5e5c9ec2, 0x196a2463,
0x68fb6faf, 0x3e6c53b5, 0x1339b2eb, 0x3b52ec6f, 0x6dfc511f, 0x9b30952c,
0xcc814544, 0xaf5ebd09, 0xbee3d004, 0xde334afd, 0x660f2807, 0x192e4bb3,
0xc0cba857, 0x45c8740f, 0xd20b5f39, 0xb9d3fbdb, 0x5579c0bd, 0x1a60320a,
0xd6a100c6, 0x402c7279, 0x679f25fe, 0xfb1fa3cc, 0x8ea5e9f8, 0xdb3222f8,
0x3c7516df, 0xfd616b15, 0x2f501ec8, 0xad0552ab, 0x323db5fa, 0xfd238760,
0x53317b48, 0x3e00df82, 0x9e5c57bb, 0xca6f8ca0, 0x1a87562e, 0xdf1769db,
0xd542a8f6, 0x287effc3, 0xac6732c6, 0x8c4f5573, 0x695b27b0, 0xbbca58c8,
0xe1ffa35d, 0xb8f011a0, 0x10fa3d98, 0xfd2183b8, 0x4afcb56c, 0x2dd1d35b,
0x9a53e479, 0xb6f84565, 0xd28e49bc, 0x4bfb9790, 0xe1ddf2da, 0xa4cb7e33,
0x62fb1341, 0xcee4c6e8, 0xef20cada, 0x36774c01, 0xd07e9efe, 0x2bf11fb4,
0x95dbda4d, 0xae909198, 0xeaad8e71, 0x6b93d5a0, 0xd08ed1d0, 0xafc725e0,
0x8e3c5b2f, 0x8e7594b7, 0x8ff6e2fb, 0xf2122b64, 0x8888b812, 0x900df01c,
0x4fad5ea0, 0x688fc31c, 0xd1cff191, 0xb3a8c1ad, 0x2f2f2218, 0xbe0e1777,
0xea752dfe, 0x8b021fa1, 0xe5a0cc0f, 0xb56f74e8, 0x18acf3d6, 0xce89e299,
0xb4a84fe0, 0xfd13e0b7, 0x7cc43b81, 0xd2ada8d9, 0x165fa266, 0x80957705,
0x93cc7314, 0x211a1477, 0xe6ad2065, 0x77b5fa86, 0xc75442f5, 0xfb9d35cf,
0xebcdaf0c, 0x7b3e89a0, 0xd6411bd3, 0xae1e7e49, 0x00250e2d, 0x2071b35e,
0x226800bb, 0x57b8e0af, 0x2464369b, 0xf009b91e, 0x5563911d, 0x59dfa6aa,
0x78c14389, 0xd95a537f, 0x207d5ba2, 0x02e5b9c5, 0x83260376, 0x6295cfa9,
0x11c81968, 0x4e734a41, 0xb3472dca, 0x7b14a94a, 0x1b510052, 0x9a532915,
0xd60f573f, 0xbc9bc6e4, 0x2b60a476, 0x81e67400, 0x08ba6fb5, 0x571be91f,
0xf296ec6b, 0x2a0dd915, 0xb6636521, 0xe7b9f9b6, 0xff34052e, 0xc5855664,
0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a,
}
var s1 = [256]uint32{
0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623, 0xad6ea6b0, 0x49a7df7d,
0x9cee60b8, 0x8fedb266, 0xecaa8c71, 0x699a17ff, 0x5664526c, 0xc2b19ee1,
0x193602a5, 0x75094c29, 0xa0591340, 0xe4183a3e, 0x3f54989a, 0x5b429d65,
0x6b8fe4d6, 0x99f73fd6, 0xa1d29c07, 0xefe830f5, 0x4d2d38e6, 0xf0255dc1,
0x4cdd2086, 0x8470eb26, 0x6382e9c6, 0x021ecc5e, 0x09686b3f, 0x3ebaefc9,
0x3c971814, 0x6b6a70a1, 0x687f3584, 0x52a0e286, 0xb79c5305, 0xaa500737,
0x3e07841c, 0x7fdeae5c, 0x8e7d44ec, 0x5716f2b8, 0xb03ada37, 0xf0500c0d,
0xf01c1f04, 0x0200b3ff, 0xae0cf51a, 0x3cb574b2, 0x25837a58, 0xdc0921bd,
0xd19113f9, 0x7ca92ff6, 0x94324773, 0x22f54701, 0x3ae5e581, 0x37c2dadc,
0xc8b57634, 0x9af3dda7, 0xa9446146, 0x0fd0030e, 0xecc8c73e, 0xa4751e41,
0xe238cd99, 0x3bea0e2f, 0x3280bba1, 0x183eb331, 0x4e548b38, 0x4f6db908,
0x6f420d03, 0xf60a04bf, 0x2cb81290, 0x24977c79, 0x5679b072, 0xbcaf89af,
0xde9a771f, 0xd9930810, 0xb38bae12, 0xdccf3f2e, 0x5512721f, 0x2e6b7124,
0x501adde6, 0x9f84cd87, 0x7a584718, 0x7408da17, 0xbc9f9abc, 0xe94b7d8c,
0xec7aec3a, 0xdb851dfa, 0x63094366, 0xc464c3d2, 0xef1c1847, 0x3215d908,
0xdd433b37, 0x24c2ba16, 0x12a14d43, 0x2a65c451, 0x50940002, 0x133ae4dd,
0x71dff89e, 0x10314e55, 0x81ac77d6, 0x5f11199b, 0x043556f1, 0xd7a3c76b,
0x3c11183b, 0x5924a509, 0xf28fe6ed, 0x97f1fbfa, 0x9ebabf2c, 0x1e153c6e,
0x86e34570, 0xeae96fb1, 0x860e5e0a, 0x5a3e2ab3, 0x771fe71c, 0x4e3d06fa,
0x2965dcb9, 0x99e71d0f, 0x803e89d6, 0x5266c825, 0x2e4cc978, 0x9c10b36a,
0xc6150eba, 0x94e2ea78, 0xa5fc3c53, 0x1e0a2df4, 0xf2f74ea7, 0x361d2b3d,
0x1939260f, 0x19c27960, 0x5223a708, 0xf71312b6, 0xebadfe6e, 0xeac31f66,
0xe3bc4595, 0xa67bc883, 0xb17f37d1, 0x018cff28, 0xc332ddef, 0xbe6c5aa5,
0x65582185, 0x68ab9802, 0xeecea50f, 0xdb2f953b, 0x2aef7dad, 0x5b6e2f84,
0x1521b628, 0x29076170, 0xecdd4775, 0x619f1510, 0x13cca830, 0xeb61bd96,
0x0334fe1e, 0xaa0363cf, 0xb5735c90, 0x4c70a239, 0xd59e9e0b, 0xcbaade14,
0xeecc86bc, 0x60622ca7, 0x9cab5cab, 0xb2f3846e, 0x648b1eaf, 0x19bdf0ca,
0xa02369b9, 0x655abb50, 0x40685a32, 0x3c2ab4b3, 0x319ee9d5, 0xc021b8f7,
0x9b540b19, 0x875fa099, 0x95f7997e, 0x623d7da8, 0xf837889a, 0x97e32d77,
0x11ed935f, 0x16681281, 0x0e358829, 0xc7e61fd6, 0x96dedfa1, 0x7858ba99,
0x57f584a5, 0x1b227263, 0x9b83c3ff, 0x1ac24696, 0xcdb30aeb, 0x532e3054,
0x8fd948e4, 0x6dbc3128, 0x58ebf2ef, 0x34c6ffea, 0xfe28ed61, 0xee7c3c73,
0x5d4a14d9, 0xe864b7e3, 0x42105d14, 0x203e13e0, 0x45eee2b6, 0xa3aaabea,
0xdb6c4f15, 0xfacb4fd0, 0xc742f442, 0xef6abbb5, 0x654f3b1d, 0x41cd2105,
0xd81e799e, 0x86854dc7, 0xe44b476a, 0x3d816250, 0xcf62a1f2, 0x5b8d2646,
0xfc8883a0, 0xc1c7b6a3, 0x7f1524c3, 0x69cb7492, 0x47848a0b, 0x5692b285,
0x095bbf00, 0xad19489d, 0x1462b174, 0x23820e00, 0x58428d2a, 0x0c55f5ea,
0x1dadf43e, 0x233f7061, 0x3372f092, 0x8d937e41, 0xd65fecf1, 0x6c223bdb,
0x7cde3759, 0xcbee7460, 0x4085f2a7, 0xce77326e, 0xa6078084, 0x19f8509e,
0xe8efd855, 0x61d99735, 0xa969a7aa, 0xc50c06c2, 0x5a04abfc, 0x800bcadc,
0x9e447a2e, 0xc3453484, 0xfdd56705, 0x0e1e9ec9, 0xdb73dbd3, 0x105588cd,
0x675fda79, 0xe3674340, 0xc5c43465, 0x713e38d8, 0x3d28f89e, 0xf16dff20,
0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7,
}
var s2 = [256]uint32{
0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934, 0x411520f7, 0x7602d4f7,
0xbcf46b2e, 0xd4a20068, 0xd4082471, 0x3320f46a, 0x43b7d4b7, 0x500061af,
0x1e39f62e, 0x97244546, 0x14214f74, 0xbf8b8840, 0x4d95fc1d, 0x96b591af,
0x70f4ddd3, 0x66a02f45, 0xbfbc09ec, 0x03bd9785, 0x7fac6dd0, 0x31cb8504,
0x96eb27b3, 0x55fd3941, 0xda2547e6, 0xabca0a9a, 0x28507825, 0x530429f4,
0x0a2c86da, 0xe9b66dfb, 0x68dc1462, 0xd7486900, 0x680ec0a4, 0x27a18dee,
0x4f3ffea2, 0xe887ad8c, 0xb58ce006, 0x7af4d6b6, 0xaace1e7c, 0xd3375fec,
0xce78a399, 0x406b2a42, 0x20fe9e35, 0xd9f385b9, 0xee39d7ab, 0x3b124e8b,
0x1dc9faf7, 0x4b6d1856, 0x26a36631, 0xeae397b2, 0x3a6efa74, 0xdd5b4332,
0x6841e7f7, 0xca7820fb, 0xfb0af54e, 0xd8feb397, 0x454056ac, 0xba489527,
0x55533a3a, 0x20838d87, 0xfe6ba9b7, 0xd096954b, 0x55a867bc, 0xa1159a58,
0xcca92963, 0x99e1db33, 0xa62a4a56, 0x3f3125f9, 0x5ef47e1c, 0x9029317c,
0xfdf8e802, 0x04272f70, 0x80bb155c, 0x05282ce3, 0x95c11548, 0xe4c66d22,
0x48c1133f, 0xc70f86dc, 0x07f9c9ee, 0x41041f0f, 0x404779a4, 0x5d886e17,
0x325f51eb, 0xd59bc0d1, 0xf2bcc18f, 0x41113564, 0x257b7834, 0x602a9c60,
0xdff8e8a3, 0x1f636c1b, 0x0e12b4c2, 0x02e1329e, 0xaf664fd1, 0xcad18115,
0x6b2395e0, 0x333e92e1, 0x3b240b62, 0xeebeb922, 0x85b2a20e, 0xe6ba0d99,
0xde720c8c, 0x2da2f728, 0xd0127845, 0x95b794fd, 0x647d0862, 0xe7ccf5f0,
0x5449a36f, 0x877d48fa, 0xc39dfd27, 0xf33e8d1e, 0x0a476341, 0x992eff74,
0x3a6f6eab, 0xf4f8fd37, 0xa812dc60, 0xa1ebddf8, 0x991be14c, 0xdb6e6b0d,
0xc67b5510, 0x6d672c37, 0x2765d43b, 0xdcd0e804, 0xf1290dc7, 0xcc00ffa3,
0xb5390f92, 0x690fed0b, 0x667b9ffb, 0xcedb7d9c, 0xa091cf0b, 0xd9155ea3,
0xbb132f88, 0x515bad24, 0x7b9479bf, 0x763bd6eb, 0x37392eb3, 0xcc115979,
0x8026e297, 0xf42e312d, 0x6842ada7, 0xc66a2b3b, 0x12754ccc, 0x782ef11c,
0x6a124237, 0xb79251e7, 0x06a1bbe6, 0x4bfb6350, 0x1a6b1018, 0x11caedfa,
0x3d25bdd8, 0xe2e1c3c9, 0x44421659, 0x0a121386, 0xd90cec6e, 0xd5abea2a,
0x64af674e, 0xda86a85f, 0xbebfe988, 0x64e4c3fe, 0x9dbc8057, 0xf0f7c086,
0x60787bf8, 0x6003604d, 0xd1fd8346, 0xf6381fb0, 0x7745ae04, 0xd736fccc,
0x83426b33, 0xf01eab71, 0xb0804187, 0x3c005e5f, 0x77a057be, 0xbde8ae24,
0x55464299, 0xbf582e61, 0x4e58f48f, 0xf2ddfda2, 0xf474ef38, 0x8789bdc2,
0x5366f9c3, 0xc8b38e74, 0xb475f255, 0x46fcd9b9, 0x7aeb2661, 0x8b1ddf84,
0x846a0e79, 0x915f95e2, 0x466e598e, 0x20b45770, 0x8cd55591, 0xc902de4c,
0xb90bace1, 0xbb8205d0, 0x11a86248, 0x7574a99e, 0xb77f19b6, 0xe0a9dc09,
0x662d09a1, 0xc4324633, 0xe85a1f02, 0x09f0be8c, 0x4a99a025, 0x1d6efe10,
0x1ab93d1d, 0x0ba5a4df, 0xa186f20f, 0x2868f169, 0xdcb7da83, 0x573906fe,
0xa1e2ce9b, 0x4fcd7f52, 0x50115e01, 0xa70683fa, 0xa002b5c4, 0x0de6d027,
0x9af88c27, 0x773f8641, 0xc3604c06, 0x61a806b5, 0xf0177a28, 0xc0f586e0,
0x006058aa, 0x30dc7d62, 0x11e69ed7, 0x2338ea63, 0x53c2dd94, 0xc2c21634,
0xbbcbee56, 0x90bcb6de, 0xebfc7da1, 0xce591d76, 0x6f05e409, 0x4b7c0188,
0x39720a3d, 0x7c927c24, 0x86e3725f, 0x724d9db9, 0x1ac15bb4, 0xd39eb8fc,
0xed545578, 0x08fca5b5, 0xd83d7cd3, 0x4dad0fc4, 0x1e50ef5e, 0xb161e6f8,
0xa28514d9, 0x6c51133c, 0x6fd5c7e7, 0x56e14ec4, 0x362abfce, 0xddc6c837,
0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0,
}
var s3 = [256]uint32{
0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b, 0x5cb0679e, 0x4fa33742,
0xd3822740, 0x99bc9bbe, 0xd5118e9d, 0xbf0f7315, 0xd62d1c7e, 0xc700c47b,
0xb78c1b6b, 0x21a19045, 0xb26eb1be, 0x6a366eb4, 0x5748ab2f, 0xbc946e79,
0xc6a376d2, 0x6549c2c8, 0x530ff8ee, 0x468dde7d, 0xd5730a1d, 0x4cd04dc6,
0x2939bbdb, 0xa9ba4650, 0xac9526e8, 0xbe5ee304, 0xa1fad5f0, 0x6a2d519a,
0x63ef8ce2, 0x9a86ee22, 0xc089c2b8, 0x43242ef6, 0xa51e03aa, 0x9cf2d0a4,
0x83c061ba, 0x9be96a4d, 0x8fe51550, 0xba645bd6, 0x2826a2f9, 0xa73a3ae1,
0x4ba99586, 0xef5562e9, 0xc72fefd3, 0xf752f7da, 0x3f046f69, 0x77fa0a59,
0x80e4a915, 0x87b08601, 0x9b09e6ad, 0x3b3ee593, 0xe990fd5a, 0x9e34d797,
0x2cf0b7d9, 0x022b8b51, 0x96d5ac3a, 0x017da67d, 0xd1cf3ed6, 0x7c7d2d28,
0x1f9f25cf, 0xadf2b89b, 0x5ad6b472, 0x5a88f54c, 0xe029ac71, 0xe019a5e6,
0x47b0acfd, 0xed93fa9b, 0xe8d3c48d, 0x283b57cc, 0xf8d56629, 0x79132e28,
0x785f0191, 0xed756055, 0xf7960e44, 0xe3d35e8c, 0x15056dd4, 0x88f46dba,
0x03a16125, 0x0564f0bd, 0xc3eb9e15, 0x3c9057a2, 0x97271aec, 0xa93a072a,
0x1b3f6d9b, 0x1e6321f5, 0xf59c66fb, 0x26dcf319, 0x7533d928, 0xb155fdf5,
0x03563482, 0x8aba3cbb, 0x28517711, 0xc20ad9f8, 0xabcc5167, 0xccad925f,
0x4de81751, 0x3830dc8e, 0x379d5862, 0x9320f991, 0xea7a90c2, 0xfb3e7bce,
0x5121ce64, 0x774fbe32, 0xa8b6e37e, 0xc3293d46, 0x48de5369, 0x6413e680,
0xa2ae0810, 0xdd6db224, 0x69852dfd, 0x09072166, 0xb39a460a, 0x6445c0dd,
0x586cdecf, 0x1c20c8ae, 0x5bbef7dd, 0x1b588d40, 0xccd2017f, 0x6bb4e3bb,
0xdda26a7e, 0x3a59ff45, 0x3e350a44, 0xbcb4cdd5, 0x72eacea8, 0xfa6484bb,
0x8d6612ae, 0xbf3c6f47, 0xd29be463, 0x542f5d9e, 0xaec2771b, 0xf64e6370,
0x740e0d8d, 0xe75b1357, 0xf8721671, 0xaf537d5d, 0x4040cb08, 0x4eb4e2cc,
0x34d2466a, 0x0115af84, 0xe1b00428, 0x95983a1d, 0x06b89fb4, 0xce6ea048,
0x6f3f3b82, 0x3520ab82, 0x011a1d4b, 0x277227f8, 0x611560b1, 0xe7933fdc,
0xbb3a792b, 0x344525bd, 0xa08839e1, 0x51ce794b, 0x2f32c9b7, 0xa01fbac9,
0xe01cc87e, 0xbcc7d1f6, 0xcf0111c3, 0xa1e8aac7, 0x1a908749, 0xd44fbd9a,
0xd0dadecb, 0xd50ada38, 0x0339c32a, 0xc6913667, 0x8df9317c, 0xe0b12b4f,
0xf79e59b7, 0x43f5bb3a, 0xf2d519ff, 0x27d9459c, 0xbf97222c, 0x15e6fc2a,
0x0f91fc71, 0x9b941525, 0xfae59361, 0xceb69ceb, 0xc2a86459, 0x12baa8d1,
0xb6c1075e, 0xe3056a0c, 0x10d25065, 0xcb03a442, 0xe0ec6e0e, 0x1698db3b,
0x4c98a0be, 0x3278e964, 0x9f1f9532, 0xe0d392df, 0xd3a0342b, 0x8971f21e,
0x1b0a7441, 0x4ba3348c, 0xc5be7120, 0xc37632d8, 0xdf359f8d, 0x9b992f2e,
0xe60b6f47, 0x0fe3f11d, 0xe54cda54, 0x1edad891, 0xce6279cf, 0xcd3e7e6f,
0x1618b166, 0xfd2c1d05, 0x848fd2c5, 0xf6fb2299, 0xf523f357, 0xa6327623,
0x93a83531, 0x56cccd02, 0xacf08162, 0x5a75ebb5, 0x6e163697, 0x88d273cc,
0xde966292, 0x81b949d0, 0x4c50901b, 0x71c65614, 0xe6c6c7bd, 0x327a140a,
0x45e1d006, 0xc3f27b9a, 0xc9aa53fd, 0x62a80f00, 0xbb25bfe2, 0x35bdd2f6,
0x71126905, 0xb2040222, 0xb6cbcf7c, 0xcd769c2b, 0x53113ec0, 0x1640e3d3,
0x38abbd60, 0x2547adf0, 0xba38209c, 0xf746ce76, 0x77afa1c5, 0x20756060,
0x85cbfe4e, 0x8ae88dd8, 0x7aaaf9b0, 0x4cf9aa7e, 0x1948c25c, 0x02fb8a8c,
0x01c36ae4, 0xd6ebe1f9, 0x90d4f869, 0xa65cdea0, 0x3f09252d, 0xc208e69f,
0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6,
}
var p = [18]uint32{
0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, 0xa4093822, 0x299f31d0,
0x082efa98, 0xec4e6c89, 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917, 0x9216d5d9, 0x8979fb1b,
}

16
vendor/golang.org/x/crypto/chacha20/chacha_arm64.go generated vendored Normal file
View File

@@ -0,0 +1,16 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
package chacha20
const bufSize = 256
//go:noescape
func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
}

307
vendor/golang.org/x/crypto/chacha20/chacha_arm64.s generated vendored Normal file
View File

@@ -0,0 +1,307 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
#include "textflag.h"
#define NUM_ROUNDS 10
// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
MOVD dst+0(FP), R1
MOVD src+24(FP), R2
MOVD src_len+32(FP), R3
MOVD key+48(FP), R4
MOVD nonce+56(FP), R6
MOVD counter+64(FP), R7
MOVD $·constants(SB), R10
MOVD $·incRotMatrix(SB), R11
MOVW (R7), R20
AND $~255, R3, R13
ADD R2, R13, R12 // R12 for block end
AND $255, R3, R13
loop:
MOVD $NUM_ROUNDS, R21
VLD1 (R11), [V30.S4, V31.S4]
// load contants
// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
WORD $0x4D60E940
// load keys
// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
WORD $0x4DFFE884
// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
WORD $0x4DFFE888
SUB $32, R4
// load counter + nonce
// VLD1R (R7), [V12.S4]
WORD $0x4D40C8EC
// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
WORD $0x4D40E8CD
// update counter
VADD V30.S4, V12.S4, V12.S4
chacha:
// V0..V3 += V4..V7
// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
VADD V0.S4, V4.S4, V0.S4
VADD V1.S4, V5.S4, V1.S4
VADD V2.S4, V6.S4, V2.S4
VADD V3.S4, V7.S4, V3.S4
VEOR V12.B16, V0.B16, V12.B16
VEOR V13.B16, V1.B16, V13.B16
VEOR V14.B16, V2.B16, V14.B16
VEOR V15.B16, V3.B16, V15.B16
VREV32 V12.H8, V12.H8
VREV32 V13.H8, V13.H8
VREV32 V14.H8, V14.H8
VREV32 V15.H8, V15.H8
// V8..V11 += V12..V15
// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
VADD V8.S4, V12.S4, V8.S4
VADD V9.S4, V13.S4, V9.S4
VADD V10.S4, V14.S4, V10.S4
VADD V11.S4, V15.S4, V11.S4
VEOR V8.B16, V4.B16, V16.B16
VEOR V9.B16, V5.B16, V17.B16
VEOR V10.B16, V6.B16, V18.B16
VEOR V11.B16, V7.B16, V19.B16
VSHL $12, V16.S4, V4.S4
VSHL $12, V17.S4, V5.S4
VSHL $12, V18.S4, V6.S4
VSHL $12, V19.S4, V7.S4
VSRI $20, V16.S4, V4.S4
VSRI $20, V17.S4, V5.S4
VSRI $20, V18.S4, V6.S4
VSRI $20, V19.S4, V7.S4
// V0..V3 += V4..V7
// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
VADD V0.S4, V4.S4, V0.S4
VADD V1.S4, V5.S4, V1.S4
VADD V2.S4, V6.S4, V2.S4
VADD V3.S4, V7.S4, V3.S4
VEOR V12.B16, V0.B16, V12.B16
VEOR V13.B16, V1.B16, V13.B16
VEOR V14.B16, V2.B16, V14.B16
VEOR V15.B16, V3.B16, V15.B16
VTBL V31.B16, [V12.B16], V12.B16
VTBL V31.B16, [V13.B16], V13.B16
VTBL V31.B16, [V14.B16], V14.B16
VTBL V31.B16, [V15.B16], V15.B16
// V8..V11 += V12..V15
// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
VADD V12.S4, V8.S4, V8.S4
VADD V13.S4, V9.S4, V9.S4
VADD V14.S4, V10.S4, V10.S4
VADD V15.S4, V11.S4, V11.S4
VEOR V8.B16, V4.B16, V16.B16
VEOR V9.B16, V5.B16, V17.B16
VEOR V10.B16, V6.B16, V18.B16
VEOR V11.B16, V7.B16, V19.B16
VSHL $7, V16.S4, V4.S4
VSHL $7, V17.S4, V5.S4
VSHL $7, V18.S4, V6.S4
VSHL $7, V19.S4, V7.S4
VSRI $25, V16.S4, V4.S4
VSRI $25, V17.S4, V5.S4
VSRI $25, V18.S4, V6.S4
VSRI $25, V19.S4, V7.S4
// V0..V3 += V5..V7, V4
// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
VADD V0.S4, V5.S4, V0.S4
VADD V1.S4, V6.S4, V1.S4
VADD V2.S4, V7.S4, V2.S4
VADD V3.S4, V4.S4, V3.S4
VEOR V15.B16, V0.B16, V15.B16
VEOR V12.B16, V1.B16, V12.B16
VEOR V13.B16, V2.B16, V13.B16
VEOR V14.B16, V3.B16, V14.B16
VREV32 V12.H8, V12.H8
VREV32 V13.H8, V13.H8
VREV32 V14.H8, V14.H8
VREV32 V15.H8, V15.H8
// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
// ...
VADD V15.S4, V10.S4, V10.S4
VADD V12.S4, V11.S4, V11.S4
VADD V13.S4, V8.S4, V8.S4
VADD V14.S4, V9.S4, V9.S4
VEOR V10.B16, V5.B16, V16.B16
VEOR V11.B16, V6.B16, V17.B16
VEOR V8.B16, V7.B16, V18.B16
VEOR V9.B16, V4.B16, V19.B16
VSHL $12, V16.S4, V5.S4
VSHL $12, V17.S4, V6.S4
VSHL $12, V18.S4, V7.S4
VSHL $12, V19.S4, V4.S4
VSRI $20, V16.S4, V5.S4
VSRI $20, V17.S4, V6.S4
VSRI $20, V18.S4, V7.S4
VSRI $20, V19.S4, V4.S4
// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
// ...
VADD V5.S4, V0.S4, V0.S4
VADD V6.S4, V1.S4, V1.S4
VADD V7.S4, V2.S4, V2.S4
VADD V4.S4, V3.S4, V3.S4
VEOR V0.B16, V15.B16, V15.B16
VEOR V1.B16, V12.B16, V12.B16
VEOR V2.B16, V13.B16, V13.B16
VEOR V3.B16, V14.B16, V14.B16
VTBL V31.B16, [V12.B16], V12.B16
VTBL V31.B16, [V13.B16], V13.B16
VTBL V31.B16, [V14.B16], V14.B16
VTBL V31.B16, [V15.B16], V15.B16
// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
// ...
VADD V15.S4, V10.S4, V10.S4
VADD V12.S4, V11.S4, V11.S4
VADD V13.S4, V8.S4, V8.S4
VADD V14.S4, V9.S4, V9.S4
VEOR V10.B16, V5.B16, V16.B16
VEOR V11.B16, V6.B16, V17.B16
VEOR V8.B16, V7.B16, V18.B16
VEOR V9.B16, V4.B16, V19.B16
VSHL $7, V16.S4, V5.S4
VSHL $7, V17.S4, V6.S4
VSHL $7, V18.S4, V7.S4
VSHL $7, V19.S4, V4.S4
VSRI $25, V16.S4, V5.S4
VSRI $25, V17.S4, V6.S4
VSRI $25, V18.S4, V7.S4
VSRI $25, V19.S4, V4.S4
SUB $1, R21
CBNZ R21, chacha
// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
WORD $0x4D60E950
// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
WORD $0x4DFFE894
VADD V30.S4, V12.S4, V12.S4
VADD V16.S4, V0.S4, V0.S4
VADD V17.S4, V1.S4, V1.S4
VADD V18.S4, V2.S4, V2.S4
VADD V19.S4, V3.S4, V3.S4
// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
WORD $0x4DFFE898
// restore R4
SUB $32, R4
// load counter + nonce
// VLD1R (R7), [V28.S4]
WORD $0x4D40C8FC
// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
WORD $0x4D40E8DD
VADD V20.S4, V4.S4, V4.S4
VADD V21.S4, V5.S4, V5.S4
VADD V22.S4, V6.S4, V6.S4
VADD V23.S4, V7.S4, V7.S4
VADD V24.S4, V8.S4, V8.S4
VADD V25.S4, V9.S4, V9.S4
VADD V26.S4, V10.S4, V10.S4
VADD V27.S4, V11.S4, V11.S4
VADD V28.S4, V12.S4, V12.S4
VADD V29.S4, V13.S4, V13.S4
VADD V30.S4, V14.S4, V14.S4
VADD V31.S4, V15.S4, V15.S4
VZIP1 V1.S4, V0.S4, V16.S4
VZIP2 V1.S4, V0.S4, V17.S4
VZIP1 V3.S4, V2.S4, V18.S4
VZIP2 V3.S4, V2.S4, V19.S4
VZIP1 V5.S4, V4.S4, V20.S4
VZIP2 V5.S4, V4.S4, V21.S4
VZIP1 V7.S4, V6.S4, V22.S4
VZIP2 V7.S4, V6.S4, V23.S4
VZIP1 V9.S4, V8.S4, V24.S4
VZIP2 V9.S4, V8.S4, V25.S4
VZIP1 V11.S4, V10.S4, V26.S4
VZIP2 V11.S4, V10.S4, V27.S4
VZIP1 V13.S4, V12.S4, V28.S4
VZIP2 V13.S4, V12.S4, V29.S4
VZIP1 V15.S4, V14.S4, V30.S4
VZIP2 V15.S4, V14.S4, V31.S4
VZIP1 V18.D2, V16.D2, V0.D2
VZIP2 V18.D2, V16.D2, V4.D2
VZIP1 V19.D2, V17.D2, V8.D2
VZIP2 V19.D2, V17.D2, V12.D2
VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
VZIP1 V22.D2, V20.D2, V1.D2
VZIP2 V22.D2, V20.D2, V5.D2
VZIP1 V23.D2, V21.D2, V9.D2
VZIP2 V23.D2, V21.D2, V13.D2
VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
VZIP1 V26.D2, V24.D2, V2.D2
VZIP2 V26.D2, V24.D2, V6.D2
VZIP1 V27.D2, V25.D2, V10.D2
VZIP2 V27.D2, V25.D2, V14.D2
VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
VZIP1 V30.D2, V28.D2, V3.D2
VZIP2 V30.D2, V28.D2, V7.D2
VZIP1 V31.D2, V29.D2, V11.D2
VZIP2 V31.D2, V29.D2, V15.D2
VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
VEOR V0.B16, V16.B16, V16.B16
VEOR V1.B16, V17.B16, V17.B16
VEOR V2.B16, V18.B16, V18.B16
VEOR V3.B16, V19.B16, V19.B16
VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
VEOR V4.B16, V20.B16, V20.B16
VEOR V5.B16, V21.B16, V21.B16
VEOR V6.B16, V22.B16, V22.B16
VEOR V7.B16, V23.B16, V23.B16
VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
VEOR V8.B16, V24.B16, V24.B16
VEOR V9.B16, V25.B16, V25.B16
VEOR V10.B16, V26.B16, V26.B16
VEOR V11.B16, V27.B16, V27.B16
VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
VEOR V12.B16, V28.B16, V28.B16
VEOR V13.B16, V29.B16, V29.B16
VEOR V14.B16, V30.B16, V30.B16
VEOR V15.B16, V31.B16, V31.B16
VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
ADD $4, R20
MOVW R20, (R7) // update counter
CMP R2, R12
BGT loop
RET
DATA ·constants+0x00(SB)/4, $0x61707865
DATA ·constants+0x04(SB)/4, $0x3320646e
DATA ·constants+0x08(SB)/4, $0x79622d32
DATA ·constants+0x0c(SB)/4, $0x6b206574
GLOBL ·constants(SB), NOPTR|RODATA, $32
DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
DATA ·incRotMatrix+0x10(SB)/4, $0x02010003
DATA ·incRotMatrix+0x14(SB)/4, $0x06050407
DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B
DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32

398
vendor/golang.org/x/crypto/chacha20/chacha_generic.go generated vendored Normal file
View File

@@ -0,0 +1,398 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package chacha20 implements the ChaCha20 and XChaCha20 encryption algorithms
// as specified in RFC 8439 and draft-irtf-cfrg-xchacha-01.
package chacha20
import (
"crypto/cipher"
"encoding/binary"
"errors"
"math/bits"
"golang.org/x/crypto/internal/alias"
)
const (
// KeySize is the size of the key used by this cipher, in bytes.
KeySize = 32
// NonceSize is the size of the nonce used with the standard variant of this
// cipher, in bytes.
//
// Note that this is too short to be safely generated at random if the same
// key is reused more than 2³² times.
NonceSize = 12
// NonceSizeX is the size of the nonce used with the XChaCha20 variant of
// this cipher, in bytes.
NonceSizeX = 24
)
// Cipher is a stateful instance of ChaCha20 or XChaCha20 using a particular key
// and nonce. A *Cipher implements the cipher.Stream interface.
type Cipher struct {
// The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter
// (incremented after each block), and 3 of nonce.
key [8]uint32
counter uint32
nonce [3]uint32
// The last len bytes of buf are leftover key stream bytes from the previous
// XORKeyStream invocation. The size of buf depends on how many blocks are
// computed at a time by xorKeyStreamBlocks.
buf [bufSize]byte
len int
// overflow is set when the counter overflowed, no more blocks can be
// generated, and the next XORKeyStream call should panic.
overflow bool
// The counter-independent results of the first round are cached after they
// are computed the first time.
precompDone bool
p1, p5, p9, p13 uint32
p2, p6, p10, p14 uint32
p3, p7, p11, p15 uint32
}
var _ cipher.Stream = (*Cipher)(nil)
// NewUnauthenticatedCipher creates a new ChaCha20 stream cipher with the given
// 32 bytes key and a 12 or 24 bytes nonce. If a nonce of 24 bytes is provided,
// the XChaCha20 construction will be used. It returns an error if key or nonce
// have any other length.
//
// Note that ChaCha20, like all stream ciphers, is not authenticated and allows
// attackers to silently tamper with the plaintext. For this reason, it is more
// appropriate as a building block than as a standalone encryption mechanism.
// Instead, consider using package golang.org/x/crypto/chacha20poly1305.
func NewUnauthenticatedCipher(key, nonce []byte) (*Cipher, error) {
// This function is split into a wrapper so that the Cipher allocation will
// be inlined, and depending on how the caller uses the return value, won't
// escape to the heap.
c := &Cipher{}
return newUnauthenticatedCipher(c, key, nonce)
}
func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) {
if len(key) != KeySize {
return nil, errors.New("chacha20: wrong key size")
}
if len(nonce) == NonceSizeX {
// XChaCha20 uses the ChaCha20 core to mix 16 bytes of the nonce into a
// derived key, allowing it to operate on a nonce of 24 bytes. See
// draft-irtf-cfrg-xchacha-01, Section 2.3.
key, _ = HChaCha20(key, nonce[0:16])
cNonce := make([]byte, NonceSize)
copy(cNonce[4:12], nonce[16:24])
nonce = cNonce
} else if len(nonce) != NonceSize {
return nil, errors.New("chacha20: wrong nonce size")
}
key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
c.key = [8]uint32{
binary.LittleEndian.Uint32(key[0:4]),
binary.LittleEndian.Uint32(key[4:8]),
binary.LittleEndian.Uint32(key[8:12]),
binary.LittleEndian.Uint32(key[12:16]),
binary.LittleEndian.Uint32(key[16:20]),
binary.LittleEndian.Uint32(key[20:24]),
binary.LittleEndian.Uint32(key[24:28]),
binary.LittleEndian.Uint32(key[28:32]),
}
c.nonce = [3]uint32{
binary.LittleEndian.Uint32(nonce[0:4]),
binary.LittleEndian.Uint32(nonce[4:8]),
binary.LittleEndian.Uint32(nonce[8:12]),
}
return c, nil
}
// The constant first 4 words of the ChaCha20 state.
const (
j0 uint32 = 0x61707865 // expa
j1 uint32 = 0x3320646e // nd 3
j2 uint32 = 0x79622d32 // 2-by
j3 uint32 = 0x6b206574 // te k
)
const blockSize = 64
// quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words.
// It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16
// words each round, in columnar or diagonal groups of 4 at a time.
func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
a += b
d ^= a
d = bits.RotateLeft32(d, 16)
c += d
b ^= c
b = bits.RotateLeft32(b, 12)
a += b
d ^= a
d = bits.RotateLeft32(d, 8)
c += d
b ^= c
b = bits.RotateLeft32(b, 7)
return a, b, c, d
}
// SetCounter sets the Cipher counter. The next invocation of XORKeyStream will
// behave as if (64 * counter) bytes had been encrypted so far.
//
// To prevent accidental counter reuse, SetCounter panics if counter is less
// than the current value.
//
// Note that the execution time of XORKeyStream is not independent of the
// counter value.
func (s *Cipher) SetCounter(counter uint32) {
// Internally, s may buffer multiple blocks, which complicates this
// implementation slightly. When checking whether the counter has rolled
// back, we must use both s.counter and s.len to determine how many blocks
// we have already output.
outputCounter := s.counter - uint32(s.len)/blockSize
if s.overflow || counter < outputCounter {
panic("chacha20: SetCounter attempted to rollback counter")
}
// In the general case, we set the new counter value and reset s.len to 0,
// causing the next call to XORKeyStream to refill the buffer. However, if
// we're advancing within the existing buffer, we can save work by simply
// setting s.len.
if counter < s.counter {
s.len = int(s.counter-counter) * blockSize
} else {
s.counter = counter
s.len = 0
}
}
// XORKeyStream XORs each byte in the given slice with a byte from the
// cipher's key stream. Dst and src must overlap entirely or not at all.
//
// If len(dst) < len(src), XORKeyStream will panic. It is acceptable
// to pass a dst bigger than src, and in that case, XORKeyStream will
// only update dst[:len(src)] and will not touch the rest of dst.
//
// Multiple calls to XORKeyStream behave as if the concatenation of
// the src buffers was passed in a single run. That is, Cipher
// maintains state and does not reset at each XORKeyStream call.
func (s *Cipher) XORKeyStream(dst, src []byte) {
if len(src) == 0 {
return
}
if len(dst) < len(src) {
panic("chacha20: output smaller than input")
}
dst = dst[:len(src)]
if alias.InexactOverlap(dst, src) {
panic("chacha20: invalid buffer overlap")
}
// First, drain any remaining key stream from a previous XORKeyStream.
if s.len != 0 {
keyStream := s.buf[bufSize-s.len:]
if len(src) < len(keyStream) {
keyStream = keyStream[:len(src)]
}
_ = src[len(keyStream)-1] // bounds check elimination hint
for i, b := range keyStream {
dst[i] = src[i] ^ b
}
s.len -= len(keyStream)
dst, src = dst[len(keyStream):], src[len(keyStream):]
}
if len(src) == 0 {
return
}
// If we'd need to let the counter overflow and keep generating output,
// panic immediately. If instead we'd only reach the last block, remember
// not to generate any more output after the buffer is drained.
numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize
if s.overflow || uint64(s.counter)+numBlocks > 1<<32 {
panic("chacha20: counter overflow")
} else if uint64(s.counter)+numBlocks == 1<<32 {
s.overflow = true
}
// xorKeyStreamBlocks implementations expect input lengths that are a
// multiple of bufSize. Platform-specific ones process multiple blocks at a
// time, so have bufSizes that are a multiple of blockSize.
full := len(src) - len(src)%bufSize
if full > 0 {
s.xorKeyStreamBlocks(dst[:full], src[:full])
}
dst, src = dst[full:], src[full:]
// If using a multi-block xorKeyStreamBlocks would overflow, use the generic
// one that does one block at a time.
const blocksPerBuf = bufSize / blockSize
if uint64(s.counter)+blocksPerBuf > 1<<32 {
s.buf = [bufSize]byte{}
numBlocks := (len(src) + blockSize - 1) / blockSize
buf := s.buf[bufSize-numBlocks*blockSize:]
copy(buf, src)
s.xorKeyStreamBlocksGeneric(buf, buf)
s.len = len(buf) - copy(dst, buf)
return
}
// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
// keep the leftover keystream for the next XORKeyStream invocation.
if len(src) > 0 {
s.buf = [bufSize]byte{}
copy(s.buf[:], src)
s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
s.len = bufSize - copy(dst, s.buf[:])
}
}
func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
if len(dst) != len(src) || len(dst)%blockSize != 0 {
panic("chacha20: internal error: wrong dst and/or src length")
}
// To generate each block of key stream, the initial cipher state
// (represented below) is passed through 20 rounds of shuffling,
// alternatively applying quarterRounds by columns (like 1, 5, 9, 13)
// or by diagonals (like 1, 6, 11, 12).
//
// 0:cccccccc 1:cccccccc 2:cccccccc 3:cccccccc
// 4:kkkkkkkk 5:kkkkkkkk 6:kkkkkkkk 7:kkkkkkkk
// 8:kkkkkkkk 9:kkkkkkkk 10:kkkkkkkk 11:kkkkkkkk
// 12:bbbbbbbb 13:nnnnnnnn 14:nnnnnnnn 15:nnnnnnnn
//
// c=constant k=key b=blockcount n=nonce
var (
c0, c1, c2, c3 = j0, j1, j2, j3
c4, c5, c6, c7 = s.key[0], s.key[1], s.key[2], s.key[3]
c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7]
_, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2]
)
// Three quarters of the first round don't depend on the counter, so we can
// calculate them here, and reuse them for multiple blocks in the loop, and
// for future XORKeyStream invocations.
if !s.precompDone {
s.p1, s.p5, s.p9, s.p13 = quarterRound(c1, c5, c9, c13)
s.p2, s.p6, s.p10, s.p14 = quarterRound(c2, c6, c10, c14)
s.p3, s.p7, s.p11, s.p15 = quarterRound(c3, c7, c11, c15)
s.precompDone = true
}
// A condition of len(src) > 0 would be sufficient, but this also
// acts as a bounds check elimination hint.
for len(src) >= 64 && len(dst) >= 64 {
// The remainder of the first column round.
fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
// The second diagonal round.
x0, x5, x10, x15 := quarterRound(fcr0, s.p5, s.p10, s.p15)
x1, x6, x11, x12 := quarterRound(s.p1, s.p6, s.p11, fcr12)
x2, x7, x8, x13 := quarterRound(s.p2, s.p7, fcr8, s.p13)
x3, x4, x9, x14 := quarterRound(s.p3, fcr4, s.p9, s.p14)
// The remaining 18 rounds.
for i := 0; i < 9; i++ {
// Column round.
x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
// Diagonal round.
x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
}
// Add back the initial state to generate the key stream, then
// XOR the key stream with the source and write out the result.
addXor(dst[0:4], src[0:4], x0, c0)
addXor(dst[4:8], src[4:8], x1, c1)
addXor(dst[8:12], src[8:12], x2, c2)
addXor(dst[12:16], src[12:16], x3, c3)
addXor(dst[16:20], src[16:20], x4, c4)
addXor(dst[20:24], src[20:24], x5, c5)
addXor(dst[24:28], src[24:28], x6, c6)
addXor(dst[28:32], src[28:32], x7, c7)
addXor(dst[32:36], src[32:36], x8, c8)
addXor(dst[36:40], src[36:40], x9, c9)
addXor(dst[40:44], src[40:44], x10, c10)
addXor(dst[44:48], src[44:48], x11, c11)
addXor(dst[48:52], src[48:52], x12, s.counter)
addXor(dst[52:56], src[52:56], x13, c13)
addXor(dst[56:60], src[56:60], x14, c14)
addXor(dst[60:64], src[60:64], x15, c15)
s.counter += 1
src, dst = src[blockSize:], dst[blockSize:]
}
}
// HChaCha20 uses the ChaCha20 core to generate a derived key from a 32 bytes
// key and a 16 bytes nonce. It returns an error if key or nonce have any other
// length. It is used as part of the XChaCha20 construction.
func HChaCha20(key, nonce []byte) ([]byte, error) {
// This function is split into a wrapper so that the slice allocation will
// be inlined, and depending on how the caller uses the return value, won't
// escape to the heap.
out := make([]byte, 32)
return hChaCha20(out, key, nonce)
}
func hChaCha20(out, key, nonce []byte) ([]byte, error) {
if len(key) != KeySize {
return nil, errors.New("chacha20: wrong HChaCha20 key size")
}
if len(nonce) != 16 {
return nil, errors.New("chacha20: wrong HChaCha20 nonce size")
}
x0, x1, x2, x3 := j0, j1, j2, j3
x4 := binary.LittleEndian.Uint32(key[0:4])
x5 := binary.LittleEndian.Uint32(key[4:8])
x6 := binary.LittleEndian.Uint32(key[8:12])
x7 := binary.LittleEndian.Uint32(key[12:16])
x8 := binary.LittleEndian.Uint32(key[16:20])
x9 := binary.LittleEndian.Uint32(key[20:24])
x10 := binary.LittleEndian.Uint32(key[24:28])
x11 := binary.LittleEndian.Uint32(key[28:32])
x12 := binary.LittleEndian.Uint32(nonce[0:4])
x13 := binary.LittleEndian.Uint32(nonce[4:8])
x14 := binary.LittleEndian.Uint32(nonce[8:12])
x15 := binary.LittleEndian.Uint32(nonce[12:16])
for i := 0; i < 10; i++ {
// Diagonal round.
x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
// Column round.
x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
}
_ = out[31] // bounds check elimination hint
binary.LittleEndian.PutUint32(out[0:4], x0)
binary.LittleEndian.PutUint32(out[4:8], x1)
binary.LittleEndian.PutUint32(out[8:12], x2)
binary.LittleEndian.PutUint32(out[12:16], x3)
binary.LittleEndian.PutUint32(out[16:20], x12)
binary.LittleEndian.PutUint32(out[20:24], x13)
binary.LittleEndian.PutUint32(out[24:28], x14)
binary.LittleEndian.PutUint32(out[28:32], x15)
return out, nil
}

13
vendor/golang.org/x/crypto/chacha20/chacha_noasm.go generated vendored Normal file
View File

@@ -0,0 +1,13 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego
package chacha20
const bufSize = blockSize
func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
s.xorKeyStreamBlocksGeneric(dst, src)
}

16
vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go generated vendored Normal file
View File

@@ -0,0 +1,16 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
package chacha20
const bufSize = 256
//go:noescape
func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
}

443
vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s generated vendored Normal file
View File

@@ -0,0 +1,443 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Based on CRYPTOGAMS code with the following comment:
// # ====================================================================
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
// # project. The module is, however, dual licensed under OpenSSL and
// # CRYPTOGAMS licenses depending on where you obtain it. For further
// # details see http://www.openssl.org/~appro/cryptogams/.
// # ====================================================================
// Code for the perl script that generates the ppc64 assembler
// can be found in the cryptogams repository at the link below. It is based on
// the original from openssl.
// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
// The differences in this and the original implementation are
// due to the calling conventions and initialization of constants.
//go:build gc && !purego
#include "textflag.h"
#define OUT R3
#define INP R4
#define LEN R5
#define KEY R6
#define CNT R7
#define TMP R15
#define CONSTBASE R16
#define BLOCKS R17
// for VPERMXOR
#define MASK R18
DATA consts<>+0x00(SB)/8, $0x3320646e61707865
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
DATA consts<>+0x10(SB)/8, $0x0000000000000001
DATA consts<>+0x18(SB)/8, $0x0000000000000000
DATA consts<>+0x20(SB)/8, $0x0000000000000004
DATA consts<>+0x28(SB)/8, $0x0000000000000000
DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
DATA consts<>+0x38(SB)/8, $0x0203000106070405
DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
DATA consts<>+0x48(SB)/8, $0x0102030005060704
DATA consts<>+0x50(SB)/8, $0x6170786561707865
DATA consts<>+0x58(SB)/8, $0x6170786561707865
DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
DATA consts<>+0x90(SB)/8, $0x0000000100000000
DATA consts<>+0x98(SB)/8, $0x0000000300000002
DATA consts<>+0xa0(SB)/8, $0x5566774411223300
DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
DATA consts<>+0xb0(SB)/8, $0x6677445522330011
DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
GLOBL consts<>(SB), RODATA, $0xc0
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
MOVD out+0(FP), OUT
MOVD inp+8(FP), INP
MOVD len+16(FP), LEN
MOVD key+24(FP), KEY
MOVD counter+32(FP), CNT
// Addressing for constants
MOVD $consts<>+0x00(SB), CONSTBASE
MOVD $16, R8
MOVD $32, R9
MOVD $48, R10
MOVD $64, R11
SRD $6, LEN, BLOCKS
// for VPERMXOR
MOVD $consts<>+0xa0(SB), MASK
MOVD $16, R20
// V16
LXVW4X (CONSTBASE)(R0), VS48
ADD $80,CONSTBASE
// Load key into V17,V18
LXVW4X (KEY)(R0), VS49
LXVW4X (KEY)(R8), VS50
// Load CNT, NONCE into V19
LXVW4X (CNT)(R0), VS51
// Clear V27
VXOR V27, V27, V27
// V28
LXVW4X (CONSTBASE)(R11), VS60
// Load mask constants for VPERMXOR
LXVW4X (MASK)(R0), V20
LXVW4X (MASK)(R20), V21
// splat slot from V19 -> V26
VSPLTW $0, V19, V26
VSLDOI $4, V19, V27, V19
VSLDOI $12, V27, V19, V19
VADDUWM V26, V28, V26
MOVD $10, R14
MOVD R14, CTR
PCALIGN $16
loop_outer_vsx:
// V0, V1, V2, V3
LXVW4X (R0)(CONSTBASE), VS32
LXVW4X (R8)(CONSTBASE), VS33
LXVW4X (R9)(CONSTBASE), VS34
LXVW4X (R10)(CONSTBASE), VS35
// splat values from V17, V18 into V4-V11
VSPLTW $0, V17, V4
VSPLTW $1, V17, V5
VSPLTW $2, V17, V6
VSPLTW $3, V17, V7
VSPLTW $0, V18, V8
VSPLTW $1, V18, V9
VSPLTW $2, V18, V10
VSPLTW $3, V18, V11
// VOR
VOR V26, V26, V12
// splat values from V19 -> V13, V14, V15
VSPLTW $1, V19, V13
VSPLTW $2, V19, V14
VSPLTW $3, V19, V15
// splat const values
VSPLTISW $-16, V27
VSPLTISW $12, V28
VSPLTISW $8, V29
VSPLTISW $7, V30
PCALIGN $16
loop_vsx:
VADDUWM V0, V4, V0
VADDUWM V1, V5, V1
VADDUWM V2, V6, V2
VADDUWM V3, V7, V3
VPERMXOR V12, V0, V21, V12
VPERMXOR V13, V1, V21, V13
VPERMXOR V14, V2, V21, V14
VPERMXOR V15, V3, V21, V15
VADDUWM V8, V12, V8
VADDUWM V9, V13, V9
VADDUWM V10, V14, V10
VADDUWM V11, V15, V11
VXOR V4, V8, V4
VXOR V5, V9, V5
VXOR V6, V10, V6
VXOR V7, V11, V7
VRLW V4, V28, V4
VRLW V5, V28, V5
VRLW V6, V28, V6
VRLW V7, V28, V7
VADDUWM V0, V4, V0
VADDUWM V1, V5, V1
VADDUWM V2, V6, V2
VADDUWM V3, V7, V3
VPERMXOR V12, V0, V20, V12
VPERMXOR V13, V1, V20, V13
VPERMXOR V14, V2, V20, V14
VPERMXOR V15, V3, V20, V15
VADDUWM V8, V12, V8
VADDUWM V9, V13, V9
VADDUWM V10, V14, V10
VADDUWM V11, V15, V11
VXOR V4, V8, V4
VXOR V5, V9, V5
VXOR V6, V10, V6
VXOR V7, V11, V7
VRLW V4, V30, V4
VRLW V5, V30, V5
VRLW V6, V30, V6
VRLW V7, V30, V7
VADDUWM V0, V5, V0
VADDUWM V1, V6, V1
VADDUWM V2, V7, V2
VADDUWM V3, V4, V3
VPERMXOR V15, V0, V21, V15
VPERMXOR V12, V1, V21, V12
VPERMXOR V13, V2, V21, V13
VPERMXOR V14, V3, V21, V14
VADDUWM V10, V15, V10
VADDUWM V11, V12, V11
VADDUWM V8, V13, V8
VADDUWM V9, V14, V9
VXOR V5, V10, V5
VXOR V6, V11, V6
VXOR V7, V8, V7
VXOR V4, V9, V4
VRLW V5, V28, V5
VRLW V6, V28, V6
VRLW V7, V28, V7
VRLW V4, V28, V4
VADDUWM V0, V5, V0
VADDUWM V1, V6, V1
VADDUWM V2, V7, V2
VADDUWM V3, V4, V3
VPERMXOR V15, V0, V20, V15
VPERMXOR V12, V1, V20, V12
VPERMXOR V13, V2, V20, V13
VPERMXOR V14, V3, V20, V14
VADDUWM V10, V15, V10
VADDUWM V11, V12, V11
VADDUWM V8, V13, V8
VADDUWM V9, V14, V9
VXOR V5, V10, V5
VXOR V6, V11, V6
VXOR V7, V8, V7
VXOR V4, V9, V4
VRLW V5, V30, V5
VRLW V6, V30, V6
VRLW V7, V30, V7
VRLW V4, V30, V4
BDNZ loop_vsx
VADDUWM V12, V26, V12
VMRGEW V0, V1, V27
VMRGEW V2, V3, V28
VMRGOW V0, V1, V0
VMRGOW V2, V3, V2
VMRGEW V4, V5, V29
VMRGEW V6, V7, V30
XXPERMDI VS32, VS34, $0, VS33
XXPERMDI VS32, VS34, $3, VS35
XXPERMDI VS59, VS60, $0, VS32
XXPERMDI VS59, VS60, $3, VS34
VMRGOW V4, V5, V4
VMRGOW V6, V7, V6
VMRGEW V8, V9, V27
VMRGEW V10, V11, V28
XXPERMDI VS36, VS38, $0, VS37
XXPERMDI VS36, VS38, $3, VS39
XXPERMDI VS61, VS62, $0, VS36
XXPERMDI VS61, VS62, $3, VS38
VMRGOW V8, V9, V8
VMRGOW V10, V11, V10
VMRGEW V12, V13, V29
VMRGEW V14, V15, V30
XXPERMDI VS40, VS42, $0, VS41
XXPERMDI VS40, VS42, $3, VS43
XXPERMDI VS59, VS60, $0, VS40
XXPERMDI VS59, VS60, $3, VS42
VMRGOW V12, V13, V12
VMRGOW V14, V15, V14
VSPLTISW $4, V27
VADDUWM V26, V27, V26
XXPERMDI VS44, VS46, $0, VS45
XXPERMDI VS44, VS46, $3, VS47
XXPERMDI VS61, VS62, $0, VS44
XXPERMDI VS61, VS62, $3, VS46
VADDUWM V0, V16, V0
VADDUWM V4, V17, V4
VADDUWM V8, V18, V8
VADDUWM V12, V19, V12
CMPU LEN, $64
BLT tail_vsx
// Bottom of loop
LXVW4X (INP)(R0), VS59
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
STXVW4X VS59, (OUT)(R0)
STXVW4X VS60, (OUT)(R8)
ADD $64, INP
STXVW4X VS61, (OUT)(R9)
ADD $-64, LEN
STXVW4X VS62, (OUT)(R10)
ADD $64, OUT
BEQ done_vsx
VADDUWM V1, V16, V0
VADDUWM V5, V17, V4
VADDUWM V9, V18, V8
VADDUWM V13, V19, V12
CMPU LEN, $64
BLT tail_vsx
LXVW4X (INP)(R0), VS59
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
STXVW4X VS59, (OUT)(R0)
STXVW4X VS60, (OUT)(R8)
ADD $64, INP
STXVW4X VS61, (OUT)(R9)
ADD $-64, LEN
STXVW4X VS62, (OUT)(V10)
ADD $64, OUT
BEQ done_vsx
VADDUWM V2, V16, V0
VADDUWM V6, V17, V4
VADDUWM V10, V18, V8
VADDUWM V14, V19, V12
CMPU LEN, $64
BLT tail_vsx
LXVW4X (INP)(R0), VS59
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
STXVW4X VS59, (OUT)(R0)
STXVW4X VS60, (OUT)(R8)
ADD $64, INP
STXVW4X VS61, (OUT)(R9)
ADD $-64, LEN
STXVW4X VS62, (OUT)(R10)
ADD $64, OUT
BEQ done_vsx
VADDUWM V3, V16, V0
VADDUWM V7, V17, V4
VADDUWM V11, V18, V8
VADDUWM V15, V19, V12
CMPU LEN, $64
BLT tail_vsx
LXVW4X (INP)(R0), VS59
LXVW4X (INP)(R8), VS60
LXVW4X (INP)(R9), VS61
LXVW4X (INP)(R10), VS62
VXOR V27, V0, V27
VXOR V28, V4, V28
VXOR V29, V8, V29
VXOR V30, V12, V30
STXVW4X VS59, (OUT)(R0)
STXVW4X VS60, (OUT)(R8)
ADD $64, INP
STXVW4X VS61, (OUT)(R9)
ADD $-64, LEN
STXVW4X VS62, (OUT)(R10)
ADD $64, OUT
MOVD $10, R14
MOVD R14, CTR
BNE loop_outer_vsx
done_vsx:
// Increment counter by number of 64 byte blocks
MOVD (CNT), R14
ADD BLOCKS, R14
MOVD R14, (CNT)
RET
tail_vsx:
ADD $32, R1, R11
MOVD LEN, CTR
// Save values on stack to copy from
STXVW4X VS32, (R11)(R0)
STXVW4X VS36, (R11)(R8)
STXVW4X VS40, (R11)(R9)
STXVW4X VS44, (R11)(R10)
ADD $-1, R11, R12
ADD $-1, INP
ADD $-1, OUT
PCALIGN $16
looptail_vsx:
// Copying the result to OUT
// in bytes.
MOVBZU 1(R12), KEY
MOVBZU 1(INP), TMP
XOR KEY, TMP, KEY
MOVBU KEY, 1(OUT)
BDNZ looptail_vsx
// Clear the stack values
STXVW4X VS48, (R11)(R0)
STXVW4X VS48, (R11)(R8)
STXVW4X VS48, (R11)(R9)
STXVW4X VS48, (R11)(R10)
BR done_vsx

27
vendor/golang.org/x/crypto/chacha20/chacha_s390x.go generated vendored Normal file
View File

@@ -0,0 +1,27 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
package chacha20
import "golang.org/x/sys/cpu"
var haveAsm = cpu.S390X.HasVX
const bufSize = 256
// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only
// be called when the vector facility is available. Implementation in asm_s390x.s.
//
//go:noescape
func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
if cpu.S390X.HasVX {
xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
} else {
c.xorKeyStreamBlocksGeneric(dst, src)
}
}

224
vendor/golang.org/x/crypto/chacha20/chacha_s390x.s generated vendored Normal file
View File

@@ -0,0 +1,224 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
#include "go_asm.h"
#include "textflag.h"
// This is an implementation of the ChaCha20 encryption algorithm as
// specified in RFC 7539. It uses vector instructions to compute
// 4 keystream blocks in parallel (256 bytes) which are then XORed
// with the bytes in the input slice.
GLOBL ·constants<>(SB), RODATA|NOPTR, $32
// BSWAP: swap bytes in each 4-byte element
DATA ·constants<>+0x00(SB)/4, $0x03020100
DATA ·constants<>+0x04(SB)/4, $0x07060504
DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
// J0: [j0, j1, j2, j3]
DATA ·constants<>+0x10(SB)/4, $0x61707865
DATA ·constants<>+0x14(SB)/4, $0x3320646e
DATA ·constants<>+0x18(SB)/4, $0x79622d32
DATA ·constants<>+0x1c(SB)/4, $0x6b206574
#define BSWAP V5
#define J0 V6
#define KEY0 V7
#define KEY1 V8
#define NONCE V9
#define CTR V10
#define M0 V11
#define M1 V12
#define M2 V13
#define M3 V14
#define INC V15
#define X0 V16
#define X1 V17
#define X2 V18
#define X3 V19
#define X4 V20
#define X5 V21
#define X6 V22
#define X7 V23
#define X8 V24
#define X9 V25
#define X10 V26
#define X11 V27
#define X12 V28
#define X13 V29
#define X14 V30
#define X15 V31
#define NUM_ROUNDS 20
#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
VAF a1, a0, a0 \
VAF b1, b0, b0 \
VAF c1, c0, c0 \
VAF d1, d0, d0 \
VX a0, a2, a2 \
VX b0, b2, b2 \
VX c0, c2, c2 \
VX d0, d2, d2 \
VERLLF $16, a2, a2 \
VERLLF $16, b2, b2 \
VERLLF $16, c2, c2 \
VERLLF $16, d2, d2 \
VAF a2, a3, a3 \
VAF b2, b3, b3 \
VAF c2, c3, c3 \
VAF d2, d3, d3 \
VX a3, a1, a1 \
VX b3, b1, b1 \
VX c3, c1, c1 \
VX d3, d1, d1 \
VERLLF $12, a1, a1 \
VERLLF $12, b1, b1 \
VERLLF $12, c1, c1 \
VERLLF $12, d1, d1 \
VAF a1, a0, a0 \
VAF b1, b0, b0 \
VAF c1, c0, c0 \
VAF d1, d0, d0 \
VX a0, a2, a2 \
VX b0, b2, b2 \
VX c0, c2, c2 \
VX d0, d2, d2 \
VERLLF $8, a2, a2 \
VERLLF $8, b2, b2 \
VERLLF $8, c2, c2 \
VERLLF $8, d2, d2 \
VAF a2, a3, a3 \
VAF b2, b3, b3 \
VAF c2, c3, c3 \
VAF d2, d3, d3 \
VX a3, a1, a1 \
VX b3, b1, b1 \
VX c3, c1, c1 \
VX d3, d1, d1 \
VERLLF $7, a1, a1 \
VERLLF $7, b1, b1 \
VERLLF $7, c1, c1 \
VERLLF $7, d1, d1
#define PERMUTE(mask, v0, v1, v2, v3) \
VPERM v0, v0, mask, v0 \
VPERM v1, v1, mask, v1 \
VPERM v2, v2, mask, v2 \
VPERM v3, v3, mask, v3
#define ADDV(x, v0, v1, v2, v3) \
VAF x, v0, v0 \
VAF x, v1, v1 \
VAF x, v2, v2 \
VAF x, v3, v3
#define XORV(off, dst, src, v0, v1, v2, v3) \
VLM off(src), M0, M3 \
PERMUTE(BSWAP, v0, v1, v2, v3) \
VX v0, M0, M0 \
VX v1, M1, M1 \
VX v2, M2, M2 \
VX v3, M3, M3 \
VSTM M0, M3, off(dst)
#define SHUFFLE(a, b, c, d, t, u, v, w) \
VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
MOVD $·constants<>(SB), R1
MOVD dst+0(FP), R2 // R2=&dst[0]
LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src)
MOVD key+48(FP), R5 // R5=key
MOVD nonce+56(FP), R6 // R6=nonce
MOVD counter+64(FP), R7 // R7=counter
// load BSWAP and J0
VLM (R1), BSWAP, J0
// setup
MOVD $95, R0
VLM (R5), KEY0, KEY1
VLL R0, (R6), NONCE
VZERO M0
VLEIB $7, $32, M0
VSRLB M0, NONCE, NONCE
// initialize counter values
VLREPF (R7), CTR
VZERO INC
VLEIF $1, $1, INC
VLEIF $2, $2, INC
VLEIF $3, $3, INC
VAF INC, CTR, CTR
VREPIF $4, INC
chacha:
VREPF $0, J0, X0
VREPF $1, J0, X1
VREPF $2, J0, X2
VREPF $3, J0, X3
VREPF $0, KEY0, X4
VREPF $1, KEY0, X5
VREPF $2, KEY0, X6
VREPF $3, KEY0, X7
VREPF $0, KEY1, X8
VREPF $1, KEY1, X9
VREPF $2, KEY1, X10
VREPF $3, KEY1, X11
VLR CTR, X12
VREPF $1, NONCE, X13
VREPF $2, NONCE, X14
VREPF $3, NONCE, X15
MOVD $(NUM_ROUNDS/2), R1
loop:
ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11)
ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9)
ADD $-1, R1
BNE loop
// decrement length
ADD $-256, R4
// rearrange vectors
SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
ADDV(J0, X0, X1, X2, X3)
SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
ADDV(KEY0, X4, X5, X6, X7)
SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
ADDV(KEY1, X8, X9, X10, X11)
VAF CTR, X12, X12
SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
ADDV(NONCE, X12, X13, X14, X15)
// increment counters
VAF INC, CTR, CTR
// xor keystream with plaintext
XORV(0*64, R2, R3, X0, X4, X8, X12)
XORV(1*64, R2, R3, X1, X5, X9, X13)
XORV(2*64, R2, R3, X2, X6, X10, X14)
XORV(3*64, R2, R3, X3, X7, X11, X15)
// increment pointers
MOVD $256(R2), R2
MOVD $256(R3), R3
CMPBNE R4, $0, chacha
VSTEF $0, CTR, (R7)
RET

42
vendor/golang.org/x/crypto/chacha20/xor.go generated vendored Normal file
View File

@@ -0,0 +1,42 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found src the LICENSE file.
package chacha20
import "runtime"
// Platforms that have fast unaligned 32-bit little endian accesses.
const unaligned = runtime.GOARCH == "386" ||
runtime.GOARCH == "amd64" ||
runtime.GOARCH == "arm64" ||
runtime.GOARCH == "ppc64le" ||
runtime.GOARCH == "s390x"
// addXor reads a little endian uint32 from src, XORs it with (a + b) and
// places the result in little endian byte order in dst.
func addXor(dst, src []byte, a, b uint32) {
_, _ = src[3], dst[3] // bounds check elimination hint
if unaligned {
// The compiler should optimize this code into
// 32-bit unaligned little endian loads and stores.
// TODO: delete once the compiler does a reliably
// good job with the generic code below.
// See issue #25111 for more details.
v := uint32(src[0])
v |= uint32(src[1]) << 8
v |= uint32(src[2]) << 16
v |= uint32(src[3]) << 24
v ^= a + b
dst[0] = byte(v)
dst[1] = byte(v >> 8)
dst[2] = byte(v >> 16)
dst[3] = byte(v >> 24)
} else {
a += b
dst[0] = src[0] ^ byte(a)
dst[1] = src[1] ^ byte(a>>8)
dst[2] = src[2] ^ byte(a>>16)
dst[3] = src[3] ^ byte(a>>24)
}
}

View File

@@ -0,0 +1,98 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD and its
// extended nonce variant XChaCha20-Poly1305, as specified in RFC 8439 and
// draft-irtf-cfrg-xchacha-01.
package chacha20poly1305
import (
"crypto/cipher"
"errors"
)
const (
// KeySize is the size of the key used by this AEAD, in bytes.
KeySize = 32
// NonceSize is the size of the nonce used with the standard variant of this
// AEAD, in bytes.
//
// Note that this is too short to be safely generated at random if the same
// key is reused more than 2³² times.
NonceSize = 12
// NonceSizeX is the size of the nonce used with the XChaCha20-Poly1305
// variant of this AEAD, in bytes.
NonceSizeX = 24
// Overhead is the size of the Poly1305 authentication tag, and the
// difference between a ciphertext length and its plaintext.
Overhead = 16
)
type chacha20poly1305 struct {
key [KeySize]byte
}
// New returns a ChaCha20-Poly1305 AEAD that uses the given 256-bit key.
func New(key []byte) (cipher.AEAD, error) {
if len(key) != KeySize {
return nil, errors.New("chacha20poly1305: bad key length")
}
ret := new(chacha20poly1305)
copy(ret.key[:], key)
return ret, nil
}
func (c *chacha20poly1305) NonceSize() int {
return NonceSize
}
func (c *chacha20poly1305) Overhead() int {
return Overhead
}
func (c *chacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte {
if len(nonce) != NonceSize {
panic("chacha20poly1305: bad nonce length passed to Seal")
}
if uint64(len(plaintext)) > (1<<38)-64 {
panic("chacha20poly1305: plaintext too large")
}
return c.seal(dst, nonce, plaintext, additionalData)
}
var errOpen = errors.New("chacha20poly1305: message authentication failed")
func (c *chacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
if len(nonce) != NonceSize {
panic("chacha20poly1305: bad nonce length passed to Open")
}
if len(ciphertext) < 16 {
return nil, errOpen
}
if uint64(len(ciphertext)) > (1<<38)-48 {
panic("chacha20poly1305: ciphertext too large")
}
return c.open(dst, nonce, ciphertext, additionalData)
}
// sliceForAppend takes a slice and a requested number of bytes. It returns a
// slice with the contents of the given slice followed by that many bytes and a
// second slice that aliases into it and contains only the extra bytes. If the
// original slice has sufficient capacity then no allocation is performed.
func sliceForAppend(in []byte, n int) (head, tail []byte) {
if total := len(in) + n; cap(in) >= total {
head = in[:total]
} else {
head = make([]byte, total)
copy(head, in)
}
tail = head[len(in):]
return
}

View File

@@ -0,0 +1,86 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
package chacha20poly1305
import (
"encoding/binary"
"golang.org/x/crypto/internal/alias"
"golang.org/x/sys/cpu"
)
//go:noescape
func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool
//go:noescape
func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte)
var (
useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
)
// setupState writes a ChaCha20 input matrix to state. See
// https://tools.ietf.org/html/rfc7539#section-2.3.
func setupState(state *[16]uint32, key *[32]byte, nonce []byte) {
state[0] = 0x61707865
state[1] = 0x3320646e
state[2] = 0x79622d32
state[3] = 0x6b206574
state[4] = binary.LittleEndian.Uint32(key[0:4])
state[5] = binary.LittleEndian.Uint32(key[4:8])
state[6] = binary.LittleEndian.Uint32(key[8:12])
state[7] = binary.LittleEndian.Uint32(key[12:16])
state[8] = binary.LittleEndian.Uint32(key[16:20])
state[9] = binary.LittleEndian.Uint32(key[20:24])
state[10] = binary.LittleEndian.Uint32(key[24:28])
state[11] = binary.LittleEndian.Uint32(key[28:32])
state[12] = 0
state[13] = binary.LittleEndian.Uint32(nonce[0:4])
state[14] = binary.LittleEndian.Uint32(nonce[4:8])
state[15] = binary.LittleEndian.Uint32(nonce[8:12])
}
func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte {
if !cpu.X86.HasSSSE3 {
return c.sealGeneric(dst, nonce, plaintext, additionalData)
}
var state [16]uint32
setupState(&state, &c.key, nonce)
ret, out := sliceForAppend(dst, len(plaintext)+16)
if alias.InexactOverlap(out, plaintext) {
panic("chacha20poly1305: invalid buffer overlap")
}
chacha20Poly1305Seal(out[:], state[:], plaintext, additionalData)
return ret
}
func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
if !cpu.X86.HasSSSE3 {
return c.openGeneric(dst, nonce, ciphertext, additionalData)
}
var state [16]uint32
setupState(&state, &c.key, nonce)
ciphertext = ciphertext[:len(ciphertext)-16]
ret, out := sliceForAppend(dst, len(ciphertext))
if alias.InexactOverlap(out, ciphertext) {
panic("chacha20poly1305: invalid buffer overlap")
}
if !chacha20Poly1305Open(out, state[:], ciphertext, additionalData) {
for i := range out {
out[i] = 0
}
return nil, errOpen
}
return ret, nil
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,81 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package chacha20poly1305
import (
"encoding/binary"
"golang.org/x/crypto/chacha20"
"golang.org/x/crypto/internal/alias"
"golang.org/x/crypto/internal/poly1305"
)
func writeWithPadding(p *poly1305.MAC, b []byte) {
p.Write(b)
if rem := len(b) % 16; rem != 0 {
var buf [16]byte
padLen := 16 - rem
p.Write(buf[:padLen])
}
}
func writeUint64(p *poly1305.MAC, n int) {
var buf [8]byte
binary.LittleEndian.PutUint64(buf[:], uint64(n))
p.Write(buf[:])
}
func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte {
ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize)
ciphertext, tag := out[:len(plaintext)], out[len(plaintext):]
if alias.InexactOverlap(out, plaintext) {
panic("chacha20poly1305: invalid buffer overlap")
}
var polyKey [32]byte
s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
s.XORKeyStream(polyKey[:], polyKey[:])
s.SetCounter(1) // set the counter to 1, skipping 32 bytes
s.XORKeyStream(ciphertext, plaintext)
p := poly1305.New(&polyKey)
writeWithPadding(p, additionalData)
writeWithPadding(p, ciphertext)
writeUint64(p, len(additionalData))
writeUint64(p, len(plaintext))
p.Sum(tag[:0])
return ret
}
func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
tag := ciphertext[len(ciphertext)-16:]
ciphertext = ciphertext[:len(ciphertext)-16]
var polyKey [32]byte
s, _ := chacha20.NewUnauthenticatedCipher(c.key[:], nonce)
s.XORKeyStream(polyKey[:], polyKey[:])
s.SetCounter(1) // set the counter to 1, skipping 32 bytes
p := poly1305.New(&polyKey)
writeWithPadding(p, additionalData)
writeWithPadding(p, ciphertext)
writeUint64(p, len(additionalData))
writeUint64(p, len(ciphertext))
ret, out := sliceForAppend(dst, len(ciphertext))
if alias.InexactOverlap(out, ciphertext) {
panic("chacha20poly1305: invalid buffer overlap")
}
if !p.Verify(tag) {
for i := range out {
out[i] = 0
}
return nil, errOpen
}
s.XORKeyStream(out, ciphertext)
return ret, nil
}

View File

@@ -0,0 +1,15 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || !gc || purego
package chacha20poly1305
func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte {
return c.sealGeneric(dst, nonce, plaintext, additionalData)
}
func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
return c.openGeneric(dst, nonce, ciphertext, additionalData)
}

View File

@@ -0,0 +1,86 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package chacha20poly1305
import (
"crypto/cipher"
"errors"
"golang.org/x/crypto/chacha20"
)
type xchacha20poly1305 struct {
key [KeySize]byte
}
// NewX returns a XChaCha20-Poly1305 AEAD that uses the given 256-bit key.
//
// XChaCha20-Poly1305 is a ChaCha20-Poly1305 variant that takes a longer nonce,
// suitable to be generated randomly without risk of collisions. It should be
// preferred when nonce uniqueness cannot be trivially ensured, or whenever
// nonces are randomly generated.
func NewX(key []byte) (cipher.AEAD, error) {
if len(key) != KeySize {
return nil, errors.New("chacha20poly1305: bad key length")
}
ret := new(xchacha20poly1305)
copy(ret.key[:], key)
return ret, nil
}
func (*xchacha20poly1305) NonceSize() int {
return NonceSizeX
}
func (*xchacha20poly1305) Overhead() int {
return Overhead
}
func (x *xchacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte {
if len(nonce) != NonceSizeX {
panic("chacha20poly1305: bad nonce length passed to Seal")
}
// XChaCha20-Poly1305 technically supports a 64-bit counter, so there is no
// size limit. However, since we reuse the ChaCha20-Poly1305 implementation,
// the second half of the counter is not available. This is unlikely to be
// an issue because the cipher.AEAD API requires the entire message to be in
// memory, and the counter overflows at 256 GB.
if uint64(len(plaintext)) > (1<<38)-64 {
panic("chacha20poly1305: plaintext too large")
}
c := new(chacha20poly1305)
hKey, _ := chacha20.HChaCha20(x.key[:], nonce[0:16])
copy(c.key[:], hKey)
// The first 4 bytes of the final nonce are unused counter space.
cNonce := make([]byte, NonceSize)
copy(cNonce[4:12], nonce[16:24])
return c.seal(dst, cNonce[:], plaintext, additionalData)
}
func (x *xchacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
if len(nonce) != NonceSizeX {
panic("chacha20poly1305: bad nonce length passed to Open")
}
if len(ciphertext) < 16 {
return nil, errOpen
}
if uint64(len(ciphertext)) > (1<<38)-48 {
panic("chacha20poly1305: ciphertext too large")
}
c := new(chacha20poly1305)
hKey, _ := chacha20.HChaCha20(x.key[:], nonce[0:16])
copy(c.key[:], hKey)
// The first 4 bytes of the final nonce are unused counter space.
cNonce := make([]byte, NonceSize)
copy(cNonce[4:12], nonce[16:24])
return c.open(dst, cNonce[:], ciphertext, additionalData)
}

90
vendor/golang.org/x/crypto/curve25519/curve25519.go generated vendored Normal file
View File

@@ -0,0 +1,90 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package curve25519 provides an implementation of the X25519 function, which
// performs scalar multiplication on the elliptic curve known as Curve25519.
// See RFC 7748.
//
// This package is a wrapper for the X25519 implementation
// in the crypto/ecdh package.
package curve25519
import "crypto/ecdh"
// ScalarMult sets dst to the product scalar * point.
//
// Deprecated: when provided a low-order point, ScalarMult will set dst to all
// zeroes, irrespective of the scalar. Instead, use the X25519 function, which
// will return an error.
func ScalarMult(dst, scalar, point *[32]byte) {
if _, err := x25519(dst, scalar[:], point[:]); err != nil {
// The only error condition for x25519 when the inputs are 32 bytes long
// is if the output would have been the all-zero value.
for i := range dst {
dst[i] = 0
}
}
}
// ScalarBaseMult sets dst to the product scalar * base where base is the
// standard generator.
//
// It is recommended to use the X25519 function with Basepoint instead, as
// copying into fixed size arrays can lead to unexpected bugs.
func ScalarBaseMult(dst, scalar *[32]byte) {
curve := ecdh.X25519()
priv, err := curve.NewPrivateKey(scalar[:])
if err != nil {
panic("curve25519: internal error: scalarBaseMult was not 32 bytes")
}
copy(dst[:], priv.PublicKey().Bytes())
}
const (
// ScalarSize is the size of the scalar input to X25519.
ScalarSize = 32
// PointSize is the size of the point input to X25519.
PointSize = 32
)
// Basepoint is the canonical Curve25519 generator.
var Basepoint []byte
var basePoint = [32]byte{9}
func init() { Basepoint = basePoint[:] }
// X25519 returns the result of the scalar multiplication (scalar * point),
// according to RFC 7748, Section 5. scalar, point and the return value are
// slices of 32 bytes.
//
// scalar can be generated at random, for example with crypto/rand. point should
// be either Basepoint or the output of another X25519 call.
//
// If point is Basepoint (but not if it's a different slice with the same
// contents) a precomputed implementation might be used for performance.
func X25519(scalar, point []byte) ([]byte, error) {
// Outline the body of function, to let the allocation be inlined in the
// caller, and possibly avoid escaping to the heap.
var dst [32]byte
return x25519(&dst, scalar, point)
}
func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) {
curve := ecdh.X25519()
pub, err := curve.NewPublicKey(point)
if err != nil {
return nil, err
}
priv, err := curve.NewPrivateKey(scalar)
if err != nil {
return nil, err
}
out, err := priv.ECDH(pub)
if err != nil {
return nil, err
}
copy(dst[:], out)
return dst[:], nil
}

95
vendor/golang.org/x/crypto/hkdf/hkdf.go generated vendored Normal file
View File

@@ -0,0 +1,95 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package hkdf implements the HMAC-based Extract-and-Expand Key Derivation
// Function (HKDF) as defined in RFC 5869.
//
// HKDF is a cryptographic key derivation function (KDF) with the goal of
// expanding limited input keying material into one or more cryptographically
// strong secret keys.
package hkdf
import (
"crypto/hmac"
"errors"
"hash"
"io"
)
// Extract generates a pseudorandom key for use with Expand from an input secret
// and an optional independent salt.
//
// Only use this function if you need to reuse the extracted key with multiple
// Expand invocations and different context values. Most common scenarios,
// including the generation of multiple keys, should use New instead.
func Extract(hash func() hash.Hash, secret, salt []byte) []byte {
if salt == nil {
salt = make([]byte, hash().Size())
}
extractor := hmac.New(hash, salt)
extractor.Write(secret)
return extractor.Sum(nil)
}
type hkdf struct {
expander hash.Hash
size int
info []byte
counter byte
prev []byte
buf []byte
}
func (f *hkdf) Read(p []byte) (int, error) {
// Check whether enough data can be generated
need := len(p)
remains := len(f.buf) + int(255-f.counter+1)*f.size
if remains < need {
return 0, errors.New("hkdf: entropy limit reached")
}
// Read any leftover from the buffer
n := copy(p, f.buf)
p = p[n:]
// Fill the rest of the buffer
for len(p) > 0 {
if f.counter > 1 {
f.expander.Reset()
}
f.expander.Write(f.prev)
f.expander.Write(f.info)
f.expander.Write([]byte{f.counter})
f.prev = f.expander.Sum(f.prev[:0])
f.counter++
// Copy the new batch into p
f.buf = f.prev
n = copy(p, f.buf)
p = p[n:]
}
// Save leftovers for next run
f.buf = f.buf[n:]
return need, nil
}
// Expand returns a Reader, from which keys can be read, using the given
// pseudorandom key and optional context info, skipping the extraction step.
//
// The pseudorandomKey should have been generated by Extract, or be a uniformly
// random or pseudorandom cryptographically strong key. See RFC 5869, Section
// 3.3. Most common scenarios will want to use New instead.
func Expand(hash func() hash.Hash, pseudorandomKey, info []byte) io.Reader {
expander := hmac.New(hash, pseudorandomKey)
return &hkdf{expander, expander.Size(), info, 1, nil, nil}
}
// New returns a Reader, from which keys can be read, using the given hash,
// secret, salt and context info. Salt and info can be nil.
func New(hash func() hash.Hash, secret, salt, info []byte) io.Reader {
prk := Extract(hash, secret, salt)
return Expand(hash, prk, info)
}

31
vendor/golang.org/x/crypto/internal/alias/alias.go generated vendored Normal file
View File

@@ -0,0 +1,31 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !purego
// Package alias implements memory aliasing tests.
package alias
import "unsafe"
// AnyOverlap reports whether x and y share memory at any (not necessarily
// corresponding) index. The memory beyond the slice length is ignored.
func AnyOverlap(x, y []byte) bool {
return len(x) > 0 && len(y) > 0 &&
uintptr(unsafe.Pointer(&x[0])) <= uintptr(unsafe.Pointer(&y[len(y)-1])) &&
uintptr(unsafe.Pointer(&y[0])) <= uintptr(unsafe.Pointer(&x[len(x)-1]))
}
// InexactOverlap reports whether x and y share memory at any non-corresponding
// index. The memory beyond the slice length is ignored. Note that x and y can
// have different lengths and still not have any inexact overlap.
//
// InexactOverlap can be used to implement the requirements of the crypto/cipher
// AEAD, Block, BlockMode and Stream interfaces.
func InexactOverlap(x, y []byte) bool {
if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] {
return false
}
return AnyOverlap(x, y)
}

View File

@@ -0,0 +1,34 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build purego
// Package alias implements memory aliasing tests.
package alias
// This is the Google App Engine standard variant based on reflect
// because the unsafe package and cgo are disallowed.
import "reflect"
// AnyOverlap reports whether x and y share memory at any (not necessarily
// corresponding) index. The memory beyond the slice length is ignored.
func AnyOverlap(x, y []byte) bool {
return len(x) > 0 && len(y) > 0 &&
reflect.ValueOf(&x[0]).Pointer() <= reflect.ValueOf(&y[len(y)-1]).Pointer() &&
reflect.ValueOf(&y[0]).Pointer() <= reflect.ValueOf(&x[len(x)-1]).Pointer()
}
// InexactOverlap reports whether x and y share memory at any non-corresponding
// index. The memory beyond the slice length is ignored. Note that x and y can
// have different lengths and still not have any inexact overlap.
//
// InexactOverlap can be used to implement the requirements of the crypto/cipher
// AEAD, Block, BlockMode and Stream interfaces.
func InexactOverlap(x, y []byte) bool {
if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] {
return false
}
return AnyOverlap(x, y)
}

View File

@@ -0,0 +1,9 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
package poly1305
type mac struct{ macGeneric }

View File

@@ -0,0 +1,99 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package poly1305 implements Poly1305 one-time message authentication code as
// specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
//
// Poly1305 is a fast, one-time authentication function. It is infeasible for an
// attacker to generate an authenticator for a message without the key. However, a
// key must only be used for a single message. Authenticating two different
// messages with the same key allows an attacker to forge authenticators for other
// messages with the same key.
//
// Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
// used with a fixed key in order to generate one-time keys from an nonce.
// However, in this package AES isn't used and the one-time key is specified
// directly.
package poly1305
import "crypto/subtle"
// TagSize is the size, in bytes, of a poly1305 authenticator.
const TagSize = 16
// Sum generates an authenticator for msg using a one-time key and puts the
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
h := New(key)
h.Write(m)
h.Sum(out[:0])
}
// Verify returns true if mac is a valid authenticator for m with the given key.
func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
var tmp [16]byte
Sum(&tmp, m, key)
return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1
}
// New returns a new MAC computing an authentication
// tag of all data written to it with the given key.
// This allows writing the message progressively instead
// of passing it as a single slice. Common users should use
// the Sum function instead.
//
// The key must be unique for each message, as authenticating
// two different messages with the same key allows an attacker
// to forge messages at will.
func New(key *[32]byte) *MAC {
m := &MAC{}
initialize(key, &m.macState)
return m
}
// MAC is an io.Writer computing an authentication tag
// of the data written to it.
//
// MAC cannot be used like common hash.Hash implementations,
// because using a poly1305 key twice breaks its security.
// Therefore writing data to a running MAC after calling
// Sum or Verify causes it to panic.
type MAC struct {
mac // platform-dependent implementation
finalized bool
}
// Size returns the number of bytes Sum will return.
func (h *MAC) Size() int { return TagSize }
// Write adds more data to the running message authentication code.
// It never returns an error.
//
// It must not be called after the first call of Sum or Verify.
func (h *MAC) Write(p []byte) (n int, err error) {
if h.finalized {
panic("poly1305: write to MAC after Sum or Verify")
}
return h.mac.Write(p)
}
// Sum computes the authenticator of all data written to the
// message authentication code.
func (h *MAC) Sum(b []byte) []byte {
var mac [TagSize]byte
h.mac.Sum(&mac)
h.finalized = true
return append(b, mac[:]...)
}
// Verify returns whether the authenticator of all data written to
// the message authentication code matches the expected value.
func (h *MAC) Verify(expected []byte) bool {
var mac [TagSize]byte
h.mac.Sum(&mac)
h.finalized = true
return subtle.ConstantTimeCompare(expected, mac[:]) == 1
}

View File

@@ -0,0 +1,47 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
package poly1305
//go:noescape
func update(state *macState, msg []byte)
// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
// using function pointers would carry a major performance cost.
type mac struct{ macGeneric }
func (h *mac) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < TagSize {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
update(&h.macState, h.buffer[:])
}
if n := len(p) - (len(p) % TagSize); n > 0 {
update(&h.macState, p[:n])
p = p[n:]
}
if len(p) > 0 {
h.offset += copy(h.buffer[h.offset:], p)
}
return nn, nil
}
func (h *mac) Sum(out *[16]byte) {
state := h.macState
if h.offset > 0 {
update(&state, h.buffer[:h.offset])
}
finalize(out, &state.h, &state.s)
}

View File

@@ -0,0 +1,108 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
#include "textflag.h"
#define POLY1305_ADD(msg, h0, h1, h2) \
ADDQ 0(msg), h0; \
ADCQ 8(msg), h1; \
ADCQ $1, h2; \
LEAQ 16(msg), msg
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
MOVQ r0, AX; \
MULQ h0; \
MOVQ AX, t0; \
MOVQ DX, t1; \
MOVQ r0, AX; \
MULQ h1; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ r0, t2; \
IMULQ h2, t2; \
ADDQ DX, t2; \
\
MOVQ r1, AX; \
MULQ h0; \
ADDQ AX, t1; \
ADCQ $0, DX; \
MOVQ DX, h0; \
MOVQ r1, t3; \
IMULQ h2, t3; \
MOVQ r1, AX; \
MULQ h1; \
ADDQ AX, t2; \
ADCQ DX, t3; \
ADDQ h0, t2; \
ADCQ $0, t3; \
\
MOVQ t0, h0; \
MOVQ t1, h1; \
MOVQ t2, h2; \
ANDQ $3, h2; \
MOVQ t2, t0; \
ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
ADDQ t0, h0; \
ADCQ t3, h1; \
ADCQ $0, h2; \
SHRQ $2, t3, t2; \
SHRQ $2, t3; \
ADDQ t2, h0; \
ADCQ t3, h1; \
ADCQ $0, h2
// func update(state *[7]uint64, msg []byte)
TEXT ·update(SB), $0-32
MOVQ state+0(FP), DI
MOVQ msg_base+8(FP), SI
MOVQ msg_len+16(FP), R15
MOVQ 0(DI), R8 // h0
MOVQ 8(DI), R9 // h1
MOVQ 16(DI), R10 // h2
MOVQ 24(DI), R11 // r0
MOVQ 32(DI), R12 // r1
CMPQ R15, $16
JB bytes_between_0_and_15
loop:
POLY1305_ADD(SI, R8, R9, R10)
multiply:
POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
SUBQ $16, R15
CMPQ R15, $16
JAE loop
bytes_between_0_and_15:
TESTQ R15, R15
JZ done
MOVQ $1, BX
XORQ CX, CX
XORQ R13, R13
ADDQ R15, SI
flush_buffer:
SHLQ $8, BX, CX
SHLQ $8, BX
MOVB -1(SI), R13
XORQ R13, BX
DECQ SI
DECQ R15
JNZ flush_buffer
ADDQ BX, R8
ADCQ CX, R9
ADCQ $0, R10
MOVQ $16, R15
JMP multiply
done:
MOVQ R8, 0(DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
RET

View File

@@ -0,0 +1,312 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file provides the generic implementation of Sum and MAC. Other files
// might provide optimized assembly implementations of some of this code.
package poly1305
import (
"encoding/binary"
"math/bits"
)
// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
// for a 64 bytes message is approximately
//
// s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r mod 2¹³⁰ - 5
//
// for some secret r and s. It can be computed sequentially like
//
// for len(msg) > 0:
// h += read(msg, 16)
// h *= r
// h %= 2¹³⁰ - 5
// return h + s
//
// All the complexity is about doing performant constant-time math on numbers
// larger than any available numeric type.
func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
h := newMACGeneric(key)
h.Write(msg)
h.Sum(out)
}
func newMACGeneric(key *[32]byte) macGeneric {
m := macGeneric{}
initialize(key, &m.macState)
return m
}
// macState holds numbers in saturated 64-bit little-endian limbs. That is,
// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
type macState struct {
// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
// can grow larger during and after rounds. It must, however, remain below
// 2 * (2¹³⁰ - 5).
h [3]uint64
// r and s are the private key components.
r [2]uint64
s [2]uint64
}
type macGeneric struct {
macState
buffer [TagSize]byte
offset int
}
// Write splits the incoming message into TagSize chunks, and passes them to
// update. It buffers incomplete chunks.
func (h *macGeneric) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < TagSize {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
updateGeneric(&h.macState, h.buffer[:])
}
if n := len(p) - (len(p) % TagSize); n > 0 {
updateGeneric(&h.macState, p[:n])
p = p[n:]
}
if len(p) > 0 {
h.offset += copy(h.buffer[h.offset:], p)
}
return nn, nil
}
// Sum flushes the last incomplete chunk from the buffer, if any, and generates
// the MAC output. It does not modify its state, in order to allow for multiple
// calls to Sum, even if no Write is allowed after Sum.
func (h *macGeneric) Sum(out *[TagSize]byte) {
state := h.macState
if h.offset > 0 {
updateGeneric(&state, h.buffer[:h.offset])
}
finalize(out, &state.h, &state.s)
}
// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
// clears some bits of the secret coefficient to make it possible to implement
// multiplication more efficiently.
const (
rMask0 = 0x0FFFFFFC0FFFFFFF
rMask1 = 0x0FFFFFFC0FFFFFFC
)
// initialize loads the 256-bit key into the two 128-bit secret values r and s.
func initialize(key *[32]byte, m *macState) {
m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
m.s[0] = binary.LittleEndian.Uint64(key[16:24])
m.s[1] = binary.LittleEndian.Uint64(key[24:32])
}
// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
// bits.Mul64 and bits.Add64 intrinsics.
type uint128 struct {
lo, hi uint64
}
func mul64(a, b uint64) uint128 {
hi, lo := bits.Mul64(a, b)
return uint128{lo, hi}
}
func add128(a, b uint128) uint128 {
lo, c := bits.Add64(a.lo, b.lo, 0)
hi, c := bits.Add64(a.hi, b.hi, c)
if c != 0 {
panic("poly1305: unexpected overflow")
}
return uint128{lo, hi}
}
func shiftRightBy2(a uint128) uint128 {
a.lo = a.lo>>2 | (a.hi&3)<<62
a.hi = a.hi >> 2
return a
}
// updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
// 128 bits of message, it computes
//
// h₊ = (h + m) * r mod 2¹³⁰ - 5
//
// If the msg length is not a multiple of TagSize, it assumes the last
// incomplete chunk is the final one.
func updateGeneric(state *macState, msg []byte) {
h0, h1, h2 := state.h[0], state.h[1], state.h[2]
r0, r1 := state.r[0], state.r[1]
for len(msg) > 0 {
var c uint64
// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
// reduced at the end of the multiplication below.
//
// The spec requires us to set a bit just above the message size, not to
// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
// add 1 to the most significant (2¹²⁸) limb, h2.
if len(msg) >= TagSize {
h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
h2 += c + 1
msg = msg[TagSize:]
} else {
var buf [TagSize]byte
copy(buf[:], msg)
buf[len(msg)] = 1
h0, c = bits.Add64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
h1, c = bits.Add64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
h2 += c
msg = nil
}
// Multiplication of big number limbs is similar to elementary school
// columnar multiplication. Instead of digits, there are 64-bit limbs.
//
// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
//
// h2 h1 h0 x
// r1 r0 =
// ----------------
// h2r0 h1r0 h0r0 <-- individual 128-bit products
// + h2r1 h1r1 h0r1
// ------------------------
// m3 m2 m1 m0 <-- result in 128-bit overlapping limbs
// ------------------------
// m3.hi m2.hi m1.hi m0.hi <-- carry propagation
// + m3.lo m2.lo m1.lo m0.lo
// -------------------------------
// t4 t3 t2 t1 t0 <-- final result in 64-bit limbs
//
// The main difference from pen-and-paper multiplication is that we do
// carry propagation in a separate step, as if we wrote two digit sums
// at first (the 128-bit limbs), and then carried the tens all at once.
h0r0 := mul64(h0, r0)
h1r0 := mul64(h1, r0)
h2r0 := mul64(h2, r0)
h0r1 := mul64(h0, r1)
h1r1 := mul64(h1, r1)
h2r1 := mul64(h2, r1)
// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
// top 4 bits cleared by rMask{0,1}, we know that their product is not going
// to overflow 64 bits, so we can ignore the high part of the products.
//
// This also means that the product doesn't have a fifth limb (t4).
if h2r0.hi != 0 {
panic("poly1305: unexpected overflow")
}
if h2r1.hi != 0 {
panic("poly1305: unexpected overflow")
}
m0 := h0r0
m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
m3 := h2r1
t0 := m0.lo
t1, c := bits.Add64(m1.lo, m0.hi, 0)
t2, c := bits.Add64(m2.lo, m1.hi, c)
t3, _ := bits.Add64(m3.lo, m2.hi, c)
// Now we have the result as 4 64-bit limbs, and we need to reduce it
// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
// a cheap partial reduction according to the reduction identity
//
// c * 2¹³⁰ + n = c * 5 + n mod 2¹³⁰ - 5
//
// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
// assumptions we make about h in the rest of the code.
//
// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23
// We split the final result at the 2¹³⁰ mark into h and cc, the carry.
// Note that the carry bits are effectively shifted left by 2, in other
// words, cc = c * 4 for the c in the reduction identity.
h0, h1, h2 = t0, t1, t2&maskLow2Bits
cc := uint128{t2 & maskNotLow2Bits, t3}
// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.
h0, c = bits.Add64(h0, cc.lo, 0)
h1, c = bits.Add64(h1, cc.hi, c)
h2 += c
cc = shiftRightBy2(cc)
h0, c = bits.Add64(h0, cc.lo, 0)
h1, c = bits.Add64(h1, cc.hi, c)
h2 += c
// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
//
// 5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
}
state.h[0], state.h[1], state.h[2] = h0, h1, h2
}
const (
maskLow2Bits uint64 = 0x0000000000000003
maskNotLow2Bits uint64 = ^maskLow2Bits
)
// select64 returns x if v == 1 and y if v == 0, in constant time.
func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
const (
p0 = 0xFFFFFFFFFFFFFFFB
p1 = 0xFFFFFFFFFFFFFFFF
p2 = 0x0000000000000003
)
// finalize completes the modular reduction of h and computes
//
// out = h + s mod 2¹²⁸
func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
h0, h1, h2 := h[0], h[1], h[2]
// After the partial reduction in updateGeneric, h might be more than
// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
// result if the subtraction underflows, and t otherwise.
hMinusP0, b := bits.Sub64(h0, p0, 0)
hMinusP1, b := bits.Sub64(h1, p1, b)
_, b = bits.Sub64(h2, p2, b)
// h = h if h < p else h - p
h0 = select64(b, h0, hMinusP0)
h1 = select64(b, h1, hMinusP1)
// Finally, we compute the last Poly1305 step
//
// tag = h + s mod 2¹²⁸
//
// by just doing a wide addition with the 128 low bits of h and discarding
// the overflow.
h0, c := bits.Add64(h0, s[0], 0)
h1, _ = bits.Add64(h1, s[1], c)
binary.LittleEndian.PutUint64(out[0:8], h0)
binary.LittleEndian.PutUint64(out[8:16], h1)
}

View File

@@ -0,0 +1,47 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
package poly1305
//go:noescape
func update(state *macState, msg []byte)
// mac is a wrapper for macGeneric that redirects calls that would have gone to
// updateGeneric to update.
//
// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
// using function pointers would carry a major performance cost.
type mac struct{ macGeneric }
func (h *mac) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < TagSize {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
update(&h.macState, h.buffer[:])
}
if n := len(p) - (len(p) % TagSize); n > 0 {
update(&h.macState, p[:n])
p = p[n:]
}
if len(p) > 0 {
h.offset += copy(h.buffer[h.offset:], p)
}
return nn, nil
}
func (h *mac) Sum(out *[16]byte) {
state := h.macState
if h.offset > 0 {
update(&state, h.buffer[:h.offset])
}
finalize(out, &state.h, &state.s)
}

View File

@@ -0,0 +1,179 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
#include "textflag.h"
// This was ported from the amd64 implementation.
#define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \
MOVD (msg), t0; \
MOVD 8(msg), t1; \
MOVD $1, t2; \
ADDC t0, h0, h0; \
ADDE t1, h1, h1; \
ADDE t2, h2; \
ADD $16, msg
#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3, t4, t5) \
MULLD r0, h0, t0; \
MULHDU r0, h0, t1; \
MULLD r0, h1, t4; \
MULHDU r0, h1, t5; \
ADDC t4, t1, t1; \
MULLD r0, h2, t2; \
MULHDU r1, h0, t4; \
MULLD r1, h0, h0; \
ADDE t5, t2, t2; \
ADDC h0, t1, t1; \
MULLD h2, r1, t3; \
ADDZE t4, h0; \
MULHDU r1, h1, t5; \
MULLD r1, h1, t4; \
ADDC t4, t2, t2; \
ADDE t5, t3, t3; \
ADDC h0, t2, t2; \
MOVD $-4, t4; \
ADDZE t3; \
RLDICL $0, t2, $62, h2; \
AND t2, t4, h0; \
ADDC t0, h0, h0; \
ADDE t3, t1, h1; \
SLD $62, t3, t4; \
SRD $2, t2; \
ADDZE h2; \
OR t4, t2, t2; \
SRD $2, t3; \
ADDC t2, h0, h0; \
ADDE t3, h1, h1; \
ADDZE h2
DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
GLOBL ·poly1305Mask<>(SB), RODATA, $16
// func update(state *[7]uint64, msg []byte)
TEXT ·update(SB), $0-32
MOVD state+0(FP), R3
MOVD msg_base+8(FP), R4
MOVD msg_len+16(FP), R5
MOVD 0(R3), R8 // h0
MOVD 8(R3), R9 // h1
MOVD 16(R3), R10 // h2
MOVD 24(R3), R11 // r0
MOVD 32(R3), R12 // r1
CMP R5, $16
BLT bytes_between_0_and_15
loop:
POLY1305_ADD(R4, R8, R9, R10, R20, R21, R22)
PCALIGN $16
multiply:
POLY1305_MUL(R8, R9, R10, R11, R12, R16, R17, R18, R14, R20, R21)
ADD $-16, R5
CMP R5, $16
BGE loop
bytes_between_0_and_15:
CMP R5, $0
BEQ done
MOVD $0, R16 // h0
MOVD $0, R17 // h1
flush_buffer:
CMP R5, $8
BLE just1
MOVD $8, R21
SUB R21, R5, R21
// Greater than 8 -- load the rightmost remaining bytes in msg
// and put into R17 (h1)
MOVD (R4)(R21), R17
MOVD $16, R22
// Find the offset to those bytes
SUB R5, R22, R22
SLD $3, R22
// Shift to get only the bytes in msg
SRD R22, R17, R17
// Put 1 at high end
MOVD $1, R23
SLD $3, R21
SLD R21, R23, R23
OR R23, R17, R17
// Remainder is 8
MOVD $8, R5
just1:
CMP R5, $8
BLT less8
// Exactly 8
MOVD (R4), R16
CMP R17, $0
// Check if we've already set R17; if not
// set 1 to indicate end of msg.
BNE carry
MOVD $1, R17
BR carry
less8:
MOVD $0, R16 // h0
MOVD $0, R22 // shift count
CMP R5, $4
BLT less4
MOVWZ (R4), R16
ADD $4, R4
ADD $-4, R5
MOVD $32, R22
less4:
CMP R5, $2
BLT less2
MOVHZ (R4), R21
SLD R22, R21, R21
OR R16, R21, R16
ADD $16, R22
ADD $-2, R5
ADD $2, R4
less2:
CMP R5, $0
BEQ insert1
MOVBZ (R4), R21
SLD R22, R21, R21
OR R16, R21, R16
ADD $8, R22
insert1:
// Insert 1 at end of msg
MOVD $1, R21
SLD R22, R21, R21
OR R16, R21, R16
carry:
// Add new values to h0, h1, h2
ADDC R16, R8
ADDE R17, R9
ADDZE R10, R10
MOVD $16, R5
ADD R5, R4
BR multiply
done:
// Save h0, h1, h2 in state
MOVD R8, 0(R3)
MOVD R9, 8(R3)
MOVD R10, 16(R3)
RET

View File

@@ -0,0 +1,76 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
package poly1305
import (
"golang.org/x/sys/cpu"
)
// updateVX is an assembly implementation of Poly1305 that uses vector
// instructions. It must only be called if the vector facility (vx) is
// available.
//
//go:noescape
func updateVX(state *macState, msg []byte)
// mac is a replacement for macGeneric that uses a larger buffer and redirects
// calls that would have gone to updateGeneric to updateVX if the vector
// facility is installed.
//
// A larger buffer is required for good performance because the vector
// implementation has a higher fixed cost per call than the generic
// implementation.
type mac struct {
macState
buffer [16 * TagSize]byte // size must be a multiple of block size (16)
offset int
}
func (h *mac) Write(p []byte) (int, error) {
nn := len(p)
if h.offset > 0 {
n := copy(h.buffer[h.offset:], p)
if h.offset+n < len(h.buffer) {
h.offset += n
return nn, nil
}
p = p[n:]
h.offset = 0
if cpu.S390X.HasVX {
updateVX(&h.macState, h.buffer[:])
} else {
updateGeneric(&h.macState, h.buffer[:])
}
}
tail := len(p) % len(h.buffer) // number of bytes to copy into buffer
body := len(p) - tail // number of bytes to process now
if body > 0 {
if cpu.S390X.HasVX {
updateVX(&h.macState, p[:body])
} else {
updateGeneric(&h.macState, p[:body])
}
}
h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0
return nn, nil
}
func (h *mac) Sum(out *[TagSize]byte) {
state := h.macState
remainder := h.buffer[:h.offset]
// Use the generic implementation if we have 2 or fewer blocks left
// to sum. The vector implementation has a higher startup time.
if cpu.S390X.HasVX && len(remainder) > 2*TagSize {
updateVX(&state, remainder)
} else if len(remainder) > 0 {
updateGeneric(&state, remainder)
}
finalize(out, &state.h, &state.s)
}

View File

@@ -0,0 +1,503 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build gc && !purego
#include "textflag.h"
// This implementation of Poly1305 uses the vector facility (vx)
// to process up to 2 blocks (32 bytes) per iteration using an
// algorithm based on the one described in:
//
// NEON crypto, Daniel J. Bernstein & Peter Schwabe
// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
//
// This algorithm uses 5 26-bit limbs to represent a 130-bit
// value. These limbs are, for the most part, zero extended and
// placed into 64-bit vector register elements. Each vector
// register is 128-bits wide and so holds 2 of these elements.
// Using 26-bit limbs allows us plenty of headroom to accommodate
// accumulations before and after multiplication without
// overflowing either 32-bits (before multiplication) or 64-bits
// (after multiplication).
//
// In order to parallelise the operations required to calculate
// the sum we use two separate accumulators and then sum those
// in an extra final step. For compatibility with the generic
// implementation we perform this summation at the end of every
// updateVX call.
//
// To use two accumulators we must multiply the message blocks
// by r² rather than r. Only the final message block should be
// multiplied by r.
//
// Example:
//
// We want to calculate the sum (h) for a 64 byte message (m):
//
// h = m[0:16]r + m[16:32]r³ + m[32:48]r² + m[48:64]r
//
// To do this we split the calculation into the even indices
// and odd indices of the message. These form our SIMD 'lanes':
//
// h = m[ 0:16]r + m[32:48]r² + <- lane 0
// m[16:32]r³ + m[48:64]r <- lane 1
//
// To calculate this iteratively we refactor so that both lanes
// are written in terms of r² and r:
//
// h = (m[ 0:16]r² + m[32:48])r² + <- lane 0
// (m[16:32]r² + m[48:64])r <- lane 1
// ^ ^
// | coefficients for second iteration
// coefficients for first iteration
//
// So in this case we would have two iterations. In the first
// both lanes are multiplied by r². In the second only the
// first lane is multiplied by r² and the second lane is
// instead multiplied by r. This gives use the odd and even
// powers of r that we need from the original equation.
//
// Notation:
//
// h - accumulator
// r - key
// m - message
//
// [a, b] - SIMD register holding two 64-bit values
// [a, b, c, d] - SIMD register holding four 32-bit values
// x[n] - limb n of variable x with bit width i
//
// Limbs are expressed in little endian order, so for 26-bit
// limbs x[4] will be the most significant limb and x[0]
// will be the least significant limb.
// masking constants
#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits
#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits
// expansion constants (see EXPAND macro)
#define EX0 V2
#define EX1 V3
#define EX2 V4
// key (r², r or 1 depending on context)
#define R_0 V5
#define R_1 V6
#define R_2 V7
#define R_3 V8
#define R_4 V9
// precalculated coefficients (5r², 5r or 0 depending on context)
#define R5_1 V10
#define R5_2 V11
#define R5_3 V12
#define R5_4 V13
// message block (m)
#define M_0 V14
#define M_1 V15
#define M_2 V16
#define M_3 V17
#define M_4 V18
// accumulator (h)
#define H_0 V19
#define H_1 V20
#define H_2 V21
#define H_3 V22
#define H_4 V23
// temporary registers (for short-lived values)
#define T_0 V24
#define T_1 V25
#define T_2 V26
#define T_3 V27
#define T_4 V28
GLOBL ·constants<>(SB), RODATA, $0x30
// EX0
DATA ·constants<>+0x00(SB)/8, $0x0006050403020100
DATA ·constants<>+0x08(SB)/8, $0x1016151413121110
// EX1
DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706
DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716
// EX2
DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d
DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d
// MULTIPLY multiplies each lane of f and g, partially reduced
// modulo 2¹³ - 5. The result, h, consists of partial products
// in each lane that need to be reduced further to produce the
// final result.
//
// h = (fg) % 2¹³ + (5fg) / 2¹³
//
// Note that the multiplication by 5 of the high bits is
// achieved by precalculating the multiplication of four of the
// g coefficients by 5. These are g51-g54.
#define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
VMLOF f0, g0, h0 \
VMLOF f0, g3, h3 \
VMLOF f0, g1, h1 \
VMLOF f0, g4, h4 \
VMLOF f0, g2, h2 \
VMLOF f1, g54, T_0 \
VMLOF f1, g2, T_3 \
VMLOF f1, g0, T_1 \
VMLOF f1, g3, T_4 \
VMLOF f1, g1, T_2 \
VMALOF f2, g53, h0, h0 \
VMALOF f2, g1, h3, h3 \
VMALOF f2, g54, h1, h1 \
VMALOF f2, g2, h4, h4 \
VMALOF f2, g0, h2, h2 \
VMALOF f3, g52, T_0, T_0 \
VMALOF f3, g0, T_3, T_3 \
VMALOF f3, g53, T_1, T_1 \
VMALOF f3, g1, T_4, T_4 \
VMALOF f3, g54, T_2, T_2 \
VMALOF f4, g51, h0, h0 \
VMALOF f4, g54, h3, h3 \
VMALOF f4, g52, h1, h1 \
VMALOF f4, g0, h4, h4 \
VMALOF f4, g53, h2, h2 \
VAG T_0, h0, h0 \
VAG T_3, h3, h3 \
VAG T_1, h1, h1 \
VAG T_4, h4, h4 \
VAG T_2, h2, h2
// REDUCE performs the following carry operations in four
// stages, as specified in Bernstein & Schwabe:
//
// 1: h[0]->h[1] h[3]->h[4]
// 2: h[1]->h[2] h[4]->h[0]
// 3: h[0]->h[1] h[2]->h[3]
// 4: h[3]->h[4]
//
// The result is that all of the limbs are limited to 26-bits
// except for h[1] and h[4] which are limited to 27-bits.
//
// Note that although each limb is aligned at 26-bit intervals
// they may contain values that exceed 2² - 1, hence the need
// to carry the excess bits in each limb.
#define REDUCE(h0, h1, h2, h3, h4) \
VESRLG $26, h0, T_0 \
VESRLG $26, h3, T_1 \
VN MOD26, h0, h0 \
VN MOD26, h3, h3 \
VAG T_0, h1, h1 \
VAG T_1, h4, h4 \
VESRLG $26, h1, T_2 \
VESRLG $26, h4, T_3 \
VN MOD26, h1, h1 \
VN MOD26, h4, h4 \
VESLG $2, T_3, T_4 \
VAG T_3, T_4, T_4 \
VAG T_2, h2, h2 \
VAG T_4, h0, h0 \
VESRLG $26, h2, T_0 \
VESRLG $26, h0, T_1 \
VN MOD26, h2, h2 \
VN MOD26, h0, h0 \
VAG T_0, h3, h3 \
VAG T_1, h1, h1 \
VESRLG $26, h3, T_2 \
VN MOD26, h3, h3 \
VAG T_2, h4, h4
// EXPAND splits the 128-bit little-endian values in0 and in1
// into 26-bit big-endian limbs and places the results into
// the first and second lane of d[0:4] respectively.
//
// The EX0, EX1 and EX2 constants are arrays of byte indices
// for permutation. The permutation both reverses the bytes
// in the input and ensures the bytes are copied into the
// destination limb ready to be shifted into their final
// position.
#define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
VPERM in0, in1, EX0, d0 \
VPERM in0, in1, EX1, d2 \
VPERM in0, in1, EX2, d4 \
VESRLG $26, d0, d1 \
VESRLG $30, d2, d3 \
VESRLG $4, d2, d2 \
VN MOD26, d0, d0 \ // [in0[0], in1[0]]
VN MOD26, d3, d3 \ // [in0[3], in1[3]]
VN MOD26, d1, d1 \ // [in0[1], in1[1]]
VN MOD24, d4, d4 \ // [in0[4], in1[4]]
VN MOD26, d2, d2 // [in0[2], in1[2]]
// func updateVX(state *macState, msg []byte)
TEXT ·updateVX(SB), NOSPLIT, $0
MOVD state+0(FP), R1
LMG msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len
// load EX0, EX1 and EX2
MOVD $·constants<>(SB), R5
VLM (R5), EX0, EX2
// generate masks
VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff]
VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff]
// load h (accumulator) and r (key) from state
VZERO T_1 // [0, 0]
VL 0(R1), T_0 // [h[0], h[1]]
VLEG $0, 16(R1), T_1 // [h[2], 0]
VL 24(R1), T_2 // [r[0], r[1]]
VPDI $0, T_0, T_2, T_3 // [h[0], r[0]]
VPDI $5, T_0, T_2, T_4 // [h[1], r[1]]
// unpack h and r into 26-bit limbs
// note: h[2] may have the low 3 bits set, so h[4] is a 27-bit value
VN MOD26, T_3, H_0 // [h[0], r[0]]
VZERO H_1 // [0, 0]
VZERO H_3 // [0, 0]
VGMG $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out
VESLG $24, T_1, T_1 // [h[2]<<24, 0]
VERIMG $-26&63, T_3, MOD26, H_1 // [h[1], r[1]]
VESRLG $+52&63, T_3, H_2 // [h[2], r[2]] - low 12 bits only
VERIMG $-14&63, T_4, MOD26, H_3 // [h[1], r[1]]
VESRLG $40, T_4, H_4 // [h[4], r[4]] - low 24 bits only
VERIMG $+12&63, T_4, T_0, H_2 // [h[2], r[2]] - complete
VO T_1, H_4, H_4 // [h[4], r[4]] - complete
// replicate r across all 4 vector elements
VREPF $3, H_0, R_0 // [r[0], r[0], r[0], r[0]]
VREPF $3, H_1, R_1 // [r[1], r[1], r[1], r[1]]
VREPF $3, H_2, R_2 // [r[2], r[2], r[2], r[2]]
VREPF $3, H_3, R_3 // [r[3], r[3], r[3], r[3]]
VREPF $3, H_4, R_4 // [r[4], r[4], r[4], r[4]]
// zero out lane 1 of h
VLEIG $1, $0, H_0 // [h[0], 0]
VLEIG $1, $0, H_1 // [h[1], 0]
VLEIG $1, $0, H_2 // [h[2], 0]
VLEIG $1, $0, H_3 // [h[3], 0]
VLEIG $1, $0, H_4 // [h[4], 0]
// calculate 5r (ignore least significant limb)
VREPIF $5, T_0
VMLF T_0, R_1, R5_1 // [5r[1], 5r[1], 5r[1], 5r[1]]
VMLF T_0, R_2, R5_2 // [5r[2], 5r[2], 5r[2], 5r[2]]
VMLF T_0, R_3, R5_3 // [5r[3], 5r[3], 5r[3], 5r[3]]
VMLF T_0, R_4, R5_4 // [5r[4], 5r[4], 5r[4], 5r[4]]
// skip r² calculation if we are only calculating one block
CMPBLE R3, $16, skip
// calculate r²
MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4)
REDUCE(M_0, M_1, M_2, M_3, M_4)
VGBM $0x0f0f, T_0
VERIMG $0, M_0, T_0, R_0 // [r[0], r²[0], r[0], r²[0]]
VERIMG $0, M_1, T_0, R_1 // [r[1], r²[1], r[1], r²[1]]
VERIMG $0, M_2, T_0, R_2 // [r[2], r²[2], r[2], r²[2]]
VERIMG $0, M_3, T_0, R_3 // [r[3], r²[3], r[3], r²[3]]
VERIMG $0, M_4, T_0, R_4 // [r[4], r²[4], r[4], r²[4]]
// calculate 5r² (ignore least significant limb)
VREPIF $5, T_0
VMLF T_0, R_1, R5_1 // [5r[1], 5r²[1], 5r[1], 5r²[1]]
VMLF T_0, R_2, R5_2 // [5r[2], 5r²[2], 5r[2], 5r²[2]]
VMLF T_0, R_3, R5_3 // [5r[3], 5r²[3], 5r[3], 5r²[3]]
VMLF T_0, R_4, R5_4 // [5r[4], 5r²[4], 5r[4], 5r²[4]]
loop:
CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients
// load next 2 blocks from message
VLM (R2), T_0, T_1
// update message slice
SUB $32, R3
MOVD $32(R2), R2
// unpack message blocks into 26-bit big-endian limbs
EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
// add 2¹² to each message block value
VLEIB $4, $1, M_4
VLEIB $12, $1, M_4
multiply:
// accumulate the incoming message
VAG H_0, M_0, M_0
VAG H_3, M_3, M_3
VAG H_1, M_1, M_1
VAG H_4, M_4, M_4
VAG H_2, M_2, M_2
// multiply the accumulator by the key coefficient
MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
// carry and partially reduce the partial products
REDUCE(H_0, H_1, H_2, H_3, H_4)
CMPBNE R3, $0, loop
finish:
// sum lane 0 and lane 1 and put the result in lane 1
VZERO T_0
VSUMQG H_0, T_0, H_0
VSUMQG H_3, T_0, H_3
VSUMQG H_1, T_0, H_1
VSUMQG H_4, T_0, H_4
VSUMQG H_2, T_0, H_2
// reduce again after summation
// TODO(mundaym): there might be a more efficient way to do this
// now that we only have 1 active lane. For example, we could
// simultaneously pack the values as we reduce them.
REDUCE(H_0, H_1, H_2, H_3, H_4)
// carry h[1] through to h[4] so that only h[4] can exceed 2² - 1
// TODO(mundaym): in testing this final carry was unnecessary.
// Needs a proof before it can be removed though.
VESRLG $26, H_1, T_1
VN MOD26, H_1, H_1
VAQ T_1, H_2, H_2
VESRLG $26, H_2, T_2
VN MOD26, H_2, H_2
VAQ T_2, H_3, H_3
VESRLG $26, H_3, T_3
VN MOD26, H_3, H_3
VAQ T_3, H_4, H_4
// h is now < 2(2¹³ - 5)
// Pack each lane in h[0:4] into h[0:1].
VESLG $26, H_1, H_1
VESLG $26, H_3, H_3
VO H_0, H_1, H_0
VO H_2, H_3, H_2
VESLG $4, H_2, H_2
VLEIB $7, $48, H_1
VSLB H_1, H_2, H_2
VO H_0, H_2, H_0
VLEIB $7, $104, H_1
VSLB H_1, H_4, H_3
VO H_3, H_0, H_0
VLEIB $7, $24, H_1
VSRLB H_1, H_4, H_1
// update state
VSTEG $1, H_0, 0(R1)
VSTEG $0, H_0, 8(R1)
VSTEG $1, H_1, 16(R1)
RET
b2: // 2 or fewer blocks remaining
CMPBLE R3, $16, b1
// Load the 2 remaining blocks (17-32 bytes remaining).
MOVD $-17(R3), R0 // index of final byte to load modulo 16
VL (R2), T_0 // load full 16 byte block
VLL R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes
// The Poly1305 algorithm requires that a 1 bit be appended to
// each message block. If the final block is less than 16 bytes
// long then it is easiest to insert the 1 before the message
// block is split into 26-bit limbs. If, on the other hand, the
// final message block is 16 bytes long then we append the 1 bit
// after expansion as normal.
MOVBZ $1, R0
MOVD $-16(R3), R3 // index of byte in last block to insert 1 at (could be 16)
CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long
VLVGB R3, R0, T_1 // insert 1 into the byte at index R3
// Split both blocks into 26-bit limbs in the appropriate lanes.
EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
// Append a 1 byte to the end of the second to last block.
VLEIB $4, $1, M_4
// Append a 1 byte to the end of the last block only if it is a
// full 16 byte block.
CMPBNE R3, $16, 2(PC)
VLEIB $12, $1, M_4
// Finally, set up the coefficients for the final multiplication.
// We have previously saved r and 5r in the 32-bit even indexes
// of the R_[0-4] and R5_[1-4] coefficient registers.
//
// We want lane 0 to be multiplied by r² so that can be kept the
// same. We want lane 1 to be multiplied by r so we need to move
// the saved r value into the 32-bit odd index in lane 1 by
// rotating the 64-bit lane by 32.
VGBM $0x00ff, T_0 // [0, 0xffffffffffffffff] - mask lane 1 only
VERIMG $32, R_0, T_0, R_0 // [_, r²[0], _, r[0]]
VERIMG $32, R_1, T_0, R_1 // [_, r²[1], _, r[1]]
VERIMG $32, R_2, T_0, R_2 // [_, r²[2], _, r[2]]
VERIMG $32, R_3, T_0, R_3 // [_, r²[3], _, r[3]]
VERIMG $32, R_4, T_0, R_4 // [_, r²[4], _, r[4]]
VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²[1], _, 5r[1]]
VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²[2], _, 5r[2]]
VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²[3], _, 5r[3]]
VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²[4], _, 5r[4]]
MOVD $0, R3
BR multiply
skip:
CMPBEQ R3, $0, finish
b1: // 1 block remaining
// Load the final block (1-16 bytes). This will be placed into
// lane 0.
MOVD $-1(R3), R0
VLL R0, (R2), T_0 // pad to 16 bytes with zeros
// The Poly1305 algorithm requires that a 1 bit be appended to
// each message block. If the final block is less than 16 bytes
// long then it is easiest to insert the 1 before the message
// block is split into 26-bit limbs. If, on the other hand, the
// final message block is 16 bytes long then we append the 1 bit
// after expansion as normal.
MOVBZ $1, R0
CMPBEQ R3, $16, 2(PC)
VLVGB R3, R0, T_0
// Set the message block in lane 1 to the value 0 so that it
// can be accumulated without affecting the final result.
VZERO T_1
// Split the final message block into 26-bit limbs in lane 0.
// Lane 1 will be contain 0.
EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
// Append a 1 byte to the end of the last block only if it is a
// full 16 byte block.
CMPBNE R3, $16, 2(PC)
VLEIB $4, $1, M_4
// We have previously saved r and 5r in the 32-bit even indexes
// of the R_[0-4] and R5_[1-4] coefficient registers.
//
// We want lane 0 to be multiplied by r so we need to move the
// saved r value into the 32-bit odd index in lane 0. We want
// lane 1 to be set to the value 1. This makes multiplication
// a no-op. We do this by setting lane 1 in every register to 0
// and then just setting the 32-bit index 3 in R_0 to 1.
VZERO T_0
MOVD $0, R0
MOVD $0x10111213, R12
VLVGP R12, R0, T_1 // [_, 0x10111213, _, 0x00000000]
VPERM T_0, R_0, T_1, R_0 // [_, r[0], _, 0]
VPERM T_0, R_1, T_1, R_1 // [_, r[1], _, 0]
VPERM T_0, R_2, T_1, R_2 // [_, r[2], _, 0]
VPERM T_0, R_3, T_1, R_3 // [_, r[3], _, 0]
VPERM T_0, R_4, T_1, R_4 // [_, r[4], _, 0]
VPERM T_0, R5_1, T_1, R5_1 // [_, 5r[1], _, 0]
VPERM T_0, R5_2, T_1, R5_2 // [_, 5r[2], _, 0]
VPERM T_0, R5_3, T_1, R5_3 // [_, 5r[3], _, 0]
VPERM T_0, R5_4, T_1, R5_4 // [_, 5r[4], _, 0]
// Set the value of lane 1 to be 1.
VLEIF $3, $1, R_0 // [_, r[0], _, 1]
MOVD $0, R3
BR multiply

182
vendor/golang.org/x/crypto/nacl/box/box.go generated vendored Normal file
View File

@@ -0,0 +1,182 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package box authenticates and encrypts small messages using public-key cryptography.
Box uses Curve25519, XSalsa20 and Poly1305 to encrypt and authenticate
messages. The length of messages is not hidden.
It is the caller's responsibility to ensure the uniqueness of nonces—for
example, by using nonce 1 for the first message, nonce 2 for the second
message, etc. Nonces are long enough that randomly generated nonces have
negligible risk of collision.
Messages should be small because:
1. The whole message needs to be held in memory to be processed.
2. Using large messages pressures implementations on small machines to decrypt
and process plaintext before authenticating it. This is very dangerous, and
this API does not allow it, but a protocol that uses excessive message sizes
might present some implementations with no other choice.
3. Fixed overheads will be sufficiently amortised by messages as small as 8KB.
4. Performance may be improved by working with messages that fit into data caches.
Thus large amounts of data should be chunked so that each message is small.
(Each message still needs a unique nonce.) If in doubt, 16KB is a reasonable
chunk size.
This package is interoperable with NaCl: https://nacl.cr.yp.to/box.html.
Anonymous sealing/opening is an extension of NaCl defined by and interoperable
with libsodium:
https://libsodium.gitbook.io/doc/public-key_cryptography/sealed_boxes.
*/
package box
import (
cryptorand "crypto/rand"
"io"
"golang.org/x/crypto/blake2b"
"golang.org/x/crypto/curve25519"
"golang.org/x/crypto/nacl/secretbox"
"golang.org/x/crypto/salsa20/salsa"
)
const (
// Overhead is the number of bytes of overhead when boxing a message.
Overhead = secretbox.Overhead
// AnonymousOverhead is the number of bytes of overhead when using anonymous
// sealed boxes.
AnonymousOverhead = Overhead + 32
)
// GenerateKey generates a new public/private key pair suitable for use with
// Seal and Open.
func GenerateKey(rand io.Reader) (publicKey, privateKey *[32]byte, err error) {
publicKey = new([32]byte)
privateKey = new([32]byte)
_, err = io.ReadFull(rand, privateKey[:])
if err != nil {
publicKey = nil
privateKey = nil
return
}
curve25519.ScalarBaseMult(publicKey, privateKey)
return
}
var zeros [16]byte
// Precompute calculates the shared key between peersPublicKey and privateKey
// and writes it to sharedKey. The shared key can be used with
// OpenAfterPrecomputation and SealAfterPrecomputation to speed up processing
// when using the same pair of keys repeatedly.
func Precompute(sharedKey, peersPublicKey, privateKey *[32]byte) {
curve25519.ScalarMult(sharedKey, privateKey, peersPublicKey)
salsa.HSalsa20(sharedKey, &zeros, sharedKey, &salsa.Sigma)
}
// Seal appends an encrypted and authenticated copy of message to out, which
// will be Overhead bytes longer than the original and must not overlap it. The
// nonce must be unique for each distinct message for a given pair of keys.
func Seal(out, message []byte, nonce *[24]byte, peersPublicKey, privateKey *[32]byte) []byte {
var sharedKey [32]byte
Precompute(&sharedKey, peersPublicKey, privateKey)
return secretbox.Seal(out, message, nonce, &sharedKey)
}
// SealAfterPrecomputation performs the same actions as Seal, but takes a
// shared key as generated by Precompute.
func SealAfterPrecomputation(out, message []byte, nonce *[24]byte, sharedKey *[32]byte) []byte {
return secretbox.Seal(out, message, nonce, sharedKey)
}
// Open authenticates and decrypts a box produced by Seal and appends the
// message to out, which must not overlap box. The output will be Overhead
// bytes smaller than box.
func Open(out, box []byte, nonce *[24]byte, peersPublicKey, privateKey *[32]byte) ([]byte, bool) {
var sharedKey [32]byte
Precompute(&sharedKey, peersPublicKey, privateKey)
return secretbox.Open(out, box, nonce, &sharedKey)
}
// OpenAfterPrecomputation performs the same actions as Open, but takes a
// shared key as generated by Precompute.
func OpenAfterPrecomputation(out, box []byte, nonce *[24]byte, sharedKey *[32]byte) ([]byte, bool) {
return secretbox.Open(out, box, nonce, sharedKey)
}
// SealAnonymous appends an encrypted and authenticated copy of message to out,
// which will be AnonymousOverhead bytes longer than the original and must not
// overlap it. This differs from Seal in that the sender is not required to
// provide a private key.
func SealAnonymous(out, message []byte, recipient *[32]byte, rand io.Reader) ([]byte, error) {
if rand == nil {
rand = cryptorand.Reader
}
ephemeralPub, ephemeralPriv, err := GenerateKey(rand)
if err != nil {
return nil, err
}
var nonce [24]byte
if err := sealNonce(ephemeralPub, recipient, &nonce); err != nil {
return nil, err
}
if total := len(out) + AnonymousOverhead + len(message); cap(out) < total {
original := out
out = make([]byte, 0, total)
out = append(out, original...)
}
out = append(out, ephemeralPub[:]...)
return Seal(out, message, &nonce, recipient, ephemeralPriv), nil
}
// OpenAnonymous authenticates and decrypts a box produced by SealAnonymous and
// appends the message to out, which must not overlap box. The output will be
// AnonymousOverhead bytes smaller than box.
func OpenAnonymous(out, box []byte, publicKey, privateKey *[32]byte) (message []byte, ok bool) {
if len(box) < AnonymousOverhead {
return nil, false
}
var ephemeralPub [32]byte
copy(ephemeralPub[:], box[:32])
var nonce [24]byte
if err := sealNonce(&ephemeralPub, publicKey, &nonce); err != nil {
return nil, false
}
return Open(out, box[32:], &nonce, &ephemeralPub, privateKey)
}
// sealNonce generates a 24 byte nonce that is a blake2b digest of the
// ephemeral public key and the receiver's public key.
func sealNonce(ephemeralPub, peersPublicKey *[32]byte, nonce *[24]byte) error {
h, err := blake2b.New(24, nil)
if err != nil {
return err
}
if _, err = h.Write(ephemeralPub[:]); err != nil {
return err
}
if _, err = h.Write(peersPublicKey[:]); err != nil {
return err
}
h.Sum(nonce[:0])
return nil
}

173
vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go generated vendored Normal file
View File

@@ -0,0 +1,173 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package secretbox encrypts and authenticates small messages.
Secretbox uses XSalsa20 and Poly1305 to encrypt and authenticate messages with
secret-key cryptography. The length of messages is not hidden.
It is the caller's responsibility to ensure the uniqueness of nonces—for
example, by using nonce 1 for the first message, nonce 2 for the second
message, etc. Nonces are long enough that randomly generated nonces have
negligible risk of collision.
Messages should be small because:
1. The whole message needs to be held in memory to be processed.
2. Using large messages pressures implementations on small machines to decrypt
and process plaintext before authenticating it. This is very dangerous, and
this API does not allow it, but a protocol that uses excessive message sizes
might present some implementations with no other choice.
3. Fixed overheads will be sufficiently amortised by messages as small as 8KB.
4. Performance may be improved by working with messages that fit into data caches.
Thus large amounts of data should be chunked so that each message is small.
(Each message still needs a unique nonce.) If in doubt, 16KB is a reasonable
chunk size.
This package is interoperable with NaCl: https://nacl.cr.yp.to/secretbox.html.
*/
package secretbox
import (
"golang.org/x/crypto/internal/alias"
"golang.org/x/crypto/internal/poly1305"
"golang.org/x/crypto/salsa20/salsa"
)
// Overhead is the number of bytes of overhead when boxing a message.
const Overhead = poly1305.TagSize
// setup produces a sub-key and Salsa20 counter given a nonce and key.
func setup(subKey *[32]byte, counter *[16]byte, nonce *[24]byte, key *[32]byte) {
// We use XSalsa20 for encryption so first we need to generate a
// key and nonce with HSalsa20.
var hNonce [16]byte
copy(hNonce[:], nonce[:])
salsa.HSalsa20(subKey, &hNonce, key, &salsa.Sigma)
// The final 8 bytes of the original nonce form the new nonce.
copy(counter[:], nonce[16:])
}
// sliceForAppend takes a slice and a requested number of bytes. It returns a
// slice with the contents of the given slice followed by that many bytes and a
// second slice that aliases into it and contains only the extra bytes. If the
// original slice has sufficient capacity then no allocation is performed.
func sliceForAppend(in []byte, n int) (head, tail []byte) {
if total := len(in) + n; cap(in) >= total {
head = in[:total]
} else {
head = make([]byte, total)
copy(head, in)
}
tail = head[len(in):]
return
}
// Seal appends an encrypted and authenticated copy of message to out, which
// must not overlap message. The key and nonce pair must be unique for each
// distinct message and the output will be Overhead bytes longer than message.
func Seal(out, message []byte, nonce *[24]byte, key *[32]byte) []byte {
var subKey [32]byte
var counter [16]byte
setup(&subKey, &counter, nonce, key)
// The Poly1305 key is generated by encrypting 32 bytes of zeros. Since
// Salsa20 works with 64-byte blocks, we also generate 32 bytes of
// keystream as a side effect.
var firstBlock [64]byte
salsa.XORKeyStream(firstBlock[:], firstBlock[:], &counter, &subKey)
var poly1305Key [32]byte
copy(poly1305Key[:], firstBlock[:])
ret, out := sliceForAppend(out, len(message)+poly1305.TagSize)
if alias.AnyOverlap(out, message) {
panic("nacl: invalid buffer overlap")
}
// We XOR up to 32 bytes of message with the keystream generated from
// the first block.
firstMessageBlock := message
if len(firstMessageBlock) > 32 {
firstMessageBlock = firstMessageBlock[:32]
}
tagOut := out
out = out[poly1305.TagSize:]
for i, x := range firstMessageBlock {
out[i] = firstBlock[32+i] ^ x
}
message = message[len(firstMessageBlock):]
ciphertext := out
out = out[len(firstMessageBlock):]
// Now encrypt the rest.
counter[8] = 1
salsa.XORKeyStream(out, message, &counter, &subKey)
var tag [poly1305.TagSize]byte
poly1305.Sum(&tag, ciphertext, &poly1305Key)
copy(tagOut, tag[:])
return ret
}
// Open authenticates and decrypts a box produced by Seal and appends the
// message to out, which must not overlap box. The output will be Overhead
// bytes smaller than box.
func Open(out, box []byte, nonce *[24]byte, key *[32]byte) ([]byte, bool) {
if len(box) < Overhead {
return nil, false
}
var subKey [32]byte
var counter [16]byte
setup(&subKey, &counter, nonce, key)
// The Poly1305 key is generated by encrypting 32 bytes of zeros. Since
// Salsa20 works with 64-byte blocks, we also generate 32 bytes of
// keystream as a side effect.
var firstBlock [64]byte
salsa.XORKeyStream(firstBlock[:], firstBlock[:], &counter, &subKey)
var poly1305Key [32]byte
copy(poly1305Key[:], firstBlock[:])
var tag [poly1305.TagSize]byte
copy(tag[:], box)
if !poly1305.Verify(&tag, box[poly1305.TagSize:], &poly1305Key) {
return nil, false
}
ret, out := sliceForAppend(out, len(box)-Overhead)
if alias.AnyOverlap(out, box) {
panic("nacl: invalid buffer overlap")
}
// We XOR up to 32 bytes of box with the keystream generated from
// the first block.
box = box[Overhead:]
firstMessageBlock := box
if len(firstMessageBlock) > 32 {
firstMessageBlock = firstMessageBlock[:32]
}
for i, x := range firstMessageBlock {
out[i] = firstBlock[32+i] ^ x
}
box = box[len(firstMessageBlock):]
out = out[len(firstMessageBlock):]
// Now decrypt the rest.
counter[8] = 1
salsa.XORKeyStream(out, box, &counter, &subKey)
return ret, true
}

91
vendor/golang.org/x/crypto/poly1305/poly1305_compat.go generated vendored Normal file
View File

@@ -0,0 +1,91 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package poly1305 implements Poly1305 one-time message authentication code as
// specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
//
// Poly1305 is a fast, one-time authentication function. It is infeasible for an
// attacker to generate an authenticator for a message without the key. However, a
// key must only be used for a single message. Authenticating two different
// messages with the same key allows an attacker to forge authenticators for other
// messages with the same key.
//
// Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
// used with a fixed key in order to generate one-time keys from an nonce.
// However, in this package AES isn't used and the one-time key is specified
// directly.
//
// Deprecated: Poly1305 as implemented by this package is a cryptographic
// building block that is not safe for general purpose use.
// For encryption, use the full ChaCha20-Poly1305 construction implemented by
// golang.org/x/crypto/chacha20poly1305. For authentication, use a general
// purpose MAC such as HMAC implemented by crypto/hmac.
package poly1305
import "golang.org/x/crypto/internal/poly1305"
// TagSize is the size, in bytes, of a poly1305 authenticator.
//
// For use with golang.org/x/crypto/chacha20poly1305, chacha20poly1305.Overhead
// can be used instead.
const TagSize = 16
// Sum generates an authenticator for msg using a one-time key and puts the
// 16-byte result into out. Authenticating two different messages with the same
// key allows an attacker to forge messages at will.
func Sum(out *[16]byte, m []byte, key *[32]byte) {
poly1305.Sum(out, m, key)
}
// Verify returns true if mac is a valid authenticator for m with the given key.
func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
return poly1305.Verify(mac, m, key)
}
// New returns a new MAC computing an authentication
// tag of all data written to it with the given key.
// This allows writing the message progressively instead
// of passing it as a single slice. Common users should use
// the Sum function instead.
//
// The key must be unique for each message, as authenticating
// two different messages with the same key allows an attacker
// to forge messages at will.
func New(key *[32]byte) *MAC {
return &MAC{mac: poly1305.New(key)}
}
// MAC is an io.Writer computing an authentication tag
// of the data written to it.
//
// MAC cannot be used like common hash.Hash implementations,
// because using a poly1305 key twice breaks its security.
// Therefore writing data to a running MAC after calling
// Sum or Verify causes it to panic.
type MAC struct {
mac *poly1305.MAC
}
// Size returns the number of bytes Sum will return.
func (h *MAC) Size() int { return TagSize }
// Write adds more data to the running message authentication code.
// It never returns an error.
//
// It must not be called after the first call of Sum or Verify.
func (h *MAC) Write(p []byte) (n int, err error) {
return h.mac.Write(p)
}
// Sum computes the authenticator of all data written to the
// message authentication code.
func (h *MAC) Sum(b []byte) []byte {
return h.mac.Sum(b)
}
// Verify returns whether the authenticator of all data written to
// the message authentication code matches the expected value.
func (h *MAC) Verify(expected []byte) bool {
return h.mac.Verify(expected)
}

146
vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go generated vendored Normal file
View File

@@ -0,0 +1,146 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package salsa provides low-level access to functions in the Salsa family.
package salsa
import "math/bits"
// Sigma is the Salsa20 constant for 256-bit keys.
var Sigma = [16]byte{'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', '2', '-', 'b', 'y', 't', 'e', ' ', 'k'}
// HSalsa20 applies the HSalsa20 core function to a 16-byte input in, 32-byte
// key k, and 16-byte constant c, and puts the result into the 32-byte array
// out.
func HSalsa20(out *[32]byte, in *[16]byte, k *[32]byte, c *[16]byte) {
x0 := uint32(c[0]) | uint32(c[1])<<8 | uint32(c[2])<<16 | uint32(c[3])<<24
x1 := uint32(k[0]) | uint32(k[1])<<8 | uint32(k[2])<<16 | uint32(k[3])<<24
x2 := uint32(k[4]) | uint32(k[5])<<8 | uint32(k[6])<<16 | uint32(k[7])<<24
x3 := uint32(k[8]) | uint32(k[9])<<8 | uint32(k[10])<<16 | uint32(k[11])<<24
x4 := uint32(k[12]) | uint32(k[13])<<8 | uint32(k[14])<<16 | uint32(k[15])<<24
x5 := uint32(c[4]) | uint32(c[5])<<8 | uint32(c[6])<<16 | uint32(c[7])<<24
x6 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
x7 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
x8 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
x9 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
x10 := uint32(c[8]) | uint32(c[9])<<8 | uint32(c[10])<<16 | uint32(c[11])<<24
x11 := uint32(k[16]) | uint32(k[17])<<8 | uint32(k[18])<<16 | uint32(k[19])<<24
x12 := uint32(k[20]) | uint32(k[21])<<8 | uint32(k[22])<<16 | uint32(k[23])<<24
x13 := uint32(k[24]) | uint32(k[25])<<8 | uint32(k[26])<<16 | uint32(k[27])<<24
x14 := uint32(k[28]) | uint32(k[29])<<8 | uint32(k[30])<<16 | uint32(k[31])<<24
x15 := uint32(c[12]) | uint32(c[13])<<8 | uint32(c[14])<<16 | uint32(c[15])<<24
for i := 0; i < 20; i += 2 {
u := x0 + x12
x4 ^= bits.RotateLeft32(u, 7)
u = x4 + x0
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x4
x12 ^= bits.RotateLeft32(u, 13)
u = x12 + x8
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x1
x9 ^= bits.RotateLeft32(u, 7)
u = x9 + x5
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x9
x1 ^= bits.RotateLeft32(u, 13)
u = x1 + x13
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x6
x14 ^= bits.RotateLeft32(u, 7)
u = x14 + x10
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x14
x6 ^= bits.RotateLeft32(u, 13)
u = x6 + x2
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x11
x3 ^= bits.RotateLeft32(u, 7)
u = x3 + x15
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x3
x11 ^= bits.RotateLeft32(u, 13)
u = x11 + x7
x15 ^= bits.RotateLeft32(u, 18)
u = x0 + x3
x1 ^= bits.RotateLeft32(u, 7)
u = x1 + x0
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x1
x3 ^= bits.RotateLeft32(u, 13)
u = x3 + x2
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x4
x6 ^= bits.RotateLeft32(u, 7)
u = x6 + x5
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x6
x4 ^= bits.RotateLeft32(u, 13)
u = x4 + x7
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x9
x11 ^= bits.RotateLeft32(u, 7)
u = x11 + x10
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x11
x9 ^= bits.RotateLeft32(u, 13)
u = x9 + x8
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x14
x12 ^= bits.RotateLeft32(u, 7)
u = x12 + x15
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x12
x14 ^= bits.RotateLeft32(u, 13)
u = x14 + x13
x15 ^= bits.RotateLeft32(u, 18)
}
out[0] = byte(x0)
out[1] = byte(x0 >> 8)
out[2] = byte(x0 >> 16)
out[3] = byte(x0 >> 24)
out[4] = byte(x5)
out[5] = byte(x5 >> 8)
out[6] = byte(x5 >> 16)
out[7] = byte(x5 >> 24)
out[8] = byte(x10)
out[9] = byte(x10 >> 8)
out[10] = byte(x10 >> 16)
out[11] = byte(x10 >> 24)
out[12] = byte(x15)
out[13] = byte(x15 >> 8)
out[14] = byte(x15 >> 16)
out[15] = byte(x15 >> 24)
out[16] = byte(x6)
out[17] = byte(x6 >> 8)
out[18] = byte(x6 >> 16)
out[19] = byte(x6 >> 24)
out[20] = byte(x7)
out[21] = byte(x7 >> 8)
out[22] = byte(x7 >> 16)
out[23] = byte(x7 >> 24)
out[24] = byte(x8)
out[25] = byte(x8 >> 8)
out[26] = byte(x8 >> 16)
out[27] = byte(x8 >> 24)
out[28] = byte(x9)
out[29] = byte(x9 >> 8)
out[30] = byte(x9 >> 16)
out[31] = byte(x9 >> 24)
}

201
vendor/golang.org/x/crypto/salsa20/salsa/salsa208.go generated vendored Normal file
View File

@@ -0,0 +1,201 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package salsa
import "math/bits"
// Core208 applies the Salsa20/8 core function to the 64-byte array in and puts
// the result into the 64-byte array out. The input and output may be the same array.
func Core208(out *[64]byte, in *[64]byte) {
j0 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
j1 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
j2 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
j3 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
j4 := uint32(in[16]) | uint32(in[17])<<8 | uint32(in[18])<<16 | uint32(in[19])<<24
j5 := uint32(in[20]) | uint32(in[21])<<8 | uint32(in[22])<<16 | uint32(in[23])<<24
j6 := uint32(in[24]) | uint32(in[25])<<8 | uint32(in[26])<<16 | uint32(in[27])<<24
j7 := uint32(in[28]) | uint32(in[29])<<8 | uint32(in[30])<<16 | uint32(in[31])<<24
j8 := uint32(in[32]) | uint32(in[33])<<8 | uint32(in[34])<<16 | uint32(in[35])<<24
j9 := uint32(in[36]) | uint32(in[37])<<8 | uint32(in[38])<<16 | uint32(in[39])<<24
j10 := uint32(in[40]) | uint32(in[41])<<8 | uint32(in[42])<<16 | uint32(in[43])<<24
j11 := uint32(in[44]) | uint32(in[45])<<8 | uint32(in[46])<<16 | uint32(in[47])<<24
j12 := uint32(in[48]) | uint32(in[49])<<8 | uint32(in[50])<<16 | uint32(in[51])<<24
j13 := uint32(in[52]) | uint32(in[53])<<8 | uint32(in[54])<<16 | uint32(in[55])<<24
j14 := uint32(in[56]) | uint32(in[57])<<8 | uint32(in[58])<<16 | uint32(in[59])<<24
j15 := uint32(in[60]) | uint32(in[61])<<8 | uint32(in[62])<<16 | uint32(in[63])<<24
x0, x1, x2, x3, x4, x5, x6, x7, x8 := j0, j1, j2, j3, j4, j5, j6, j7, j8
x9, x10, x11, x12, x13, x14, x15 := j9, j10, j11, j12, j13, j14, j15
for i := 0; i < 8; i += 2 {
u := x0 + x12
x4 ^= bits.RotateLeft32(u, 7)
u = x4 + x0
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x4
x12 ^= bits.RotateLeft32(u, 13)
u = x12 + x8
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x1
x9 ^= bits.RotateLeft32(u, 7)
u = x9 + x5
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x9
x1 ^= bits.RotateLeft32(u, 13)
u = x1 + x13
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x6
x14 ^= bits.RotateLeft32(u, 7)
u = x14 + x10
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x14
x6 ^= bits.RotateLeft32(u, 13)
u = x6 + x2
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x11
x3 ^= bits.RotateLeft32(u, 7)
u = x3 + x15
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x3
x11 ^= bits.RotateLeft32(u, 13)
u = x11 + x7
x15 ^= bits.RotateLeft32(u, 18)
u = x0 + x3
x1 ^= bits.RotateLeft32(u, 7)
u = x1 + x0
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x1
x3 ^= bits.RotateLeft32(u, 13)
u = x3 + x2
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x4
x6 ^= bits.RotateLeft32(u, 7)
u = x6 + x5
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x6
x4 ^= bits.RotateLeft32(u, 13)
u = x4 + x7
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x9
x11 ^= bits.RotateLeft32(u, 7)
u = x11 + x10
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x11
x9 ^= bits.RotateLeft32(u, 13)
u = x9 + x8
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x14
x12 ^= bits.RotateLeft32(u, 7)
u = x12 + x15
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x12
x14 ^= bits.RotateLeft32(u, 13)
u = x14 + x13
x15 ^= bits.RotateLeft32(u, 18)
}
x0 += j0
x1 += j1
x2 += j2
x3 += j3
x4 += j4
x5 += j5
x6 += j6
x7 += j7
x8 += j8
x9 += j9
x10 += j10
x11 += j11
x12 += j12
x13 += j13
x14 += j14
x15 += j15
out[0] = byte(x0)
out[1] = byte(x0 >> 8)
out[2] = byte(x0 >> 16)
out[3] = byte(x0 >> 24)
out[4] = byte(x1)
out[5] = byte(x1 >> 8)
out[6] = byte(x1 >> 16)
out[7] = byte(x1 >> 24)
out[8] = byte(x2)
out[9] = byte(x2 >> 8)
out[10] = byte(x2 >> 16)
out[11] = byte(x2 >> 24)
out[12] = byte(x3)
out[13] = byte(x3 >> 8)
out[14] = byte(x3 >> 16)
out[15] = byte(x3 >> 24)
out[16] = byte(x4)
out[17] = byte(x4 >> 8)
out[18] = byte(x4 >> 16)
out[19] = byte(x4 >> 24)
out[20] = byte(x5)
out[21] = byte(x5 >> 8)
out[22] = byte(x5 >> 16)
out[23] = byte(x5 >> 24)
out[24] = byte(x6)
out[25] = byte(x6 >> 8)
out[26] = byte(x6 >> 16)
out[27] = byte(x6 >> 24)
out[28] = byte(x7)
out[29] = byte(x7 >> 8)
out[30] = byte(x7 >> 16)
out[31] = byte(x7 >> 24)
out[32] = byte(x8)
out[33] = byte(x8 >> 8)
out[34] = byte(x8 >> 16)
out[35] = byte(x8 >> 24)
out[36] = byte(x9)
out[37] = byte(x9 >> 8)
out[38] = byte(x9 >> 16)
out[39] = byte(x9 >> 24)
out[40] = byte(x10)
out[41] = byte(x10 >> 8)
out[42] = byte(x10 >> 16)
out[43] = byte(x10 >> 24)
out[44] = byte(x11)
out[45] = byte(x11 >> 8)
out[46] = byte(x11 >> 16)
out[47] = byte(x11 >> 24)
out[48] = byte(x12)
out[49] = byte(x12 >> 8)
out[50] = byte(x12 >> 16)
out[51] = byte(x12 >> 24)
out[52] = byte(x13)
out[53] = byte(x13 >> 8)
out[54] = byte(x13 >> 16)
out[55] = byte(x13 >> 24)
out[56] = byte(x14)
out[57] = byte(x14 >> 8)
out[58] = byte(x14 >> 16)
out[59] = byte(x14 >> 24)
out[60] = byte(x15)
out[61] = byte(x15 >> 8)
out[62] = byte(x15 >> 16)
out[63] = byte(x15 >> 24)
}

View File

@@ -0,0 +1,23 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && !purego && gc
package salsa
//go:noescape
// salsa2020XORKeyStream is implemented in salsa20_amd64.s.
func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
// XORKeyStream crypts bytes from in to out using the given key and counters.
// In and out must overlap entirely or not at all. Counter
// contains the raw salsa20 counter bytes (both nonce and block counter).
func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
if len(in) == 0 {
return
}
_ = out[len(in)-1]
salsa2020XORKeyStream(&out[0], &in[0], uint64(len(in)), &counter[0], &key[0])
}

View File

@@ -0,0 +1,880 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && !purego && gc
// This code was translated into a form compatible with 6a from the public
// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
MOVQ out+0(FP),DI
MOVQ in+8(FP),SI
MOVQ n+16(FP),DX
MOVQ nonce+24(FP),CX
MOVQ key+32(FP),R8
MOVQ SP,R12
ADDQ $31, R12
ANDQ $~31, R12
MOVQ DX,R9
MOVQ CX,DX
MOVQ R8,R10
CMPQ R9,$0
JBE DONE
START:
MOVL 20(R10),CX
MOVL 0(R10),R8
MOVL 0(DX),AX
MOVL 16(R10),R11
MOVL CX,0(R12)
MOVL R8, 4 (R12)
MOVL AX, 8 (R12)
MOVL R11, 12 (R12)
MOVL 8(DX),CX
MOVL 24(R10),R8
MOVL 4(R10),AX
MOVL 4(DX),R11
MOVL CX,16(R12)
MOVL R8, 20 (R12)
MOVL AX, 24 (R12)
MOVL R11, 28 (R12)
MOVL 12(DX),CX
MOVL 12(R10),DX
MOVL 28(R10),R8
MOVL 8(R10),AX
MOVL DX,32(R12)
MOVL CX, 36 (R12)
MOVL R8, 40 (R12)
MOVL AX, 44 (R12)
MOVQ $1634760805,DX
MOVQ $857760878,CX
MOVQ $2036477234,R8
MOVQ $1797285236,AX
MOVL DX,48(R12)
MOVL CX, 52 (R12)
MOVL R8, 56 (R12)
MOVL AX, 60 (R12)
CMPQ R9,$256
JB BYTESBETWEEN1AND255
MOVOA 48(R12),X0
PSHUFL $0X55,X0,X1
PSHUFL $0XAA,X0,X2
PSHUFL $0XFF,X0,X3
PSHUFL $0X00,X0,X0
MOVOA X1,64(R12)
MOVOA X2,80(R12)
MOVOA X3,96(R12)
MOVOA X0,112(R12)
MOVOA 0(R12),X0
PSHUFL $0XAA,X0,X1
PSHUFL $0XFF,X0,X2
PSHUFL $0X00,X0,X3
PSHUFL $0X55,X0,X0
MOVOA X1,128(R12)
MOVOA X2,144(R12)
MOVOA X3,160(R12)
MOVOA X0,176(R12)
MOVOA 16(R12),X0
PSHUFL $0XFF,X0,X1
PSHUFL $0X55,X0,X2
PSHUFL $0XAA,X0,X0
MOVOA X1,192(R12)
MOVOA X2,208(R12)
MOVOA X0,224(R12)
MOVOA 32(R12),X0
PSHUFL $0X00,X0,X1
PSHUFL $0XAA,X0,X2
PSHUFL $0XFF,X0,X0
MOVOA X1,240(R12)
MOVOA X2,256(R12)
MOVOA X0,272(R12)
BYTESATLEAST256:
MOVL 16(R12),DX
MOVL 36 (R12),CX
MOVL DX,288(R12)
MOVL CX,304(R12)
SHLQ $32,CX
ADDQ CX,DX
ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
MOVL DX, 292 (R12)
MOVL CX, 308 (R12)
ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
MOVL DX, 296 (R12)
MOVL CX, 312 (R12)
ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
MOVL DX, 300 (R12)
MOVL CX, 316 (R12)
ADDQ $1,DX
MOVQ DX,CX
SHRQ $32,CX
MOVL DX,16(R12)
MOVL CX, 36 (R12)
MOVQ R9,352(R12)
MOVQ $20,DX
MOVOA 64(R12),X0
MOVOA 80(R12),X1
MOVOA 96(R12),X2
MOVOA 256(R12),X3
MOVOA 272(R12),X4
MOVOA 128(R12),X5
MOVOA 144(R12),X6
MOVOA 176(R12),X7
MOVOA 192(R12),X8
MOVOA 208(R12),X9
MOVOA 224(R12),X10
MOVOA 304(R12),X11
MOVOA 112(R12),X12
MOVOA 160(R12),X13
MOVOA 240(R12),X14
MOVOA 288(R12),X15
MAINLOOP1:
MOVOA X1,320(R12)
MOVOA X2,336(R12)
MOVOA X13,X1
PADDL X12,X1
MOVOA X1,X2
PSLLL $7,X1
PXOR X1,X14
PSRLL $25,X2
PXOR X2,X14
MOVOA X7,X1
PADDL X0,X1
MOVOA X1,X2
PSLLL $7,X1
PXOR X1,X11
PSRLL $25,X2
PXOR X2,X11
MOVOA X12,X1
PADDL X14,X1
MOVOA X1,X2
PSLLL $9,X1
PXOR X1,X15
PSRLL $23,X2
PXOR X2,X15
MOVOA X0,X1
PADDL X11,X1
MOVOA X1,X2
PSLLL $9,X1
PXOR X1,X9
PSRLL $23,X2
PXOR X2,X9
MOVOA X14,X1
PADDL X15,X1
MOVOA X1,X2
PSLLL $13,X1
PXOR X1,X13
PSRLL $19,X2
PXOR X2,X13
MOVOA X11,X1
PADDL X9,X1
MOVOA X1,X2
PSLLL $13,X1
PXOR X1,X7
PSRLL $19,X2
PXOR X2,X7
MOVOA X15,X1
PADDL X13,X1
MOVOA X1,X2
PSLLL $18,X1
PXOR X1,X12
PSRLL $14,X2
PXOR X2,X12
MOVOA 320(R12),X1
MOVOA X12,320(R12)
MOVOA X9,X2
PADDL X7,X2
MOVOA X2,X12
PSLLL $18,X2
PXOR X2,X0
PSRLL $14,X12
PXOR X12,X0
MOVOA X5,X2
PADDL X1,X2
MOVOA X2,X12
PSLLL $7,X2
PXOR X2,X3
PSRLL $25,X12
PXOR X12,X3
MOVOA 336(R12),X2
MOVOA X0,336(R12)
MOVOA X6,X0
PADDL X2,X0
MOVOA X0,X12
PSLLL $7,X0
PXOR X0,X4
PSRLL $25,X12
PXOR X12,X4
MOVOA X1,X0
PADDL X3,X0
MOVOA X0,X12
PSLLL $9,X0
PXOR X0,X10
PSRLL $23,X12
PXOR X12,X10
MOVOA X2,X0
PADDL X4,X0
MOVOA X0,X12
PSLLL $9,X0
PXOR X0,X8
PSRLL $23,X12
PXOR X12,X8
MOVOA X3,X0
PADDL X10,X0
MOVOA X0,X12
PSLLL $13,X0
PXOR X0,X5
PSRLL $19,X12
PXOR X12,X5
MOVOA X4,X0
PADDL X8,X0
MOVOA X0,X12
PSLLL $13,X0
PXOR X0,X6
PSRLL $19,X12
PXOR X12,X6
MOVOA X10,X0
PADDL X5,X0
MOVOA X0,X12
PSLLL $18,X0
PXOR X0,X1
PSRLL $14,X12
PXOR X12,X1
MOVOA 320(R12),X0
MOVOA X1,320(R12)
MOVOA X4,X1
PADDL X0,X1
MOVOA X1,X12
PSLLL $7,X1
PXOR X1,X7
PSRLL $25,X12
PXOR X12,X7
MOVOA X8,X1
PADDL X6,X1
MOVOA X1,X12
PSLLL $18,X1
PXOR X1,X2
PSRLL $14,X12
PXOR X12,X2
MOVOA 336(R12),X12
MOVOA X2,336(R12)
MOVOA X14,X1
PADDL X12,X1
MOVOA X1,X2
PSLLL $7,X1
PXOR X1,X5
PSRLL $25,X2
PXOR X2,X5
MOVOA X0,X1
PADDL X7,X1
MOVOA X1,X2
PSLLL $9,X1
PXOR X1,X10
PSRLL $23,X2
PXOR X2,X10
MOVOA X12,X1
PADDL X5,X1
MOVOA X1,X2
PSLLL $9,X1
PXOR X1,X8
PSRLL $23,X2
PXOR X2,X8
MOVOA X7,X1
PADDL X10,X1
MOVOA X1,X2
PSLLL $13,X1
PXOR X1,X4
PSRLL $19,X2
PXOR X2,X4
MOVOA X5,X1
PADDL X8,X1
MOVOA X1,X2
PSLLL $13,X1
PXOR X1,X14
PSRLL $19,X2
PXOR X2,X14
MOVOA X10,X1
PADDL X4,X1
MOVOA X1,X2
PSLLL $18,X1
PXOR X1,X0
PSRLL $14,X2
PXOR X2,X0
MOVOA 320(R12),X1
MOVOA X0,320(R12)
MOVOA X8,X0
PADDL X14,X0
MOVOA X0,X2
PSLLL $18,X0
PXOR X0,X12
PSRLL $14,X2
PXOR X2,X12
MOVOA X11,X0
PADDL X1,X0
MOVOA X0,X2
PSLLL $7,X0
PXOR X0,X6
PSRLL $25,X2
PXOR X2,X6
MOVOA 336(R12),X2
MOVOA X12,336(R12)
MOVOA X3,X0
PADDL X2,X0
MOVOA X0,X12
PSLLL $7,X0
PXOR X0,X13
PSRLL $25,X12
PXOR X12,X13
MOVOA X1,X0
PADDL X6,X0
MOVOA X0,X12
PSLLL $9,X0
PXOR X0,X15
PSRLL $23,X12
PXOR X12,X15
MOVOA X2,X0
PADDL X13,X0
MOVOA X0,X12
PSLLL $9,X0
PXOR X0,X9
PSRLL $23,X12
PXOR X12,X9
MOVOA X6,X0
PADDL X15,X0
MOVOA X0,X12
PSLLL $13,X0
PXOR X0,X11
PSRLL $19,X12
PXOR X12,X11
MOVOA X13,X0
PADDL X9,X0
MOVOA X0,X12
PSLLL $13,X0
PXOR X0,X3
PSRLL $19,X12
PXOR X12,X3
MOVOA X15,X0
PADDL X11,X0
MOVOA X0,X12
PSLLL $18,X0
PXOR X0,X1
PSRLL $14,X12
PXOR X12,X1
MOVOA X9,X0
PADDL X3,X0
MOVOA X0,X12
PSLLL $18,X0
PXOR X0,X2
PSRLL $14,X12
PXOR X12,X2
MOVOA 320(R12),X12
MOVOA 336(R12),X0
SUBQ $2,DX
JA MAINLOOP1
PADDL 112(R12),X12
PADDL 176(R12),X7
PADDL 224(R12),X10
PADDL 272(R12),X4
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
MOVD X4,R9
PSHUFL $0X39,X12,X12
PSHUFL $0X39,X7,X7
PSHUFL $0X39,X10,X10
PSHUFL $0X39,X4,X4
XORL 0(SI),DX
XORL 4(SI),CX
XORL 8(SI),R8
XORL 12(SI),R9
MOVL DX,0(DI)
MOVL CX,4(DI)
MOVL R8,8(DI)
MOVL R9,12(DI)
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
MOVD X4,R9
PSHUFL $0X39,X12,X12
PSHUFL $0X39,X7,X7
PSHUFL $0X39,X10,X10
PSHUFL $0X39,X4,X4
XORL 64(SI),DX
XORL 68(SI),CX
XORL 72(SI),R8
XORL 76(SI),R9
MOVL DX,64(DI)
MOVL CX,68(DI)
MOVL R8,72(DI)
MOVL R9,76(DI)
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
MOVD X4,R9
PSHUFL $0X39,X12,X12
PSHUFL $0X39,X7,X7
PSHUFL $0X39,X10,X10
PSHUFL $0X39,X4,X4
XORL 128(SI),DX
XORL 132(SI),CX
XORL 136(SI),R8
XORL 140(SI),R9
MOVL DX,128(DI)
MOVL CX,132(DI)
MOVL R8,136(DI)
MOVL R9,140(DI)
MOVD X12,DX
MOVD X7,CX
MOVD X10,R8
MOVD X4,R9
XORL 192(SI),DX
XORL 196(SI),CX
XORL 200(SI),R8
XORL 204(SI),R9
MOVL DX,192(DI)
MOVL CX,196(DI)
MOVL R8,200(DI)
MOVL R9,204(DI)
PADDL 240(R12),X14
PADDL 64(R12),X0
PADDL 128(R12),X5
PADDL 192(R12),X8
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
MOVD X8,R9
PSHUFL $0X39,X14,X14
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X5,X5
PSHUFL $0X39,X8,X8
XORL 16(SI),DX
XORL 20(SI),CX
XORL 24(SI),R8
XORL 28(SI),R9
MOVL DX,16(DI)
MOVL CX,20(DI)
MOVL R8,24(DI)
MOVL R9,28(DI)
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
MOVD X8,R9
PSHUFL $0X39,X14,X14
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X5,X5
PSHUFL $0X39,X8,X8
XORL 80(SI),DX
XORL 84(SI),CX
XORL 88(SI),R8
XORL 92(SI),R9
MOVL DX,80(DI)
MOVL CX,84(DI)
MOVL R8,88(DI)
MOVL R9,92(DI)
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
MOVD X8,R9
PSHUFL $0X39,X14,X14
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X5,X5
PSHUFL $0X39,X8,X8
XORL 144(SI),DX
XORL 148(SI),CX
XORL 152(SI),R8
XORL 156(SI),R9
MOVL DX,144(DI)
MOVL CX,148(DI)
MOVL R8,152(DI)
MOVL R9,156(DI)
MOVD X14,DX
MOVD X0,CX
MOVD X5,R8
MOVD X8,R9
XORL 208(SI),DX
XORL 212(SI),CX
XORL 216(SI),R8
XORL 220(SI),R9
MOVL DX,208(DI)
MOVL CX,212(DI)
MOVL R8,216(DI)
MOVL R9,220(DI)
PADDL 288(R12),X15
PADDL 304(R12),X11
PADDL 80(R12),X1
PADDL 144(R12),X6
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
MOVD X6,R9
PSHUFL $0X39,X15,X15
PSHUFL $0X39,X11,X11
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X6,X6
XORL 32(SI),DX
XORL 36(SI),CX
XORL 40(SI),R8
XORL 44(SI),R9
MOVL DX,32(DI)
MOVL CX,36(DI)
MOVL R8,40(DI)
MOVL R9,44(DI)
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
MOVD X6,R9
PSHUFL $0X39,X15,X15
PSHUFL $0X39,X11,X11
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X6,X6
XORL 96(SI),DX
XORL 100(SI),CX
XORL 104(SI),R8
XORL 108(SI),R9
MOVL DX,96(DI)
MOVL CX,100(DI)
MOVL R8,104(DI)
MOVL R9,108(DI)
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
MOVD X6,R9
PSHUFL $0X39,X15,X15
PSHUFL $0X39,X11,X11
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X6,X6
XORL 160(SI),DX
XORL 164(SI),CX
XORL 168(SI),R8
XORL 172(SI),R9
MOVL DX,160(DI)
MOVL CX,164(DI)
MOVL R8,168(DI)
MOVL R9,172(DI)
MOVD X15,DX
MOVD X11,CX
MOVD X1,R8
MOVD X6,R9
XORL 224(SI),DX
XORL 228(SI),CX
XORL 232(SI),R8
XORL 236(SI),R9
MOVL DX,224(DI)
MOVL CX,228(DI)
MOVL R8,232(DI)
MOVL R9,236(DI)
PADDL 160(R12),X13
PADDL 208(R12),X9
PADDL 256(R12),X3
PADDL 96(R12),X2
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
MOVD X2,R9
PSHUFL $0X39,X13,X13
PSHUFL $0X39,X9,X9
PSHUFL $0X39,X3,X3
PSHUFL $0X39,X2,X2
XORL 48(SI),DX
XORL 52(SI),CX
XORL 56(SI),R8
XORL 60(SI),R9
MOVL DX,48(DI)
MOVL CX,52(DI)
MOVL R8,56(DI)
MOVL R9,60(DI)
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
MOVD X2,R9
PSHUFL $0X39,X13,X13
PSHUFL $0X39,X9,X9
PSHUFL $0X39,X3,X3
PSHUFL $0X39,X2,X2
XORL 112(SI),DX
XORL 116(SI),CX
XORL 120(SI),R8
XORL 124(SI),R9
MOVL DX,112(DI)
MOVL CX,116(DI)
MOVL R8,120(DI)
MOVL R9,124(DI)
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
MOVD X2,R9
PSHUFL $0X39,X13,X13
PSHUFL $0X39,X9,X9
PSHUFL $0X39,X3,X3
PSHUFL $0X39,X2,X2
XORL 176(SI),DX
XORL 180(SI),CX
XORL 184(SI),R8
XORL 188(SI),R9
MOVL DX,176(DI)
MOVL CX,180(DI)
MOVL R8,184(DI)
MOVL R9,188(DI)
MOVD X13,DX
MOVD X9,CX
MOVD X3,R8
MOVD X2,R9
XORL 240(SI),DX
XORL 244(SI),CX
XORL 248(SI),R8
XORL 252(SI),R9
MOVL DX,240(DI)
MOVL CX,244(DI)
MOVL R8,248(DI)
MOVL R9,252(DI)
MOVQ 352(R12),R9
SUBQ $256,R9
ADDQ $256,SI
ADDQ $256,DI
CMPQ R9,$256
JAE BYTESATLEAST256
CMPQ R9,$0
JBE DONE
BYTESBETWEEN1AND255:
CMPQ R9,$64
JAE NOCOPY
MOVQ DI,DX
LEAQ 360(R12),DI
MOVQ R9,CX
REP; MOVSB
LEAQ 360(R12),DI
LEAQ 360(R12),SI
NOCOPY:
MOVQ R9,352(R12)
MOVOA 48(R12),X0
MOVOA 0(R12),X1
MOVOA 16(R12),X2
MOVOA 32(R12),X3
MOVOA X1,X4
MOVQ $20,CX
MAINLOOP2:
PADDL X0,X4
MOVOA X0,X5
MOVOA X4,X6
PSLLL $7,X4
PSRLL $25,X6
PXOR X4,X3
PXOR X6,X3
PADDL X3,X5
MOVOA X3,X4
MOVOA X5,X6
PSLLL $9,X5
PSRLL $23,X6
PXOR X5,X2
PSHUFL $0X93,X3,X3
PXOR X6,X2
PADDL X2,X4
MOVOA X2,X5
MOVOA X4,X6
PSLLL $13,X4
PSRLL $19,X6
PXOR X4,X1
PSHUFL $0X4E,X2,X2
PXOR X6,X1
PADDL X1,X5
MOVOA X3,X4
MOVOA X5,X6
PSLLL $18,X5
PSRLL $14,X6
PXOR X5,X0
PSHUFL $0X39,X1,X1
PXOR X6,X0
PADDL X0,X4
MOVOA X0,X5
MOVOA X4,X6
PSLLL $7,X4
PSRLL $25,X6
PXOR X4,X1
PXOR X6,X1
PADDL X1,X5
MOVOA X1,X4
MOVOA X5,X6
PSLLL $9,X5
PSRLL $23,X6
PXOR X5,X2
PSHUFL $0X93,X1,X1
PXOR X6,X2
PADDL X2,X4
MOVOA X2,X5
MOVOA X4,X6
PSLLL $13,X4
PSRLL $19,X6
PXOR X4,X3
PSHUFL $0X4E,X2,X2
PXOR X6,X3
PADDL X3,X5
MOVOA X1,X4
MOVOA X5,X6
PSLLL $18,X5
PSRLL $14,X6
PXOR X5,X0
PSHUFL $0X39,X3,X3
PXOR X6,X0
PADDL X0,X4
MOVOA X0,X5
MOVOA X4,X6
PSLLL $7,X4
PSRLL $25,X6
PXOR X4,X3
PXOR X6,X3
PADDL X3,X5
MOVOA X3,X4
MOVOA X5,X6
PSLLL $9,X5
PSRLL $23,X6
PXOR X5,X2
PSHUFL $0X93,X3,X3
PXOR X6,X2
PADDL X2,X4
MOVOA X2,X5
MOVOA X4,X6
PSLLL $13,X4
PSRLL $19,X6
PXOR X4,X1
PSHUFL $0X4E,X2,X2
PXOR X6,X1
PADDL X1,X5
MOVOA X3,X4
MOVOA X5,X6
PSLLL $18,X5
PSRLL $14,X6
PXOR X5,X0
PSHUFL $0X39,X1,X1
PXOR X6,X0
PADDL X0,X4
MOVOA X0,X5
MOVOA X4,X6
PSLLL $7,X4
PSRLL $25,X6
PXOR X4,X1
PXOR X6,X1
PADDL X1,X5
MOVOA X1,X4
MOVOA X5,X6
PSLLL $9,X5
PSRLL $23,X6
PXOR X5,X2
PSHUFL $0X93,X1,X1
PXOR X6,X2
PADDL X2,X4
MOVOA X2,X5
MOVOA X4,X6
PSLLL $13,X4
PSRLL $19,X6
PXOR X4,X3
PSHUFL $0X4E,X2,X2
PXOR X6,X3
SUBQ $4,CX
PADDL X3,X5
MOVOA X1,X4
MOVOA X5,X6
PSLLL $18,X5
PXOR X7,X7
PSRLL $14,X6
PXOR X5,X0
PSHUFL $0X39,X3,X3
PXOR X6,X0
JA MAINLOOP2
PADDL 48(R12),X0
PADDL 0(R12),X1
PADDL 16(R12),X2
PADDL 32(R12),X3
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
MOVD X3,AX
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X2,X2
PSHUFL $0X39,X3,X3
XORL 0(SI),CX
XORL 48(SI),R8
XORL 32(SI),R9
XORL 16(SI),AX
MOVL CX,0(DI)
MOVL R8,48(DI)
MOVL R9,32(DI)
MOVL AX,16(DI)
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
MOVD X3,AX
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X2,X2
PSHUFL $0X39,X3,X3
XORL 20(SI),CX
XORL 4(SI),R8
XORL 52(SI),R9
XORL 36(SI),AX
MOVL CX,20(DI)
MOVL R8,4(DI)
MOVL R9,52(DI)
MOVL AX,36(DI)
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
MOVD X3,AX
PSHUFL $0X39,X0,X0
PSHUFL $0X39,X1,X1
PSHUFL $0X39,X2,X2
PSHUFL $0X39,X3,X3
XORL 40(SI),CX
XORL 24(SI),R8
XORL 8(SI),R9
XORL 56(SI),AX
MOVL CX,40(DI)
MOVL R8,24(DI)
MOVL R9,8(DI)
MOVL AX,56(DI)
MOVD X0,CX
MOVD X1,R8
MOVD X2,R9
MOVD X3,AX
XORL 60(SI),CX
XORL 44(SI),R8
XORL 28(SI),R9
XORL 12(SI),AX
MOVL CX,60(DI)
MOVL R8,44(DI)
MOVL R9,28(DI)
MOVL AX,12(DI)
MOVQ 352(R12),R9
MOVL 16(R12),CX
MOVL 36 (R12),R8
ADDQ $1,CX
SHLQ $32,R8
ADDQ R8,CX
MOVQ CX,R8
SHRQ $32,R8
MOVL CX,16(R12)
MOVL R8, 36 (R12)
CMPQ R9,$64
JA BYTESATLEAST65
JAE BYTESATLEAST64
MOVQ DI,SI
MOVQ DX,DI
MOVQ R9,CX
REP; MOVSB
BYTESATLEAST64:
DONE:
RET
BYTESATLEAST65:
SUBQ $64,R9
ADDQ $64,DI
ADDQ $64,SI
JMP BYTESBETWEEN1AND255

View File

@@ -0,0 +1,14 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || purego || !gc
package salsa
// XORKeyStream crypts bytes from in to out using the given key and counters.
// In and out must overlap entirely or not at all. Counter
// contains the raw salsa20 counter bytes (both nonce and block counter).
func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
genericXORKeyStream(out, in, counter, key)
}

233
vendor/golang.org/x/crypto/salsa20/salsa/salsa20_ref.go generated vendored Normal file
View File

@@ -0,0 +1,233 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package salsa
import "math/bits"
const rounds = 20
// core applies the Salsa20 core function to 16-byte input in, 32-byte key k,
// and 16-byte constant c, and puts the result into 64-byte array out.
func core(out *[64]byte, in *[16]byte, k *[32]byte, c *[16]byte) {
j0 := uint32(c[0]) | uint32(c[1])<<8 | uint32(c[2])<<16 | uint32(c[3])<<24
j1 := uint32(k[0]) | uint32(k[1])<<8 | uint32(k[2])<<16 | uint32(k[3])<<24
j2 := uint32(k[4]) | uint32(k[5])<<8 | uint32(k[6])<<16 | uint32(k[7])<<24
j3 := uint32(k[8]) | uint32(k[9])<<8 | uint32(k[10])<<16 | uint32(k[11])<<24
j4 := uint32(k[12]) | uint32(k[13])<<8 | uint32(k[14])<<16 | uint32(k[15])<<24
j5 := uint32(c[4]) | uint32(c[5])<<8 | uint32(c[6])<<16 | uint32(c[7])<<24
j6 := uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
j7 := uint32(in[4]) | uint32(in[5])<<8 | uint32(in[6])<<16 | uint32(in[7])<<24
j8 := uint32(in[8]) | uint32(in[9])<<8 | uint32(in[10])<<16 | uint32(in[11])<<24
j9 := uint32(in[12]) | uint32(in[13])<<8 | uint32(in[14])<<16 | uint32(in[15])<<24
j10 := uint32(c[8]) | uint32(c[9])<<8 | uint32(c[10])<<16 | uint32(c[11])<<24
j11 := uint32(k[16]) | uint32(k[17])<<8 | uint32(k[18])<<16 | uint32(k[19])<<24
j12 := uint32(k[20]) | uint32(k[21])<<8 | uint32(k[22])<<16 | uint32(k[23])<<24
j13 := uint32(k[24]) | uint32(k[25])<<8 | uint32(k[26])<<16 | uint32(k[27])<<24
j14 := uint32(k[28]) | uint32(k[29])<<8 | uint32(k[30])<<16 | uint32(k[31])<<24
j15 := uint32(c[12]) | uint32(c[13])<<8 | uint32(c[14])<<16 | uint32(c[15])<<24
x0, x1, x2, x3, x4, x5, x6, x7, x8 := j0, j1, j2, j3, j4, j5, j6, j7, j8
x9, x10, x11, x12, x13, x14, x15 := j9, j10, j11, j12, j13, j14, j15
for i := 0; i < rounds; i += 2 {
u := x0 + x12
x4 ^= bits.RotateLeft32(u, 7)
u = x4 + x0
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x4
x12 ^= bits.RotateLeft32(u, 13)
u = x12 + x8
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x1
x9 ^= bits.RotateLeft32(u, 7)
u = x9 + x5
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x9
x1 ^= bits.RotateLeft32(u, 13)
u = x1 + x13
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x6
x14 ^= bits.RotateLeft32(u, 7)
u = x14 + x10
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x14
x6 ^= bits.RotateLeft32(u, 13)
u = x6 + x2
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x11
x3 ^= bits.RotateLeft32(u, 7)
u = x3 + x15
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x3
x11 ^= bits.RotateLeft32(u, 13)
u = x11 + x7
x15 ^= bits.RotateLeft32(u, 18)
u = x0 + x3
x1 ^= bits.RotateLeft32(u, 7)
u = x1 + x0
x2 ^= bits.RotateLeft32(u, 9)
u = x2 + x1
x3 ^= bits.RotateLeft32(u, 13)
u = x3 + x2
x0 ^= bits.RotateLeft32(u, 18)
u = x5 + x4
x6 ^= bits.RotateLeft32(u, 7)
u = x6 + x5
x7 ^= bits.RotateLeft32(u, 9)
u = x7 + x6
x4 ^= bits.RotateLeft32(u, 13)
u = x4 + x7
x5 ^= bits.RotateLeft32(u, 18)
u = x10 + x9
x11 ^= bits.RotateLeft32(u, 7)
u = x11 + x10
x8 ^= bits.RotateLeft32(u, 9)
u = x8 + x11
x9 ^= bits.RotateLeft32(u, 13)
u = x9 + x8
x10 ^= bits.RotateLeft32(u, 18)
u = x15 + x14
x12 ^= bits.RotateLeft32(u, 7)
u = x12 + x15
x13 ^= bits.RotateLeft32(u, 9)
u = x13 + x12
x14 ^= bits.RotateLeft32(u, 13)
u = x14 + x13
x15 ^= bits.RotateLeft32(u, 18)
}
x0 += j0
x1 += j1
x2 += j2
x3 += j3
x4 += j4
x5 += j5
x6 += j6
x7 += j7
x8 += j8
x9 += j9
x10 += j10
x11 += j11
x12 += j12
x13 += j13
x14 += j14
x15 += j15
out[0] = byte(x0)
out[1] = byte(x0 >> 8)
out[2] = byte(x0 >> 16)
out[3] = byte(x0 >> 24)
out[4] = byte(x1)
out[5] = byte(x1 >> 8)
out[6] = byte(x1 >> 16)
out[7] = byte(x1 >> 24)
out[8] = byte(x2)
out[9] = byte(x2 >> 8)
out[10] = byte(x2 >> 16)
out[11] = byte(x2 >> 24)
out[12] = byte(x3)
out[13] = byte(x3 >> 8)
out[14] = byte(x3 >> 16)
out[15] = byte(x3 >> 24)
out[16] = byte(x4)
out[17] = byte(x4 >> 8)
out[18] = byte(x4 >> 16)
out[19] = byte(x4 >> 24)
out[20] = byte(x5)
out[21] = byte(x5 >> 8)
out[22] = byte(x5 >> 16)
out[23] = byte(x5 >> 24)
out[24] = byte(x6)
out[25] = byte(x6 >> 8)
out[26] = byte(x6 >> 16)
out[27] = byte(x6 >> 24)
out[28] = byte(x7)
out[29] = byte(x7 >> 8)
out[30] = byte(x7 >> 16)
out[31] = byte(x7 >> 24)
out[32] = byte(x8)
out[33] = byte(x8 >> 8)
out[34] = byte(x8 >> 16)
out[35] = byte(x8 >> 24)
out[36] = byte(x9)
out[37] = byte(x9 >> 8)
out[38] = byte(x9 >> 16)
out[39] = byte(x9 >> 24)
out[40] = byte(x10)
out[41] = byte(x10 >> 8)
out[42] = byte(x10 >> 16)
out[43] = byte(x10 >> 24)
out[44] = byte(x11)
out[45] = byte(x11 >> 8)
out[46] = byte(x11 >> 16)
out[47] = byte(x11 >> 24)
out[48] = byte(x12)
out[49] = byte(x12 >> 8)
out[50] = byte(x12 >> 16)
out[51] = byte(x12 >> 24)
out[52] = byte(x13)
out[53] = byte(x13 >> 8)
out[54] = byte(x13 >> 16)
out[55] = byte(x13 >> 24)
out[56] = byte(x14)
out[57] = byte(x14 >> 8)
out[58] = byte(x14 >> 16)
out[59] = byte(x14 >> 24)
out[60] = byte(x15)
out[61] = byte(x15 >> 8)
out[62] = byte(x15 >> 16)
out[63] = byte(x15 >> 24)
}
// genericXORKeyStream is the generic implementation of XORKeyStream to be used
// when no assembly implementation is available.
func genericXORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
var block [64]byte
var counterCopy [16]byte
copy(counterCopy[:], counter[:])
for len(in) >= 64 {
core(&block, &counterCopy, key, &Sigma)
for i, x := range block {
out[i] = in[i] ^ x
}
u := uint32(1)
for i := 8; i < 16; i++ {
u += uint32(counterCopy[i])
counterCopy[i] = byte(u)
u >>= 8
}
in = in[64:]
out = out[64:]
}
if len(in) > 0 {
core(&block, &counterCopy, key, &Sigma)
for i, v := range in {
out[i] = v ^ block[i]
}
}
}