add packages

This commit is contained in:
fatedier
2017-10-24 22:53:20 +08:00
parent 0559865fe5
commit 9e0fd0c4ef
54 changed files with 7315 additions and 21 deletions

14
vendor/github.com/templexxx/cpufeat/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,14 @@
# Binaries for programs and plugins
*.exe
*.dll
*.so
*.dylib
# Test binary, build with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
.glide/

27
vendor/github.com/templexxx/cpufeat/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

32
vendor/github.com/templexxx/cpufeat/cpu.go generated vendored Normal file
View File

@@ -0,0 +1,32 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package cpu implements processor feature detection
// used by the Go standard libary.
package cpufeat
var X86 x86
// The booleans in x86 contain the correspondingly named cpuid feature bit.
// HasAVX and HasAVX2 are only set if the OS does support XMM and YMM registers
// in addition to the cpuid feature bit being set.
// The struct is padded to avoid false sharing.
type x86 struct {
_ [CacheLineSize]byte
HasAES bool
HasAVX bool
HasAVX2 bool
HasBMI1 bool
HasBMI2 bool
HasERMS bool
HasOSXSAVE bool
HasPCLMULQDQ bool
HasPOPCNT bool
HasSSE2 bool
HasSSE3 bool
HasSSSE3 bool
HasSSE41 bool
HasSSE42 bool
_ [CacheLineSize]byte
}

7
vendor/github.com/templexxx/cpufeat/cpu_arm.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpufeat/cpu_arm64.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpufeat/cpu_mips.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpufeat/cpu_mips64.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpufeat/cpu_mips64le.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpufeat/cpu_mipsle.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpufeat/cpu_ppc64.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 128

7
vendor/github.com/templexxx/cpufeat/cpu_ppc64le.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 128

7
vendor/github.com/templexxx/cpufeat/cpu_s390x.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpufeat
const CacheLineSize = 256

59
vendor/github.com/templexxx/cpufeat/cpu_x86.go generated vendored Normal file
View File

@@ -0,0 +1,59 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build 386 amd64 amd64p32
package cpufeat
const CacheLineSize = 64
// cpuid is implemented in cpu_x86.s.
func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
// xgetbv with ecx = 0 is implemented in cpu_x86.s.
func xgetbv() (eax, edx uint32)
func init() {
maxId, _, _, _ := cpuid(0, 0)
if maxId < 1 {
return
}
_, _, ecx1, edx1 := cpuid(1, 0)
X86.HasSSE2 = isSet(26, edx1)
X86.HasSSE3 = isSet(0, ecx1)
X86.HasPCLMULQDQ = isSet(1, ecx1)
X86.HasSSSE3 = isSet(9, ecx1)
X86.HasSSE41 = isSet(19, ecx1)
X86.HasSSE42 = isSet(20, ecx1)
X86.HasPOPCNT = isSet(23, ecx1)
X86.HasAES = isSet(25, ecx1)
X86.HasOSXSAVE = isSet(27, ecx1)
osSupportsAVX := false
// For XGETBV, OSXSAVE bit is required and sufficient.
if X86.HasOSXSAVE {
eax, _ := xgetbv()
// Check if XMM and YMM registers have OS support.
osSupportsAVX = isSet(1, eax) && isSet(2, eax)
}
X86.HasAVX = isSet(28, ecx1) && osSupportsAVX
if maxId < 7 {
return
}
_, ebx7, _, _ := cpuid(7, 0)
X86.HasBMI1 = isSet(3, ebx7)
X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX
X86.HasBMI2 = isSet(8, ebx7)
X86.HasERMS = isSet(9, ebx7)
}
func isSet(bitpos uint, value uint32) bool {
return value&(1<<bitpos) != 0
}

32
vendor/github.com/templexxx/cpufeat/cpu_x86.s generated vendored Normal file
View File

@@ -0,0 +1,32 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build 386 amd64 amd64p32
#include "textflag.h"
// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
TEXT ·cpuid(SB), NOSPLIT, $0-24
MOVL eaxArg+0(FP), AX
MOVL ecxArg+4(FP), CX
CPUID
MOVL AX, eax+8(FP)
MOVL BX, ebx+12(FP)
MOVL CX, ecx+16(FP)
MOVL DX, edx+20(FP)
RET
// func xgetbv() (eax, edx uint32)
TEXT ·xgetbv(SB),NOSPLIT,$0-8
#ifdef GOOS_nacl
// nacl does not support XGETBV.
MOVL $0, eax+0(FP)
MOVL $0, edx+4(FP)
#else
MOVL $0, CX
WORD $0x010f; BYTE $0xd0 //XGETBV
MOVL AX, eax+0(FP)
MOVL DX, edx+4(FP)
#endif
RET

40
vendor/github.com/templexxx/reedsolomon/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,40 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof
/.idea
/backup
/loopunroll/
cpu.out
mathtool/galois/
mathtool/matrix/
mem.out
/examples/
/.DS_Store
/mathtool/cntinverse
/invert
/bakcup
/buf.svg
*.svg
*.out
/escape

9
vendor/github.com/templexxx/reedsolomon/.travis.yml generated vendored Normal file
View File

@@ -0,0 +1,9 @@
language: go
go:
- 1.9
install:
- go get github.com/templexxx/reedsolomon
script:
- go test -v

23
vendor/github.com/templexxx/reedsolomon/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,23 @@
MIT License
Copyright (c) 2017 Templexxx
Copyright (c) 2015 Klaus Post
Copyright (c) 2015 Backblaze
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

109
vendor/github.com/templexxx/reedsolomon/README.md generated vendored Normal file
View File

@@ -0,0 +1,109 @@
# Reed-Solomon
[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8]
[1]: https://godoc.org/github.com/templexxx/reedsolomon?status.svg
[2]: https://godoc.org/github.com/templexxx/reedsolomon
[3]: https://img.shields.io/badge/license-MIT-blue.svg
[4]: LICENSE
[5]: https://travis-ci.org/templexxx/reedsolomon.svg?branch=master
[6]: https://travis-ci.org/templexxx/reedsolomon
[7]: https://goreportcard.com/badge/github.com/templexxx/reedsolomon
[8]: https://goreportcard.com/report/github.com/templexxx/reedsolomon
## Introduction:
1. Reed-Solomon Erasure Code engine in pure Go.
2. Super Fast: more than 10GB/s per physics core ( 10+4, 4KB per vector, Macbook Pro 2.8 GHz Intel Core i7 )
## Installation
To get the package use the standard:
```bash
go get github.com/templexxx/reedsolomon
```
## Documentation
See the associated [GoDoc](http://godoc.org/github.com/templexxx/reedsolomon)
## Specification
### GOARCH
1. All arch are supported
2. 0.1.0 need go1.9 for sync.Map in AMD64
### Math
1. Coding over in GF(2^8)
2. Primitive Polynomial: x^8 + x^4 + x^3 + x^2 + 1 (0x1d)
3. mathtool/gentbls.go : generator Primitive Polynomial and it's log table, exp table, multiply table, inverse table etc. We can get more info about how galois field work
4. mathtool/cntinverse.go : calculate how many inverse matrix will have in different RS codes config
5. Both of Cauchy and Vandermonde Matrix are supported. Vandermonde need more operations for preserving the property that any square subset of rows is invertible
### Why so fast?
These three parts will cost too much time:
1. lookup galois-field tables
2. read/write memory
3. calculate inverse matrix in the reconstruct process
SIMD will solve no.1
Cache-friendly codes will help to solve no.2 & no.3, and more, use a sync.Map for cache inverse matrix, it will help to save about 1000ns when we need same matrix.
## Performance
Performance depends mainly on:
1. CPU instruction extension( AVX2 or SSSE3 or none )
2. number of data/parity vects
3. unit size of calculation ( see it in rs_amd64.go )
4. size of shards
5. speed of memory (waste so much time on read/write mem, :D )
6. performance of CPU
7. the way of using ( reuse memory)
And we must know the benchmark test is quite different with encoding/decoding in practice.
Because in benchmark test loops, the CPU Cache will help a lot. In practice, we must reuse the memory to make the performance become as good as the benchmark test.
Example of performance on my MacBook 2017 i7 2.8GHz. 10+4 (with 0.1.0).
### Encoding:
| Vector size | Speed (MB/S) |
|----------------|--------------|
| 1400B | 7655.02 |
| 4KB | 10551.37 |
| 64KB | 9297.25 |
| 1MB | 6829.89 |
| 16MB | 6312.83 |
### Reconstruct (use nil to point which one need repair):
| Vector size | Speed (MB/S) |
|----------------|--------------|
| 1400B | 4124.85 |
| 4KB | 5715.45 |
| 64KB | 6050.06 |
| 1MB | 5001.21 |
| 16MB | 5043.04 |
### ReconstructWithPos (use a position list to point which one need repair, reuse the memory):
| Vector size | Speed (MB/S) |
|----------------|--------------|
| 1400B | 6170.24 |
| 4KB | 9444.86 |
| 64KB | 9311.30 |
| 1MB | 6781.06 |
| 16MB | 6285.34 |
**reconstruct benchmark tests here run with inverse matrix cache, if there is no cache, it will cost more time( about 1000ns)**
## Who is using this?
1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang
## Links & Thanks
* [Klauspost ReedSolomon](https://github.com/klauspost/reedsolomon)
* [intel ISA-L](https://github.com/01org/isa-l)
* [GF SIMD] (http://www.ssrc.ucsc.edu/papers/plank-fast13.pdf)
* [asm2plan9s] (https://github.com/fwessels/asm2plan9s)

156
vendor/github.com/templexxx/reedsolomon/matrix.go generated vendored Normal file
View File

@@ -0,0 +1,156 @@
package reedsolomon
import "errors"
type matrix []byte
func genEncMatrixCauchy(d, p int) matrix {
t := d + p
m := make([]byte, t*d)
for i := 0; i < d; i++ {
m[i*d+i] = byte(1)
}
d2 := d * d
for i := d; i < t; i++ {
for j := 0; j < d; j++ {
d := i ^ j
a := inverseTbl[d]
m[d2] = byte(a)
d2++
}
}
return m
}
func gfExp(b byte, n int) byte {
if n == 0 {
return 1
}
if b == 0 {
return 0
}
a := logTbl[b]
ret := int(a) * n
for ret >= 255 {
ret -= 255
}
return byte(expTbl[ret])
}
func genVandMatrix(vm []byte, t, d int) {
for i := 0; i < t; i++ {
for j := 0; j < d; j++ {
vm[i*d+j] = gfExp(byte(i), j)
}
}
}
func (m matrix) mul(right matrix, rows, cols int, r []byte) {
for i := 0; i < rows; i++ {
for j := 0; j < cols; j++ {
var v byte
for k := 0; k < cols; k++ {
v ^= gfMul(m[i*cols+k], right[k*cols+j])
}
r[i*cols+j] = v
}
}
}
func genEncMatrixVand(d, p int) (matrix, error) {
t := d + p
buf := make([]byte, (2*t+4*d)*d)
vm := buf[:t*d]
genVandMatrix(vm, t, d)
top := buf[t*d : (t+d)*d]
copy(top, vm[:d*d])
raw := buf[(t+d)*d : (t+3*d)*d]
im := buf[(t+3*d)*d : (t+4*d)*d]
err := matrix(top).invert(raw, d, im)
if err != nil {
return nil, err
}
r := buf[(t+4*d)*d : (2*t+4*d)*d]
matrix(vm).mul(im, t, d, r)
return matrix(r), nil
}
// [I|m'] -> [m']
func (m matrix) subMatrix(n int, r []byte) {
for i := 0; i < n; i++ {
off := i * n
copy(r[off:off+n], m[2*off+n:2*(off+n)])
}
}
func (m matrix) invert(raw matrix, n int, im []byte) error {
// [m] -> [m|I]
for i := 0; i < n; i++ {
t := i * n
copy(raw[2*t:2*t+n], m[t:t+n])
raw[2*t+i+n] = byte(1)
}
err := gauss(raw, n)
if err != nil {
return err
}
raw.subMatrix(n, im)
return nil
}
func (m matrix) swap(i, j, n int) {
for k := 0; k < n; k++ {
m[i*n+k], m[j*n+k] = m[j*n+k], m[i*n+k]
}
}
func gfMul(a, b byte) byte {
return mulTbl[a][b]
}
var errSingular = errors.New("rs.invert: matrix is singular")
// [m|I] -> [I|m']
func gauss(m matrix, n int) error {
n2 := 2 * n
for i := 0; i < n; i++ {
if m[i*n2+i] == 0 {
for j := i + 1; j < n; j++ {
if m[j*n2+i] != 0 {
m.swap(i, j, n2)
break
}
}
}
if m[i*n2+i] == 0 {
return errSingular
}
if m[i*n2+i] != 1 {
d := m[i*n2+i]
scale := inverseTbl[d]
for c := 0; c < n2; c++ {
m[i*n2+c] = gfMul(m[i*n2+c], scale)
}
}
for j := i + 1; j < n; j++ {
if m[j*n2+i] != 0 {
scale := m[j*n2+i]
for c := 0; c < n2; c++ {
m[j*n2+c] ^= gfMul(scale, m[i*n2+c])
}
}
}
}
for k := 0; k < n; k++ {
for j := 0; j < k; j++ {
if m[j*n2+k] != 0 {
scale := m[j*n2+k]
for c := 0; c < n2; c++ {
m[j*n2+c] ^= gfMul(scale, m[k*n2+c])
}
}
}
}
return nil
}

280
vendor/github.com/templexxx/reedsolomon/rs.go generated vendored Normal file
View File

@@ -0,0 +1,280 @@
/*
Reed-Solomon Codes over GF(2^8)
Primitive Polynomial: x^8+x^4+x^3+x^2+1
Galois Filed arithmetic using Intel SIMD instructions (AVX2 or SSSE3)
*/
package reedsolomon
import "errors"
// Encoder implements for Reed-Solomon Encoding/Reconstructing
type Encoder interface {
// Encode multiply generator-matrix with data
// len(vects) must be equal with num of data+parity
Encode(vects [][]byte) error
// Result of reconst will be put into origin position of vects
// it means if you lost vects[0], after reconst the vects[0]'s data will be back in vects[0]
// Reconstruct repair lost data & parity
// Set vect nil if lost
Reconstruct(vects [][]byte) error
// Reconstruct repair lost data
// Set vect nil if lost
ReconstructData(vects [][]byte) error
// ReconstWithPos repair lost data&parity with has&lost vects position
// Save bandwidth&disk I/O (cmp with Reconstruct, if the lost is less than num of parity)
// As erasure codes, we must know which vect is broken,
// so it's necessary to provide such APIs
// len(has) must equal num of data vects
// Example:
// in 3+2, the whole position: [0,1,2,3,4]
// if lost vects[0]
// the "has" could be [1,2,3] or [1,2,4] or ...
// then you must be sure that vects[1] vects[2] vects[3] have correct data (if the "has" is [1,2,3])
// the "dLost" will be [0]
// ps:
// 1. the above lists are in increasing orders TODO support out-of-order
// 2. each vect has same len, don't set it nil
// so we don't need to make slice
ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error
//// ReconstWithPos repair lost data with survived&lost vects position
//// Don't need to append position of parity lost into "lost"
ReconstDataWithPos(vects [][]byte, has, dLost []int) error
}
func checkCfg(d, p int) error {
if (d <= 0) || (p <= 0) {
return errors.New("rs.New: data or parity <= 0")
}
if d+p >= 256 {
return errors.New("rs.New: data+parity >= 256")
}
return nil
}
// New create an Encoder (vandermonde matrix as Encoding matrix)
func New(data, parity int) (enc Encoder, err error) {
err = checkCfg(data, parity)
if err != nil {
return
}
e, err := genEncMatrixVand(data, parity)
if err != nil {
return
}
return newRS(data, parity, e), nil
}
// NewCauchy create an Encoder (cauchy matrix as Generator Matrix)
func NewCauchy(data, parity int) (enc Encoder, err error) {
err = checkCfg(data, parity)
if err != nil {
return
}
e := genEncMatrixCauchy(data, parity)
return newRS(data, parity, e), nil
}
type encBase struct {
data int
parity int
encode []byte
gen []byte
}
func checkEnc(d, p int, vs [][]byte) (size int, err error) {
total := len(vs)
if d+p != total {
err = errors.New("rs.checkER: vects not match rs args")
return
}
size = len(vs[0])
if size == 0 {
err = errors.New("rs.checkER: vects size = 0")
return
}
for i := 1; i < total; i++ {
if len(vs[i]) != size {
err = errors.New("rs.checkER: vects size mismatch")
return
}
}
return
}
func (e *encBase) Encode(vects [][]byte) (err error) {
d := e.data
p := e.parity
_, err = checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i], pv[j])
} else {
mulVect(g[j*d], dv[0], pv[j])
}
}
}
return
}
func mulVect(c byte, a, b []byte) {
t := mulTbl[c]
for i := 0; i < len(a); i++ {
b[i] = t[a[i]]
}
}
func mulVectAdd(c byte, a, b []byte) {
t := mulTbl[c]
for i := 0; i < len(a); i++ {
b[i] ^= t[a[i]]
}
}
func (e *encBase) Reconstruct(vects [][]byte) (err error) {
return e.reconstruct(vects, false)
}
func (e *encBase) ReconstructData(vects [][]byte) (err error) {
return e.reconstruct(vects, true)
}
func (e *encBase) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
return e.reconstWithPos(vects, has, dLost, pLost, false)
}
func (e *encBase) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
return e.reconstWithPos(vects, has, dLost, nil, true)
}
func (e *encBase) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
em := e.encode
dCnt := len(dLost)
size := len(vects[has[0]])
if dCnt != 0 {
vtmp := make([][]byte, d+dCnt)
for i, p := range has {
vtmp[i] = vects[p]
}
for i, p := range dLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
matrixbuf := make([]byte, 4*d*d+dCnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return err2
}
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
etmp := &encBase{data: d, parity: dCnt, gen: g}
err2 = etmp.Encode(vtmp[:d+dCnt])
if err2 != nil {
return err2
}
}
if dataOnly {
return
}
pCnt := len(pLost)
if pCnt != 0 {
vtmp := make([][]byte, d+pCnt)
g := make([]byte, pCnt*d)
for i, l := range pLost {
copy(g[i*d:i*d+d], em[l*d:l*d+d])
}
for i := 0; i < d; i++ {
vtmp[i] = vects[i]
}
for i, p := range pLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
etmp := &encBase{data: d, parity: pCnt, gen: g}
err2 := etmp.Encode(vtmp[:d+pCnt])
if err2 != nil {
return err2
}
}
return
}
func (e *encBase) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
p := e.parity
// TODO check more, maybe element in has show in lost & deal with len(has) > d
if len(has) != d {
return errors.New("rs.Reconst: not enough vects")
}
dCnt := len(dLost)
if dCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
pCnt := len(pLost)
if pCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
return e.reconst(vects, has, dLost, pLost, dataOnly)
}
func (e *encBase) reconstruct(vects [][]byte, dataOnly bool) (err error) {
d := e.data
p := e.parity
t := d + p
listBuf := make([]int, t+p)
has := listBuf[:d]
dLost := listBuf[d:t]
pLost := listBuf[t : t+p]
hasCnt, dCnt, pCnt := 0, 0, 0
for i := 0; i < t; i++ {
if vects[i] != nil {
if hasCnt < d {
has[hasCnt] = i
hasCnt++
}
} else {
if i < d {
if dCnt < p {
dLost[dCnt] = i
dCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
} else {
if pCnt < p {
pLost[pCnt] = i
pCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
}
}
}
if hasCnt != d {
return errors.New("rs.Reconst: not enough vects")
}
dLost = dLost[:dCnt]
pLost = pLost[:pCnt]
return e.reconst(vects, has, dLost, pLost, dataOnly)
}

868
vendor/github.com/templexxx/reedsolomon/rs_amd64.go generated vendored Normal file
View File

@@ -0,0 +1,868 @@
package reedsolomon
import (
"errors"
"sync"
"github.com/templexxx/cpufeat"
)
// SIMD Instruction Extensions
const (
none = iota
avx2
ssse3
)
var extension = none
func init() {
getEXT()
}
func getEXT() {
if cpufeat.X86.HasAVX2 {
extension = avx2
return
} else if cpufeat.X86.HasSSSE3 {
extension = ssse3
return
} else {
extension = none
return
}
}
//go:noescape
func copy32B(dst, src []byte) // Need SSE2(introduced in 2001)
func initTbl(g matrix, rows, cols int, tbl []byte) {
off := 0
for i := 0; i < cols; i++ {
for j := 0; j < rows; j++ {
c := g[j*cols+i]
t := lowhighTbl[c][:]
copy32B(tbl[off:off+32], t)
off += 32
}
}
}
// At most 3060 inverse matrix (when data=14, parity=4, calc by mathtool/cntinverse)
// In practice, data usually below 12, parity below 5
func okCache(data, parity int) bool {
if data < 15 && parity < 5 { // you can change it, but the data+parity can't be bigger than 32 (tips: see the codes about make inverse matrix)
return true
}
return false
}
type (
encSSSE3 encSIMD
encAVX2 encSIMD
encSIMD struct {
data int
parity int
encode matrix
gen matrix
tbl []byte
// inverse matrix cache is design for small vect size ( < 4KB )
// it will save time for calculating inverse matrix
// but it's not so important for big vect size
enableCache bool
inverseCache iCache
}
iCache struct {
sync.RWMutex
data map[uint32][]byte
}
)
func newRS(d, p int, em matrix) (enc Encoder) {
g := em[d*d:]
if extension == none {
return &encBase{data: d, parity: p, encode: em, gen: g}
}
t := make([]byte, d*p*32)
initTbl(g, p, d, t)
ok := okCache(d, p)
if extension == avx2 {
e := &encAVX2{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
inverseCache: iCache{data: make(map[uint32][]byte)}}
return e
}
e := &encSSSE3{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
inverseCache: iCache{data: make(map[uint32][]byte)}}
return e
}
// Size of sub-vector
const unit int = 16 * 1024
func getDo(n int) int {
if n < unit {
c := n >> 4
if c == 0 {
return unit
}
return c << 4
}
return unit
}
func (e *encAVX2) Encode(vects [][]byte) (err error) {
d := e.data
p := e.parity
size, err := checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
start, end := 0, 0
do := getDo(size)
for start < size {
end = start + do
if end <= size {
e.matrixMul(start, end, dv, pv)
start = end
} else {
e.matrixMulRemain(start, size, dv, pv)
start = size
}
}
return
}
//go:noescape
func mulVectAVX2(tbl, d, p []byte)
//go:noescape
func mulVectAddAVX2(tbl, d, p []byte)
func (e *encAVX2) matrixMul(start, end int, dv, pv [][]byte) {
d := e.data
p := e.parity
tbl := e.tbl
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
} else {
mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
}
off += 32
}
}
}
func (e *encAVX2) matrixMulRemain(start, end int, dv, pv [][]byte) {
undone := end - start
do := (undone >> 4) << 4
d := e.data
p := e.parity
tbl := e.tbl
if do >= 16 {
end2 := start + do
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
} else {
mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
}
off += 32
}
}
start = end
}
if undone > do {
// may recalculate some data, but still improve a lot
start2 := end - 16
if start2 >= 0 {
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
} else {
mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
}
off += 32
}
}
} else {
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
} else {
mulVect(g[j*d], dv[0][start:], pv[j][start:])
}
}
}
}
}
}
// use generator-matrix but not tbls for encoding
// it's design for reconstructing
// for small vects, it cost to much time on initTbl, so drop it
// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
func (e *encAVX2) encodeGen(vects [][]byte) (err error) {
d := e.data
p := e.parity
size, err := checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
start, end := 0, 0
do := getDo(size)
for start < size {
end = start + do
if end <= size {
e.matrixMulGen(start, end, dv, pv)
start = end
} else {
e.matrixMulRemainGen(start, size, dv, pv)
start = size
}
}
return
}
func (e *encAVX2) matrixMulGen(start, end int, dv, pv [][]byte) {
d := e.data
p := e.parity
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
} else {
mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
}
}
}
}
func (e *encAVX2) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
undone := end - start
do := (undone >> 4) << 4
d := e.data
p := e.parity
g := e.gen
if do >= 16 {
end2 := start + do
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
} else {
mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
}
}
}
start = end
}
if undone > do {
start2 := end - 16
if start2 >= 0 {
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
} else {
mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
}
}
}
} else {
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
} else {
mulVect(g[j*d], dv[0][start:], pv[j][start:])
}
}
}
}
}
}
func (e *encAVX2) Reconstruct(vects [][]byte) (err error) {
return e.reconstruct(vects, false)
}
func (e *encAVX2) ReconstructData(vects [][]byte) (err error) {
return e.reconstruct(vects, true)
}
func (e *encAVX2) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
return e.reconstWithPos(vects, has, dLost, pLost, false)
}
func (e *encAVX2) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
return e.reconstWithPos(vects, has, dLost, nil, true)
}
func (e *encAVX2) makeGen(has, dLost []int) (gen []byte, err error) {
d := e.data
em := e.encode
cnt := len(dLost)
if !e.enableCache {
matrixbuf := make([]byte, 4*d*d+cnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return nil, err2
}
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
return g, nil
}
var ikey uint32
for _, p := range has {
ikey += 1 << uint8(p)
}
e.inverseCache.RLock()
v, ok := e.inverseCache.data[ikey]
if ok {
im := v
g := make([]byte, cnt*d)
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
e.inverseCache.RUnlock()
return g, nil
}
e.inverseCache.RUnlock()
matrixbuf := make([]byte, 4*d*d+cnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return nil, err2
}
e.inverseCache.Lock()
e.inverseCache.data[ikey] = im
e.inverseCache.Unlock()
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
return g, nil
}
func (e *encAVX2) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
em := e.encode
dCnt := len(dLost)
size := len(vects[has[0]])
if dCnt != 0 {
vtmp := make([][]byte, d+dCnt)
for i, p := range has {
vtmp[i] = vects[p]
}
for i, p := range dLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
g, err2 := e.makeGen(has, dLost)
if err2 != nil {
return
}
etmp := &encAVX2{data: d, parity: dCnt, gen: g}
err2 = etmp.encodeGen(vtmp)
if err2 != nil {
return err2
}
}
if dataOnly {
return
}
pCnt := len(pLost)
if pCnt != 0 {
g := make([]byte, pCnt*d)
for i, l := range pLost {
copy(g[i*d:i*d+d], em[l*d:l*d+d])
}
vtmp := make([][]byte, d+pCnt)
for i := 0; i < d; i++ {
vtmp[i] = vects[i]
}
for i, p := range pLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
etmp := &encAVX2{data: d, parity: pCnt, gen: g}
err2 := etmp.encodeGen(vtmp)
if err2 != nil {
return err2
}
}
return
}
func (e *encAVX2) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
p := e.parity
if len(has) != d {
return errors.New("rs.Reconst: not enough vects")
}
dCnt := len(dLost)
if dCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
pCnt := len(pLost)
if pCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
return e.reconst(vects, has, dLost, pLost, dataOnly)
}
func (e *encAVX2) reconstruct(vects [][]byte, dataOnly bool) (err error) {
d := e.data
p := e.parity
t := d + p
listBuf := make([]int, t+p)
has := listBuf[:d]
dLost := listBuf[d:t]
pLost := listBuf[t : t+p]
hasCnt, dCnt, pCnt := 0, 0, 0
for i := 0; i < t; i++ {
if vects[i] != nil {
if hasCnt < d {
has[hasCnt] = i
hasCnt++
}
} else {
if i < d {
if dCnt < p {
dLost[dCnt] = i
dCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
} else {
if pCnt < p {
pLost[pCnt] = i
pCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
}
}
}
if hasCnt != d {
return errors.New("rs.Reconst: not enough vects")
}
dLost = dLost[:dCnt]
pLost = pLost[:pCnt]
return e.reconst(vects, has, dLost, pLost, dataOnly)
}
func (e *encSSSE3) Encode(vects [][]byte) (err error) {
d := e.data
p := e.parity
size, err := checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
start, end := 0, 0
do := getDo(size)
for start < size {
end = start + do
if end <= size {
e.matrixMul(start, end, dv, pv)
start = end
} else {
e.matrixMulRemain(start, size, dv, pv)
start = size
}
}
return
}
//go:noescape
func mulVectSSSE3(tbl, d, p []byte)
//go:noescape
func mulVectAddSSSE3(tbl, d, p []byte)
func (e *encSSSE3) matrixMul(start, end int, dv, pv [][]byte) {
d := e.data
p := e.parity
tbl := e.tbl
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
} else {
mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
}
off += 32
}
}
}
func (e *encSSSE3) matrixMulRemain(start, end int, dv, pv [][]byte) {
undone := end - start
do := (undone >> 4) << 4
d := e.data
p := e.parity
tbl := e.tbl
if do >= 16 {
end2 := start + do
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
} else {
mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
}
off += 32
}
}
start = end
}
if undone > do {
start2 := end - 16
if start2 >= 0 {
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
} else {
mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
}
off += 32
}
}
} else {
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
} else {
mulVect(g[j*d], dv[0][start:], pv[j][start:])
}
}
}
}
}
}
// use generator-matrix but not tbls for encoding
// it's design for reconstructing
// for small vects, it cost to much time on initTbl, so drop it
// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
func (e *encSSSE3) encodeGen(vects [][]byte) (err error) {
d := e.data
p := e.parity
size, err := checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
start, end := 0, 0
do := getDo(size)
for start < size {
end = start + do
if end <= size {
e.matrixMulGen(start, end, dv, pv)
start = end
} else {
e.matrixMulRemainGen(start, size, dv, pv)
start = size
}
}
return
}
func (e *encSSSE3) matrixMulGen(start, end int, dv, pv [][]byte) {
d := e.data
p := e.parity
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
} else {
mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
}
}
}
}
func (e *encSSSE3) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
undone := end - start
do := (undone >> 4) << 4
d := e.data
p := e.parity
g := e.gen
if do >= 16 {
end2 := start + do
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
} else {
mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
}
}
}
start = end
}
if undone > do {
start2 := end - 16
if start2 >= 0 {
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
} else {
mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
}
}
}
} else {
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
} else {
mulVect(g[j*d], dv[0][start:], pv[j][start:])
}
}
}
}
}
}
func (e *encSSSE3) Reconstruct(vects [][]byte) (err error) {
return e.reconstruct(vects, false)
}
func (e *encSSSE3) ReconstructData(vects [][]byte) (err error) {
return e.reconstruct(vects, true)
}
func (e *encSSSE3) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
return e.reconstWithPos(vects, has, dLost, pLost, false)
}
func (e *encSSSE3) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
return e.reconstWithPos(vects, has, dLost, nil, true)
}
func (e *encSSSE3) makeGen(has, dLost []int) (gen []byte, err error) {
d := e.data
em := e.encode
cnt := len(dLost)
if !e.enableCache {
matrixbuf := make([]byte, 4*d*d+cnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return nil, err2
}
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
return g, nil
}
var ikey uint32
for _, p := range has {
ikey += 1 << uint8(p)
}
e.inverseCache.RLock()
v, ok := e.inverseCache.data[ikey]
if ok {
im := v
g := make([]byte, cnt*d)
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
e.inverseCache.RUnlock()
return g, nil
}
e.inverseCache.RUnlock()
matrixbuf := make([]byte, 4*d*d+cnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return nil, err2
}
e.inverseCache.Lock()
e.inverseCache.data[ikey] = im
e.inverseCache.Unlock()
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
return g, nil
}
func (e *encSSSE3) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
em := e.encode
dCnt := len(dLost)
size := len(vects[has[0]])
if dCnt != 0 {
vtmp := make([][]byte, d+dCnt)
for i, p := range has {
vtmp[i] = vects[p]
}
for i, p := range dLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
g, err2 := e.makeGen(has, dLost)
if err2 != nil {
return
}
etmp := &encSSSE3{data: d, parity: dCnt, gen: g}
err2 = etmp.encodeGen(vtmp)
if err2 != nil {
return err2
}
}
if dataOnly {
return
}
pCnt := len(pLost)
if pCnt != 0 {
g := make([]byte, pCnt*d)
for i, l := range pLost {
copy(g[i*d:i*d+d], em[l*d:l*d+d])
}
vtmp := make([][]byte, d+pCnt)
for i := 0; i < d; i++ {
vtmp[i] = vects[i]
}
for i, p := range pLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
etmp := &encSSSE3{data: d, parity: pCnt, gen: g}
err2 := etmp.encodeGen(vtmp)
if err2 != nil {
return err2
}
}
return
}
func (e *encSSSE3) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
p := e.parity
if len(has) != d {
return errors.New("rs.Reconst: not enough vects")
}
dCnt := len(dLost)
if dCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
pCnt := len(pLost)
if pCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
return e.reconst(vects, has, dLost, pLost, dataOnly)
}
func (e *encSSSE3) reconstruct(vects [][]byte, dataOnly bool) (err error) {
d := e.data
p := e.parity
t := d + p
listBuf := make([]int, t+p)
has := listBuf[:d]
dLost := listBuf[d:t]
pLost := listBuf[t : t+p]
hasCnt, dCnt, pCnt := 0, 0, 0
for i := 0; i < t; i++ {
if vects[i] != nil {
if hasCnt < d {
has[hasCnt] = i
hasCnt++
}
} else {
if i < d {
if dCnt < p {
dLost[dCnt] = i
dCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
} else {
if pCnt < p {
pLost[pCnt] = i
pCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
}
}
}
if hasCnt != d {
return errors.New("rs.Reconst: not enough vects")
}
dLost = dLost[:dCnt]
pLost = pLost[:pCnt]
return e.reconst(vects, has, dLost, pLost, dataOnly)
}

401
vendor/github.com/templexxx/reedsolomon/rs_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,401 @@
// Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
#include "textflag.h"
#define low_tbl Y0
#define high_tbl Y1
#define mask Y2
#define in0 Y3
#define in1 Y4
#define in2 Y5
#define in3 Y6
#define in4 Y7
#define in5 Y8
#define in0_h Y10
#define in1_h Y11
#define in2_h Y12
#define in3_h Y13
#define in4_h Y14
#define in5_h Y15
#define in BX
#define out DI
#define len R8
#define pos R9
#define tmp0 R10
#define low_tblx X0
#define high_tblx X1
#define maskx X2
#define in0x X3
#define in0_hx X10
#define tmp0x X9
#define tmp1x X11
#define tmp2x X12
#define tmp3x X13
// func mulVectAVX2(tbl, d, p []byte)
TEXT ·mulVectAVX2(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
VMOVDQU (tmp0), low_tblx
VMOVDQU 16(tmp0), high_tblx
MOVB $0x0f, DX
LONG $0x2069e3c4; WORD $0x00d2 // VPINSRB $0x00, EDX, XMM2, XMM2
VPBROADCASTB maskx, maskx
MOVQ in_len+32(FP), len
TESTQ $31, len
JNZ one16b
ymm:
VINSERTI128 $1, low_tblx, low_tbl, low_tbl
VINSERTI128 $1, high_tblx, high_tbl, high_tbl
VINSERTI128 $1, maskx, mask, mask
TESTQ $255, len
JNZ not_aligned
// 256bytes/loop
aligned:
MOVQ $0, pos
loop256b:
VMOVDQU (in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, (out)(pos*1)
VMOVDQU 32(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VMOVDQU in1, 32(out)(pos*1)
VMOVDQU 64(in)(pos*1), in2
VPSRLQ $4, in2, in2_h
VPAND mask, in2_h, in2_h
VPAND mask, in2, in2
VPSHUFB in2_h, high_tbl, in2_h
VPSHUFB in2, low_tbl, in2
VPXOR in2, in2_h, in2
VMOVDQU in2, 64(out)(pos*1)
VMOVDQU 96(in)(pos*1), in3
VPSRLQ $4, in3, in3_h
VPAND mask, in3_h, in3_h
VPAND mask, in3, in3
VPSHUFB in3_h, high_tbl, in3_h
VPSHUFB in3, low_tbl, in3
VPXOR in3, in3_h, in3
VMOVDQU in3, 96(out)(pos*1)
VMOVDQU 128(in)(pos*1), in4
VPSRLQ $4, in4, in4_h
VPAND mask, in4_h, in4_h
VPAND mask, in4, in4
VPSHUFB in4_h, high_tbl, in4_h
VPSHUFB in4, low_tbl, in4
VPXOR in4, in4_h, in4
VMOVDQU in4, 128(out)(pos*1)
VMOVDQU 160(in)(pos*1), in5
VPSRLQ $4, in5, in5_h
VPAND mask, in5_h, in5_h
VPAND mask, in5, in5
VPSHUFB in5_h, high_tbl, in5_h
VPSHUFB in5, low_tbl, in5
VPXOR in5, in5_h, in5
VMOVDQU in5, 160(out)(pos*1)
VMOVDQU 192(in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, 192(out)(pos*1)
VMOVDQU 224(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VMOVDQU in1, 224(out)(pos*1)
ADDQ $256, pos
CMPQ len, pos
JNE loop256b
VZEROUPPER
RET
not_aligned:
MOVQ len, tmp0
ANDQ $255, tmp0
loop32b:
VMOVDQU -32(in)(len*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, -32(out)(len*1)
SUBQ $32, len
SUBQ $32, tmp0
JG loop32b
CMPQ len, $256
JGE aligned
VZEROUPPER
RET
one16b:
VMOVDQU -16(in)(len*1), in0x
VPSRLQ $4, in0x, in0_hx
VPAND maskx, in0x, in0x
VPAND maskx, in0_hx, in0_hx
VPSHUFB in0_hx, high_tblx, in0_hx
VPSHUFB in0x, low_tblx, in0x
VPXOR in0x, in0_hx, in0x
VMOVDQU in0x, -16(out)(len*1)
SUBQ $16, len
CMPQ len, $0
JNE ymm
RET
// func mulVectAddAVX2(tbl, d, p []byte)
TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
VMOVDQU (tmp0), low_tblx
VMOVDQU 16(tmp0), high_tblx
MOVB $0x0f, DX
LONG $0x2069e3c4; WORD $0x00d2
VPBROADCASTB maskx, maskx
MOVQ in_len+32(FP), len
TESTQ $31, len
JNZ one16b
ymm:
VINSERTI128 $1, low_tblx, low_tbl, low_tbl
VINSERTI128 $1, high_tblx, high_tbl, high_tbl
VINSERTI128 $1, maskx, mask, mask
TESTQ $255, len
JNZ not_aligned
aligned:
MOVQ $0, pos
loop256b:
VMOVDQU (in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR (out)(pos*1), in0, in0
VMOVDQU in0, (out)(pos*1)
VMOVDQU 32(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VPXOR 32(out)(pos*1), in1, in1
VMOVDQU in1, 32(out)(pos*1)
VMOVDQU 64(in)(pos*1), in2
VPSRLQ $4, in2, in2_h
VPAND mask, in2_h, in2_h
VPAND mask, in2, in2
VPSHUFB in2_h, high_tbl, in2_h
VPSHUFB in2, low_tbl, in2
VPXOR in2, in2_h, in2
VPXOR 64(out)(pos*1), in2, in2
VMOVDQU in2, 64(out)(pos*1)
VMOVDQU 96(in)(pos*1), in3
VPSRLQ $4, in3, in3_h
VPAND mask, in3_h, in3_h
VPAND mask, in3, in3
VPSHUFB in3_h, high_tbl, in3_h
VPSHUFB in3, low_tbl, in3
VPXOR in3, in3_h, in3
VPXOR 96(out)(pos*1), in3, in3
VMOVDQU in3, 96(out)(pos*1)
VMOVDQU 128(in)(pos*1), in4
VPSRLQ $4, in4, in4_h
VPAND mask, in4_h, in4_h
VPAND mask, in4, in4
VPSHUFB in4_h, high_tbl, in4_h
VPSHUFB in4, low_tbl, in4
VPXOR in4, in4_h, in4
VPXOR 128(out)(pos*1), in4, in4
VMOVDQU in4, 128(out)(pos*1)
VMOVDQU 160(in)(pos*1), in5
VPSRLQ $4, in5, in5_h
VPAND mask, in5_h, in5_h
VPAND mask, in5, in5
VPSHUFB in5_h, high_tbl, in5_h
VPSHUFB in5, low_tbl, in5
VPXOR in5, in5_h, in5
VPXOR 160(out)(pos*1), in5, in5
VMOVDQU in5, 160(out)(pos*1)
VMOVDQU 192(in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR 192(out)(pos*1), in0, in0
VMOVDQU in0, 192(out)(pos*1)
VMOVDQU 224(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VPXOR 224(out)(pos*1), in1, in1
VMOVDQU in1, 224(out)(pos*1)
ADDQ $256, pos
CMPQ len, pos
JNE loop256b
VZEROUPPER
RET
not_aligned:
MOVQ len, tmp0
ANDQ $255, tmp0
loop32b:
VMOVDQU -32(in)(len*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR -32(out)(len*1), in0, in0
VMOVDQU in0, -32(out)(len*1)
SUBQ $32, len
SUBQ $32, tmp0
JG loop32b
CMPQ len, $256
JGE aligned
VZEROUPPER
RET
one16b:
VMOVDQU -16(in)(len*1), in0x
VPSRLQ $4, in0x, in0_hx
VPAND maskx, in0x, in0x
VPAND maskx, in0_hx, in0_hx
VPSHUFB in0_hx, high_tblx, in0_hx
VPSHUFB in0x, low_tblx, in0x
VPXOR in0x, in0_hx, in0x
VPXOR -16(out)(len*1), in0x, in0x
VMOVDQU in0x, -16(out)(len*1)
SUBQ $16, len
CMPQ len, $0
JNE ymm
RET
// func mulVectSSSE3(tbl, d, p []byte)
TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
MOVOU (tmp0), low_tblx
MOVOU 16(tmp0), high_tblx
MOVB $15, tmp0
MOVQ tmp0, maskx
PXOR tmp0x, tmp0x
PSHUFB tmp0x, maskx
MOVQ in_len+32(FP), len
SHRQ $4, len
loop:
MOVOU (in), in0x
MOVOU in0x, in0_hx
PSRLQ $4, in0_hx
PAND maskx, in0x
PAND maskx, in0_hx
MOVOU low_tblx, tmp1x
MOVOU high_tblx, tmp2x
PSHUFB in0x, tmp1x
PSHUFB in0_hx, tmp2x
PXOR tmp1x, tmp2x
MOVOU tmp2x, (out)
ADDQ $16, in
ADDQ $16, out
SUBQ $1, len
JNZ loop
RET
// func mulVectAddSSSE3(tbl, d, p []byte)
TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
MOVOU (tmp0), low_tblx
MOVOU 16(tmp0), high_tblx
MOVB $15, tmp0
MOVQ tmp0, maskx
PXOR tmp0x, tmp0x
PSHUFB tmp0x, maskx
MOVQ in_len+32(FP), len
SHRQ $4, len
loop:
MOVOU (in), in0x
MOVOU in0x, in0_hx
PSRLQ $4, in0_hx
PAND maskx, in0x
PAND maskx, in0_hx
MOVOU low_tblx, tmp1x
MOVOU high_tblx, tmp2x
PSHUFB in0x, tmp1x
PSHUFB in0_hx, tmp2x
PXOR tmp1x, tmp2x
MOVOU (out), tmp3x
PXOR tmp3x, tmp2x
MOVOU tmp2x, (out)
ADDQ $16, in
ADDQ $16, out
SUBQ $1, len
JNZ loop
RET
// func copy32B(dst, src []byte)
TEXT ·copy32B(SB), NOSPLIT, $0
MOVQ dst+0(FP), SI
MOVQ src+24(FP), DX
MOVOU (DX), X0
MOVOU 16(DX), X1
MOVOU X0, (SI)
MOVOU X1, 16(SI)
RET

8
vendor/github.com/templexxx/reedsolomon/rs_other.go generated vendored Normal file
View File

@@ -0,0 +1,8 @@
// +build !amd64
package reedsolomon
func newRS(d, p int, em matrix) (enc Encoder) {
g := em[d*d:]
return &encBase{data: d, parity: p, encode: em, gen: g}
}

44
vendor/github.com/templexxx/reedsolomon/tbl.go generated vendored Normal file

File diff suppressed because one or more lines are too long

1
vendor/github.com/templexxx/xor/.gitattributes generated vendored Normal file
View File

@@ -0,0 +1 @@
*.s linguist-language=go

18
vendor/github.com/templexxx/xor/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,18 @@
# Binaries for programs and plugins
*.exe
*.dll
*.so
*.dylib
# Test binary, build with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
.glide/
/backup/
/backup2/
/.idea
/backup3/

21
vendor/github.com/templexxx/xor/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2017 Temple3x
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

48
vendor/github.com/templexxx/xor/README.md generated vendored Normal file
View File

@@ -0,0 +1,48 @@
# XOR
XOR code engine in pure Go
more than 10GB/S per core
## Introduction:
1. Use SIMD (SSE2 or AVX2) for speeding up
2. ...
## Installation
To get the package use the standard:
```bash
go get github.com/templexxx/xor
```
## Documentation
See the associated [GoDoc](http://godoc.org/github.com/templexxx/xor)
## Performance
Performance depends mainly on:
1. SIMD extension
2. unit size of worker
3. hardware ( CPU RAM etc)
Example of performance on my MacBook 2014-mid(i5-4278U 2.6GHz 2 physical cores). The 16MB per shards.
```
speed = ( shards * size ) / cost
```
| data_shards | shard_size |speed (MB/S) |
|----------|----|-----|
| 2 |1KB|64127.95 |
|2|1400B|59657.55|
|2|16KB|35370.84|
| 2 | 16MB|12128.95 |
| 5 |1KB| 78837.33 |
|5|1400B|58054.89|
|5|16KB|50161.19|
|5| 16MB|12750.41|
## Who is using this?
1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang

438
vendor/github.com/templexxx/xor/avx2_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,438 @@
#include "textflag.h"
// addr of mem
#define DST BX
#define SRC SI
#define SRC0 TMP4
#define SRC1 TMP5
// loop args
// num of vect
#define VECT CX
#define LEN DX
// pos of matrix
#define POS R8
// tmp store
// num of vect or ...
#define TMP1 R9
// pos of matrix or ...
#define TMP2 R10
// store addr of data/parity or ...
#define TMP3 R11
#define TMP4 R12
#define TMP5 R13
#define TMP6 R14
// func bytesAVX2mini(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $31, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop32b:
VMOVDQU (SRC0)(POS*1), Y0
VPXOR (SRC1)(POS*1), Y0, Y0
VMOVDQU Y0, (DST)(POS*1)
ADDQ $32, POS
CMPQ LEN, POS
JNE loop32b
VZEROUPPER
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $31, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $31, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $32
JGE aligned
RET
ret:
RET
// func bytesAVX2small(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2small(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
VMOVDQU (SRC0)(POS*1), Y0
VMOVDQU 32(SRC0)(POS*1), Y1
VMOVDQU 64(SRC0)(POS*1), Y2
VMOVDQU 96(SRC0)(POS*1), Y3
VPXOR (SRC1)(POS*1), Y0, Y0
VPXOR 32(SRC1)(POS*1), Y1, Y1
VPXOR 64(SRC1)(POS*1), Y2, Y2
VPXOR 96(SRC1)(POS*1), Y3, Y3
VMOVDQU Y0, (DST)(POS*1)
VMOVDQU Y1, 32(DST)(POS*1)
VMOVDQU Y2, 64(DST)(POS*1)
VMOVDQU Y3, 96(DST)(POS*1)
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
VZEROUPPER
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $127, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func bytesAVX2big(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2big(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
VMOVDQU (SRC0)(POS*1), Y0
VMOVDQU 32(SRC0)(POS*1), Y1
VMOVDQU 64(SRC0)(POS*1), Y2
VMOVDQU 96(SRC0)(POS*1), Y3
VPXOR (SRC1)(POS*1), Y0, Y0
VPXOR 32(SRC1)(POS*1), Y1, Y1
VPXOR 64(SRC1)(POS*1), Y2, Y2
VPXOR 96(SRC1)(POS*1), Y3, Y3
LONG $0xe77da1c4; WORD $0x0304
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
SFENCE
VZEROUPPER
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $127, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func matrixAVX2small(dst []byte, src [][]byte)
TEXT ·matrixAVX2small(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y0
VMOVDQU 32(TMP4)(POS*1), Y1
VMOVDQU 64(TMP3)(POS*1), Y2
VMOVDQU 96(TMP4)(POS*1), Y3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y4
VMOVDQU 32(TMP4)(POS*1), Y5
VMOVDQU 64(TMP3)(POS*1), Y6
VMOVDQU 96(TMP4)(POS*1), Y7
VPXOR Y4, Y0, Y0
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
SUBQ $1, TMP1
JGE next_vect
VMOVDQU Y0, (DST)(POS*1)
VMOVDQU Y1, 32(DST)(POS*1)
VMOVDQU Y2, 64(DST)(POS*1)
VMOVDQU Y3, 96(DST)(POS*1)
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
VZEROUPPER
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $127, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func matrixAVX2big(dst []byte, src [][]byte)
TEXT ·matrixAVX2big(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y0
VMOVDQU 32(TMP4)(POS*1), Y1
VMOVDQU 64(TMP3)(POS*1), Y2
VMOVDQU 96(TMP4)(POS*1), Y3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y4
VMOVDQU 32(TMP4)(POS*1), Y5
VMOVDQU 64(TMP3)(POS*1), Y6
VMOVDQU 96(TMP4)(POS*1), Y7
VPXOR Y4, Y0, Y0
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
SUBQ $1, TMP1
JGE next_vect
LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
VZEROUPPER
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $127, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET

116
vendor/github.com/templexxx/xor/nosimd.go generated vendored Normal file
View File

@@ -0,0 +1,116 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package xor
import (
"runtime"
"unsafe"
)
const wordSize = int(unsafe.Sizeof(uintptr(0)))
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
// xor the bytes in a and b. The destination is assumed to have enough space.
func bytesNoSIMD(dst, a, b []byte, size int) {
if supportsUnaligned {
fastXORBytes(dst, a, b, size)
} else {
// TODO(hanwen): if (dst, a, b) have common alignment
// we could still try fastXORBytes. It is not clear
// how often this happens, and it's only worth it if
// the block encryption itself is hardware
// accelerated.
safeXORBytes(dst, a, b, size)
}
}
// split slice for cache-friendly
const unitSize = 16 * 1024
func matrixNoSIMD(dst []byte, src [][]byte) {
size := len(src[0])
start := 0
do := unitSize
for start < size {
end := start + do
if end <= size {
partNoSIMD(start, end, dst, src)
start = start + do
} else {
partNoSIMD(start, size, dst, src)
start = size
}
}
}
// split vect will improve performance with big data by reducing cache pollution
func partNoSIMD(start, end int, dst []byte, src [][]byte) {
bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start)
for i := 2; i < len(src); i++ {
bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start)
}
}
// fastXORBytes xor in bulk. It only works on architectures that
// support unaligned read/writes.
func fastXORBytes(dst, a, b []byte, n int) {
w := n / wordSize
if w > 0 {
wordBytes := w * wordSize
fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
}
for i := n - n%wordSize; i < n; i++ {
dst[i] = a[i] ^ b[i]
}
}
func safeXORBytes(dst, a, b []byte, n int) {
ex := n % 8
for i := 0; i < ex; i++ {
dst[i] = a[i] ^ b[i]
}
for i := ex; i < n; i += 8 {
_dst := dst[i : i+8]
_a := a[i : i+8]
_b := b[i : i+8]
_dst[0] = _a[0] ^ _b[0]
_dst[1] = _a[1] ^ _b[1]
_dst[2] = _a[2] ^ _b[2]
_dst[3] = _a[3] ^ _b[3]
_dst[4] = _a[4] ^ _b[4]
_dst[5] = _a[5] ^ _b[5]
_dst[6] = _a[6] ^ _b[6]
_dst[7] = _a[7] ^ _b[7]
}
}
// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
// The arguments are assumed to be of equal length.
func fastXORWords(dst, a, b []byte) {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
n := len(b) / wordSize
ex := n % 8
for i := 0; i < ex; i++ {
dw[i] = aw[i] ^ bw[i]
}
for i := ex; i < n; i += 8 {
_dw := dw[i : i+8]
_aw := aw[i : i+8]
_bw := bw[i : i+8]
_dw[0] = _aw[0] ^ _bw[0]
_dw[1] = _aw[1] ^ _bw[1]
_dw[2] = _aw[2] ^ _bw[2]
_dw[3] = _aw[3] ^ _bw[3]
_dw[4] = _aw[4] ^ _bw[4]
_dw[5] = _aw[5] ^ _bw[5]
_dw[6] = _aw[6] ^ _bw[6]
_dw[7] = _aw[7] ^ _bw[7]
}
}

574
vendor/github.com/templexxx/xor/sse2_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,574 @@
#include "textflag.h"
// addr of mem
#define DST BX
#define SRC SI
#define SRC0 TMP4
#define SRC1 TMP5
// loop args
// num of vect
#define VECT CX
#define LEN DX
// pos of matrix
#define POS R8
// tmp store
// num of vect or ...
#define TMP1 R9
// pos of matrix or ...
#define TMP2 R10
// store addr of data/parity or ...
#define TMP3 R11
#define TMP4 R12
#define TMP5 R13
#define TMP6 R14
// func bytesSrc0(dst, src0, src1 []byte)
TEXT ·xorSrc0(SB), NOSPLIT, $0
MOVQ len+32(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $15, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop16b:
MOVOU (SRC0)(POS*1), X0
XORPD (SRC1)(POS*1), X0
MOVOU X0, (DST)(POS*1)
ADDQ $16, POS
CMPQ LEN, POS
JNE loop16b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $15, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $15, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $16
JGE aligned
RET
ret:
RET
// func bytesSrc1(dst, src0, src1 []byte)
TEXT ·xorSrc1(SB), NOSPLIT, $0
MOVQ len+56(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $15, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop16b:
MOVOU (SRC0)(POS*1), X0
XORPD (SRC1)(POS*1), X0
MOVOU X0, (DST)(POS*1)
ADDQ $16, POS
CMPQ LEN, POS
JNE loop16b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $15, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $15, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $16
JGE aligned
RET
ret:
RET
// func bytesSSE2mini(dst, src0, src1 []byte, size int)
TEXT ·bytesSSE2mini(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $15, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop16b:
MOVOU (SRC0)(POS*1), X0
XORPD (SRC1)(POS*1), X0
// MOVOU (SRC1)(POS*1), X4
// PXOR X4, X0
MOVOU X0, (DST)(POS*1)
ADDQ $16, POS
CMPQ LEN, POS
JNE loop16b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $15, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $15, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $16
JGE aligned
RET
ret:
RET
// func bytesSSE2small(dst, src0, src1 []byte, size int)
TEXT ·bytesSSE2small(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $63, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop64b:
MOVOU (SRC0)(POS*1), X0
MOVOU 16(SRC0)(POS*1), X1
MOVOU 32(SRC0)(POS*1), X2
MOVOU 48(SRC0)(POS*1), X3
MOVOU (SRC1)(POS*1), X4
MOVOU 16(SRC1)(POS*1), X5
MOVOU 32(SRC1)(POS*1), X6
MOVOU 48(SRC1)(POS*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
MOVOU X0, (DST)(POS*1)
MOVOU X1, 16(DST)(POS*1)
MOVOU X2, 32(DST)(POS*1)
MOVOU X3, 48(DST)(POS*1)
ADDQ $64, POS
CMPQ LEN, POS
JNE loop64b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $63, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $63, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $64
JGE aligned
RET
ret:
RET
// func bytesSSE2big(dst, src0, src1 []byte, size int)
TEXT ·bytesSSE2big(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $63, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop64b:
MOVOU (SRC0)(POS*1), X0
MOVOU 16(SRC0)(POS*1), X1
MOVOU 32(SRC0)(POS*1), X2
MOVOU 48(SRC0)(POS*1), X3
MOVOU (SRC1)(POS*1), X4
MOVOU 16(SRC1)(POS*1), X5
MOVOU 32(SRC1)(POS*1), X6
MOVOU 48(SRC1)(POS*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
LONG $0xe70f4266; WORD $0x0304 // MOVNTDQ
LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
ADDQ $64, POS
CMPQ LEN, POS
JNE loop64b
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $63, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $63, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $64
JGE aligned
RET
ret:
RET
// func matrixSSE2small(dst []byte, src [][]byte)
TEXT ·matrixSSE2small(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $63, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop64b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
MOVOU (TMP3)(POS*1), X0
MOVOU 16(TMP4)(POS*1), X1
MOVOU 32(TMP3)(POS*1), X2
MOVOU 48(TMP4)(POS*1), X3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
MOVOU (TMP3)(POS*1), X4
MOVOU 16(TMP4)(POS*1), X5
MOVOU 32(TMP3)(POS*1), X6
MOVOU 48(TMP4)(POS*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
SUBQ $1, TMP1
JGE next_vect
MOVOU X0, (DST)(POS*1)
MOVOU X1, 16(DST)(POS*1)
MOVOU X2, 32(DST)(POS*1)
MOVOU X3, 48(DST)(POS*1)
ADDQ $64, POS
CMPQ LEN, POS
JNE loop64b
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $63, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $63, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $64
JGE aligned
RET
ret:
RET
// func matrixSSE2big(dst []byte, src [][]byte)
TEXT ·matrixSSE2big(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $63, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop64b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
MOVOU (TMP3)(POS*1), X0
MOVOU 16(TMP4)(POS*1), X1
MOVOU 32(TMP3)(POS*1), X2
MOVOU 48(TMP4)(POS*1), X3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
MOVOU (TMP3)(POS*1), X4
MOVOU 16(TMP4)(POS*1), X5
MOVOU 32(TMP3)(POS*1), X6
MOVOU 48(TMP4)(POS*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
SUBQ $1, TMP1
JGE next_vect
LONG $0xe70f4266; WORD $0x0304
LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
ADDQ $64, POS
CMPQ LEN, POS
JNE loop64b
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $63, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $63, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $64
JGE aligned
RET
ret:
RET
TEXT ·hasSSE2(SB), NOSPLIT, $0
XORQ AX, AX
INCL AX
CPUID
SHRQ $26, DX
ANDQ $1, DX
MOVB DX, ret+0(FP)
RET

49
vendor/github.com/templexxx/xor/xor.go generated vendored Normal file
View File

@@ -0,0 +1,49 @@
package xor
// SIMD Extensions
const (
none = iota
avx2
// first introduced by Intel with the initial version of the Pentium 4 in 2001
// so I think we can assume all amd64 has sse2
sse2
)
var extension = none
// Bytes : chose the shortest one as xor size
// it's better to use it for big data ( > 64bytes )
func Bytes(dst, src0, src1 []byte) {
size := len(dst)
if size > len(src0) {
size = len(src0)
}
if size > len(src1) {
size = len(src1)
}
xorBytes(dst, src0, src1, size)
}
// BytesSameLen : all slice's length must be equal
// cut size branch, save time for small data
func BytesSameLen(dst, src0, src1 []byte) {
xorSrc1(dst, src0, src1)
}
// BytesSrc0 : src1 >= src0, dst >= src0
// xor src0's len bytes
func BytesSrc0(dst, src0, src1 []byte) {
xorSrc0(dst, src0, src1)
}
// BytesSrc1 : src0 >= src1, dst >= src1
// xor src1's len bytes
func BytesSrc1(dst, src0, src1 []byte) {
xorSrc1(dst, src0, src1)
}
// Matrix : all slice's length must be equal && != 0
// len(src) must >= 2
func Matrix(dst []byte, src [][]byte) {
xorMatrix(dst, src)
}

120
vendor/github.com/templexxx/xor/xor_amd64.go generated vendored Normal file
View File

@@ -0,0 +1,120 @@
package xor
import "github.com/templexxx/cpufeat"
func init() {
getEXT()
}
func getEXT() {
if cpufeat.X86.HasAVX2 {
extension = avx2
} else {
extension = sse2
}
return
}
func xorBytes(dst, src0, src1 []byte, size int) {
switch extension {
case avx2:
bytesAVX2(dst, src0, src1, size)
default:
bytesSSE2(dst, src0, src1, size)
}
}
// non-temporal hint store
const nontmp = 8 * 1024
const avx2loopsize = 128
func bytesAVX2(dst, src0, src1 []byte, size int) {
if size < avx2loopsize {
bytesAVX2mini(dst, src0, src1, size)
} else if size >= avx2loopsize && size <= nontmp {
bytesAVX2small(dst, src0, src1, size)
} else {
bytesAVX2big(dst, src0, src1, size)
}
}
const sse2loopsize = 64
func bytesSSE2(dst, src0, src1 []byte, size int) {
if size < sse2loopsize {
bytesSSE2mini(dst, src0, src1, size)
} else if size >= sse2loopsize && size <= nontmp {
bytesSSE2small(dst, src0, src1, size)
} else {
bytesSSE2big(dst, src0, src1, size)
}
}
func xorMatrix(dst []byte, src [][]byte) {
switch extension {
case avx2:
matrixAVX2(dst, src)
default:
matrixSSE2(dst, src)
}
}
func matrixAVX2(dst []byte, src [][]byte) {
size := len(dst)
if size > nontmp {
matrixAVX2big(dst, src)
} else {
matrixAVX2small(dst, src)
}
}
func matrixSSE2(dst []byte, src [][]byte) {
size := len(dst)
if size > nontmp {
matrixSSE2big(dst, src)
} else {
matrixSSE2small(dst, src)
}
}
//go:noescape
func xorSrc0(dst, src0, src1 []byte)
//go:noescape
func xorSrc1(dst, src0, src1 []byte)
//go:noescape
func bytesAVX2mini(dst, src0, src1 []byte, size int)
//go:noescape
func bytesAVX2big(dst, src0, src1 []byte, size int)
//go:noescape
func bytesAVX2small(dst, src0, src1 []byte, size int)
//go:noescape
func bytesSSE2mini(dst, src0, src1 []byte, size int)
//go:noescape
func bytesSSE2small(dst, src0, src1 []byte, size int)
//go:noescape
func bytesSSE2big(dst, src0, src1 []byte, size int)
//go:noescape
func matrixAVX2small(dst []byte, src [][]byte)
//go:noescape
func matrixAVX2big(dst []byte, src [][]byte)
//go:noescape
func matrixSSE2small(dst []byte, src [][]byte)
//go:noescape
func matrixSSE2big(dst []byte, src [][]byte)
//go:noescape
func hasAVX2() bool
//go:noescape
func hasSSE2() bool

19
vendor/github.com/templexxx/xor/xor_other.go generated vendored Normal file
View File

@@ -0,0 +1,19 @@
// +build !amd64 noasm
package xor
func xorBytes(dst, src0, src1 []byte, size int) {
bytesNoSIMD(dst, src0, src1, size)
}
func xorMatrix(dst []byte, src [][]byte) {
matrixNoSIMD(dst, src)
}
func xorSrc0(dst, src0, src1 []byte) {
bytesNoSIMD(dst, src0, src1, len(src0))
}
func xorSrc1(dst, src0, src1 []byte) {
bytesNoSIMD(dst, src0, src1, len(src1))
}