mirror of
https://github.com/fatedier/frp.git
synced 2026-01-11 22:23:12 +00:00
add packages
This commit is contained in:
14
vendor/github.com/templexxx/cpufeat/.gitignore
generated
vendored
Normal file
14
vendor/github.com/templexxx/cpufeat/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# Binaries for programs and plugins
|
||||
*.exe
|
||||
*.dll
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Test binary, build with `go test -c`
|
||||
*.test
|
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||
*.out
|
||||
|
||||
# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
|
||||
.glide/
|
||||
27
vendor/github.com/templexxx/cpufeat/LICENSE
generated
vendored
Normal file
27
vendor/github.com/templexxx/cpufeat/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2009 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
32
vendor/github.com/templexxx/cpufeat/cpu.go
generated
vendored
Normal file
32
vendor/github.com/templexxx/cpufeat/cpu.go
generated
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package cpu implements processor feature detection
|
||||
// used by the Go standard libary.
|
||||
package cpufeat
|
||||
|
||||
var X86 x86
|
||||
|
||||
// The booleans in x86 contain the correspondingly named cpuid feature bit.
|
||||
// HasAVX and HasAVX2 are only set if the OS does support XMM and YMM registers
|
||||
// in addition to the cpuid feature bit being set.
|
||||
// The struct is padded to avoid false sharing.
|
||||
type x86 struct {
|
||||
_ [CacheLineSize]byte
|
||||
HasAES bool
|
||||
HasAVX bool
|
||||
HasAVX2 bool
|
||||
HasBMI1 bool
|
||||
HasBMI2 bool
|
||||
HasERMS bool
|
||||
HasOSXSAVE bool
|
||||
HasPCLMULQDQ bool
|
||||
HasPOPCNT bool
|
||||
HasSSE2 bool
|
||||
HasSSE3 bool
|
||||
HasSSSE3 bool
|
||||
HasSSE41 bool
|
||||
HasSSE42 bool
|
||||
_ [CacheLineSize]byte
|
||||
}
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_arm.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_arm.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 32
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_arm64.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_arm64.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 32
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_mips.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_mips.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 32
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_mips64.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_mips64.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 32
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_mips64le.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_mips64le.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 32
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_mipsle.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_mipsle.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 32
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_ppc64.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_ppc64.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 128
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_ppc64le.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_ppc64le.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 128
|
||||
7
vendor/github.com/templexxx/cpufeat/cpu_s390x.go
generated
vendored
Normal file
7
vendor/github.com/templexxx/cpufeat/cpu_s390x.go
generated
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 256
|
||||
59
vendor/github.com/templexxx/cpufeat/cpu_x86.go
generated
vendored
Normal file
59
vendor/github.com/templexxx/cpufeat/cpu_x86.go
generated
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build 386 amd64 amd64p32
|
||||
|
||||
package cpufeat
|
||||
|
||||
const CacheLineSize = 64
|
||||
|
||||
// cpuid is implemented in cpu_x86.s.
|
||||
func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
|
||||
|
||||
// xgetbv with ecx = 0 is implemented in cpu_x86.s.
|
||||
func xgetbv() (eax, edx uint32)
|
||||
|
||||
func init() {
|
||||
maxId, _, _, _ := cpuid(0, 0)
|
||||
|
||||
if maxId < 1 {
|
||||
return
|
||||
}
|
||||
|
||||
_, _, ecx1, edx1 := cpuid(1, 0)
|
||||
X86.HasSSE2 = isSet(26, edx1)
|
||||
|
||||
X86.HasSSE3 = isSet(0, ecx1)
|
||||
X86.HasPCLMULQDQ = isSet(1, ecx1)
|
||||
X86.HasSSSE3 = isSet(9, ecx1)
|
||||
X86.HasSSE41 = isSet(19, ecx1)
|
||||
X86.HasSSE42 = isSet(20, ecx1)
|
||||
X86.HasPOPCNT = isSet(23, ecx1)
|
||||
X86.HasAES = isSet(25, ecx1)
|
||||
X86.HasOSXSAVE = isSet(27, ecx1)
|
||||
|
||||
osSupportsAVX := false
|
||||
// For XGETBV, OSXSAVE bit is required and sufficient.
|
||||
if X86.HasOSXSAVE {
|
||||
eax, _ := xgetbv()
|
||||
// Check if XMM and YMM registers have OS support.
|
||||
osSupportsAVX = isSet(1, eax) && isSet(2, eax)
|
||||
}
|
||||
|
||||
X86.HasAVX = isSet(28, ecx1) && osSupportsAVX
|
||||
|
||||
if maxId < 7 {
|
||||
return
|
||||
}
|
||||
|
||||
_, ebx7, _, _ := cpuid(7, 0)
|
||||
X86.HasBMI1 = isSet(3, ebx7)
|
||||
X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX
|
||||
X86.HasBMI2 = isSet(8, ebx7)
|
||||
X86.HasERMS = isSet(9, ebx7)
|
||||
}
|
||||
|
||||
func isSet(bitpos uint, value uint32) bool {
|
||||
return value&(1<<bitpos) != 0
|
||||
}
|
||||
32
vendor/github.com/templexxx/cpufeat/cpu_x86.s
generated
vendored
Normal file
32
vendor/github.com/templexxx/cpufeat/cpu_x86.s
generated
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build 386 amd64 amd64p32
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
|
||||
TEXT ·cpuid(SB), NOSPLIT, $0-24
|
||||
MOVL eaxArg+0(FP), AX
|
||||
MOVL ecxArg+4(FP), CX
|
||||
CPUID
|
||||
MOVL AX, eax+8(FP)
|
||||
MOVL BX, ebx+12(FP)
|
||||
MOVL CX, ecx+16(FP)
|
||||
MOVL DX, edx+20(FP)
|
||||
RET
|
||||
|
||||
// func xgetbv() (eax, edx uint32)
|
||||
TEXT ·xgetbv(SB),NOSPLIT,$0-8
|
||||
#ifdef GOOS_nacl
|
||||
// nacl does not support XGETBV.
|
||||
MOVL $0, eax+0(FP)
|
||||
MOVL $0, edx+4(FP)
|
||||
#else
|
||||
MOVL $0, CX
|
||||
WORD $0x010f; BYTE $0xd0 //XGETBV
|
||||
MOVL AX, eax+0(FP)
|
||||
MOVL DX, edx+4(FP)
|
||||
#endif
|
||||
RET
|
||||
40
vendor/github.com/templexxx/reedsolomon/.gitignore
generated
vendored
Normal file
40
vendor/github.com/templexxx/reedsolomon/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects)
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
|
||||
# Folders
|
||||
_obj
|
||||
_test
|
||||
|
||||
# Architecture specific extensions/prefixes
|
||||
*.[568vq]
|
||||
[568vq].out
|
||||
|
||||
*.cgo1.go
|
||||
*.cgo2.c
|
||||
_cgo_defun.c
|
||||
_cgo_gotypes.go
|
||||
_cgo_export.*
|
||||
|
||||
_testmain.go
|
||||
|
||||
*.exe
|
||||
*.test
|
||||
*.prof
|
||||
/.idea
|
||||
/backup
|
||||
/loopunroll/
|
||||
cpu.out
|
||||
mathtool/galois/
|
||||
mathtool/matrix/
|
||||
mem.out
|
||||
/examples/
|
||||
/.DS_Store
|
||||
/mathtool/cntinverse
|
||||
/invert
|
||||
/bakcup
|
||||
/buf.svg
|
||||
*.svg
|
||||
*.out
|
||||
/escape
|
||||
9
vendor/github.com/templexxx/reedsolomon/.travis.yml
generated
vendored
Normal file
9
vendor/github.com/templexxx/reedsolomon/.travis.yml
generated
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
language: go
|
||||
go:
|
||||
- 1.9
|
||||
|
||||
install:
|
||||
- go get github.com/templexxx/reedsolomon
|
||||
|
||||
script:
|
||||
- go test -v
|
||||
23
vendor/github.com/templexxx/reedsolomon/LICENSE
generated
vendored
Normal file
23
vendor/github.com/templexxx/reedsolomon/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2017 Templexxx
|
||||
Copyright (c) 2015 Klaus Post
|
||||
Copyright (c) 2015 Backblaze
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
109
vendor/github.com/templexxx/reedsolomon/README.md
generated
vendored
Normal file
109
vendor/github.com/templexxx/reedsolomon/README.md
generated
vendored
Normal file
@@ -0,0 +1,109 @@
|
||||
# Reed-Solomon
|
||||
|
||||
[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8]
|
||||
|
||||
[1]: https://godoc.org/github.com/templexxx/reedsolomon?status.svg
|
||||
[2]: https://godoc.org/github.com/templexxx/reedsolomon
|
||||
[3]: https://img.shields.io/badge/license-MIT-blue.svg
|
||||
[4]: LICENSE
|
||||
[5]: https://travis-ci.org/templexxx/reedsolomon.svg?branch=master
|
||||
[6]: https://travis-ci.org/templexxx/reedsolomon
|
||||
[7]: https://goreportcard.com/badge/github.com/templexxx/reedsolomon
|
||||
[8]: https://goreportcard.com/report/github.com/templexxx/reedsolomon
|
||||
|
||||
|
||||
## Introduction:
|
||||
1. Reed-Solomon Erasure Code engine in pure Go.
|
||||
2. Super Fast: more than 10GB/s per physics core ( 10+4, 4KB per vector, Macbook Pro 2.8 GHz Intel Core i7 )
|
||||
|
||||
## Installation
|
||||
To get the package use the standard:
|
||||
```bash
|
||||
go get github.com/templexxx/reedsolomon
|
||||
```
|
||||
|
||||
## Documentation
|
||||
See the associated [GoDoc](http://godoc.org/github.com/templexxx/reedsolomon)
|
||||
|
||||
## Specification
|
||||
### GOARCH
|
||||
1. All arch are supported
|
||||
2. 0.1.0 need go1.9 for sync.Map in AMD64
|
||||
|
||||
### Math
|
||||
1. Coding over in GF(2^8)
|
||||
2. Primitive Polynomial: x^8 + x^4 + x^3 + x^2 + 1 (0x1d)
|
||||
3. mathtool/gentbls.go : generator Primitive Polynomial and it's log table, exp table, multiply table, inverse table etc. We can get more info about how galois field work
|
||||
4. mathtool/cntinverse.go : calculate how many inverse matrix will have in different RS codes config
|
||||
5. Both of Cauchy and Vandermonde Matrix are supported. Vandermonde need more operations for preserving the property that any square subset of rows is invertible
|
||||
|
||||
### Why so fast?
|
||||
These three parts will cost too much time:
|
||||
|
||||
1. lookup galois-field tables
|
||||
2. read/write memory
|
||||
3. calculate inverse matrix in the reconstruct process
|
||||
|
||||
SIMD will solve no.1
|
||||
|
||||
Cache-friendly codes will help to solve no.2 & no.3, and more, use a sync.Map for cache inverse matrix, it will help to save about 1000ns when we need same matrix.
|
||||
|
||||
## Performance
|
||||
|
||||
Performance depends mainly on:
|
||||
|
||||
1. CPU instruction extension( AVX2 or SSSE3 or none )
|
||||
2. number of data/parity vects
|
||||
3. unit size of calculation ( see it in rs_amd64.go )
|
||||
4. size of shards
|
||||
5. speed of memory (waste so much time on read/write mem, :D )
|
||||
6. performance of CPU
|
||||
7. the way of using ( reuse memory)
|
||||
|
||||
And we must know the benchmark test is quite different with encoding/decoding in practice.
|
||||
|
||||
Because in benchmark test loops, the CPU Cache will help a lot. In practice, we must reuse the memory to make the performance become as good as the benchmark test.
|
||||
|
||||
Example of performance on my MacBook 2017 i7 2.8GHz. 10+4 (with 0.1.0).
|
||||
|
||||
### Encoding:
|
||||
|
||||
| Vector size | Speed (MB/S) |
|
||||
|----------------|--------------|
|
||||
| 1400B | 7655.02 |
|
||||
| 4KB | 10551.37 |
|
||||
| 64KB | 9297.25 |
|
||||
| 1MB | 6829.89 |
|
||||
| 16MB | 6312.83 |
|
||||
|
||||
### Reconstruct (use nil to point which one need repair):
|
||||
|
||||
| Vector size | Speed (MB/S) |
|
||||
|----------------|--------------|
|
||||
| 1400B | 4124.85 |
|
||||
| 4KB | 5715.45 |
|
||||
| 64KB | 6050.06 |
|
||||
| 1MB | 5001.21 |
|
||||
| 16MB | 5043.04 |
|
||||
|
||||
### ReconstructWithPos (use a position list to point which one need repair, reuse the memory):
|
||||
|
||||
| Vector size | Speed (MB/S) |
|
||||
|----------------|--------------|
|
||||
| 1400B | 6170.24 |
|
||||
| 4KB | 9444.86 |
|
||||
| 64KB | 9311.30 |
|
||||
| 1MB | 6781.06 |
|
||||
| 16MB | 6285.34 |
|
||||
|
||||
**reconstruct benchmark tests here run with inverse matrix cache, if there is no cache, it will cost more time( about 1000ns)**
|
||||
|
||||
## Who is using this?
|
||||
|
||||
1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang
|
||||
|
||||
## Links & Thanks
|
||||
* [Klauspost ReedSolomon](https://github.com/klauspost/reedsolomon)
|
||||
* [intel ISA-L](https://github.com/01org/isa-l)
|
||||
* [GF SIMD] (http://www.ssrc.ucsc.edu/papers/plank-fast13.pdf)
|
||||
* [asm2plan9s] (https://github.com/fwessels/asm2plan9s)
|
||||
156
vendor/github.com/templexxx/reedsolomon/matrix.go
generated
vendored
Normal file
156
vendor/github.com/templexxx/reedsolomon/matrix.go
generated
vendored
Normal file
@@ -0,0 +1,156 @@
|
||||
package reedsolomon
|
||||
|
||||
import "errors"
|
||||
|
||||
type matrix []byte
|
||||
|
||||
func genEncMatrixCauchy(d, p int) matrix {
|
||||
t := d + p
|
||||
m := make([]byte, t*d)
|
||||
for i := 0; i < d; i++ {
|
||||
m[i*d+i] = byte(1)
|
||||
}
|
||||
|
||||
d2 := d * d
|
||||
for i := d; i < t; i++ {
|
||||
for j := 0; j < d; j++ {
|
||||
d := i ^ j
|
||||
a := inverseTbl[d]
|
||||
m[d2] = byte(a)
|
||||
d2++
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func gfExp(b byte, n int) byte {
|
||||
if n == 0 {
|
||||
return 1
|
||||
}
|
||||
if b == 0 {
|
||||
return 0
|
||||
}
|
||||
a := logTbl[b]
|
||||
ret := int(a) * n
|
||||
for ret >= 255 {
|
||||
ret -= 255
|
||||
}
|
||||
return byte(expTbl[ret])
|
||||
}
|
||||
|
||||
func genVandMatrix(vm []byte, t, d int) {
|
||||
for i := 0; i < t; i++ {
|
||||
for j := 0; j < d; j++ {
|
||||
vm[i*d+j] = gfExp(byte(i), j)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m matrix) mul(right matrix, rows, cols int, r []byte) {
|
||||
for i := 0; i < rows; i++ {
|
||||
for j := 0; j < cols; j++ {
|
||||
var v byte
|
||||
for k := 0; k < cols; k++ {
|
||||
v ^= gfMul(m[i*cols+k], right[k*cols+j])
|
||||
}
|
||||
r[i*cols+j] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func genEncMatrixVand(d, p int) (matrix, error) {
|
||||
t := d + p
|
||||
buf := make([]byte, (2*t+4*d)*d)
|
||||
vm := buf[:t*d]
|
||||
genVandMatrix(vm, t, d)
|
||||
top := buf[t*d : (t+d)*d]
|
||||
copy(top, vm[:d*d])
|
||||
raw := buf[(t+d)*d : (t+3*d)*d]
|
||||
im := buf[(t+3*d)*d : (t+4*d)*d]
|
||||
err := matrix(top).invert(raw, d, im)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
r := buf[(t+4*d)*d : (2*t+4*d)*d]
|
||||
matrix(vm).mul(im, t, d, r)
|
||||
return matrix(r), nil
|
||||
}
|
||||
|
||||
// [I|m'] -> [m']
|
||||
func (m matrix) subMatrix(n int, r []byte) {
|
||||
for i := 0; i < n; i++ {
|
||||
off := i * n
|
||||
copy(r[off:off+n], m[2*off+n:2*(off+n)])
|
||||
}
|
||||
}
|
||||
|
||||
func (m matrix) invert(raw matrix, n int, im []byte) error {
|
||||
// [m] -> [m|I]
|
||||
for i := 0; i < n; i++ {
|
||||
t := i * n
|
||||
copy(raw[2*t:2*t+n], m[t:t+n])
|
||||
raw[2*t+i+n] = byte(1)
|
||||
}
|
||||
err := gauss(raw, n)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
raw.subMatrix(n, im)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m matrix) swap(i, j, n int) {
|
||||
for k := 0; k < n; k++ {
|
||||
m[i*n+k], m[j*n+k] = m[j*n+k], m[i*n+k]
|
||||
}
|
||||
}
|
||||
|
||||
func gfMul(a, b byte) byte {
|
||||
return mulTbl[a][b]
|
||||
}
|
||||
|
||||
var errSingular = errors.New("rs.invert: matrix is singular")
|
||||
|
||||
// [m|I] -> [I|m']
|
||||
func gauss(m matrix, n int) error {
|
||||
n2 := 2 * n
|
||||
for i := 0; i < n; i++ {
|
||||
if m[i*n2+i] == 0 {
|
||||
for j := i + 1; j < n; j++ {
|
||||
if m[j*n2+i] != 0 {
|
||||
m.swap(i, j, n2)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if m[i*n2+i] == 0 {
|
||||
return errSingular
|
||||
}
|
||||
if m[i*n2+i] != 1 {
|
||||
d := m[i*n2+i]
|
||||
scale := inverseTbl[d]
|
||||
for c := 0; c < n2; c++ {
|
||||
m[i*n2+c] = gfMul(m[i*n2+c], scale)
|
||||
}
|
||||
}
|
||||
for j := i + 1; j < n; j++ {
|
||||
if m[j*n2+i] != 0 {
|
||||
scale := m[j*n2+i]
|
||||
for c := 0; c < n2; c++ {
|
||||
m[j*n2+c] ^= gfMul(scale, m[i*n2+c])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for k := 0; k < n; k++ {
|
||||
for j := 0; j < k; j++ {
|
||||
if m[j*n2+k] != 0 {
|
||||
scale := m[j*n2+k]
|
||||
for c := 0; c < n2; c++ {
|
||||
m[j*n2+c] ^= gfMul(scale, m[k*n2+c])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
280
vendor/github.com/templexxx/reedsolomon/rs.go
generated
vendored
Normal file
280
vendor/github.com/templexxx/reedsolomon/rs.go
generated
vendored
Normal file
@@ -0,0 +1,280 @@
|
||||
/*
|
||||
Reed-Solomon Codes over GF(2^8)
|
||||
Primitive Polynomial: x^8+x^4+x^3+x^2+1
|
||||
Galois Filed arithmetic using Intel SIMD instructions (AVX2 or SSSE3)
|
||||
*/
|
||||
|
||||
package reedsolomon
|
||||
|
||||
import "errors"
|
||||
|
||||
// Encoder implements for Reed-Solomon Encoding/Reconstructing
|
||||
type Encoder interface {
|
||||
// Encode multiply generator-matrix with data
|
||||
// len(vects) must be equal with num of data+parity
|
||||
Encode(vects [][]byte) error
|
||||
// Result of reconst will be put into origin position of vects
|
||||
// it means if you lost vects[0], after reconst the vects[0]'s data will be back in vects[0]
|
||||
|
||||
// Reconstruct repair lost data & parity
|
||||
// Set vect nil if lost
|
||||
Reconstruct(vects [][]byte) error
|
||||
// Reconstruct repair lost data
|
||||
// Set vect nil if lost
|
||||
ReconstructData(vects [][]byte) error
|
||||
// ReconstWithPos repair lost data&parity with has&lost vects position
|
||||
// Save bandwidth&disk I/O (cmp with Reconstruct, if the lost is less than num of parity)
|
||||
// As erasure codes, we must know which vect is broken,
|
||||
// so it's necessary to provide such APIs
|
||||
// len(has) must equal num of data vects
|
||||
// Example:
|
||||
// in 3+2, the whole position: [0,1,2,3,4]
|
||||
// if lost vects[0]
|
||||
// the "has" could be [1,2,3] or [1,2,4] or ...
|
||||
// then you must be sure that vects[1] vects[2] vects[3] have correct data (if the "has" is [1,2,3])
|
||||
// the "dLost" will be [0]
|
||||
// ps:
|
||||
// 1. the above lists are in increasing orders TODO support out-of-order
|
||||
// 2. each vect has same len, don't set it nil
|
||||
// so we don't need to make slice
|
||||
ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error
|
||||
//// ReconstWithPos repair lost data with survived&lost vects position
|
||||
//// Don't need to append position of parity lost into "lost"
|
||||
ReconstDataWithPos(vects [][]byte, has, dLost []int) error
|
||||
}
|
||||
|
||||
func checkCfg(d, p int) error {
|
||||
if (d <= 0) || (p <= 0) {
|
||||
return errors.New("rs.New: data or parity <= 0")
|
||||
}
|
||||
if d+p >= 256 {
|
||||
return errors.New("rs.New: data+parity >= 256")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// New create an Encoder (vandermonde matrix as Encoding matrix)
|
||||
func New(data, parity int) (enc Encoder, err error) {
|
||||
err = checkCfg(data, parity)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
e, err := genEncMatrixVand(data, parity)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
return newRS(data, parity, e), nil
|
||||
}
|
||||
|
||||
// NewCauchy create an Encoder (cauchy matrix as Generator Matrix)
|
||||
func NewCauchy(data, parity int) (enc Encoder, err error) {
|
||||
err = checkCfg(data, parity)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
e := genEncMatrixCauchy(data, parity)
|
||||
return newRS(data, parity, e), nil
|
||||
}
|
||||
|
||||
type encBase struct {
|
||||
data int
|
||||
parity int
|
||||
encode []byte
|
||||
gen []byte
|
||||
}
|
||||
|
||||
func checkEnc(d, p int, vs [][]byte) (size int, err error) {
|
||||
total := len(vs)
|
||||
if d+p != total {
|
||||
err = errors.New("rs.checkER: vects not match rs args")
|
||||
return
|
||||
}
|
||||
size = len(vs[0])
|
||||
if size == 0 {
|
||||
err = errors.New("rs.checkER: vects size = 0")
|
||||
return
|
||||
}
|
||||
for i := 1; i < total; i++ {
|
||||
if len(vs[i]) != size {
|
||||
err = errors.New("rs.checkER: vects size mismatch")
|
||||
return
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (e *encBase) Encode(vects [][]byte) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
_, err = checkEnc(d, p, vects)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dv := vects[:d]
|
||||
pv := vects[d:]
|
||||
g := e.gen
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
if i != 0 {
|
||||
mulVectAdd(g[j*d+i], dv[i], pv[j])
|
||||
} else {
|
||||
mulVect(g[j*d], dv[0], pv[j])
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func mulVect(c byte, a, b []byte) {
|
||||
t := mulTbl[c]
|
||||
for i := 0; i < len(a); i++ {
|
||||
b[i] = t[a[i]]
|
||||
}
|
||||
}
|
||||
|
||||
func mulVectAdd(c byte, a, b []byte) {
|
||||
t := mulTbl[c]
|
||||
for i := 0; i < len(a); i++ {
|
||||
b[i] ^= t[a[i]]
|
||||
}
|
||||
}
|
||||
|
||||
func (e *encBase) Reconstruct(vects [][]byte) (err error) {
|
||||
return e.reconstruct(vects, false)
|
||||
}
|
||||
|
||||
func (e *encBase) ReconstructData(vects [][]byte) (err error) {
|
||||
return e.reconstruct(vects, true)
|
||||
}
|
||||
|
||||
func (e *encBase) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
|
||||
return e.reconstWithPos(vects, has, dLost, pLost, false)
|
||||
}
|
||||
|
||||
func (e *encBase) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
|
||||
return e.reconstWithPos(vects, has, dLost, nil, true)
|
||||
}
|
||||
|
||||
func (e *encBase) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
em := e.encode
|
||||
dCnt := len(dLost)
|
||||
size := len(vects[has[0]])
|
||||
if dCnt != 0 {
|
||||
vtmp := make([][]byte, d+dCnt)
|
||||
for i, p := range has {
|
||||
vtmp[i] = vects[p]
|
||||
}
|
||||
for i, p := range dLost {
|
||||
if len(vects[p]) == 0 {
|
||||
vects[p] = make([]byte, size)
|
||||
}
|
||||
vtmp[i+d] = vects[p]
|
||||
}
|
||||
matrixbuf := make([]byte, 4*d*d+dCnt*d)
|
||||
m := matrixbuf[:d*d]
|
||||
for i, l := range has {
|
||||
copy(m[i*d:i*d+d], em[l*d:l*d+d])
|
||||
}
|
||||
raw := matrixbuf[d*d : 3*d*d]
|
||||
im := matrixbuf[3*d*d : 4*d*d]
|
||||
err2 := matrix(m).invert(raw, d, im)
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
g := matrixbuf[4*d*d:]
|
||||
for i, l := range dLost {
|
||||
copy(g[i*d:i*d+d], im[l*d:l*d+d])
|
||||
}
|
||||
etmp := &encBase{data: d, parity: dCnt, gen: g}
|
||||
err2 = etmp.Encode(vtmp[:d+dCnt])
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
}
|
||||
if dataOnly {
|
||||
return
|
||||
}
|
||||
pCnt := len(pLost)
|
||||
if pCnt != 0 {
|
||||
vtmp := make([][]byte, d+pCnt)
|
||||
g := make([]byte, pCnt*d)
|
||||
for i, l := range pLost {
|
||||
copy(g[i*d:i*d+d], em[l*d:l*d+d])
|
||||
}
|
||||
for i := 0; i < d; i++ {
|
||||
vtmp[i] = vects[i]
|
||||
}
|
||||
for i, p := range pLost {
|
||||
if len(vects[p]) == 0 {
|
||||
vects[p] = make([]byte, size)
|
||||
}
|
||||
vtmp[i+d] = vects[p]
|
||||
}
|
||||
etmp := &encBase{data: d, parity: pCnt, gen: g}
|
||||
err2 := etmp.Encode(vtmp[:d+pCnt])
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (e *encBase) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
// TODO check more, maybe element in has show in lost & deal with len(has) > d
|
||||
if len(has) != d {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
dCnt := len(dLost)
|
||||
if dCnt > p {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
pCnt := len(pLost)
|
||||
if pCnt > p {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
return e.reconst(vects, has, dLost, pLost, dataOnly)
|
||||
}
|
||||
|
||||
func (e *encBase) reconstruct(vects [][]byte, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
t := d + p
|
||||
listBuf := make([]int, t+p)
|
||||
has := listBuf[:d]
|
||||
dLost := listBuf[d:t]
|
||||
pLost := listBuf[t : t+p]
|
||||
hasCnt, dCnt, pCnt := 0, 0, 0
|
||||
for i := 0; i < t; i++ {
|
||||
if vects[i] != nil {
|
||||
if hasCnt < d {
|
||||
has[hasCnt] = i
|
||||
hasCnt++
|
||||
}
|
||||
} else {
|
||||
if i < d {
|
||||
if dCnt < p {
|
||||
dLost[dCnt] = i
|
||||
dCnt++
|
||||
} else {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
} else {
|
||||
if pCnt < p {
|
||||
pLost[pCnt] = i
|
||||
pCnt++
|
||||
} else {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if hasCnt != d {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
dLost = dLost[:dCnt]
|
||||
pLost = pLost[:pCnt]
|
||||
return e.reconst(vects, has, dLost, pLost, dataOnly)
|
||||
}
|
||||
868
vendor/github.com/templexxx/reedsolomon/rs_amd64.go
generated
vendored
Normal file
868
vendor/github.com/templexxx/reedsolomon/rs_amd64.go
generated
vendored
Normal file
@@ -0,0 +1,868 @@
|
||||
package reedsolomon
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
"github.com/templexxx/cpufeat"
|
||||
)
|
||||
|
||||
// SIMD Instruction Extensions
|
||||
const (
|
||||
none = iota
|
||||
avx2
|
||||
ssse3
|
||||
)
|
||||
|
||||
var extension = none
|
||||
|
||||
func init() {
|
||||
getEXT()
|
||||
}
|
||||
|
||||
func getEXT() {
|
||||
if cpufeat.X86.HasAVX2 {
|
||||
extension = avx2
|
||||
return
|
||||
} else if cpufeat.X86.HasSSSE3 {
|
||||
extension = ssse3
|
||||
return
|
||||
} else {
|
||||
extension = none
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func copy32B(dst, src []byte) // Need SSE2(introduced in 2001)
|
||||
|
||||
func initTbl(g matrix, rows, cols int, tbl []byte) {
|
||||
off := 0
|
||||
for i := 0; i < cols; i++ {
|
||||
for j := 0; j < rows; j++ {
|
||||
c := g[j*cols+i]
|
||||
t := lowhighTbl[c][:]
|
||||
copy32B(tbl[off:off+32], t)
|
||||
off += 32
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// At most 3060 inverse matrix (when data=14, parity=4, calc by mathtool/cntinverse)
|
||||
// In practice, data usually below 12, parity below 5
|
||||
func okCache(data, parity int) bool {
|
||||
if data < 15 && parity < 5 { // you can change it, but the data+parity can't be bigger than 32 (tips: see the codes about make inverse matrix)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type (
|
||||
encSSSE3 encSIMD
|
||||
encAVX2 encSIMD
|
||||
encSIMD struct {
|
||||
data int
|
||||
parity int
|
||||
encode matrix
|
||||
gen matrix
|
||||
tbl []byte
|
||||
// inverse matrix cache is design for small vect size ( < 4KB )
|
||||
// it will save time for calculating inverse matrix
|
||||
// but it's not so important for big vect size
|
||||
enableCache bool
|
||||
inverseCache iCache
|
||||
}
|
||||
iCache struct {
|
||||
sync.RWMutex
|
||||
data map[uint32][]byte
|
||||
}
|
||||
)
|
||||
|
||||
func newRS(d, p int, em matrix) (enc Encoder) {
|
||||
g := em[d*d:]
|
||||
if extension == none {
|
||||
return &encBase{data: d, parity: p, encode: em, gen: g}
|
||||
}
|
||||
t := make([]byte, d*p*32)
|
||||
initTbl(g, p, d, t)
|
||||
ok := okCache(d, p)
|
||||
if extension == avx2 {
|
||||
e := &encAVX2{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
|
||||
inverseCache: iCache{data: make(map[uint32][]byte)}}
|
||||
return e
|
||||
}
|
||||
e := &encSSSE3{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
|
||||
inverseCache: iCache{data: make(map[uint32][]byte)}}
|
||||
return e
|
||||
}
|
||||
|
||||
// Size of sub-vector
|
||||
const unit int = 16 * 1024
|
||||
|
||||
func getDo(n int) int {
|
||||
if n < unit {
|
||||
c := n >> 4
|
||||
if c == 0 {
|
||||
return unit
|
||||
}
|
||||
return c << 4
|
||||
}
|
||||
return unit
|
||||
}
|
||||
|
||||
func (e *encAVX2) Encode(vects [][]byte) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
size, err := checkEnc(d, p, vects)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dv := vects[:d]
|
||||
pv := vects[d:]
|
||||
start, end := 0, 0
|
||||
do := getDo(size)
|
||||
for start < size {
|
||||
end = start + do
|
||||
if end <= size {
|
||||
e.matrixMul(start, end, dv, pv)
|
||||
start = end
|
||||
} else {
|
||||
e.matrixMulRemain(start, size, dv, pv)
|
||||
start = size
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func mulVectAVX2(tbl, d, p []byte)
|
||||
|
||||
//go:noescape
|
||||
func mulVectAddAVX2(tbl, d, p []byte)
|
||||
|
||||
func (e *encAVX2) matrixMul(start, end int, dv, pv [][]byte) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
tbl := e.tbl
|
||||
off := 0
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := tbl[off : off+32]
|
||||
if i != 0 {
|
||||
mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
|
||||
} else {
|
||||
mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
|
||||
}
|
||||
off += 32
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *encAVX2) matrixMulRemain(start, end int, dv, pv [][]byte) {
|
||||
undone := end - start
|
||||
do := (undone >> 4) << 4
|
||||
d := e.data
|
||||
p := e.parity
|
||||
tbl := e.tbl
|
||||
if do >= 16 {
|
||||
end2 := start + do
|
||||
off := 0
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := tbl[off : off+32]
|
||||
if i != 0 {
|
||||
mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
|
||||
} else {
|
||||
mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
|
||||
}
|
||||
off += 32
|
||||
}
|
||||
}
|
||||
start = end
|
||||
}
|
||||
if undone > do {
|
||||
// may recalculate some data, but still improve a lot
|
||||
start2 := end - 16
|
||||
if start2 >= 0 {
|
||||
off := 0
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := tbl[off : off+32]
|
||||
if i != 0 {
|
||||
mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
|
||||
} else {
|
||||
mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
|
||||
}
|
||||
off += 32
|
||||
}
|
||||
}
|
||||
} else {
|
||||
g := e.gen
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
if i != 0 {
|
||||
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
|
||||
} else {
|
||||
mulVect(g[j*d], dv[0][start:], pv[j][start:])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// use generator-matrix but not tbls for encoding
|
||||
// it's design for reconstructing
|
||||
// for small vects, it cost to much time on initTbl, so drop it
|
||||
// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
|
||||
func (e *encAVX2) encodeGen(vects [][]byte) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
size, err := checkEnc(d, p, vects)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dv := vects[:d]
|
||||
pv := vects[d:]
|
||||
start, end := 0, 0
|
||||
do := getDo(size)
|
||||
for start < size {
|
||||
end = start + do
|
||||
if end <= size {
|
||||
e.matrixMulGen(start, end, dv, pv)
|
||||
start = end
|
||||
} else {
|
||||
e.matrixMulRemainGen(start, size, dv, pv)
|
||||
start = size
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (e *encAVX2) matrixMulGen(start, end int, dv, pv [][]byte) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
g := e.gen
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := lowhighTbl[g[j*d+i]][:]
|
||||
if i != 0 {
|
||||
mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
|
||||
} else {
|
||||
mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *encAVX2) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
|
||||
undone := end - start
|
||||
do := (undone >> 4) << 4
|
||||
d := e.data
|
||||
p := e.parity
|
||||
g := e.gen
|
||||
if do >= 16 {
|
||||
end2 := start + do
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := lowhighTbl[g[j*d+i]][:]
|
||||
if i != 0 {
|
||||
mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
|
||||
} else {
|
||||
mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
|
||||
}
|
||||
}
|
||||
}
|
||||
start = end
|
||||
}
|
||||
if undone > do {
|
||||
start2 := end - 16
|
||||
if start2 >= 0 {
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := lowhighTbl[g[j*d+i]][:]
|
||||
if i != 0 {
|
||||
mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
|
||||
} else {
|
||||
mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
if i != 0 {
|
||||
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
|
||||
} else {
|
||||
mulVect(g[j*d], dv[0][start:], pv[j][start:])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *encAVX2) Reconstruct(vects [][]byte) (err error) {
|
||||
return e.reconstruct(vects, false)
|
||||
}
|
||||
|
||||
func (e *encAVX2) ReconstructData(vects [][]byte) (err error) {
|
||||
return e.reconstruct(vects, true)
|
||||
}
|
||||
|
||||
func (e *encAVX2) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
|
||||
return e.reconstWithPos(vects, has, dLost, pLost, false)
|
||||
}
|
||||
|
||||
func (e *encAVX2) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
|
||||
return e.reconstWithPos(vects, has, dLost, nil, true)
|
||||
}
|
||||
|
||||
func (e *encAVX2) makeGen(has, dLost []int) (gen []byte, err error) {
|
||||
d := e.data
|
||||
em := e.encode
|
||||
cnt := len(dLost)
|
||||
if !e.enableCache {
|
||||
matrixbuf := make([]byte, 4*d*d+cnt*d)
|
||||
m := matrixbuf[:d*d]
|
||||
for i, l := range has {
|
||||
copy(m[i*d:i*d+d], em[l*d:l*d+d])
|
||||
}
|
||||
raw := matrixbuf[d*d : 3*d*d]
|
||||
im := matrixbuf[3*d*d : 4*d*d]
|
||||
err2 := matrix(m).invert(raw, d, im)
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
g := matrixbuf[4*d*d:]
|
||||
for i, l := range dLost {
|
||||
copy(g[i*d:i*d+d], im[l*d:l*d+d])
|
||||
}
|
||||
return g, nil
|
||||
}
|
||||
var ikey uint32
|
||||
for _, p := range has {
|
||||
ikey += 1 << uint8(p)
|
||||
}
|
||||
e.inverseCache.RLock()
|
||||
v, ok := e.inverseCache.data[ikey]
|
||||
if ok {
|
||||
im := v
|
||||
g := make([]byte, cnt*d)
|
||||
for i, l := range dLost {
|
||||
copy(g[i*d:i*d+d], im[l*d:l*d+d])
|
||||
}
|
||||
e.inverseCache.RUnlock()
|
||||
return g, nil
|
||||
}
|
||||
e.inverseCache.RUnlock()
|
||||
matrixbuf := make([]byte, 4*d*d+cnt*d)
|
||||
m := matrixbuf[:d*d]
|
||||
for i, l := range has {
|
||||
copy(m[i*d:i*d+d], em[l*d:l*d+d])
|
||||
}
|
||||
raw := matrixbuf[d*d : 3*d*d]
|
||||
im := matrixbuf[3*d*d : 4*d*d]
|
||||
err2 := matrix(m).invert(raw, d, im)
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
e.inverseCache.Lock()
|
||||
e.inverseCache.data[ikey] = im
|
||||
e.inverseCache.Unlock()
|
||||
g := matrixbuf[4*d*d:]
|
||||
for i, l := range dLost {
|
||||
copy(g[i*d:i*d+d], im[l*d:l*d+d])
|
||||
}
|
||||
return g, nil
|
||||
}
|
||||
|
||||
func (e *encAVX2) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
em := e.encode
|
||||
dCnt := len(dLost)
|
||||
size := len(vects[has[0]])
|
||||
if dCnt != 0 {
|
||||
vtmp := make([][]byte, d+dCnt)
|
||||
for i, p := range has {
|
||||
vtmp[i] = vects[p]
|
||||
}
|
||||
for i, p := range dLost {
|
||||
if len(vects[p]) == 0 {
|
||||
vects[p] = make([]byte, size)
|
||||
}
|
||||
vtmp[i+d] = vects[p]
|
||||
}
|
||||
g, err2 := e.makeGen(has, dLost)
|
||||
if err2 != nil {
|
||||
return
|
||||
}
|
||||
etmp := &encAVX2{data: d, parity: dCnt, gen: g}
|
||||
err2 = etmp.encodeGen(vtmp)
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
}
|
||||
if dataOnly {
|
||||
return
|
||||
}
|
||||
pCnt := len(pLost)
|
||||
if pCnt != 0 {
|
||||
g := make([]byte, pCnt*d)
|
||||
for i, l := range pLost {
|
||||
copy(g[i*d:i*d+d], em[l*d:l*d+d])
|
||||
}
|
||||
vtmp := make([][]byte, d+pCnt)
|
||||
for i := 0; i < d; i++ {
|
||||
vtmp[i] = vects[i]
|
||||
}
|
||||
for i, p := range pLost {
|
||||
if len(vects[p]) == 0 {
|
||||
vects[p] = make([]byte, size)
|
||||
}
|
||||
vtmp[i+d] = vects[p]
|
||||
}
|
||||
etmp := &encAVX2{data: d, parity: pCnt, gen: g}
|
||||
err2 := etmp.encodeGen(vtmp)
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (e *encAVX2) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
if len(has) != d {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
dCnt := len(dLost)
|
||||
if dCnt > p {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
pCnt := len(pLost)
|
||||
if pCnt > p {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
return e.reconst(vects, has, dLost, pLost, dataOnly)
|
||||
}
|
||||
|
||||
func (e *encAVX2) reconstruct(vects [][]byte, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
t := d + p
|
||||
listBuf := make([]int, t+p)
|
||||
has := listBuf[:d]
|
||||
dLost := listBuf[d:t]
|
||||
pLost := listBuf[t : t+p]
|
||||
hasCnt, dCnt, pCnt := 0, 0, 0
|
||||
for i := 0; i < t; i++ {
|
||||
if vects[i] != nil {
|
||||
if hasCnt < d {
|
||||
has[hasCnt] = i
|
||||
hasCnt++
|
||||
}
|
||||
} else {
|
||||
if i < d {
|
||||
if dCnt < p {
|
||||
dLost[dCnt] = i
|
||||
dCnt++
|
||||
} else {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
} else {
|
||||
if pCnt < p {
|
||||
pLost[pCnt] = i
|
||||
pCnt++
|
||||
} else {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if hasCnt != d {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
dLost = dLost[:dCnt]
|
||||
pLost = pLost[:pCnt]
|
||||
return e.reconst(vects, has, dLost, pLost, dataOnly)
|
||||
}
|
||||
|
||||
func (e *encSSSE3) Encode(vects [][]byte) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
size, err := checkEnc(d, p, vects)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dv := vects[:d]
|
||||
pv := vects[d:]
|
||||
start, end := 0, 0
|
||||
do := getDo(size)
|
||||
for start < size {
|
||||
end = start + do
|
||||
if end <= size {
|
||||
e.matrixMul(start, end, dv, pv)
|
||||
start = end
|
||||
} else {
|
||||
e.matrixMulRemain(start, size, dv, pv)
|
||||
start = size
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func mulVectSSSE3(tbl, d, p []byte)
|
||||
|
||||
//go:noescape
|
||||
func mulVectAddSSSE3(tbl, d, p []byte)
|
||||
|
||||
func (e *encSSSE3) matrixMul(start, end int, dv, pv [][]byte) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
tbl := e.tbl
|
||||
off := 0
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := tbl[off : off+32]
|
||||
if i != 0 {
|
||||
mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
|
||||
} else {
|
||||
mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
|
||||
}
|
||||
off += 32
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *encSSSE3) matrixMulRemain(start, end int, dv, pv [][]byte) {
|
||||
undone := end - start
|
||||
do := (undone >> 4) << 4
|
||||
d := e.data
|
||||
p := e.parity
|
||||
tbl := e.tbl
|
||||
if do >= 16 {
|
||||
end2 := start + do
|
||||
off := 0
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := tbl[off : off+32]
|
||||
if i != 0 {
|
||||
mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
|
||||
} else {
|
||||
mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
|
||||
}
|
||||
off += 32
|
||||
}
|
||||
}
|
||||
start = end
|
||||
}
|
||||
if undone > do {
|
||||
start2 := end - 16
|
||||
if start2 >= 0 {
|
||||
off := 0
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := tbl[off : off+32]
|
||||
if i != 0 {
|
||||
mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
|
||||
} else {
|
||||
mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
|
||||
}
|
||||
off += 32
|
||||
}
|
||||
}
|
||||
} else {
|
||||
g := e.gen
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
if i != 0 {
|
||||
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
|
||||
} else {
|
||||
mulVect(g[j*d], dv[0][start:], pv[j][start:])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// use generator-matrix but not tbls for encoding
|
||||
// it's design for reconstructing
|
||||
// for small vects, it cost to much time on initTbl, so drop it
|
||||
// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
|
||||
func (e *encSSSE3) encodeGen(vects [][]byte) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
size, err := checkEnc(d, p, vects)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dv := vects[:d]
|
||||
pv := vects[d:]
|
||||
start, end := 0, 0
|
||||
do := getDo(size)
|
||||
for start < size {
|
||||
end = start + do
|
||||
if end <= size {
|
||||
e.matrixMulGen(start, end, dv, pv)
|
||||
start = end
|
||||
} else {
|
||||
e.matrixMulRemainGen(start, size, dv, pv)
|
||||
start = size
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (e *encSSSE3) matrixMulGen(start, end int, dv, pv [][]byte) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
g := e.gen
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := lowhighTbl[g[j*d+i]][:]
|
||||
if i != 0 {
|
||||
mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
|
||||
} else {
|
||||
mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *encSSSE3) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
|
||||
undone := end - start
|
||||
do := (undone >> 4) << 4
|
||||
d := e.data
|
||||
p := e.parity
|
||||
g := e.gen
|
||||
if do >= 16 {
|
||||
end2 := start + do
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := lowhighTbl[g[j*d+i]][:]
|
||||
if i != 0 {
|
||||
mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
|
||||
} else {
|
||||
mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
|
||||
}
|
||||
}
|
||||
}
|
||||
start = end
|
||||
}
|
||||
if undone > do {
|
||||
start2 := end - 16
|
||||
if start2 >= 0 {
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
t := lowhighTbl[g[j*d+i]][:]
|
||||
if i != 0 {
|
||||
mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
|
||||
} else {
|
||||
mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for i := 0; i < d; i++ {
|
||||
for j := 0; j < p; j++ {
|
||||
if i != 0 {
|
||||
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
|
||||
} else {
|
||||
mulVect(g[j*d], dv[0][start:], pv[j][start:])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (e *encSSSE3) Reconstruct(vects [][]byte) (err error) {
|
||||
return e.reconstruct(vects, false)
|
||||
}
|
||||
|
||||
func (e *encSSSE3) ReconstructData(vects [][]byte) (err error) {
|
||||
return e.reconstruct(vects, true)
|
||||
}
|
||||
|
||||
func (e *encSSSE3) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
|
||||
return e.reconstWithPos(vects, has, dLost, pLost, false)
|
||||
}
|
||||
|
||||
func (e *encSSSE3) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
|
||||
return e.reconstWithPos(vects, has, dLost, nil, true)
|
||||
}
|
||||
|
||||
func (e *encSSSE3) makeGen(has, dLost []int) (gen []byte, err error) {
|
||||
d := e.data
|
||||
em := e.encode
|
||||
cnt := len(dLost)
|
||||
if !e.enableCache {
|
||||
matrixbuf := make([]byte, 4*d*d+cnt*d)
|
||||
m := matrixbuf[:d*d]
|
||||
for i, l := range has {
|
||||
copy(m[i*d:i*d+d], em[l*d:l*d+d])
|
||||
}
|
||||
raw := matrixbuf[d*d : 3*d*d]
|
||||
im := matrixbuf[3*d*d : 4*d*d]
|
||||
err2 := matrix(m).invert(raw, d, im)
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
g := matrixbuf[4*d*d:]
|
||||
for i, l := range dLost {
|
||||
copy(g[i*d:i*d+d], im[l*d:l*d+d])
|
||||
}
|
||||
return g, nil
|
||||
}
|
||||
var ikey uint32
|
||||
for _, p := range has {
|
||||
ikey += 1 << uint8(p)
|
||||
}
|
||||
e.inverseCache.RLock()
|
||||
v, ok := e.inverseCache.data[ikey]
|
||||
if ok {
|
||||
im := v
|
||||
g := make([]byte, cnt*d)
|
||||
for i, l := range dLost {
|
||||
copy(g[i*d:i*d+d], im[l*d:l*d+d])
|
||||
}
|
||||
e.inverseCache.RUnlock()
|
||||
return g, nil
|
||||
}
|
||||
e.inverseCache.RUnlock()
|
||||
matrixbuf := make([]byte, 4*d*d+cnt*d)
|
||||
m := matrixbuf[:d*d]
|
||||
for i, l := range has {
|
||||
copy(m[i*d:i*d+d], em[l*d:l*d+d])
|
||||
}
|
||||
raw := matrixbuf[d*d : 3*d*d]
|
||||
im := matrixbuf[3*d*d : 4*d*d]
|
||||
err2 := matrix(m).invert(raw, d, im)
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
e.inverseCache.Lock()
|
||||
e.inverseCache.data[ikey] = im
|
||||
e.inverseCache.Unlock()
|
||||
g := matrixbuf[4*d*d:]
|
||||
for i, l := range dLost {
|
||||
copy(g[i*d:i*d+d], im[l*d:l*d+d])
|
||||
}
|
||||
return g, nil
|
||||
}
|
||||
|
||||
func (e *encSSSE3) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
em := e.encode
|
||||
dCnt := len(dLost)
|
||||
size := len(vects[has[0]])
|
||||
if dCnt != 0 {
|
||||
vtmp := make([][]byte, d+dCnt)
|
||||
for i, p := range has {
|
||||
vtmp[i] = vects[p]
|
||||
}
|
||||
for i, p := range dLost {
|
||||
if len(vects[p]) == 0 {
|
||||
vects[p] = make([]byte, size)
|
||||
}
|
||||
vtmp[i+d] = vects[p]
|
||||
}
|
||||
g, err2 := e.makeGen(has, dLost)
|
||||
if err2 != nil {
|
||||
return
|
||||
}
|
||||
etmp := &encSSSE3{data: d, parity: dCnt, gen: g}
|
||||
err2 = etmp.encodeGen(vtmp)
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
}
|
||||
if dataOnly {
|
||||
return
|
||||
}
|
||||
pCnt := len(pLost)
|
||||
if pCnt != 0 {
|
||||
g := make([]byte, pCnt*d)
|
||||
for i, l := range pLost {
|
||||
copy(g[i*d:i*d+d], em[l*d:l*d+d])
|
||||
}
|
||||
vtmp := make([][]byte, d+pCnt)
|
||||
for i := 0; i < d; i++ {
|
||||
vtmp[i] = vects[i]
|
||||
}
|
||||
for i, p := range pLost {
|
||||
if len(vects[p]) == 0 {
|
||||
vects[p] = make([]byte, size)
|
||||
}
|
||||
vtmp[i+d] = vects[p]
|
||||
}
|
||||
etmp := &encSSSE3{data: d, parity: pCnt, gen: g}
|
||||
err2 := etmp.encodeGen(vtmp)
|
||||
if err2 != nil {
|
||||
return err2
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (e *encSSSE3) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
if len(has) != d {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
dCnt := len(dLost)
|
||||
if dCnt > p {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
pCnt := len(pLost)
|
||||
if pCnt > p {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
return e.reconst(vects, has, dLost, pLost, dataOnly)
|
||||
}
|
||||
|
||||
func (e *encSSSE3) reconstruct(vects [][]byte, dataOnly bool) (err error) {
|
||||
d := e.data
|
||||
p := e.parity
|
||||
t := d + p
|
||||
listBuf := make([]int, t+p)
|
||||
has := listBuf[:d]
|
||||
dLost := listBuf[d:t]
|
||||
pLost := listBuf[t : t+p]
|
||||
hasCnt, dCnt, pCnt := 0, 0, 0
|
||||
for i := 0; i < t; i++ {
|
||||
if vects[i] != nil {
|
||||
if hasCnt < d {
|
||||
has[hasCnt] = i
|
||||
hasCnt++
|
||||
}
|
||||
} else {
|
||||
if i < d {
|
||||
if dCnt < p {
|
||||
dLost[dCnt] = i
|
||||
dCnt++
|
||||
} else {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
} else {
|
||||
if pCnt < p {
|
||||
pLost[pCnt] = i
|
||||
pCnt++
|
||||
} else {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if hasCnt != d {
|
||||
return errors.New("rs.Reconst: not enough vects")
|
||||
}
|
||||
dLost = dLost[:dCnt]
|
||||
pLost = pLost[:pCnt]
|
||||
return e.reconst(vects, has, dLost, pLost, dataOnly)
|
||||
}
|
||||
401
vendor/github.com/templexxx/reedsolomon/rs_amd64.s
generated
vendored
Normal file
401
vendor/github.com/templexxx/reedsolomon/rs_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,401 @@
|
||||
// Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define low_tbl Y0
|
||||
#define high_tbl Y1
|
||||
#define mask Y2
|
||||
#define in0 Y3
|
||||
#define in1 Y4
|
||||
#define in2 Y5
|
||||
#define in3 Y6
|
||||
#define in4 Y7
|
||||
#define in5 Y8
|
||||
#define in0_h Y10
|
||||
#define in1_h Y11
|
||||
#define in2_h Y12
|
||||
#define in3_h Y13
|
||||
#define in4_h Y14
|
||||
#define in5_h Y15
|
||||
|
||||
#define in BX
|
||||
#define out DI
|
||||
#define len R8
|
||||
#define pos R9
|
||||
|
||||
#define tmp0 R10
|
||||
|
||||
#define low_tblx X0
|
||||
#define high_tblx X1
|
||||
#define maskx X2
|
||||
#define in0x X3
|
||||
#define in0_hx X10
|
||||
#define tmp0x X9
|
||||
#define tmp1x X11
|
||||
#define tmp2x X12
|
||||
#define tmp3x X13
|
||||
|
||||
|
||||
// func mulVectAVX2(tbl, d, p []byte)
|
||||
TEXT ·mulVectAVX2(SB), NOSPLIT, $0
|
||||
MOVQ i+24(FP), in
|
||||
MOVQ o+48(FP), out
|
||||
MOVQ tbl+0(FP), tmp0
|
||||
VMOVDQU (tmp0), low_tblx
|
||||
VMOVDQU 16(tmp0), high_tblx
|
||||
MOVB $0x0f, DX
|
||||
LONG $0x2069e3c4; WORD $0x00d2 // VPINSRB $0x00, EDX, XMM2, XMM2
|
||||
VPBROADCASTB maskx, maskx
|
||||
MOVQ in_len+32(FP), len
|
||||
TESTQ $31, len
|
||||
JNZ one16b
|
||||
|
||||
ymm:
|
||||
VINSERTI128 $1, low_tblx, low_tbl, low_tbl
|
||||
VINSERTI128 $1, high_tblx, high_tbl, high_tbl
|
||||
VINSERTI128 $1, maskx, mask, mask
|
||||
TESTQ $255, len
|
||||
JNZ not_aligned
|
||||
|
||||
// 256bytes/loop
|
||||
aligned:
|
||||
MOVQ $0, pos
|
||||
|
||||
loop256b:
|
||||
VMOVDQU (in)(pos*1), in0
|
||||
VPSRLQ $4, in0, in0_h
|
||||
VPAND mask, in0_h, in0_h
|
||||
VPAND mask, in0, in0
|
||||
VPSHUFB in0_h, high_tbl, in0_h
|
||||
VPSHUFB in0, low_tbl, in0
|
||||
VPXOR in0, in0_h, in0
|
||||
VMOVDQU in0, (out)(pos*1)
|
||||
|
||||
VMOVDQU 32(in)(pos*1), in1
|
||||
VPSRLQ $4, in1, in1_h
|
||||
VPAND mask, in1_h, in1_h
|
||||
VPAND mask, in1, in1
|
||||
VPSHUFB in1_h, high_tbl, in1_h
|
||||
VPSHUFB in1, low_tbl, in1
|
||||
VPXOR in1, in1_h, in1
|
||||
VMOVDQU in1, 32(out)(pos*1)
|
||||
|
||||
VMOVDQU 64(in)(pos*1), in2
|
||||
VPSRLQ $4, in2, in2_h
|
||||
VPAND mask, in2_h, in2_h
|
||||
VPAND mask, in2, in2
|
||||
VPSHUFB in2_h, high_tbl, in2_h
|
||||
VPSHUFB in2, low_tbl, in2
|
||||
VPXOR in2, in2_h, in2
|
||||
VMOVDQU in2, 64(out)(pos*1)
|
||||
|
||||
VMOVDQU 96(in)(pos*1), in3
|
||||
VPSRLQ $4, in3, in3_h
|
||||
VPAND mask, in3_h, in3_h
|
||||
VPAND mask, in3, in3
|
||||
VPSHUFB in3_h, high_tbl, in3_h
|
||||
VPSHUFB in3, low_tbl, in3
|
||||
VPXOR in3, in3_h, in3
|
||||
VMOVDQU in3, 96(out)(pos*1)
|
||||
|
||||
VMOVDQU 128(in)(pos*1), in4
|
||||
VPSRLQ $4, in4, in4_h
|
||||
VPAND mask, in4_h, in4_h
|
||||
VPAND mask, in4, in4
|
||||
VPSHUFB in4_h, high_tbl, in4_h
|
||||
VPSHUFB in4, low_tbl, in4
|
||||
VPXOR in4, in4_h, in4
|
||||
VMOVDQU in4, 128(out)(pos*1)
|
||||
|
||||
VMOVDQU 160(in)(pos*1), in5
|
||||
VPSRLQ $4, in5, in5_h
|
||||
VPAND mask, in5_h, in5_h
|
||||
VPAND mask, in5, in5
|
||||
VPSHUFB in5_h, high_tbl, in5_h
|
||||
VPSHUFB in5, low_tbl, in5
|
||||
VPXOR in5, in5_h, in5
|
||||
VMOVDQU in5, 160(out)(pos*1)
|
||||
|
||||
VMOVDQU 192(in)(pos*1), in0
|
||||
VPSRLQ $4, in0, in0_h
|
||||
VPAND mask, in0_h, in0_h
|
||||
VPAND mask, in0, in0
|
||||
VPSHUFB in0_h, high_tbl, in0_h
|
||||
VPSHUFB in0, low_tbl, in0
|
||||
VPXOR in0, in0_h, in0
|
||||
VMOVDQU in0, 192(out)(pos*1)
|
||||
|
||||
VMOVDQU 224(in)(pos*1), in1
|
||||
VPSRLQ $4, in1, in1_h
|
||||
VPAND mask, in1_h, in1_h
|
||||
VPAND mask, in1, in1
|
||||
VPSHUFB in1_h, high_tbl, in1_h
|
||||
VPSHUFB in1, low_tbl, in1
|
||||
VPXOR in1, in1_h, in1
|
||||
VMOVDQU in1, 224(out)(pos*1)
|
||||
|
||||
ADDQ $256, pos
|
||||
CMPQ len, pos
|
||||
JNE loop256b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
not_aligned:
|
||||
MOVQ len, tmp0
|
||||
ANDQ $255, tmp0
|
||||
|
||||
loop32b:
|
||||
VMOVDQU -32(in)(len*1), in0
|
||||
VPSRLQ $4, in0, in0_h
|
||||
VPAND mask, in0_h, in0_h
|
||||
VPAND mask, in0, in0
|
||||
VPSHUFB in0_h, high_tbl, in0_h
|
||||
VPSHUFB in0, low_tbl, in0
|
||||
VPXOR in0, in0_h, in0
|
||||
VMOVDQU in0, -32(out)(len*1)
|
||||
SUBQ $32, len
|
||||
SUBQ $32, tmp0
|
||||
JG loop32b
|
||||
CMPQ len, $256
|
||||
JGE aligned
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
one16b:
|
||||
VMOVDQU -16(in)(len*1), in0x
|
||||
VPSRLQ $4, in0x, in0_hx
|
||||
VPAND maskx, in0x, in0x
|
||||
VPAND maskx, in0_hx, in0_hx
|
||||
VPSHUFB in0_hx, high_tblx, in0_hx
|
||||
VPSHUFB in0x, low_tblx, in0x
|
||||
VPXOR in0x, in0_hx, in0x
|
||||
VMOVDQU in0x, -16(out)(len*1)
|
||||
SUBQ $16, len
|
||||
CMPQ len, $0
|
||||
JNE ymm
|
||||
RET
|
||||
|
||||
// func mulVectAddAVX2(tbl, d, p []byte)
|
||||
TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
|
||||
MOVQ i+24(FP), in
|
||||
MOVQ o+48(FP), out
|
||||
MOVQ tbl+0(FP), tmp0
|
||||
VMOVDQU (tmp0), low_tblx
|
||||
VMOVDQU 16(tmp0), high_tblx
|
||||
MOVB $0x0f, DX
|
||||
LONG $0x2069e3c4; WORD $0x00d2
|
||||
VPBROADCASTB maskx, maskx
|
||||
MOVQ in_len+32(FP), len
|
||||
TESTQ $31, len
|
||||
JNZ one16b
|
||||
|
||||
ymm:
|
||||
VINSERTI128 $1, low_tblx, low_tbl, low_tbl
|
||||
VINSERTI128 $1, high_tblx, high_tbl, high_tbl
|
||||
VINSERTI128 $1, maskx, mask, mask
|
||||
TESTQ $255, len
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, pos
|
||||
|
||||
loop256b:
|
||||
VMOVDQU (in)(pos*1), in0
|
||||
VPSRLQ $4, in0, in0_h
|
||||
VPAND mask, in0_h, in0_h
|
||||
VPAND mask, in0, in0
|
||||
VPSHUFB in0_h, high_tbl, in0_h
|
||||
VPSHUFB in0, low_tbl, in0
|
||||
VPXOR in0, in0_h, in0
|
||||
VPXOR (out)(pos*1), in0, in0
|
||||
VMOVDQU in0, (out)(pos*1)
|
||||
|
||||
VMOVDQU 32(in)(pos*1), in1
|
||||
VPSRLQ $4, in1, in1_h
|
||||
VPAND mask, in1_h, in1_h
|
||||
VPAND mask, in1, in1
|
||||
VPSHUFB in1_h, high_tbl, in1_h
|
||||
VPSHUFB in1, low_tbl, in1
|
||||
VPXOR in1, in1_h, in1
|
||||
VPXOR 32(out)(pos*1), in1, in1
|
||||
VMOVDQU in1, 32(out)(pos*1)
|
||||
|
||||
VMOVDQU 64(in)(pos*1), in2
|
||||
VPSRLQ $4, in2, in2_h
|
||||
VPAND mask, in2_h, in2_h
|
||||
VPAND mask, in2, in2
|
||||
VPSHUFB in2_h, high_tbl, in2_h
|
||||
VPSHUFB in2, low_tbl, in2
|
||||
VPXOR in2, in2_h, in2
|
||||
VPXOR 64(out)(pos*1), in2, in2
|
||||
VMOVDQU in2, 64(out)(pos*1)
|
||||
|
||||
VMOVDQU 96(in)(pos*1), in3
|
||||
VPSRLQ $4, in3, in3_h
|
||||
VPAND mask, in3_h, in3_h
|
||||
VPAND mask, in3, in3
|
||||
VPSHUFB in3_h, high_tbl, in3_h
|
||||
VPSHUFB in3, low_tbl, in3
|
||||
VPXOR in3, in3_h, in3
|
||||
VPXOR 96(out)(pos*1), in3, in3
|
||||
VMOVDQU in3, 96(out)(pos*1)
|
||||
|
||||
VMOVDQU 128(in)(pos*1), in4
|
||||
VPSRLQ $4, in4, in4_h
|
||||
VPAND mask, in4_h, in4_h
|
||||
VPAND mask, in4, in4
|
||||
VPSHUFB in4_h, high_tbl, in4_h
|
||||
VPSHUFB in4, low_tbl, in4
|
||||
VPXOR in4, in4_h, in4
|
||||
VPXOR 128(out)(pos*1), in4, in4
|
||||
VMOVDQU in4, 128(out)(pos*1)
|
||||
|
||||
VMOVDQU 160(in)(pos*1), in5
|
||||
VPSRLQ $4, in5, in5_h
|
||||
VPAND mask, in5_h, in5_h
|
||||
VPAND mask, in5, in5
|
||||
VPSHUFB in5_h, high_tbl, in5_h
|
||||
VPSHUFB in5, low_tbl, in5
|
||||
VPXOR in5, in5_h, in5
|
||||
VPXOR 160(out)(pos*1), in5, in5
|
||||
VMOVDQU in5, 160(out)(pos*1)
|
||||
|
||||
VMOVDQU 192(in)(pos*1), in0
|
||||
VPSRLQ $4, in0, in0_h
|
||||
VPAND mask, in0_h, in0_h
|
||||
VPAND mask, in0, in0
|
||||
VPSHUFB in0_h, high_tbl, in0_h
|
||||
VPSHUFB in0, low_tbl, in0
|
||||
VPXOR in0, in0_h, in0
|
||||
VPXOR 192(out)(pos*1), in0, in0
|
||||
VMOVDQU in0, 192(out)(pos*1)
|
||||
|
||||
VMOVDQU 224(in)(pos*1), in1
|
||||
VPSRLQ $4, in1, in1_h
|
||||
VPAND mask, in1_h, in1_h
|
||||
VPAND mask, in1, in1
|
||||
VPSHUFB in1_h, high_tbl, in1_h
|
||||
VPSHUFB in1, low_tbl, in1
|
||||
VPXOR in1, in1_h, in1
|
||||
VPXOR 224(out)(pos*1), in1, in1
|
||||
VMOVDQU in1, 224(out)(pos*1)
|
||||
|
||||
ADDQ $256, pos
|
||||
CMPQ len, pos
|
||||
JNE loop256b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
not_aligned:
|
||||
MOVQ len, tmp0
|
||||
ANDQ $255, tmp0
|
||||
|
||||
loop32b:
|
||||
VMOVDQU -32(in)(len*1), in0
|
||||
VPSRLQ $4, in0, in0_h
|
||||
VPAND mask, in0_h, in0_h
|
||||
VPAND mask, in0, in0
|
||||
VPSHUFB in0_h, high_tbl, in0_h
|
||||
VPSHUFB in0, low_tbl, in0
|
||||
VPXOR in0, in0_h, in0
|
||||
VPXOR -32(out)(len*1), in0, in0
|
||||
VMOVDQU in0, -32(out)(len*1)
|
||||
SUBQ $32, len
|
||||
SUBQ $32, tmp0
|
||||
JG loop32b
|
||||
CMPQ len, $256
|
||||
JGE aligned
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
one16b:
|
||||
VMOVDQU -16(in)(len*1), in0x
|
||||
VPSRLQ $4, in0x, in0_hx
|
||||
VPAND maskx, in0x, in0x
|
||||
VPAND maskx, in0_hx, in0_hx
|
||||
VPSHUFB in0_hx, high_tblx, in0_hx
|
||||
VPSHUFB in0x, low_tblx, in0x
|
||||
VPXOR in0x, in0_hx, in0x
|
||||
VPXOR -16(out)(len*1), in0x, in0x
|
||||
VMOVDQU in0x, -16(out)(len*1)
|
||||
SUBQ $16, len
|
||||
CMPQ len, $0
|
||||
JNE ymm
|
||||
RET
|
||||
|
||||
// func mulVectSSSE3(tbl, d, p []byte)
|
||||
TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
|
||||
MOVQ i+24(FP), in
|
||||
MOVQ o+48(FP), out
|
||||
MOVQ tbl+0(FP), tmp0
|
||||
MOVOU (tmp0), low_tblx
|
||||
MOVOU 16(tmp0), high_tblx
|
||||
MOVB $15, tmp0
|
||||
MOVQ tmp0, maskx
|
||||
PXOR tmp0x, tmp0x
|
||||
PSHUFB tmp0x, maskx
|
||||
MOVQ in_len+32(FP), len
|
||||
SHRQ $4, len
|
||||
|
||||
loop:
|
||||
MOVOU (in), in0x
|
||||
MOVOU in0x, in0_hx
|
||||
PSRLQ $4, in0_hx
|
||||
PAND maskx, in0x
|
||||
PAND maskx, in0_hx
|
||||
MOVOU low_tblx, tmp1x
|
||||
MOVOU high_tblx, tmp2x
|
||||
PSHUFB in0x, tmp1x
|
||||
PSHUFB in0_hx, tmp2x
|
||||
PXOR tmp1x, tmp2x
|
||||
MOVOU tmp2x, (out)
|
||||
ADDQ $16, in
|
||||
ADDQ $16, out
|
||||
SUBQ $1, len
|
||||
JNZ loop
|
||||
RET
|
||||
|
||||
// func mulVectAddSSSE3(tbl, d, p []byte)
|
||||
TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
|
||||
MOVQ i+24(FP), in
|
||||
MOVQ o+48(FP), out
|
||||
MOVQ tbl+0(FP), tmp0
|
||||
MOVOU (tmp0), low_tblx
|
||||
MOVOU 16(tmp0), high_tblx
|
||||
MOVB $15, tmp0
|
||||
MOVQ tmp0, maskx
|
||||
PXOR tmp0x, tmp0x
|
||||
PSHUFB tmp0x, maskx
|
||||
MOVQ in_len+32(FP), len
|
||||
SHRQ $4, len
|
||||
|
||||
loop:
|
||||
MOVOU (in), in0x
|
||||
MOVOU in0x, in0_hx
|
||||
PSRLQ $4, in0_hx
|
||||
PAND maskx, in0x
|
||||
PAND maskx, in0_hx
|
||||
MOVOU low_tblx, tmp1x
|
||||
MOVOU high_tblx, tmp2x
|
||||
PSHUFB in0x, tmp1x
|
||||
PSHUFB in0_hx, tmp2x
|
||||
PXOR tmp1x, tmp2x
|
||||
MOVOU (out), tmp3x
|
||||
PXOR tmp3x, tmp2x
|
||||
MOVOU tmp2x, (out)
|
||||
ADDQ $16, in
|
||||
ADDQ $16, out
|
||||
SUBQ $1, len
|
||||
JNZ loop
|
||||
RET
|
||||
|
||||
// func copy32B(dst, src []byte)
|
||||
TEXT ·copy32B(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), SI
|
||||
MOVQ src+24(FP), DX
|
||||
MOVOU (DX), X0
|
||||
MOVOU 16(DX), X1
|
||||
MOVOU X0, (SI)
|
||||
MOVOU X1, 16(SI)
|
||||
RET
|
||||
|
||||
8
vendor/github.com/templexxx/reedsolomon/rs_other.go
generated
vendored
Normal file
8
vendor/github.com/templexxx/reedsolomon/rs_other.go
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
// +build !amd64
|
||||
|
||||
package reedsolomon
|
||||
|
||||
func newRS(d, p int, em matrix) (enc Encoder) {
|
||||
g := em[d*d:]
|
||||
return &encBase{data: d, parity: p, encode: em, gen: g}
|
||||
}
|
||||
44
vendor/github.com/templexxx/reedsolomon/tbl.go
generated
vendored
Normal file
44
vendor/github.com/templexxx/reedsolomon/tbl.go
generated
vendored
Normal file
File diff suppressed because one or more lines are too long
1
vendor/github.com/templexxx/xor/.gitattributes
generated
vendored
Normal file
1
vendor/github.com/templexxx/xor/.gitattributes
generated
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.s linguist-language=go
|
||||
18
vendor/github.com/templexxx/xor/.gitignore
generated
vendored
Normal file
18
vendor/github.com/templexxx/xor/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
# Binaries for programs and plugins
|
||||
*.exe
|
||||
*.dll
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Test binary, build with `go test -c`
|
||||
*.test
|
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||
*.out
|
||||
|
||||
# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
|
||||
.glide/
|
||||
/backup/
|
||||
/backup2/
|
||||
/.idea
|
||||
/backup3/
|
||||
21
vendor/github.com/templexxx/xor/LICENSE
generated
vendored
Normal file
21
vendor/github.com/templexxx/xor/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2017 Temple3x
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
48
vendor/github.com/templexxx/xor/README.md
generated
vendored
Normal file
48
vendor/github.com/templexxx/xor/README.md
generated
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
# XOR
|
||||
|
||||
XOR code engine in pure Go
|
||||
|
||||
more than 10GB/S per core
|
||||
|
||||
## Introduction:
|
||||
|
||||
1. Use SIMD (SSE2 or AVX2) for speeding up
|
||||
2. ...
|
||||
|
||||
## Installation
|
||||
To get the package use the standard:
|
||||
```bash
|
||||
go get github.com/templexxx/xor
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
See the associated [GoDoc](http://godoc.org/github.com/templexxx/xor)
|
||||
|
||||
|
||||
## Performance
|
||||
|
||||
Performance depends mainly on:
|
||||
|
||||
1. SIMD extension
|
||||
2. unit size of worker
|
||||
3. hardware ( CPU RAM etc)
|
||||
|
||||
Example of performance on my MacBook 2014-mid(i5-4278U 2.6GHz 2 physical cores). The 16MB per shards.
|
||||
```
|
||||
speed = ( shards * size ) / cost
|
||||
```
|
||||
| data_shards | shard_size |speed (MB/S) |
|
||||
|----------|----|-----|
|
||||
| 2 |1KB|64127.95 |
|
||||
|2|1400B|59657.55|
|
||||
|2|16KB|35370.84|
|
||||
| 2 | 16MB|12128.95 |
|
||||
| 5 |1KB| 78837.33 |
|
||||
|5|1400B|58054.89|
|
||||
|5|16KB|50161.19|
|
||||
|5| 16MB|12750.41|
|
||||
|
||||
## Who is using this?
|
||||
|
||||
1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang
|
||||
438
vendor/github.com/templexxx/xor/avx2_amd64.s
generated
vendored
Normal file
438
vendor/github.com/templexxx/xor/avx2_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,438 @@
|
||||
#include "textflag.h"
|
||||
|
||||
// addr of mem
|
||||
#define DST BX
|
||||
#define SRC SI
|
||||
#define SRC0 TMP4
|
||||
#define SRC1 TMP5
|
||||
|
||||
// loop args
|
||||
// num of vect
|
||||
#define VECT CX
|
||||
#define LEN DX
|
||||
// pos of matrix
|
||||
#define POS R8
|
||||
|
||||
// tmp store
|
||||
// num of vect or ...
|
||||
#define TMP1 R9
|
||||
// pos of matrix or ...
|
||||
#define TMP2 R10
|
||||
// store addr of data/parity or ...
|
||||
#define TMP3 R11
|
||||
#define TMP4 R12
|
||||
#define TMP5 R13
|
||||
#define TMP6 R14
|
||||
|
||||
// func bytesAVX2mini(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $31, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop32b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
ADDQ $32, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop32b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $31, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $31, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $32
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesAVX2small(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2small(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VMOVDQU 32(SRC0)(POS*1), Y1
|
||||
VMOVDQU 64(SRC0)(POS*1), Y2
|
||||
VMOVDQU 96(SRC0)(POS*1), Y3
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VPXOR 32(SRC1)(POS*1), Y1, Y1
|
||||
VPXOR 64(SRC1)(POS*1), Y2, Y2
|
||||
VPXOR 96(SRC1)(POS*1), Y3, Y3
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
VMOVDQU Y1, 32(DST)(POS*1)
|
||||
VMOVDQU Y2, 64(DST)(POS*1)
|
||||
VMOVDQU Y3, 96(DST)(POS*1)
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $127, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesAVX2big(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2big(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VMOVDQU 32(SRC0)(POS*1), Y1
|
||||
VMOVDQU 64(SRC0)(POS*1), Y2
|
||||
VMOVDQU 96(SRC0)(POS*1), Y3
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VPXOR 32(SRC1)(POS*1), Y1, Y1
|
||||
VPXOR 64(SRC1)(POS*1), Y2, Y2
|
||||
VPXOR 96(SRC1)(POS*1), Y3, Y3
|
||||
LONG $0xe77da1c4; WORD $0x0304
|
||||
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
|
||||
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
|
||||
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
SFENCE
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $127, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixAVX2small(dst []byte, src [][]byte)
|
||||
TEXT ·matrixAVX2small(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y0
|
||||
VMOVDQU 32(TMP4)(POS*1), Y1
|
||||
VMOVDQU 64(TMP3)(POS*1), Y2
|
||||
VMOVDQU 96(TMP4)(POS*1), Y3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y4
|
||||
VMOVDQU 32(TMP4)(POS*1), Y5
|
||||
VMOVDQU 64(TMP3)(POS*1), Y6
|
||||
VMOVDQU 96(TMP4)(POS*1), Y7
|
||||
VPXOR Y4, Y0, Y0
|
||||
VPXOR Y5, Y1, Y1
|
||||
VPXOR Y6, Y2, Y2
|
||||
VPXOR Y7, Y3, Y3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
VMOVDQU Y1, 32(DST)(POS*1)
|
||||
VMOVDQU Y2, 64(DST)(POS*1)
|
||||
VMOVDQU Y3, 96(DST)(POS*1)
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $127, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixAVX2big(dst []byte, src [][]byte)
|
||||
TEXT ·matrixAVX2big(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y0
|
||||
VMOVDQU 32(TMP4)(POS*1), Y1
|
||||
VMOVDQU 64(TMP3)(POS*1), Y2
|
||||
VMOVDQU 96(TMP4)(POS*1), Y3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y4
|
||||
VMOVDQU 32(TMP4)(POS*1), Y5
|
||||
VMOVDQU 64(TMP3)(POS*1), Y6
|
||||
VMOVDQU 96(TMP4)(POS*1), Y7
|
||||
VPXOR Y4, Y0, Y0
|
||||
VPXOR Y5, Y1, Y1
|
||||
VPXOR Y6, Y2, Y2
|
||||
VPXOR Y7, Y3, Y3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
|
||||
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
|
||||
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
|
||||
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $127, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
116
vendor/github.com/templexxx/xor/nosimd.go
generated
vendored
Normal file
116
vendor/github.com/templexxx/xor/nosimd.go
generated
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package xor
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const wordSize = int(unsafe.Sizeof(uintptr(0)))
|
||||
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
|
||||
|
||||
// xor the bytes in a and b. The destination is assumed to have enough space.
|
||||
func bytesNoSIMD(dst, a, b []byte, size int) {
|
||||
if supportsUnaligned {
|
||||
fastXORBytes(dst, a, b, size)
|
||||
} else {
|
||||
// TODO(hanwen): if (dst, a, b) have common alignment
|
||||
// we could still try fastXORBytes. It is not clear
|
||||
// how often this happens, and it's only worth it if
|
||||
// the block encryption itself is hardware
|
||||
// accelerated.
|
||||
safeXORBytes(dst, a, b, size)
|
||||
}
|
||||
}
|
||||
|
||||
// split slice for cache-friendly
|
||||
const unitSize = 16 * 1024
|
||||
|
||||
func matrixNoSIMD(dst []byte, src [][]byte) {
|
||||
size := len(src[0])
|
||||
start := 0
|
||||
do := unitSize
|
||||
for start < size {
|
||||
end := start + do
|
||||
if end <= size {
|
||||
partNoSIMD(start, end, dst, src)
|
||||
start = start + do
|
||||
} else {
|
||||
partNoSIMD(start, size, dst, src)
|
||||
start = size
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// split vect will improve performance with big data by reducing cache pollution
|
||||
func partNoSIMD(start, end int, dst []byte, src [][]byte) {
|
||||
bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start)
|
||||
for i := 2; i < len(src); i++ {
|
||||
bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start)
|
||||
}
|
||||
}
|
||||
|
||||
// fastXORBytes xor in bulk. It only works on architectures that
|
||||
// support unaligned read/writes.
|
||||
func fastXORBytes(dst, a, b []byte, n int) {
|
||||
w := n / wordSize
|
||||
if w > 0 {
|
||||
wordBytes := w * wordSize
|
||||
fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
|
||||
}
|
||||
for i := n - n%wordSize; i < n; i++ {
|
||||
dst[i] = a[i] ^ b[i]
|
||||
}
|
||||
}
|
||||
|
||||
func safeXORBytes(dst, a, b []byte, n int) {
|
||||
ex := n % 8
|
||||
for i := 0; i < ex; i++ {
|
||||
dst[i] = a[i] ^ b[i]
|
||||
}
|
||||
|
||||
for i := ex; i < n; i += 8 {
|
||||
_dst := dst[i : i+8]
|
||||
_a := a[i : i+8]
|
||||
_b := b[i : i+8]
|
||||
_dst[0] = _a[0] ^ _b[0]
|
||||
_dst[1] = _a[1] ^ _b[1]
|
||||
_dst[2] = _a[2] ^ _b[2]
|
||||
_dst[3] = _a[3] ^ _b[3]
|
||||
|
||||
_dst[4] = _a[4] ^ _b[4]
|
||||
_dst[5] = _a[5] ^ _b[5]
|
||||
_dst[6] = _a[6] ^ _b[6]
|
||||
_dst[7] = _a[7] ^ _b[7]
|
||||
}
|
||||
}
|
||||
|
||||
// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
|
||||
// The arguments are assumed to be of equal length.
|
||||
func fastXORWords(dst, a, b []byte) {
|
||||
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
|
||||
aw := *(*[]uintptr)(unsafe.Pointer(&a))
|
||||
bw := *(*[]uintptr)(unsafe.Pointer(&b))
|
||||
n := len(b) / wordSize
|
||||
ex := n % 8
|
||||
for i := 0; i < ex; i++ {
|
||||
dw[i] = aw[i] ^ bw[i]
|
||||
}
|
||||
|
||||
for i := ex; i < n; i += 8 {
|
||||
_dw := dw[i : i+8]
|
||||
_aw := aw[i : i+8]
|
||||
_bw := bw[i : i+8]
|
||||
_dw[0] = _aw[0] ^ _bw[0]
|
||||
_dw[1] = _aw[1] ^ _bw[1]
|
||||
_dw[2] = _aw[2] ^ _bw[2]
|
||||
_dw[3] = _aw[3] ^ _bw[3]
|
||||
_dw[4] = _aw[4] ^ _bw[4]
|
||||
_dw[5] = _aw[5] ^ _bw[5]
|
||||
_dw[6] = _aw[6] ^ _bw[6]
|
||||
_dw[7] = _aw[7] ^ _bw[7]
|
||||
}
|
||||
}
|
||||
574
vendor/github.com/templexxx/xor/sse2_amd64.s
generated
vendored
Normal file
574
vendor/github.com/templexxx/xor/sse2_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,574 @@
|
||||
#include "textflag.h"
|
||||
|
||||
// addr of mem
|
||||
#define DST BX
|
||||
#define SRC SI
|
||||
#define SRC0 TMP4
|
||||
#define SRC1 TMP5
|
||||
|
||||
// loop args
|
||||
// num of vect
|
||||
#define VECT CX
|
||||
#define LEN DX
|
||||
// pos of matrix
|
||||
#define POS R8
|
||||
|
||||
// tmp store
|
||||
// num of vect or ...
|
||||
#define TMP1 R9
|
||||
// pos of matrix or ...
|
||||
#define TMP2 R10
|
||||
// store addr of data/parity or ...
|
||||
#define TMP3 R11
|
||||
#define TMP4 R12
|
||||
#define TMP5 R13
|
||||
#define TMP6 R14
|
||||
|
||||
// func bytesSrc0(dst, src0, src1 []byte)
|
||||
TEXT ·xorSrc0(SB), NOSPLIT, $0
|
||||
MOVQ len+32(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $15, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop16b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
XORPD (SRC1)(POS*1), X0
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
ADDQ $16, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop16b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $15, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $15, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $16
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesSrc1(dst, src0, src1 []byte)
|
||||
TEXT ·xorSrc1(SB), NOSPLIT, $0
|
||||
MOVQ len+56(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $15, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop16b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
XORPD (SRC1)(POS*1), X0
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
ADDQ $16, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop16b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $15, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $15, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $16
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesSSE2mini(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesSSE2mini(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $15, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop16b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
XORPD (SRC1)(POS*1), X0
|
||||
|
||||
// MOVOU (SRC1)(POS*1), X4
|
||||
// PXOR X4, X0
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
ADDQ $16, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop16b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $15, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $15, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $16
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesSSE2small(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesSSE2small(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $63, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop64b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
MOVOU 16(SRC0)(POS*1), X1
|
||||
MOVOU 32(SRC0)(POS*1), X2
|
||||
MOVOU 48(SRC0)(POS*1), X3
|
||||
|
||||
MOVOU (SRC1)(POS*1), X4
|
||||
MOVOU 16(SRC1)(POS*1), X5
|
||||
MOVOU 32(SRC1)(POS*1), X6
|
||||
MOVOU 48(SRC1)(POS*1), X7
|
||||
|
||||
PXOR X4, X0
|
||||
PXOR X5, X1
|
||||
PXOR X6, X2
|
||||
PXOR X7, X3
|
||||
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
MOVOU X1, 16(DST)(POS*1)
|
||||
MOVOU X2, 32(DST)(POS*1)
|
||||
MOVOU X3, 48(DST)(POS*1)
|
||||
|
||||
ADDQ $64, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop64b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $63, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $63, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $64
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesSSE2big(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesSSE2big(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $63, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop64b:
|
||||
MOVOU (SRC0)(POS*1), X0
|
||||
MOVOU 16(SRC0)(POS*1), X1
|
||||
MOVOU 32(SRC0)(POS*1), X2
|
||||
MOVOU 48(SRC0)(POS*1), X3
|
||||
|
||||
MOVOU (SRC1)(POS*1), X4
|
||||
MOVOU 16(SRC1)(POS*1), X5
|
||||
MOVOU 32(SRC1)(POS*1), X6
|
||||
MOVOU 48(SRC1)(POS*1), X7
|
||||
|
||||
PXOR X4, X0
|
||||
PXOR X5, X1
|
||||
PXOR X6, X2
|
||||
PXOR X7, X3
|
||||
|
||||
LONG $0xe70f4266; WORD $0x0304 // MOVNTDQ
|
||||
LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
|
||||
LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
|
||||
LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
|
||||
|
||||
ADDQ $64, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop64b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $63, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $63, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $64
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixSSE2small(dst []byte, src [][]byte)
|
||||
TEXT ·matrixSSE2small(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $63, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop64b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
MOVOU (TMP3)(POS*1), X0
|
||||
MOVOU 16(TMP4)(POS*1), X1
|
||||
MOVOU 32(TMP3)(POS*1), X2
|
||||
MOVOU 48(TMP4)(POS*1), X3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
MOVOU (TMP3)(POS*1), X4
|
||||
MOVOU 16(TMP4)(POS*1), X5
|
||||
MOVOU 32(TMP3)(POS*1), X6
|
||||
MOVOU 48(TMP4)(POS*1), X7
|
||||
PXOR X4, X0
|
||||
PXOR X5, X1
|
||||
PXOR X6, X2
|
||||
PXOR X7, X3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
MOVOU X0, (DST)(POS*1)
|
||||
MOVOU X1, 16(DST)(POS*1)
|
||||
MOVOU X2, 32(DST)(POS*1)
|
||||
MOVOU X3, 48(DST)(POS*1)
|
||||
|
||||
ADDQ $64, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop64b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $63, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $63, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $64
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixSSE2big(dst []byte, src [][]byte)
|
||||
TEXT ·matrixSSE2big(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $63, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop64b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
MOVOU (TMP3)(POS*1), X0
|
||||
MOVOU 16(TMP4)(POS*1), X1
|
||||
MOVOU 32(TMP3)(POS*1), X2
|
||||
MOVOU 48(TMP4)(POS*1), X3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
MOVOU (TMP3)(POS*1), X4
|
||||
MOVOU 16(TMP4)(POS*1), X5
|
||||
MOVOU 32(TMP3)(POS*1), X6
|
||||
MOVOU 48(TMP4)(POS*1), X7
|
||||
PXOR X4, X0
|
||||
PXOR X5, X1
|
||||
PXOR X6, X2
|
||||
PXOR X7, X3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
LONG $0xe70f4266; WORD $0x0304
|
||||
LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
|
||||
LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
|
||||
LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
|
||||
|
||||
ADDQ $64, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop64b
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $63, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $63, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $64
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
TEXT ·hasSSE2(SB), NOSPLIT, $0
|
||||
XORQ AX, AX
|
||||
INCL AX
|
||||
CPUID
|
||||
SHRQ $26, DX
|
||||
ANDQ $1, DX
|
||||
MOVB DX, ret+0(FP)
|
||||
RET
|
||||
|
||||
49
vendor/github.com/templexxx/xor/xor.go
generated
vendored
Normal file
49
vendor/github.com/templexxx/xor/xor.go
generated
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
package xor
|
||||
|
||||
// SIMD Extensions
|
||||
const (
|
||||
none = iota
|
||||
avx2
|
||||
// first introduced by Intel with the initial version of the Pentium 4 in 2001
|
||||
// so I think we can assume all amd64 has sse2
|
||||
sse2
|
||||
)
|
||||
|
||||
var extension = none
|
||||
|
||||
// Bytes : chose the shortest one as xor size
|
||||
// it's better to use it for big data ( > 64bytes )
|
||||
func Bytes(dst, src0, src1 []byte) {
|
||||
size := len(dst)
|
||||
if size > len(src0) {
|
||||
size = len(src0)
|
||||
}
|
||||
if size > len(src1) {
|
||||
size = len(src1)
|
||||
}
|
||||
xorBytes(dst, src0, src1, size)
|
||||
}
|
||||
|
||||
// BytesSameLen : all slice's length must be equal
|
||||
// cut size branch, save time for small data
|
||||
func BytesSameLen(dst, src0, src1 []byte) {
|
||||
xorSrc1(dst, src0, src1)
|
||||
}
|
||||
|
||||
// BytesSrc0 : src1 >= src0, dst >= src0
|
||||
// xor src0's len bytes
|
||||
func BytesSrc0(dst, src0, src1 []byte) {
|
||||
xorSrc0(dst, src0, src1)
|
||||
}
|
||||
|
||||
// BytesSrc1 : src0 >= src1, dst >= src1
|
||||
// xor src1's len bytes
|
||||
func BytesSrc1(dst, src0, src1 []byte) {
|
||||
xorSrc1(dst, src0, src1)
|
||||
}
|
||||
|
||||
// Matrix : all slice's length must be equal && != 0
|
||||
// len(src) must >= 2
|
||||
func Matrix(dst []byte, src [][]byte) {
|
||||
xorMatrix(dst, src)
|
||||
}
|
||||
120
vendor/github.com/templexxx/xor/xor_amd64.go
generated
vendored
Normal file
120
vendor/github.com/templexxx/xor/xor_amd64.go
generated
vendored
Normal file
@@ -0,0 +1,120 @@
|
||||
package xor
|
||||
|
||||
import "github.com/templexxx/cpufeat"
|
||||
|
||||
func init() {
|
||||
getEXT()
|
||||
}
|
||||
|
||||
func getEXT() {
|
||||
if cpufeat.X86.HasAVX2 {
|
||||
extension = avx2
|
||||
} else {
|
||||
extension = sse2
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func xorBytes(dst, src0, src1 []byte, size int) {
|
||||
switch extension {
|
||||
case avx2:
|
||||
bytesAVX2(dst, src0, src1, size)
|
||||
default:
|
||||
bytesSSE2(dst, src0, src1, size)
|
||||
}
|
||||
}
|
||||
|
||||
// non-temporal hint store
|
||||
const nontmp = 8 * 1024
|
||||
const avx2loopsize = 128
|
||||
|
||||
func bytesAVX2(dst, src0, src1 []byte, size int) {
|
||||
if size < avx2loopsize {
|
||||
bytesAVX2mini(dst, src0, src1, size)
|
||||
} else if size >= avx2loopsize && size <= nontmp {
|
||||
bytesAVX2small(dst, src0, src1, size)
|
||||
} else {
|
||||
bytesAVX2big(dst, src0, src1, size)
|
||||
}
|
||||
}
|
||||
|
||||
const sse2loopsize = 64
|
||||
|
||||
func bytesSSE2(dst, src0, src1 []byte, size int) {
|
||||
if size < sse2loopsize {
|
||||
bytesSSE2mini(dst, src0, src1, size)
|
||||
} else if size >= sse2loopsize && size <= nontmp {
|
||||
bytesSSE2small(dst, src0, src1, size)
|
||||
} else {
|
||||
bytesSSE2big(dst, src0, src1, size)
|
||||
}
|
||||
}
|
||||
|
||||
func xorMatrix(dst []byte, src [][]byte) {
|
||||
switch extension {
|
||||
case avx2:
|
||||
matrixAVX2(dst, src)
|
||||
default:
|
||||
matrixSSE2(dst, src)
|
||||
}
|
||||
}
|
||||
|
||||
func matrixAVX2(dst []byte, src [][]byte) {
|
||||
size := len(dst)
|
||||
if size > nontmp {
|
||||
matrixAVX2big(dst, src)
|
||||
} else {
|
||||
matrixAVX2small(dst, src)
|
||||
}
|
||||
}
|
||||
|
||||
func matrixSSE2(dst []byte, src [][]byte) {
|
||||
size := len(dst)
|
||||
if size > nontmp {
|
||||
matrixSSE2big(dst, src)
|
||||
} else {
|
||||
matrixSSE2small(dst, src)
|
||||
}
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func xorSrc0(dst, src0, src1 []byte)
|
||||
|
||||
//go:noescape
|
||||
func xorSrc1(dst, src0, src1 []byte)
|
||||
|
||||
//go:noescape
|
||||
func bytesAVX2mini(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesAVX2big(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesAVX2small(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesSSE2mini(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesSSE2small(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func bytesSSE2big(dst, src0, src1 []byte, size int)
|
||||
|
||||
//go:noescape
|
||||
func matrixAVX2small(dst []byte, src [][]byte)
|
||||
|
||||
//go:noescape
|
||||
func matrixAVX2big(dst []byte, src [][]byte)
|
||||
|
||||
//go:noescape
|
||||
func matrixSSE2small(dst []byte, src [][]byte)
|
||||
|
||||
//go:noescape
|
||||
func matrixSSE2big(dst []byte, src [][]byte)
|
||||
|
||||
//go:noescape
|
||||
func hasAVX2() bool
|
||||
|
||||
//go:noescape
|
||||
func hasSSE2() bool
|
||||
19
vendor/github.com/templexxx/xor/xor_other.go
generated
vendored
Normal file
19
vendor/github.com/templexxx/xor/xor_other.go
generated
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
// +build !amd64 noasm
|
||||
|
||||
package xor
|
||||
|
||||
func xorBytes(dst, src0, src1 []byte, size int) {
|
||||
bytesNoSIMD(dst, src0, src1, size)
|
||||
}
|
||||
|
||||
func xorMatrix(dst []byte, src [][]byte) {
|
||||
matrixNoSIMD(dst, src)
|
||||
}
|
||||
|
||||
func xorSrc0(dst, src0, src1 []byte) {
|
||||
bytesNoSIMD(dst, src0, src1, len(src0))
|
||||
}
|
||||
|
||||
func xorSrc1(dst, src0, src1 []byte) {
|
||||
bytesNoSIMD(dst, src0, src1, len(src1))
|
||||
}
|
||||
Reference in New Issue
Block a user