add packages

2026-01-11 22:23:12 +00:00 · 2017-10-24 22:53:20 +08:00
parent 0559865fe5
commit 9e0fd0c4ef
54 changed files with 7315 additions and 21 deletions
--- a/vendor/github.com/templexxx/cpufeat/.gitignore
+++ b/vendor/github.com/templexxx/cpufeat/.gitignore
@@ -0,0 +1,14 @@
+# Binaries for programs and plugins
+*.exe
+*.dll
+*.so
+*.dylib
+
+# Test binary, build with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+
+# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
+.glide/
--- a/vendor/github.com/templexxx/cpufeat/LICENSE
+++ b/vendor/github.com/templexxx/cpufeat/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/github.com/templexxx/cpufeat/cpu.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu.go
@@ -0,0 +1,32 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cpu implements processor feature detection
+// used by the Go standard libary.
+package cpufeat
+
+var X86 x86
+
+// The booleans in x86 contain the correspondingly named cpuid feature bit.
+// HasAVX and HasAVX2 are only set if the OS does support XMM and YMM registers
+// in addition to the cpuid feature bit being set.
+// The struct is padded to avoid false sharing.
+type x86 struct {
+	_            [CacheLineSize]byte
+	HasAES       bool
+	HasAVX       bool
+	HasAVX2      bool
+	HasBMI1      bool
+	HasBMI2      bool
+	HasERMS      bool
+	HasOSXSAVE   bool
+	HasPCLMULQDQ bool
+	HasPOPCNT    bool
+	HasSSE2      bool
+	HasSSE3      bool
+	HasSSSE3     bool
+	HasSSE41     bool
+	HasSSE42     bool
+	_            [CacheLineSize]byte
+}
--- a/vendor/github.com/templexxx/cpufeat/cpu_arm.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_arm.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_arm64.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_arm64.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_mips.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_mips.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_mips64.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_mips64.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_mips64le.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_mips64le.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_mipsle.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_mipsle.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_ppc64.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_ppc64.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 128
--- a/vendor/github.com/templexxx/cpufeat/cpu_ppc64le.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_ppc64le.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 128
--- a/vendor/github.com/templexxx/cpufeat/cpu_s390x.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_s390x.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 256
--- a/vendor/github.com/templexxx/cpufeat/cpu_x86.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_x86.go
@@ -0,0 +1,59 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32
+
+package cpufeat
+
+const CacheLineSize = 64
+
+// cpuid is implemented in cpu_x86.s.
+func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+
+// xgetbv with ecx = 0 is implemented in cpu_x86.s.
+func xgetbv() (eax, edx uint32)
+
+func init() {
+	maxId, _, _, _ := cpuid(0, 0)
+
+	if maxId < 1 {
+		return
+	}
+
+	_, _, ecx1, edx1 := cpuid(1, 0)
+	X86.HasSSE2 = isSet(26, edx1)
+
+	X86.HasSSE3 = isSet(0, ecx1)
+	X86.HasPCLMULQDQ = isSet(1, ecx1)
+	X86.HasSSSE3 = isSet(9, ecx1)
+	X86.HasSSE41 = isSet(19, ecx1)
+	X86.HasSSE42 = isSet(20, ecx1)
+	X86.HasPOPCNT = isSet(23, ecx1)
+	X86.HasAES = isSet(25, ecx1)
+	X86.HasOSXSAVE = isSet(27, ecx1)
+
+	osSupportsAVX := false
+	// For XGETBV, OSXSAVE bit is required and sufficient.
+	if X86.HasOSXSAVE {
+		eax, _ := xgetbv()
+		// Check if XMM and YMM registers have OS support.
+		osSupportsAVX = isSet(1, eax) && isSet(2, eax)
+	}
+
+	X86.HasAVX = isSet(28, ecx1) && osSupportsAVX
+
+	if maxId < 7 {
+		return
+	}
+
+	_, ebx7, _, _ := cpuid(7, 0)
+	X86.HasBMI1 = isSet(3, ebx7)
+	X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX
+	X86.HasBMI2 = isSet(8, ebx7)
+	X86.HasERMS = isSet(9, ebx7)
+}
+
+func isSet(bitpos uint, value uint32) bool {
+	return value&(1<<bitpos) != 0
+}
--- a/vendor/github.com/templexxx/cpufeat/cpu_x86.s
+++ b/vendor/github.com/templexxx/cpufeat/cpu_x86.s
@@ -0,0 +1,32 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32
+
+#include "textflag.h"
+
+// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), NOSPLIT, $0-24
+	MOVL eaxArg+0(FP), AX
+	MOVL ecxArg+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv() (eax, edx uint32)
+TEXT ·xgetbv(SB),NOSPLIT,$0-8
+#ifdef GOOS_nacl
+	// nacl does not support XGETBV.
+	MOVL $0, eax+0(FP)
+	MOVL $0, edx+4(FP)
+#else
+	MOVL $0, CX
+	WORD $0x010f; BYTE $0xd0 //XGETBV
+	MOVL AX, eax+0(FP)
+	MOVL DX, edx+4(FP)
+#endif
+	RET
--- a/vendor/github.com/templexxx/reedsolomon/.gitignore
+++ b/vendor/github.com/templexxx/reedsolomon/.gitignore
@@ -0,0 +1,40 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
+/.idea
+/backup
+/loopunroll/
+cpu.out
+mathtool/galois/
+mathtool/matrix/
+mem.out
+/examples/
+/.DS_Store
+/mathtool/cntinverse
+/invert
+/bakcup
+/buf.svg
+*.svg
+*.out
+/escape
--- a/vendor/github.com/templexxx/reedsolomon/.travis.yml
+++ b/vendor/github.com/templexxx/reedsolomon/.travis.yml
@@ -0,0 +1,9 @@
+language: go
+go:
+    - 1.9
+
+install:
+    - go get github.com/templexxx/reedsolomon
+
+script:
+    - go test -v
--- a/vendor/github.com/templexxx/reedsolomon/LICENSE
+++ b/vendor/github.com/templexxx/reedsolomon/LICENSE
@@ -0,0 +1,23 @@
+MIT License
+
+Copyright (c) 2017 Templexxx
+Copyright (c) 2015 Klaus Post
+Copyright (c) 2015 Backblaze
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/vendor/github.com/templexxx/reedsolomon/README.md
+++ b/vendor/github.com/templexxx/reedsolomon/README.md
@@ -0,0 +1,109 @@
+# Reed-Solomon
+
+[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] 
+
+[1]: https://godoc.org/github.com/templexxx/reedsolomon?status.svg
+[2]: https://godoc.org/github.com/templexxx/reedsolomon
+[3]: https://img.shields.io/badge/license-MIT-blue.svg
+[4]: LICENSE
+[5]: https://travis-ci.org/templexxx/reedsolomon.svg?branch=master
+[6]: https://travis-ci.org/templexxx/reedsolomon
+[7]: https://goreportcard.com/badge/github.com/templexxx/reedsolomon
+[8]: https://goreportcard.com/report/github.com/templexxx/reedsolomon
+
+
+## Introduction:
+1.  Reed-Solomon Erasure Code engine in pure Go.
+2.  Super Fast: more than 10GB/s per physics core ( 10+4, 4KB per vector, Macbook Pro 2.8 GHz Intel Core i7 )
+
+## Installation
+To get the package use the standard:
+```bash
+go get github.com/templexxx/reedsolomon
+```
+
+## Documentation
+See the associated [GoDoc](http://godoc.org/github.com/templexxx/reedsolomon)
+
+## Specification
+### GOARCH
+1. All arch are supported
+2. 0.1.0 need go1.9 for sync.Map in AMD64
+
+### Math
+1. Coding over in GF(2^8)
+2. Primitive Polynomial: x^8 + x^4 + x^3 + x^2 + 1 (0x1d)
+3. mathtool/gentbls.go : generator Primitive Polynomial and it's log table, exp table, multiply table, inverse table etc. We can get more info about how galois field work
+4. mathtool/cntinverse.go : calculate how many inverse matrix will have in different RS codes config
+5. Both of Cauchy and Vandermonde Matrix are supported. Vandermonde need more operations for preserving the property that any square subset of rows is invertible
+
+### Why so fast?
+These three parts will cost too much time:
+
+1. lookup galois-field tables
+2. read/write memory
+3. calculate inverse matrix in the reconstruct process
+
+SIMD will solve no.1
+
+Cache-friendly codes will help to solve no.2 & no.3, and more, use a sync.Map for cache inverse matrix, it will help to save about 1000ns when we need same matrix. 
+
+## Performance
+
+Performance depends mainly on:
+
+1. CPU instruction extension( AVX2 or SSSE3 or none )
+2. number of data/parity vects
+3. unit size of calculation ( see it in rs_amd64.go )
+4. size of shards
+5. speed of memory (waste so much time on read/write mem, :D )
+6. performance of CPU
+7. the way of using ( reuse memory)
+
+And we must know the benchmark test is quite different with encoding/decoding in practice.
+
+Because in benchmark test loops, the CPU Cache will help a lot. In practice, we must reuse the memory to make the performance become as good as the benchmark test.
+
+Example of performance on my MacBook 2017 i7 2.8GHz. 10+4 (with 0.1.0).
+
+### Encoding:
+
+| Vector size | Speed (MB/S) |
+|----------------|--------------|
+| 1400B              |    7655.02  |
+| 4KB              |       10551.37  |
+| 64KB              |       9297.25 |
+| 1MB              |      6829.89 |
+| 16MB              |      6312.83 |
+
+### Reconstruct (use nil to point which one need repair):
+
+| Vector size | Speed (MB/S) |
+|----------------|--------------|
+| 1400B              |    4124.85  |
+| 4KB              |       5715.45 |
+| 64KB              |       6050.06 |
+| 1MB              |      5001.21 |
+| 16MB              |      5043.04 |
+
+### ReconstructWithPos (use a position list to point which one need repair, reuse the memory):
+
+| Vector size | Speed (MB/S) |
+|----------------|--------------|
+| 1400B              |    6170.24  |
+| 4KB              |       9444.86 |
+| 64KB              |       9311.30 |
+| 1MB              |      6781.06 |
+| 16MB              |      6285.34 |
+
+**reconstruct benchmark tests here run with inverse matrix cache, if there is no cache, it will cost more time( about 1000ns)**
+
+## Who is using this?
+
+1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang
+
+## Links & Thanks
+* [Klauspost ReedSolomon](https://github.com/klauspost/reedsolomon)
+* [intel ISA-L](https://github.com/01org/isa-l)
+* [GF SIMD] (http://www.ssrc.ucsc.edu/papers/plank-fast13.pdf)
+* [asm2plan9s] (https://github.com/fwessels/asm2plan9s)
--- a/vendor/github.com/templexxx/reedsolomon/matrix.go
+++ b/vendor/github.com/templexxx/reedsolomon/matrix.go
@@ -0,0 +1,156 @@
+package reedsolomon
+
+import "errors"
+
+type matrix []byte
+
+func genEncMatrixCauchy(d, p int) matrix {
+	t := d + p
+	m := make([]byte, t*d)
+	for i := 0; i < d; i++ {
+		m[i*d+i] = byte(1)
+	}
+
+	d2 := d * d
+	for i := d; i < t; i++ {
+		for j := 0; j < d; j++ {
+			d := i ^ j
+			a := inverseTbl[d]
+			m[d2] = byte(a)
+			d2++
+		}
+	}
+	return m
+}
+
+func gfExp(b byte, n int) byte {
+	if n == 0 {
+		return 1
+	}
+	if b == 0 {
+		return 0
+	}
+	a := logTbl[b]
+	ret := int(a) * n
+	for ret >= 255 {
+		ret -= 255
+	}
+	return byte(expTbl[ret])
+}
+
+func genVandMatrix(vm []byte, t, d int) {
+	for i := 0; i < t; i++ {
+		for j := 0; j < d; j++ {
+			vm[i*d+j] = gfExp(byte(i), j)
+		}
+	}
+}
+
+func (m matrix) mul(right matrix, rows, cols int, r []byte) {
+	for i := 0; i < rows; i++ {
+		for j := 0; j < cols; j++ {
+			var v byte
+			for k := 0; k < cols; k++ {
+				v ^= gfMul(m[i*cols+k], right[k*cols+j])
+			}
+			r[i*cols+j] = v
+		}
+	}
+}
+
+func genEncMatrixVand(d, p int) (matrix, error) {
+	t := d + p
+	buf := make([]byte, (2*t+4*d)*d)
+	vm := buf[:t*d]
+	genVandMatrix(vm, t, d)
+	top := buf[t*d : (t+d)*d]
+	copy(top, vm[:d*d])
+	raw := buf[(t+d)*d : (t+3*d)*d]
+	im := buf[(t+3*d)*d : (t+4*d)*d]
+	err := matrix(top).invert(raw, d, im)
+	if err != nil {
+		return nil, err
+	}
+	r := buf[(t+4*d)*d : (2*t+4*d)*d]
+	matrix(vm).mul(im, t, d, r)
+	return matrix(r), nil
+}
+
+// [I|m'] -> [m']
+func (m matrix) subMatrix(n int, r []byte) {
+	for i := 0; i < n; i++ {
+		off := i * n
+		copy(r[off:off+n], m[2*off+n:2*(off+n)])
+	}
+}
+
+func (m matrix) invert(raw matrix, n int, im []byte) error {
+	// [m] -> [m|I]
+	for i := 0; i < n; i++ {
+		t := i * n
+		copy(raw[2*t:2*t+n], m[t:t+n])
+		raw[2*t+i+n] = byte(1)
+	}
+	err := gauss(raw, n)
+	if err != nil {
+		return err
+	}
+	raw.subMatrix(n, im)
+	return nil
+}
+
+func (m matrix) swap(i, j, n int) {
+	for k := 0; k < n; k++ {
+		m[i*n+k], m[j*n+k] = m[j*n+k], m[i*n+k]
+	}
+}
+
+func gfMul(a, b byte) byte {
+	return mulTbl[a][b]
+}
+
+var errSingular = errors.New("rs.invert: matrix is singular")
+
+// [m|I] -> [I|m']
+func gauss(m matrix, n int) error {
+	n2 := 2 * n
+	for i := 0; i < n; i++ {
+		if m[i*n2+i] == 0 {
+			for j := i + 1; j < n; j++ {
+				if m[j*n2+i] != 0 {
+					m.swap(i, j, n2)
+					break
+				}
+			}
+		}
+		if m[i*n2+i] == 0 {
+			return errSingular
+		}
+		if m[i*n2+i] != 1 {
+			d := m[i*n2+i]
+			scale := inverseTbl[d]
+			for c := 0; c < n2; c++ {
+				m[i*n2+c] = gfMul(m[i*n2+c], scale)
+			}
+		}
+		for j := i + 1; j < n; j++ {
+			if m[j*n2+i] != 0 {
+				scale := m[j*n2+i]
+				for c := 0; c < n2; c++ {
+					m[j*n2+c] ^= gfMul(scale, m[i*n2+c])
+				}
+			}
+		}
+	}
+	for k := 0; k < n; k++ {
+		for j := 0; j < k; j++ {
+			if m[j*n2+k] != 0 {
+				scale := m[j*n2+k]
+				for c := 0; c < n2; c++ {
+					m[j*n2+c] ^= gfMul(scale, m[k*n2+c])
+				}
+			}
+		}
+	}
+	return nil
+}
--- a/vendor/github.com/templexxx/reedsolomon/rs.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs.go
@@ -0,0 +1,280 @@
+/*
+	Reed-Solomon Codes over GF(2^8)
+	Primitive Polynomial:  x^8+x^4+x^3+x^2+1
+	Galois Filed arithmetic using Intel SIMD instructions (AVX2 or SSSE3)
+*/
+
+package reedsolomon
+
+import "errors"
+
+// Encoder implements for Reed-Solomon Encoding/Reconstructing
+type Encoder interface {
+	// Encode multiply generator-matrix with data
+	// len(vects) must be equal with num of data+parity
+	Encode(vects [][]byte) error
+	// Result of reconst will be put into origin position of vects
+	// it means if you lost vects[0], after reconst the vects[0]'s data will be back in vects[0]
+
+	// Reconstruct repair lost data & parity
+	// Set vect nil if lost
+	Reconstruct(vects [][]byte) error
+	// Reconstruct repair lost data
+	// Set vect nil if lost
+	ReconstructData(vects [][]byte) error
+	// ReconstWithPos repair lost data&parity with has&lost vects position
+	// Save bandwidth&disk I/O (cmp with Reconstruct, if the lost is less than num of parity)
+	// As erasure codes, we must know which vect is broken,
+	// so it's necessary to provide such APIs
+	// len(has) must equal num of data vects
+	// Example:
+	// in 3+2, the whole position: [0,1,2,3,4]
+	// if lost vects[0]
+	// the "has" could be [1,2,3] or [1,2,4] or ...
+	// then you must be sure that vects[1] vects[2] vects[3] have correct data (if the "has" is [1,2,3])
+	// the "dLost" will be [0]
+	// ps:
+	// 1. the above lists are in increasing orders  TODO support out-of-order
+	// 2. each vect has same len, don't set it nil
+	// so we don't need to make slice
+	ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error
+	//// ReconstWithPos repair lost data with survived&lost vects position
+	//// Don't need to append position of parity lost into "lost"
+	ReconstDataWithPos(vects [][]byte, has, dLost []int) error
+}
+
+func checkCfg(d, p int) error {
+	if (d <= 0) || (p <= 0) {
+		return errors.New("rs.New: data or parity <= 0")
+	}
+	if d+p >= 256 {
+		return errors.New("rs.New: data+parity >= 256")
+	}
+	return nil
+}
+
+// New create an Encoder (vandermonde matrix as Encoding matrix)
+func New(data, parity int) (enc Encoder, err error) {
+	err = checkCfg(data, parity)
+	if err != nil {
+		return
+	}
+	e, err := genEncMatrixVand(data, parity)
+	if err != nil {
+		return
+	}
+	return newRS(data, parity, e), nil
+}
+
+// NewCauchy create an Encoder (cauchy matrix as Generator Matrix)
+func NewCauchy(data, parity int) (enc Encoder, err error) {
+	err = checkCfg(data, parity)
+	if err != nil {
+		return
+	}
+	e := genEncMatrixCauchy(data, parity)
+	return newRS(data, parity, e), nil
+}
+
+type encBase struct {
+	data   int
+	parity int
+	encode []byte
+	gen    []byte
+}
+
+func checkEnc(d, p int, vs [][]byte) (size int, err error) {
+	total := len(vs)
+	if d+p != total {
+		err = errors.New("rs.checkER: vects not match rs args")
+		return
+	}
+	size = len(vs[0])
+	if size == 0 {
+		err = errors.New("rs.checkER: vects size = 0")
+		return
+	}
+	for i := 1; i < total; i++ {
+		if len(vs[i]) != size {
+			err = errors.New("rs.checkER: vects size mismatch")
+			return
+		}
+	}
+	return
+}
+
+func (e *encBase) Encode(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	_, err = checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	g := e.gen
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			if i != 0 {
+				mulVectAdd(g[j*d+i], dv[i], pv[j])
+			} else {
+				mulVect(g[j*d], dv[0], pv[j])
+			}
+		}
+	}
+	return
+}
+
+func mulVect(c byte, a, b []byte) {
+	t := mulTbl[c]
+	for i := 0; i < len(a); i++ {
+		b[i] = t[a[i]]
+	}
+}
+
+func mulVectAdd(c byte, a, b []byte) {
+	t := mulTbl[c]
+	for i := 0; i < len(a); i++ {
+		b[i] ^= t[a[i]]
+	}
+}
+
+func (e *encBase) Reconstruct(vects [][]byte) (err error) {
+	return e.reconstruct(vects, false)
+}
+
+func (e *encBase) ReconstructData(vects [][]byte) (err error) {
+	return e.reconstruct(vects, true)
+}
+
+func (e *encBase) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, pLost, false)
+}
+
+func (e *encBase) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, nil, true)
+}
+
+func (e *encBase) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	em := e.encode
+	dCnt := len(dLost)
+	size := len(vects[has[0]])
+	if dCnt != 0 {
+		vtmp := make([][]byte, d+dCnt)
+		for i, p := range has {
+			vtmp[i] = vects[p]
+		}
+		for i, p := range dLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		matrixbuf := make([]byte, 4*d*d+dCnt*d)
+		m := matrixbuf[:d*d]
+		for i, l := range has {
+			copy(m[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		raw := matrixbuf[d*d : 3*d*d]
+		im := matrixbuf[3*d*d : 4*d*d]
+		err2 := matrix(m).invert(raw, d, im)
+		if err2 != nil {
+			return err2
+		}
+		g := matrixbuf[4*d*d:]
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		etmp := &encBase{data: d, parity: dCnt, gen: g}
+		err2 = etmp.Encode(vtmp[:d+dCnt])
+		if err2 != nil {
+			return err2
+		}
+	}
+	if dataOnly {
+		return
+	}
+	pCnt := len(pLost)
+	if pCnt != 0 {
+		vtmp := make([][]byte, d+pCnt)
+		g := make([]byte, pCnt*d)
+		for i, l := range pLost {
+			copy(g[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		for i := 0; i < d; i++ {
+			vtmp[i] = vects[i]
+		}
+		for i, p := range pLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		etmp := &encBase{data: d, parity: pCnt, gen: g}
+		err2 := etmp.Encode(vtmp[:d+pCnt])
+		if err2 != nil {
+			return err2
+		}
+	}
+	return
+}
+
+func (e *encBase) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	// TODO check more, maybe element in has show in lost & deal with len(has) > d
+	if len(has) != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dCnt := len(dLost)
+	if dCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	pCnt := len(pLost)
+	if pCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
+
+func (e *encBase) reconstruct(vects [][]byte, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	t := d + p
+	listBuf := make([]int, t+p)
+	has := listBuf[:d]
+	dLost := listBuf[d:t]
+	pLost := listBuf[t : t+p]
+	hasCnt, dCnt, pCnt := 0, 0, 0
+	for i := 0; i < t; i++ {
+		if vects[i] != nil {
+			if hasCnt < d {
+				has[hasCnt] = i
+				hasCnt++
+			}
+		} else {
+			if i < d {
+				if dCnt < p {
+					dLost[dCnt] = i
+					dCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			} else {
+				if pCnt < p {
+					pLost[pCnt] = i
+					pCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			}
+		}
+	}
+	if hasCnt != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dLost = dLost[:dCnt]
+	pLost = pLost[:pCnt]
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
--- a/vendor/github.com/templexxx/reedsolomon/rs_amd64.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs_amd64.go
@@ -0,0 +1,868 @@
+package reedsolomon
+
+import (
+	"errors"
+	"sync"
+
+	"github.com/templexxx/cpufeat"
+)
+
+// SIMD Instruction Extensions
+const (
+	none = iota
+	avx2
+	ssse3
+)
+
+var extension = none
+
+func init() {
+	getEXT()
+}
+
+func getEXT() {
+	if cpufeat.X86.HasAVX2 {
+		extension = avx2
+		return
+	} else if cpufeat.X86.HasSSSE3 {
+		extension = ssse3
+		return
+	} else {
+		extension = none
+		return
+	}
+}
+
+//go:noescape
+func copy32B(dst, src []byte) // Need SSE2(introduced in 2001)
+
+func initTbl(g matrix, rows, cols int, tbl []byte) {
+	off := 0
+	for i := 0; i < cols; i++ {
+		for j := 0; j < rows; j++ {
+			c := g[j*cols+i]
+			t := lowhighTbl[c][:]
+			copy32B(tbl[off:off+32], t)
+			off += 32
+		}
+	}
+}
+
+// At most 3060 inverse matrix (when data=14, parity=4, calc by mathtool/cntinverse)
+// In practice,  data usually below 12, parity below 5
+func okCache(data, parity int) bool {
+	if data < 15 && parity < 5 { // you can change it, but the data+parity can't be bigger than 32 (tips: see the codes about make inverse matrix)
+		return true
+	}
+	return false
+}
+
+type (
+	encSSSE3 encSIMD
+	encAVX2  encSIMD
+	encSIMD  struct {
+		data   int
+		parity int
+		encode matrix
+		gen    matrix
+		tbl    []byte
+		// inverse matrix cache is design for small vect size ( < 4KB )
+		// it will save time for calculating inverse matrix
+		// but it's not so important for big vect size
+		enableCache  bool
+		inverseCache iCache
+	}
+	iCache struct {
+		sync.RWMutex
+		data map[uint32][]byte
+	}
+)
+
+func newRS(d, p int, em matrix) (enc Encoder) {
+	g := em[d*d:]
+	if extension == none {
+		return &encBase{data: d, parity: p, encode: em, gen: g}
+	}
+	t := make([]byte, d*p*32)
+	initTbl(g, p, d, t)
+	ok := okCache(d, p)
+	if extension == avx2 {
+		e := &encAVX2{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
+			inverseCache: iCache{data: make(map[uint32][]byte)}}
+		return e
+	}
+	e := &encSSSE3{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
+		inverseCache: iCache{data: make(map[uint32][]byte)}}
+	return e
+}
+
+// Size of sub-vector
+const unit int = 16 * 1024
+
+func getDo(n int) int {
+	if n < unit {
+		c := n >> 4
+		if c == 0 {
+			return unit
+		}
+		return c << 4
+	}
+	return unit
+}
+
+func (e *encAVX2) Encode(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	size, err := checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	start, end := 0, 0
+	do := getDo(size)
+	for start < size {
+		end = start + do
+		if end <= size {
+			e.matrixMul(start, end, dv, pv)
+			start = end
+		} else {
+			e.matrixMulRemain(start, size, dv, pv)
+			start = size
+		}
+	}
+	return
+}
+
+//go:noescape
+func mulVectAVX2(tbl, d, p []byte)
+
+//go:noescape
+func mulVectAddAVX2(tbl, d, p []byte)
+
+func (e *encAVX2) matrixMul(start, end int, dv, pv [][]byte) {
+	d := e.data
+	p := e.parity
+	tbl := e.tbl
+	off := 0
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			t := tbl[off : off+32]
+			if i != 0 {
+				mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
+			} else {
+				mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
+			}
+			off += 32
+		}
+	}
+}
+
+func (e *encAVX2) matrixMulRemain(start, end int, dv, pv [][]byte) {
+	undone := end - start
+	do := (undone >> 4) << 4
+	d := e.data
+	p := e.parity
+	tbl := e.tbl
+	if do >= 16 {
+		end2 := start + do
+		off := 0
+		for i := 0; i < d; i++ {
+			for j := 0; j < p; j++ {
+				t := tbl[off : off+32]
+				if i != 0 {
+					mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
+				} else {
+					mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
+				}
+				off += 32
+			}
+		}
+		start = end
+	}
+	if undone > do {
+		// may recalculate some data, but still improve a lot
+		start2 := end - 16
+		if start2 >= 0 {
+			off := 0
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					t := tbl[off : off+32]
+					if i != 0 {
+						mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
+					} else {
+						mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
+					}
+					off += 32
+				}
+			}
+		} else {
+			g := e.gen
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					if i != 0 {
+						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
+					} else {
+						mulVect(g[j*d], dv[0][start:], pv[j][start:])
+					}
+				}
+			}
+		}
+	}
+}
+
+// use generator-matrix but not tbls for encoding
+// it's design for reconstructing
+// for small vects, it cost to much time on initTbl, so drop it
+// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
+func (e *encAVX2) encodeGen(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	size, err := checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	start, end := 0, 0
+	do := getDo(size)
+	for start < size {
+		end = start + do
+		if end <= size {
+			e.matrixMulGen(start, end, dv, pv)
+			start = end
+		} else {
+			e.matrixMulRemainGen(start, size, dv, pv)
+			start = size
+		}
+	}
+	return
+}
+
+func (e *encAVX2) matrixMulGen(start, end int, dv, pv [][]byte) {
+	d := e.data
+	p := e.parity
+	g := e.gen
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			t := lowhighTbl[g[j*d+i]][:]
+			if i != 0 {
+				mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
+			} else {
+				mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
+			}
+		}
+	}
+}
+
+func (e *encAVX2) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
+	undone := end - start
+	do := (undone >> 4) << 4
+	d := e.data
+	p := e.parity
+	g := e.gen
+	if do >= 16 {
+		end2 := start + do
+		for i := 0; i < d; i++ {
+			for j := 0; j < p; j++ {
+				t := lowhighTbl[g[j*d+i]][:]
+				if i != 0 {
+					mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
+				} else {
+					mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
+				}
+			}
+		}
+		start = end
+	}
+	if undone > do {
+		start2 := end - 16
+		if start2 >= 0 {
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					t := lowhighTbl[g[j*d+i]][:]
+					if i != 0 {
+						mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
+					} else {
+						mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					if i != 0 {
+						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
+					} else {
+						mulVect(g[j*d], dv[0][start:], pv[j][start:])
+					}
+				}
+			}
+		}
+	}
+}
+
+func (e *encAVX2) Reconstruct(vects [][]byte) (err error) {
+	return e.reconstruct(vects, false)
+}
+
+func (e *encAVX2) ReconstructData(vects [][]byte) (err error) {
+	return e.reconstruct(vects, true)
+}
+
+func (e *encAVX2) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, pLost, false)
+}
+
+func (e *encAVX2) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, nil, true)
+}
+
+func (e *encAVX2) makeGen(has, dLost []int) (gen []byte, err error) {
+	d := e.data
+	em := e.encode
+	cnt := len(dLost)
+	if !e.enableCache {
+		matrixbuf := make([]byte, 4*d*d+cnt*d)
+		m := matrixbuf[:d*d]
+		for i, l := range has {
+			copy(m[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		raw := matrixbuf[d*d : 3*d*d]
+		im := matrixbuf[3*d*d : 4*d*d]
+		err2 := matrix(m).invert(raw, d, im)
+		if err2 != nil {
+			return nil, err2
+		}
+		g := matrixbuf[4*d*d:]
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		return g, nil
+	}
+	var ikey uint32
+	for _, p := range has {
+		ikey += 1 << uint8(p)
+	}
+	e.inverseCache.RLock()
+	v, ok := e.inverseCache.data[ikey]
+	if ok {
+		im := v
+		g := make([]byte, cnt*d)
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		e.inverseCache.RUnlock()
+		return g, nil
+	}
+	e.inverseCache.RUnlock()
+	matrixbuf := make([]byte, 4*d*d+cnt*d)
+	m := matrixbuf[:d*d]
+	for i, l := range has {
+		copy(m[i*d:i*d+d], em[l*d:l*d+d])
+	}
+	raw := matrixbuf[d*d : 3*d*d]
+	im := matrixbuf[3*d*d : 4*d*d]
+	err2 := matrix(m).invert(raw, d, im)
+	if err2 != nil {
+		return nil, err2
+	}
+	e.inverseCache.Lock()
+	e.inverseCache.data[ikey] = im
+	e.inverseCache.Unlock()
+	g := matrixbuf[4*d*d:]
+	for i, l := range dLost {
+		copy(g[i*d:i*d+d], im[l*d:l*d+d])
+	}
+	return g, nil
+}
+
+func (e *encAVX2) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	em := e.encode
+	dCnt := len(dLost)
+	size := len(vects[has[0]])
+	if dCnt != 0 {
+		vtmp := make([][]byte, d+dCnt)
+		for i, p := range has {
+			vtmp[i] = vects[p]
+		}
+		for i, p := range dLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		g, err2 := e.makeGen(has, dLost)
+		if err2 != nil {
+			return
+		}
+		etmp := &encAVX2{data: d, parity: dCnt, gen: g}
+		err2 = etmp.encodeGen(vtmp)
+		if err2 != nil {
+			return err2
+		}
+	}
+	if dataOnly {
+		return
+	}
+	pCnt := len(pLost)
+	if pCnt != 0 {
+		g := make([]byte, pCnt*d)
+		for i, l := range pLost {
+			copy(g[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		vtmp := make([][]byte, d+pCnt)
+		for i := 0; i < d; i++ {
+			vtmp[i] = vects[i]
+		}
+		for i, p := range pLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		etmp := &encAVX2{data: d, parity: pCnt, gen: g}
+		err2 := etmp.encodeGen(vtmp)
+		if err2 != nil {
+			return err2
+		}
+	}
+	return
+}
+
+func (e *encAVX2) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	if len(has) != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dCnt := len(dLost)
+	if dCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	pCnt := len(pLost)
+	if pCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
+
+func (e *encAVX2) reconstruct(vects [][]byte, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	t := d + p
+	listBuf := make([]int, t+p)
+	has := listBuf[:d]
+	dLost := listBuf[d:t]
+	pLost := listBuf[t : t+p]
+	hasCnt, dCnt, pCnt := 0, 0, 0
+	for i := 0; i < t; i++ {
+		if vects[i] != nil {
+			if hasCnt < d {
+				has[hasCnt] = i
+				hasCnt++
+			}
+		} else {
+			if i < d {
+				if dCnt < p {
+					dLost[dCnt] = i
+					dCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			} else {
+				if pCnt < p {
+					pLost[pCnt] = i
+					pCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			}
+		}
+	}
+	if hasCnt != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dLost = dLost[:dCnt]
+	pLost = pLost[:pCnt]
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
+
+func (e *encSSSE3) Encode(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	size, err := checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	start, end := 0, 0
+	do := getDo(size)
+	for start < size {
+		end = start + do
+		if end <= size {
+			e.matrixMul(start, end, dv, pv)
+			start = end
+		} else {
+			e.matrixMulRemain(start, size, dv, pv)
+			start = size
+		}
+	}
+	return
+}
+
+//go:noescape
+func mulVectSSSE3(tbl, d, p []byte)
+
+//go:noescape
+func mulVectAddSSSE3(tbl, d, p []byte)
+
+func (e *encSSSE3) matrixMul(start, end int, dv, pv [][]byte) {
+	d := e.data
+	p := e.parity
+	tbl := e.tbl
+	off := 0
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			t := tbl[off : off+32]
+			if i != 0 {
+				mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
+			} else {
+				mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
+			}
+			off += 32
+		}
+	}
+}
+
+func (e *encSSSE3) matrixMulRemain(start, end int, dv, pv [][]byte) {
+	undone := end - start
+	do := (undone >> 4) << 4
+	d := e.data
+	p := e.parity
+	tbl := e.tbl
+	if do >= 16 {
+		end2 := start + do
+		off := 0
+		for i := 0; i < d; i++ {
+			for j := 0; j < p; j++ {
+				t := tbl[off : off+32]
+				if i != 0 {
+					mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
+				} else {
+					mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
+				}
+				off += 32
+			}
+		}
+		start = end
+	}
+	if undone > do {
+		start2 := end - 16
+		if start2 >= 0 {
+			off := 0
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					t := tbl[off : off+32]
+					if i != 0 {
+						mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
+					} else {
+						mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
+					}
+					off += 32
+				}
+			}
+		} else {
+			g := e.gen
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					if i != 0 {
+						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
+					} else {
+						mulVect(g[j*d], dv[0][start:], pv[j][start:])
+					}
+				}
+			}
+		}
+	}
+}
+
+// use generator-matrix but not tbls for encoding
+// it's design for reconstructing
+// for small vects, it cost to much time on initTbl, so drop it
+// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
+func (e *encSSSE3) encodeGen(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	size, err := checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	start, end := 0, 0
+	do := getDo(size)
+	for start < size {
+		end = start + do
+		if end <= size {
+			e.matrixMulGen(start, end, dv, pv)
+			start = end
+		} else {
+			e.matrixMulRemainGen(start, size, dv, pv)
+			start = size
+		}
+	}
+	return
+}
+
+func (e *encSSSE3) matrixMulGen(start, end int, dv, pv [][]byte) {
+	d := e.data
+	p := e.parity
+	g := e.gen
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			t := lowhighTbl[g[j*d+i]][:]
+			if i != 0 {
+				mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
+			} else {
+				mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
+			}
+		}
+	}
+}
+
+func (e *encSSSE3) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
+	undone := end - start
+	do := (undone >> 4) << 4
+	d := e.data
+	p := e.parity
+	g := e.gen
+	if do >= 16 {
+		end2 := start + do
+		for i := 0; i < d; i++ {
+			for j := 0; j < p; j++ {
+				t := lowhighTbl[g[j*d+i]][:]
+				if i != 0 {
+					mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
+				} else {
+					mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
+				}
+			}
+		}
+		start = end
+	}
+	if undone > do {
+		start2 := end - 16
+		if start2 >= 0 {
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					t := lowhighTbl[g[j*d+i]][:]
+					if i != 0 {
+						mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
+					} else {
+						mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					if i != 0 {
+						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
+					} else {
+						mulVect(g[j*d], dv[0][start:], pv[j][start:])
+					}
+				}
+			}
+		}
+	}
+}
+
+func (e *encSSSE3) Reconstruct(vects [][]byte) (err error) {
+	return e.reconstruct(vects, false)
+}
+
+func (e *encSSSE3) ReconstructData(vects [][]byte) (err error) {
+	return e.reconstruct(vects, true)
+}
+
+func (e *encSSSE3) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, pLost, false)
+}
+
+func (e *encSSSE3) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, nil, true)
+}
+
+func (e *encSSSE3) makeGen(has, dLost []int) (gen []byte, err error) {
+	d := e.data
+	em := e.encode
+	cnt := len(dLost)
+	if !e.enableCache {
+		matrixbuf := make([]byte, 4*d*d+cnt*d)
+		m := matrixbuf[:d*d]
+		for i, l := range has {
+			copy(m[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		raw := matrixbuf[d*d : 3*d*d]
+		im := matrixbuf[3*d*d : 4*d*d]
+		err2 := matrix(m).invert(raw, d, im)
+		if err2 != nil {
+			return nil, err2
+		}
+		g := matrixbuf[4*d*d:]
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		return g, nil
+	}
+	var ikey uint32
+	for _, p := range has {
+		ikey += 1 << uint8(p)
+	}
+	e.inverseCache.RLock()
+	v, ok := e.inverseCache.data[ikey]
+	if ok {
+		im := v
+		g := make([]byte, cnt*d)
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		e.inverseCache.RUnlock()
+		return g, nil
+	}
+	e.inverseCache.RUnlock()
+	matrixbuf := make([]byte, 4*d*d+cnt*d)
+	m := matrixbuf[:d*d]
+	for i, l := range has {
+		copy(m[i*d:i*d+d], em[l*d:l*d+d])
+	}
+	raw := matrixbuf[d*d : 3*d*d]
+	im := matrixbuf[3*d*d : 4*d*d]
+	err2 := matrix(m).invert(raw, d, im)
+	if err2 != nil {
+		return nil, err2
+	}
+	e.inverseCache.Lock()
+	e.inverseCache.data[ikey] = im
+	e.inverseCache.Unlock()
+	g := matrixbuf[4*d*d:]
+	for i, l := range dLost {
+		copy(g[i*d:i*d+d], im[l*d:l*d+d])
+	}
+	return g, nil
+}
+
+func (e *encSSSE3) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	em := e.encode
+	dCnt := len(dLost)
+	size := len(vects[has[0]])
+	if dCnt != 0 {
+		vtmp := make([][]byte, d+dCnt)
+		for i, p := range has {
+			vtmp[i] = vects[p]
+		}
+		for i, p := range dLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		g, err2 := e.makeGen(has, dLost)
+		if err2 != nil {
+			return
+		}
+		etmp := &encSSSE3{data: d, parity: dCnt, gen: g}
+		err2 = etmp.encodeGen(vtmp)
+		if err2 != nil {
+			return err2
+		}
+	}
+	if dataOnly {
+		return
+	}
+	pCnt := len(pLost)
+	if pCnt != 0 {
+		g := make([]byte, pCnt*d)
+		for i, l := range pLost {
+			copy(g[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		vtmp := make([][]byte, d+pCnt)
+		for i := 0; i < d; i++ {
+			vtmp[i] = vects[i]
+		}
+		for i, p := range pLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		etmp := &encSSSE3{data: d, parity: pCnt, gen: g}
+		err2 := etmp.encodeGen(vtmp)
+		if err2 != nil {
+			return err2
+		}
+	}
+	return
+}
+
+func (e *encSSSE3) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	if len(has) != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dCnt := len(dLost)
+	if dCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	pCnt := len(pLost)
+	if pCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
+
+func (e *encSSSE3) reconstruct(vects [][]byte, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	t := d + p
+	listBuf := make([]int, t+p)
+	has := listBuf[:d]
+	dLost := listBuf[d:t]
+	pLost := listBuf[t : t+p]
+	hasCnt, dCnt, pCnt := 0, 0, 0
+	for i := 0; i < t; i++ {
+		if vects[i] != nil {
+			if hasCnt < d {
+				has[hasCnt] = i
+				hasCnt++
+			}
+		} else {
+			if i < d {
+				if dCnt < p {
+					dLost[dCnt] = i
+					dCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			} else {
+				if pCnt < p {
+					pLost[pCnt] = i
+					pCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			}
+		}
+	}
+	if hasCnt != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dLost = dLost[:dCnt]
+	pLost = pLost[:pCnt]
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
--- a/vendor/github.com/templexxx/reedsolomon/rs_amd64.s
+++ b/vendor/github.com/templexxx/reedsolomon/rs_amd64.s
@@ -0,0 +1,401 @@
+// Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
+
+#include "textflag.h"
+
+#define low_tbl Y0
+#define high_tbl Y1
+#define mask Y2
+#define in0  Y3
+#define in1  Y4
+#define in2  Y5
+#define in3  Y6
+#define in4  Y7
+#define in5  Y8
+#define in0_h  Y10
+#define in1_h  Y11
+#define in2_h  Y12
+#define in3_h  Y13
+#define in4_h  Y14
+#define in5_h  Y15
+
+#define in  BX
+#define out DI
+#define len R8
+#define pos R9
+
+#define tmp0 R10
+
+#define low_tblx X0
+#define high_tblx X1
+#define maskx X2
+#define in0x X3
+#define in0_hx X10
+#define tmp0x  X9
+#define tmp1x  X11
+#define tmp2x  X12
+#define tmp3x  X13
+
+
+// func mulVectAVX2(tbl, d, p []byte)
+TEXT ·mulVectAVX2(SB), NOSPLIT, $0
+    MOVQ         i+24(FP), in
+	MOVQ         o+48(FP), out
+	MOVQ         tbl+0(FP), tmp0
+	VMOVDQU      (tmp0), low_tblx
+	VMOVDQU      16(tmp0), high_tblx
+	MOVB         $0x0f, DX
+	LONG         $0x2069e3c4; WORD $0x00d2   // VPINSRB $0x00, EDX, XMM2, XMM2
+	VPBROADCASTB maskx, maskx
+	MOVQ         in_len+32(FP), len
+	TESTQ        $31, len
+	JNZ          one16b
+
+ymm:
+    VINSERTI128  $1, low_tblx, low_tbl, low_tbl
+    VINSERTI128  $1, high_tblx, high_tbl, high_tbl
+    VINSERTI128  $1, maskx, mask, mask
+    TESTQ        $255, len
+    JNZ          not_aligned
+
+// 256bytes/loop
+aligned:
+    MOVQ         $0, pos
+
+loop256b:
+	VMOVDQU (in)(pos*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VMOVDQU in0, (out)(pos*1)
+
+    VMOVDQU 32(in)(pos*1), in1
+	VPSRLQ  $4, in1, in1_h
+	VPAND   mask, in1_h, in1_h
+	VPAND   mask, in1, in1
+	VPSHUFB in1_h, high_tbl, in1_h
+	VPSHUFB in1, low_tbl, in1
+	VPXOR   in1, in1_h, in1
+	VMOVDQU in1, 32(out)(pos*1)
+
+    VMOVDQU 64(in)(pos*1), in2
+	VPSRLQ  $4, in2, in2_h
+	VPAND   mask, in2_h, in2_h
+	VPAND   mask, in2, in2
+	VPSHUFB in2_h, high_tbl, in2_h
+	VPSHUFB in2, low_tbl, in2
+	VPXOR   in2, in2_h, in2
+	VMOVDQU in2, 64(out)(pos*1)
+
+    VMOVDQU 96(in)(pos*1), in3
+	VPSRLQ  $4, in3, in3_h
+	VPAND   mask, in3_h, in3_h
+	VPAND   mask, in3, in3
+	VPSHUFB in3_h, high_tbl, in3_h
+	VPSHUFB in3, low_tbl, in3
+	VPXOR   in3, in3_h, in3
+	VMOVDQU in3, 96(out)(pos*1)
+
+    VMOVDQU 128(in)(pos*1), in4
+	VPSRLQ  $4, in4, in4_h
+	VPAND   mask, in4_h, in4_h
+	VPAND   mask, in4, in4
+	VPSHUFB in4_h, high_tbl, in4_h
+	VPSHUFB in4, low_tbl, in4
+	VPXOR   in4, in4_h, in4
+	VMOVDQU in4, 128(out)(pos*1)
+
+    VMOVDQU 160(in)(pos*1), in5
+	VPSRLQ  $4, in5, in5_h
+	VPAND   mask, in5_h, in5_h
+	VPAND   mask, in5, in5
+	VPSHUFB in5_h, high_tbl, in5_h
+	VPSHUFB in5, low_tbl, in5
+	VPXOR   in5, in5_h, in5
+	VMOVDQU in5, 160(out)(pos*1)
+
+    VMOVDQU 192(in)(pos*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VMOVDQU in0, 192(out)(pos*1)
+
+    VMOVDQU 224(in)(pos*1), in1
+	VPSRLQ  $4, in1, in1_h
+	VPAND   mask, in1_h, in1_h
+	VPAND   mask, in1, in1
+	VPSHUFB in1_h, high_tbl, in1_h
+	VPSHUFB in1, low_tbl, in1
+	VPXOR   in1, in1_h, in1
+	VMOVDQU in1, 224(out)(pos*1)
+
+	ADDQ    $256, pos
+	CMPQ    len, pos
+	JNE     loop256b
+	VZEROUPPER
+	RET
+
+not_aligned:
+    MOVQ    len, tmp0
+    ANDQ    $255, tmp0
+
+loop32b:
+    VMOVDQU -32(in)(len*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VMOVDQU in0, -32(out)(len*1)
+	SUBQ    $32, len
+	SUBQ    $32, tmp0
+	JG      loop32b
+	CMPQ    len, $256
+	JGE     aligned
+	VZEROUPPER
+	RET
+
+one16b:
+    VMOVDQU  -16(in)(len*1), in0x
+    VPSRLQ   $4, in0x, in0_hx
+    VPAND    maskx, in0x, in0x
+    VPAND    maskx, in0_hx, in0_hx
+    VPSHUFB  in0_hx, high_tblx, in0_hx
+    VPSHUFB  in0x, low_tblx, in0x
+    VPXOR    in0x, in0_hx, in0x
+	VMOVDQU  in0x, -16(out)(len*1)
+	SUBQ     $16, len
+	CMPQ     len, $0
+	JNE      ymm
+	RET
+
+// func mulVectAddAVX2(tbl, d, p []byte)
+TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
+    MOVQ         i+24(FP), in
+	MOVQ         o+48(FP), out
+	MOVQ         tbl+0(FP), tmp0
+	VMOVDQU      (tmp0), low_tblx
+	VMOVDQU      16(tmp0), high_tblx
+	MOVB         $0x0f, DX
+	LONG         $0x2069e3c4; WORD $0x00d2
+	VPBROADCASTB maskx, maskx
+	MOVQ         in_len+32(FP), len
+	TESTQ        $31, len
+	JNZ          one16b
+
+ymm:
+    VINSERTI128  $1, low_tblx, low_tbl, low_tbl
+    VINSERTI128  $1, high_tblx, high_tbl, high_tbl
+    VINSERTI128  $1, maskx, mask, mask
+    TESTQ        $255, len
+    JNZ          not_aligned
+
+aligned:
+    MOVQ         $0, pos
+
+loop256b:
+    VMOVDQU (in)(pos*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VPXOR   (out)(pos*1), in0, in0
+	VMOVDQU in0, (out)(pos*1)
+
+    VMOVDQU 32(in)(pos*1), in1
+	VPSRLQ  $4, in1, in1_h
+	VPAND   mask, in1_h, in1_h
+	VPAND   mask, in1, in1
+	VPSHUFB in1_h, high_tbl, in1_h
+	VPSHUFB in1, low_tbl, in1
+	VPXOR   in1, in1_h, in1
+	VPXOR   32(out)(pos*1), in1, in1
+	VMOVDQU in1, 32(out)(pos*1)
+
+    VMOVDQU 64(in)(pos*1), in2
+	VPSRLQ  $4, in2, in2_h
+	VPAND   mask, in2_h, in2_h
+	VPAND   mask, in2, in2
+	VPSHUFB in2_h, high_tbl, in2_h
+	VPSHUFB in2, low_tbl, in2
+	VPXOR   in2, in2_h, in2
+	VPXOR   64(out)(pos*1), in2, in2
+	VMOVDQU in2, 64(out)(pos*1)
+
+    VMOVDQU 96(in)(pos*1), in3
+	VPSRLQ  $4, in3, in3_h
+	VPAND   mask, in3_h, in3_h
+	VPAND   mask, in3, in3
+	VPSHUFB in3_h, high_tbl, in3_h
+	VPSHUFB in3, low_tbl, in3
+	VPXOR   in3, in3_h, in3
+	VPXOR   96(out)(pos*1), in3, in3
+	VMOVDQU in3, 96(out)(pos*1)
+
+    VMOVDQU 128(in)(pos*1), in4
+	VPSRLQ  $4, in4, in4_h
+	VPAND   mask, in4_h, in4_h
+	VPAND   mask, in4, in4
+	VPSHUFB in4_h, high_tbl, in4_h
+	VPSHUFB in4, low_tbl, in4
+	VPXOR   in4, in4_h, in4
+	VPXOR   128(out)(pos*1), in4, in4
+	VMOVDQU in4, 128(out)(pos*1)
+
+    VMOVDQU 160(in)(pos*1), in5
+	VPSRLQ  $4, in5, in5_h
+	VPAND   mask, in5_h, in5_h
+	VPAND   mask, in5, in5
+	VPSHUFB in5_h, high_tbl, in5_h
+	VPSHUFB in5, low_tbl, in5
+	VPXOR   in5, in5_h, in5
+	VPXOR   160(out)(pos*1), in5, in5
+	VMOVDQU in5, 160(out)(pos*1)
+
+    VMOVDQU 192(in)(pos*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VPXOR   192(out)(pos*1), in0, in0
+	VMOVDQU in0, 192(out)(pos*1)
+
+    VMOVDQU 224(in)(pos*1), in1
+	VPSRLQ  $4, in1, in1_h
+	VPAND   mask, in1_h, in1_h
+	VPAND   mask, in1, in1
+	VPSHUFB in1_h, high_tbl, in1_h
+	VPSHUFB in1, low_tbl, in1
+	VPXOR   in1, in1_h, in1
+	VPXOR   224(out)(pos*1), in1, in1
+	VMOVDQU in1, 224(out)(pos*1)
+
+	ADDQ    $256, pos
+	CMPQ    len, pos
+	JNE     loop256b
+	VZEROUPPER
+	RET
+
+not_aligned:
+    MOVQ    len, tmp0
+    ANDQ    $255, tmp0
+
+loop32b:
+    VMOVDQU -32(in)(len*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VPXOR   -32(out)(len*1), in0, in0
+	VMOVDQU in0, -32(out)(len*1)
+	SUBQ    $32, len
+	SUBQ    $32, tmp0
+	JG      loop32b
+	CMPQ    len, $256
+	JGE     aligned
+	VZEROUPPER
+	RET
+
+one16b:
+    VMOVDQU  -16(in)(len*1), in0x
+    VPSRLQ   $4, in0x, in0_hx
+    VPAND    maskx, in0x, in0x
+    VPAND    maskx, in0_hx, in0_hx
+    VPSHUFB  in0_hx, high_tblx, in0_hx
+    VPSHUFB  in0x, low_tblx, in0x
+    VPXOR    in0x, in0_hx, in0x
+    VPXOR    -16(out)(len*1), in0x, in0x
+	VMOVDQU  in0x, -16(out)(len*1)
+	SUBQ     $16, len
+	CMPQ     len, $0
+	JNE      ymm
+	RET
+
+// func mulVectSSSE3(tbl, d, p []byte)
+TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
+    MOVQ    i+24(FP), in
+	MOVQ    o+48(FP), out
+	MOVQ    tbl+0(FP), tmp0
+	MOVOU   (tmp0), low_tblx
+	MOVOU   16(tmp0), high_tblx
+    MOVB    $15, tmp0
+    MOVQ    tmp0, maskx
+    PXOR    tmp0x, tmp0x
+   	PSHUFB  tmp0x, maskx
+	MOVQ    in_len+32(FP), len
+	SHRQ    $4, len
+
+loop:
+	MOVOU  (in), in0x
+	MOVOU  in0x, in0_hx
+	PSRLQ  $4, in0_hx
+	PAND   maskx, in0x
+	PAND   maskx, in0_hx
+	MOVOU  low_tblx, tmp1x
+	MOVOU  high_tblx, tmp2x
+	PSHUFB in0x, tmp1x
+	PSHUFB in0_hx, tmp2x
+	PXOR   tmp1x, tmp2x
+	MOVOU  tmp2x, (out)
+	ADDQ   $16, in
+	ADDQ   $16, out
+	SUBQ   $1, len
+	JNZ    loop
+	RET
+
+// func mulVectAddSSSE3(tbl, d, p []byte)
+TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
+    MOVQ    i+24(FP), in
+	MOVQ    o+48(FP), out
+	MOVQ    tbl+0(FP), tmp0
+	MOVOU   (tmp0), low_tblx
+	MOVOU   16(tmp0), high_tblx
+    MOVB    $15, tmp0
+    MOVQ    tmp0, maskx
+    PXOR    tmp0x, tmp0x
+   	PSHUFB  tmp0x, maskx
+	MOVQ    in_len+32(FP), len
+	SHRQ    $4, len
+
+loop:
+	MOVOU  (in), in0x
+	MOVOU  in0x, in0_hx
+	PSRLQ  $4, in0_hx
+	PAND   maskx, in0x
+	PAND   maskx, in0_hx
+	MOVOU  low_tblx, tmp1x
+	MOVOU  high_tblx, tmp2x
+	PSHUFB in0x, tmp1x
+	PSHUFB in0_hx, tmp2x
+	PXOR   tmp1x, tmp2x
+	MOVOU  (out), tmp3x
+	PXOR   tmp3x, tmp2x
+	MOVOU  tmp2x, (out)
+	ADDQ   $16, in
+	ADDQ   $16, out
+	SUBQ   $1, len
+	JNZ    loop
+	RET
+
+// func copy32B(dst, src []byte)
+TEXT ·copy32B(SB), NOSPLIT, $0
+    MOVQ dst+0(FP), SI
+    MOVQ src+24(FP), DX
+    MOVOU (DX), X0
+    MOVOU 16(DX), X1
+    MOVOU X0, (SI)
+    MOVOU X1, 16(SI)
+    RET
+	
--- a/vendor/github.com/templexxx/reedsolomon/rs_other.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs_other.go
@@ -0,0 +1,8 @@
+// +build !amd64
+
+package reedsolomon
+
+func newRS(d, p int, em matrix) (enc Encoder) {
+	g := em[d*d:]
+	return &encBase{data: d, parity: p, encode: em, gen: g}
+}
--- a/vendor/github.com/templexxx/reedsolomon/tbl.go
+++ b/vendor/github.com/templexxx/reedsolomon/tbl.go
--- a/vendor/github.com/templexxx/xor/.gitattributes
+++ b/vendor/github.com/templexxx/xor/.gitattributes
@@ -0,0 +1 @@
+*.s linguist-language=go
--- a/vendor/github.com/templexxx/xor/.gitignore
+++ b/vendor/github.com/templexxx/xor/.gitignore
@@ -0,0 +1,18 @@
+# Binaries for programs and plugins
+*.exe
+*.dll
+*.so
+*.dylib
+
+# Test binary, build with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+
+# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
+.glide/
+/backup/
+/backup2/
+/.idea
+/backup3/
--- a/vendor/github.com/templexxx/xor/LICENSE
+++ b/vendor/github.com/templexxx/xor/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Temple3x
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/vendor/github.com/templexxx/xor/README.md
+++ b/vendor/github.com/templexxx/xor/README.md
@@ -0,0 +1,48 @@
+# XOR
+
+XOR code engine in pure Go
+
+more than 10GB/S per core
+
+## Introduction:
+
+1. Use SIMD (SSE2 or AVX2) for speeding up
+2. ...
+
+## Installation
+To get the package use the standard:
+```bash
+go get github.com/templexxx/xor
+```
+
+## Documentation
+
+See the associated [GoDoc](http://godoc.org/github.com/templexxx/xor)
+
+
+## Performance
+
+Performance depends mainly on:
+
+1. SIMD extension
+2. unit size of worker
+3. hardware ( CPU RAM etc)
+
+Example of performance on my MacBook 2014-mid(i5-4278U 2.6GHz 2 physical cores). The 16MB per shards.
+```
+speed = ( shards * size ) / cost
+```
+| data_shards    | shard_size |speed (MB/S) |
+|----------|----|-----|
+| 2       |1KB|64127.95  |
+|2|1400B|59657.55|
+|2|16KB|35370.84|
+| 2       | 16MB|12128.95 |
+| 5       |1KB| 78837.33 |
+|5|1400B|58054.89|
+|5|16KB|50161.19|
+|5| 16MB|12750.41|
+
+## Who is using this?
+
+1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang
--- a/vendor/github.com/templexxx/xor/avx2_amd64.s
+++ b/vendor/github.com/templexxx/xor/avx2_amd64.s
@@ -0,0 +1,438 @@
+#include "textflag.h"
+
+// addr of mem
+#define DST BX
+#define SRC SI
+#define SRC0 TMP4
+#define SRC1 TMP5
+
+// loop args
+// num of vect
+#define VECT CX
+#define LEN DX
+// pos of matrix
+#define POS R8
+
+// tmp store
+// num of vect or ...
+#define TMP1 R9
+// pos of matrix or ...
+#define TMP2 R10
+// store addr of data/parity or ...
+#define TMP3 R11
+#define TMP4 R12
+#define TMP5 R13
+#define TMP6 R14
+
+// func bytesAVX2mini(dst, src0, src1 []byte, size int)
+TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $31, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop32b:
+	VMOVDQU (SRC0)(POS*1), Y0
+	VPXOR   (SRC1)(POS*1), Y0, Y0
+	VMOVDQU Y0, (DST)(POS*1)
+	ADDQ    $32, POS
+	CMPQ    LEN, POS
+	JNE     loop32b
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $31, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $31, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $32
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesAVX2small(dst, src0, src1 []byte, size int)
+TEXT ·bytesAVX2small(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $127, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop128b:
+	VMOVDQU (SRC0)(POS*1), Y0
+	VMOVDQU 32(SRC0)(POS*1), Y1
+	VMOVDQU 64(SRC0)(POS*1), Y2
+	VMOVDQU 96(SRC0)(POS*1), Y3
+	VPXOR   (SRC1)(POS*1), Y0, Y0
+	VPXOR   32(SRC1)(POS*1), Y1, Y1
+	VPXOR   64(SRC1)(POS*1), Y2, Y2
+	VPXOR   96(SRC1)(POS*1), Y3, Y3
+	VMOVDQU Y0, (DST)(POS*1)
+	VMOVDQU Y1, 32(DST)(POS*1)
+	VMOVDQU Y2, 64(DST)(POS*1)
+	VMOVDQU Y3, 96(DST)(POS*1)
+
+	ADDQ $128, POS
+	CMPQ LEN, POS
+	JNE  loop128b
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $127, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $127, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesAVX2big(dst, src0, src1 []byte, size int)
+TEXT ·bytesAVX2big(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $127, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop128b:
+	VMOVDQU (SRC0)(POS*1), Y0
+	VMOVDQU 32(SRC0)(POS*1), Y1
+	VMOVDQU 64(SRC0)(POS*1), Y2
+	VMOVDQU 96(SRC0)(POS*1), Y3
+	VPXOR   (SRC1)(POS*1), Y0, Y0
+	VPXOR   32(SRC1)(POS*1), Y1, Y1
+	VPXOR   64(SRC1)(POS*1), Y2, Y2
+	VPXOR   96(SRC1)(POS*1), Y3, Y3
+	LONG    $0xe77da1c4; WORD $0x0304
+	LONG    $0xe77da1c4; WORD $0x034c; BYTE $0x20
+	LONG    $0xe77da1c4; WORD $0x0354; BYTE $0x40
+	LONG    $0xe77da1c4; WORD $0x035c; BYTE $0x60
+
+	ADDQ $128, POS
+	CMPQ LEN, POS
+	JNE  loop128b
+	SFENCE
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $127, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $127, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func matrixAVX2small(dst []byte, src [][]byte)
+TEXT ·matrixAVX2small(SB), NOSPLIT, $0
+	MOVQ  dst+0(FP), DST
+	MOVQ  src+24(FP), SRC
+	MOVQ  vec+32(FP), VECT
+	MOVQ  len+8(FP), LEN
+	TESTQ $127, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop128b:
+	MOVQ    VECT, TMP1
+	SUBQ    $2, TMP1
+	MOVQ    $0, TMP2
+	MOVQ    (SRC)(TMP2*1), TMP3
+	MOVQ    TMP3, TMP4
+	VMOVDQU (TMP3)(POS*1), Y0
+	VMOVDQU 32(TMP4)(POS*1), Y1
+	VMOVDQU 64(TMP3)(POS*1), Y2
+	VMOVDQU 96(TMP4)(POS*1), Y3
+
+next_vect:
+	ADDQ    $24, TMP2
+	MOVQ    (SRC)(TMP2*1), TMP3
+	MOVQ    TMP3, TMP4
+	VMOVDQU (TMP3)(POS*1), Y4
+	VMOVDQU 32(TMP4)(POS*1), Y5
+	VMOVDQU 64(TMP3)(POS*1), Y6
+	VMOVDQU 96(TMP4)(POS*1), Y7
+	VPXOR   Y4, Y0, Y0
+	VPXOR   Y5, Y1, Y1
+	VPXOR   Y6, Y2, Y2
+	VPXOR   Y7, Y3, Y3
+	SUBQ    $1, TMP1
+	JGE     next_vect
+
+	VMOVDQU Y0, (DST)(POS*1)
+	VMOVDQU Y1, 32(DST)(POS*1)
+	VMOVDQU Y2, 64(DST)(POS*1)
+	VMOVDQU Y3, 96(DST)(POS*1)
+
+	ADDQ $128, POS
+	CMPQ LEN, POS
+	JNE  loop128b
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVB -1(TMP3)(LEN*1), TMP5
+
+next_vect_1b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVB -1(TMP3)(LEN*1), TMP6
+	XORB TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_1b
+
+	MOVB  TMP5, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $127, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP4
+	ANDQ  $127, TMP4
+
+loop_8b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVQ -8(TMP3)(LEN*1), TMP5
+
+next_vect_8b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVQ -8(TMP3)(LEN*1), TMP6
+	XORQ TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_8b
+
+	MOVQ TMP5, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP4
+	JG   loop_8b
+
+	CMPQ LEN, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func matrixAVX2big(dst []byte, src [][]byte)
+TEXT ·matrixAVX2big(SB), NOSPLIT, $0
+	MOVQ  dst+0(FP), DST
+	MOVQ  src+24(FP), SRC
+	MOVQ  vec+32(FP), VECT
+	MOVQ  len+8(FP), LEN
+	TESTQ $127, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop128b:
+	MOVQ    VECT, TMP1
+	SUBQ    $2, TMP1
+	MOVQ    $0, TMP2
+	MOVQ    (SRC)(TMP2*1), TMP3
+	MOVQ    TMP3, TMP4
+	VMOVDQU (TMP3)(POS*1), Y0
+	VMOVDQU 32(TMP4)(POS*1), Y1
+	VMOVDQU 64(TMP3)(POS*1), Y2
+	VMOVDQU 96(TMP4)(POS*1), Y3
+
+next_vect:
+	ADDQ    $24, TMP2
+	MOVQ    (SRC)(TMP2*1), TMP3
+	MOVQ    TMP3, TMP4
+	VMOVDQU (TMP3)(POS*1), Y4
+	VMOVDQU 32(TMP4)(POS*1), Y5
+	VMOVDQU 64(TMP3)(POS*1), Y6
+	VMOVDQU 96(TMP4)(POS*1), Y7
+	VPXOR   Y4, Y0, Y0
+	VPXOR   Y5, Y1, Y1
+	VPXOR   Y6, Y2, Y2
+	VPXOR   Y7, Y3, Y3
+	SUBQ    $1, TMP1
+	JGE     next_vect
+
+	LONG $0xe77da1c4; WORD $0x0304             // VMOVNTDQ  go1.8 has
+	LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
+	LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
+	LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
+
+	ADDQ $128, POS
+	CMPQ LEN, POS
+	JNE  loop128b
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVB -1(TMP3)(LEN*1), TMP5
+
+next_vect_1b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVB -1(TMP3)(LEN*1), TMP6
+	XORB TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_1b
+
+	MOVB  TMP5, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $127, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP4
+	ANDQ  $127, TMP4
+
+loop_8b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVQ -8(TMP3)(LEN*1), TMP5
+
+next_vect_8b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVQ -8(TMP3)(LEN*1), TMP6
+	XORQ TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_8b
+
+	MOVQ TMP5, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP4
+	JG   loop_8b
+
+	CMPQ LEN, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
--- a/vendor/github.com/templexxx/xor/nosimd.go
+++ b/vendor/github.com/templexxx/xor/nosimd.go
@@ -0,0 +1,116 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package xor
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
+
+// xor the bytes in a and b. The destination is assumed to have enough space.
+func bytesNoSIMD(dst, a, b []byte, size int) {
+	if supportsUnaligned {
+		fastXORBytes(dst, a, b, size)
+	} else {
+		// TODO(hanwen): if (dst, a, b) have common alignment
+		// we could still try fastXORBytes. It is not clear
+		// how often this happens, and it's only worth it if
+		// the block encryption itself is hardware
+		// accelerated.
+		safeXORBytes(dst, a, b, size)
+	}
+}
+
+// split slice for cache-friendly
+const unitSize = 16 * 1024
+
+func matrixNoSIMD(dst []byte, src [][]byte) {
+	size := len(src[0])
+	start := 0
+	do := unitSize
+	for start < size {
+		end := start + do
+		if end <= size {
+			partNoSIMD(start, end, dst, src)
+			start = start + do
+		} else {
+			partNoSIMD(start, size, dst, src)
+			start = size
+		}
+	}
+}
+
+// split vect will improve performance with big data by reducing cache pollution
+func partNoSIMD(start, end int, dst []byte, src [][]byte) {
+	bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start)
+	for i := 2; i < len(src); i++ {
+		bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start)
+	}
+}
+
+// fastXORBytes xor in bulk. It only works on architectures that
+// support unaligned read/writes.
+func fastXORBytes(dst, a, b []byte, n int) {
+	w := n / wordSize
+	if w > 0 {
+		wordBytes := w * wordSize
+		fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
+	}
+	for i := n - n%wordSize; i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+}
+
+func safeXORBytes(dst, a, b []byte, n int) {
+	ex := n % 8
+	for i := 0; i < ex; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+
+	for i := ex; i < n; i += 8 {
+		_dst := dst[i : i+8]
+		_a := a[i : i+8]
+		_b := b[i : i+8]
+		_dst[0] = _a[0] ^ _b[0]
+		_dst[1] = _a[1] ^ _b[1]
+		_dst[2] = _a[2] ^ _b[2]
+		_dst[3] = _a[3] ^ _b[3]
+
+		_dst[4] = _a[4] ^ _b[4]
+		_dst[5] = _a[5] ^ _b[5]
+		_dst[6] = _a[6] ^ _b[6]
+		_dst[7] = _a[7] ^ _b[7]
+	}
+}
+
+// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The arguments are assumed to be of equal length.
+func fastXORWords(dst, a, b []byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
+	n := len(b) / wordSize
+	ex := n % 8
+	for i := 0; i < ex; i++ {
+		dw[i] = aw[i] ^ bw[i]
+	}
+
+	for i := ex; i < n; i += 8 {
+		_dw := dw[i : i+8]
+		_aw := aw[i : i+8]
+		_bw := bw[i : i+8]
+		_dw[0] = _aw[0] ^ _bw[0]
+		_dw[1] = _aw[1] ^ _bw[1]
+		_dw[2] = _aw[2] ^ _bw[2]
+		_dw[3] = _aw[3] ^ _bw[3]
+		_dw[4] = _aw[4] ^ _bw[4]
+		_dw[5] = _aw[5] ^ _bw[5]
+		_dw[6] = _aw[6] ^ _bw[6]
+		_dw[7] = _aw[7] ^ _bw[7]
+	}
+}
--- a/vendor/github.com/templexxx/xor/sse2_amd64.s
+++ b/vendor/github.com/templexxx/xor/sse2_amd64.s
@@ -0,0 +1,574 @@
+#include "textflag.h"
+
+// addr of mem
+#define DST BX
+#define SRC SI
+#define SRC0 TMP4
+#define SRC1 TMP5
+
+// loop args
+// num of vect
+#define VECT CX
+#define LEN DX
+// pos of matrix
+#define POS R8
+
+// tmp store
+// num of vect or ...
+#define TMP1 R9
+// pos of matrix or ...
+#define TMP2 R10
+// store addr of data/parity or ...
+#define TMP3 R11
+#define TMP4 R12
+#define TMP5 R13
+#define TMP6 R14
+
+// func bytesSrc0(dst, src0, src1 []byte)
+TEXT ·xorSrc0(SB), NOSPLIT, $0
+	MOVQ  len+32(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $15, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop16b:
+	MOVOU (SRC0)(POS*1), X0
+	XORPD (SRC1)(POS*1), X0
+	MOVOU X0, (DST)(POS*1)
+	ADDQ  $16, POS
+	CMPQ  LEN, POS
+	JNE   loop16b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $15, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $15, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $16
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesSrc1(dst, src0, src1 []byte)
+TEXT ·xorSrc1(SB), NOSPLIT, $0
+	MOVQ  len+56(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $15, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop16b:
+	MOVOU (SRC0)(POS*1), X0
+	XORPD (SRC1)(POS*1), X0
+	MOVOU X0, (DST)(POS*1)
+	ADDQ  $16, POS
+	CMPQ  LEN, POS
+	JNE   loop16b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $15, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $15, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $16
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesSSE2mini(dst, src0, src1 []byte, size int)
+TEXT ·bytesSSE2mini(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $15, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop16b:
+	MOVOU (SRC0)(POS*1), X0
+	XORPD (SRC1)(POS*1), X0
+
+	// MOVOU (SRC1)(POS*1), X4
+	// PXOR X4, X0
+	MOVOU X0, (DST)(POS*1)
+	ADDQ  $16, POS
+	CMPQ  LEN, POS
+	JNE   loop16b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $15, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $15, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $16
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesSSE2small(dst, src0, src1 []byte, size int)
+TEXT ·bytesSSE2small(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $63, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop64b:
+	MOVOU (SRC0)(POS*1), X0
+	MOVOU 16(SRC0)(POS*1), X1
+	MOVOU 32(SRC0)(POS*1), X2
+	MOVOU 48(SRC0)(POS*1), X3
+
+	MOVOU (SRC1)(POS*1), X4
+	MOVOU 16(SRC1)(POS*1), X5
+	MOVOU 32(SRC1)(POS*1), X6
+	MOVOU 48(SRC1)(POS*1), X7
+
+	PXOR X4, X0
+	PXOR X5, X1
+	PXOR X6, X2
+	PXOR X7, X3
+
+	MOVOU X0, (DST)(POS*1)
+	MOVOU X1, 16(DST)(POS*1)
+	MOVOU X2, 32(DST)(POS*1)
+	MOVOU X3, 48(DST)(POS*1)
+
+	ADDQ $64, POS
+	CMPQ LEN, POS
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $63, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $63, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func bytesSSE2big(dst, src0, src1 []byte, size int)
+TEXT ·bytesSSE2big(SB), NOSPLIT, $0
+	MOVQ  len+72(FP), LEN
+	CMPQ  LEN, $0
+	JE    ret
+	MOVQ  dst+0(FP), DST
+	MOVQ  src0+24(FP), SRC0
+	MOVQ  src1+48(FP), SRC1
+	TESTQ $63, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop64b:
+	MOVOU (SRC0)(POS*1), X0
+	MOVOU 16(SRC0)(POS*1), X1
+	MOVOU 32(SRC0)(POS*1), X2
+	MOVOU 48(SRC0)(POS*1), X3
+
+	MOVOU (SRC1)(POS*1), X4
+	MOVOU 16(SRC1)(POS*1), X5
+	MOVOU 32(SRC1)(POS*1), X6
+	MOVOU 48(SRC1)(POS*1), X7
+
+	PXOR X4, X0
+	PXOR X5, X1
+	PXOR X6, X2
+	PXOR X7, X3
+
+	LONG $0xe70f4266; WORD $0x0304             // MOVNTDQ
+	LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
+	LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
+	LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
+
+	ADDQ $64, POS
+	CMPQ LEN, POS
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVB  -1(SRC0)(LEN*1), TMP1
+	MOVB  -1(SRC1)(LEN*1), TMP2
+	XORB  TMP1, TMP2
+	MOVB  TMP2, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $63, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP1
+	ANDQ  $63, TMP1
+
+loop_8b:
+	MOVQ -8(SRC0)(LEN*1), TMP2
+	MOVQ -8(SRC1)(LEN*1), TMP3
+	XORQ TMP2, TMP3
+	MOVQ TMP3, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP1
+	JG   loop_8b
+
+	CMPQ LEN, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func matrixSSE2small(dst []byte, src [][]byte)
+TEXT ·matrixSSE2small(SB), NOSPLIT, $0
+	MOVQ  dst+0(FP), DST
+	MOVQ  src+24(FP), SRC
+	MOVQ  vec+32(FP), VECT
+	MOVQ  len+8(FP), LEN
+	TESTQ $63, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop64b:
+	MOVQ  VECT, TMP1
+	SUBQ  $2, TMP1
+	MOVQ  $0, TMP2
+	MOVQ  (SRC)(TMP2*1), TMP3
+	MOVQ  TMP3, TMP4
+	MOVOU (TMP3)(POS*1), X0
+	MOVOU 16(TMP4)(POS*1), X1
+	MOVOU 32(TMP3)(POS*1), X2
+	MOVOU 48(TMP4)(POS*1), X3
+
+next_vect:
+	ADDQ  $24, TMP2
+	MOVQ  (SRC)(TMP2*1), TMP3
+	MOVQ  TMP3, TMP4
+	MOVOU (TMP3)(POS*1), X4
+	MOVOU 16(TMP4)(POS*1), X5
+	MOVOU 32(TMP3)(POS*1), X6
+	MOVOU 48(TMP4)(POS*1), X7
+	PXOR  X4, X0
+	PXOR  X5, X1
+	PXOR  X6, X2
+	PXOR  X7, X3
+	SUBQ  $1, TMP1
+	JGE   next_vect
+
+	MOVOU X0, (DST)(POS*1)
+	MOVOU X1, 16(DST)(POS*1)
+	MOVOU X2, 32(DST)(POS*1)
+	MOVOU X3, 48(DST)(POS*1)
+
+	ADDQ $64, POS
+	CMPQ LEN, POS
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVB -1(TMP3)(LEN*1), TMP5
+
+next_vect_1b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVB -1(TMP3)(LEN*1), TMP6
+	XORB TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_1b
+
+	MOVB  TMP5, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $63, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP4
+	ANDQ  $63, TMP4
+
+loop_8b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVQ -8(TMP3)(LEN*1), TMP5
+
+next_vect_8b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVQ -8(TMP3)(LEN*1), TMP6
+	XORQ TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_8b
+
+	MOVQ TMP5, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP4
+	JG   loop_8b
+
+	CMPQ LEN, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+// func matrixSSE2big(dst []byte, src [][]byte)
+TEXT ·matrixSSE2big(SB), NOSPLIT, $0
+	MOVQ  dst+0(FP), DST
+	MOVQ  src+24(FP), SRC
+	MOVQ  vec+32(FP), VECT
+	MOVQ  len+8(FP), LEN
+	TESTQ $63, LEN
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, POS
+
+loop64b:
+	MOVQ  VECT, TMP1
+	SUBQ  $2, TMP1
+	MOVQ  $0, TMP2
+	MOVQ  (SRC)(TMP2*1), TMP3
+	MOVQ  TMP3, TMP4
+	MOVOU (TMP3)(POS*1), X0
+	MOVOU 16(TMP4)(POS*1), X1
+	MOVOU 32(TMP3)(POS*1), X2
+	MOVOU 48(TMP4)(POS*1), X3
+
+next_vect:
+	ADDQ  $24, TMP2
+	MOVQ  (SRC)(TMP2*1), TMP3
+	MOVQ  TMP3, TMP4
+	MOVOU (TMP3)(POS*1), X4
+	MOVOU 16(TMP4)(POS*1), X5
+	MOVOU 32(TMP3)(POS*1), X6
+	MOVOU 48(TMP4)(POS*1), X7
+	PXOR  X4, X0
+	PXOR  X5, X1
+	PXOR  X6, X2
+	PXOR  X7, X3
+	SUBQ  $1, TMP1
+	JGE   next_vect
+
+	LONG $0xe70f4266; WORD $0x0304
+	LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
+	LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
+	LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
+
+	ADDQ $64, POS
+	CMPQ LEN, POS
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVB -1(TMP3)(LEN*1), TMP5
+
+next_vect_1b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVB -1(TMP3)(LEN*1), TMP6
+	XORB TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_1b
+
+	MOVB  TMP5, -1(DST)(LEN*1)
+	SUBQ  $1, LEN
+	TESTQ $7, LEN
+	JNZ   loop_1b
+
+	CMPQ  LEN, $0
+	JE    ret
+	TESTQ $63, LEN
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, LEN
+	JNE   loop_1b
+	MOVQ  LEN, TMP4
+	ANDQ  $63, TMP4
+
+loop_8b:
+	MOVQ VECT, TMP1
+	MOVQ $0, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	SUBQ $2, TMP1
+	MOVQ -8(TMP3)(LEN*1), TMP5
+
+next_vect_8b:
+	ADDQ $24, TMP2
+	MOVQ (SRC)(TMP2*1), TMP3
+	MOVQ -8(TMP3)(LEN*1), TMP6
+	XORQ TMP6, TMP5
+	SUBQ $1, TMP1
+	JGE  next_vect_8b
+
+	MOVQ TMP5, -8(DST)(LEN*1)
+	SUBQ $8, LEN
+	SUBQ $8, TMP4
+	JG   loop_8b
+
+	CMPQ LEN, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET
+
+TEXT ·hasSSE2(SB), NOSPLIT, $0
+	XORQ AX, AX
+	INCL AX
+	CPUID
+	SHRQ $26, DX
+	ANDQ $1, DX
+	MOVB DX, ret+0(FP)
+	RET
+
--- a/vendor/github.com/templexxx/xor/xor.go
+++ b/vendor/github.com/templexxx/xor/xor.go
@@ -0,0 +1,49 @@
+package xor
+
+// SIMD Extensions
+const (
+	none = iota
+	avx2
+	// first introduced by Intel with the initial version of the Pentium 4 in 2001
+	// so I think we can assume all amd64 has sse2
+	sse2
+)
+
+var extension = none
+
+// Bytes : chose the shortest one as xor size
+// it's better to use it for big data ( > 64bytes )
+func Bytes(dst, src0, src1 []byte) {
+	size := len(dst)
+	if size > len(src0) {
+		size = len(src0)
+	}
+	if size > len(src1) {
+		size = len(src1)
+	}
+	xorBytes(dst, src0, src1, size)
+}
+
+// BytesSameLen : all slice's length must be equal
+// cut size branch, save time for small data
+func BytesSameLen(dst, src0, src1 []byte) {
+	xorSrc1(dst, src0, src1)
+}
+
+// BytesSrc0 : src1 >= src0, dst >= src0
+// xor src0's len bytes
+func BytesSrc0(dst, src0, src1 []byte) {
+	xorSrc0(dst, src0, src1)
+}
+
+// BytesSrc1 : src0 >= src1, dst >= src1
+// xor src1's len bytes
+func BytesSrc1(dst, src0, src1 []byte) {
+	xorSrc1(dst, src0, src1)
+}
+
+// Matrix : all slice's length must be equal && != 0
+// len(src) must >= 2
+func Matrix(dst []byte, src [][]byte) {
+	xorMatrix(dst, src)
+}
--- a/vendor/github.com/templexxx/xor/xor_amd64.go
+++ b/vendor/github.com/templexxx/xor/xor_amd64.go
@@ -0,0 +1,120 @@
+package xor
+
+import "github.com/templexxx/cpufeat"
+
+func init() {
+	getEXT()
+}
+
+func getEXT() {
+	if cpufeat.X86.HasAVX2 {
+		extension = avx2
+	} else {
+		extension = sse2
+	}
+	return
+}
+
+func xorBytes(dst, src0, src1 []byte, size int) {
+	switch extension {
+	case avx2:
+		bytesAVX2(dst, src0, src1, size)
+	default:
+		bytesSSE2(dst, src0, src1, size)
+	}
+}
+
+// non-temporal hint store
+const nontmp = 8 * 1024
+const avx2loopsize = 128
+
+func bytesAVX2(dst, src0, src1 []byte, size int) {
+	if size < avx2loopsize {
+		bytesAVX2mini(dst, src0, src1, size)
+	} else if size >= avx2loopsize && size <= nontmp {
+		bytesAVX2small(dst, src0, src1, size)
+	} else {
+		bytesAVX2big(dst, src0, src1, size)
+	}
+}
+
+const sse2loopsize = 64
+
+func bytesSSE2(dst, src0, src1 []byte, size int) {
+	if size < sse2loopsize {
+		bytesSSE2mini(dst, src0, src1, size)
+	} else if size >= sse2loopsize && size <= nontmp {
+		bytesSSE2small(dst, src0, src1, size)
+	} else {
+		bytesSSE2big(dst, src0, src1, size)
+	}
+}
+
+func xorMatrix(dst []byte, src [][]byte) {
+	switch extension {
+	case avx2:
+		matrixAVX2(dst, src)
+	default:
+		matrixSSE2(dst, src)
+	}
+}
+
+func matrixAVX2(dst []byte, src [][]byte) {
+	size := len(dst)
+	if size > nontmp {
+		matrixAVX2big(dst, src)
+	} else {
+		matrixAVX2small(dst, src)
+	}
+}
+
+func matrixSSE2(dst []byte, src [][]byte) {
+	size := len(dst)
+	if size > nontmp {
+		matrixSSE2big(dst, src)
+	} else {
+		matrixSSE2small(dst, src)
+	}
+}
+
+//go:noescape
+func xorSrc0(dst, src0, src1 []byte)
+
+//go:noescape
+func xorSrc1(dst, src0, src1 []byte)
+
+//go:noescape
+func bytesAVX2mini(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesAVX2big(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesAVX2small(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesSSE2mini(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesSSE2small(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func bytesSSE2big(dst, src0, src1 []byte, size int)
+
+//go:noescape
+func matrixAVX2small(dst []byte, src [][]byte)
+
+//go:noescape
+func matrixAVX2big(dst []byte, src [][]byte)
+
+//go:noescape
+func matrixSSE2small(dst []byte, src [][]byte)
+
+//go:noescape
+func matrixSSE2big(dst []byte, src [][]byte)
+
+//go:noescape
+func hasAVX2() bool
+
+//go:noescape
+func hasSSE2() bool
--- a/vendor/github.com/templexxx/xor/xor_other.go
+++ b/vendor/github.com/templexxx/xor/xor_other.go
@@ -0,0 +1,19 @@
+// +build !amd64 noasm
+
+package xor
+
+func xorBytes(dst, src0, src1 []byte, size int) {
+	bytesNoSIMD(dst, src0, src1, size)
+}
+
+func xorMatrix(dst []byte, src [][]byte) {
+	matrixNoSIMD(dst, src)
+}
+
+func xorSrc0(dst, src0, src1 []byte) {
+	bytesNoSIMD(dst, src0, src1, len(src0))
+}
+
+func xorSrc1(dst, src0, src1 []byte) {
+	bytesNoSIMD(dst, src0, src1, len(src1))
+}