mirror of
https://github.com/fatedier/frp.git
synced 2025-08-04 04:36:51 +00:00
add packages
This commit is contained in:
438
vendor/github.com/templexxx/xor/avx2_amd64.s
generated
vendored
Normal file
438
vendor/github.com/templexxx/xor/avx2_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,438 @@
|
||||
#include "textflag.h"
|
||||
|
||||
// addr of mem
|
||||
#define DST BX
|
||||
#define SRC SI
|
||||
#define SRC0 TMP4
|
||||
#define SRC1 TMP5
|
||||
|
||||
// loop args
|
||||
// num of vect
|
||||
#define VECT CX
|
||||
#define LEN DX
|
||||
// pos of matrix
|
||||
#define POS R8
|
||||
|
||||
// tmp store
|
||||
// num of vect or ...
|
||||
#define TMP1 R9
|
||||
// pos of matrix or ...
|
||||
#define TMP2 R10
|
||||
// store addr of data/parity or ...
|
||||
#define TMP3 R11
|
||||
#define TMP4 R12
|
||||
#define TMP5 R13
|
||||
#define TMP6 R14
|
||||
|
||||
// func bytesAVX2mini(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $31, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop32b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
ADDQ $32, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop32b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $31, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $31, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $32
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesAVX2small(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2small(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VMOVDQU 32(SRC0)(POS*1), Y1
|
||||
VMOVDQU 64(SRC0)(POS*1), Y2
|
||||
VMOVDQU 96(SRC0)(POS*1), Y3
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VPXOR 32(SRC1)(POS*1), Y1, Y1
|
||||
VPXOR 64(SRC1)(POS*1), Y2, Y2
|
||||
VPXOR 96(SRC1)(POS*1), Y3, Y3
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
VMOVDQU Y1, 32(DST)(POS*1)
|
||||
VMOVDQU Y2, 64(DST)(POS*1)
|
||||
VMOVDQU Y3, 96(DST)(POS*1)
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $127, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func bytesAVX2big(dst, src0, src1 []byte, size int)
|
||||
TEXT ·bytesAVX2big(SB), NOSPLIT, $0
|
||||
MOVQ len+72(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src0+24(FP), SRC0
|
||||
MOVQ src1+48(FP), SRC1
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
VMOVDQU (SRC0)(POS*1), Y0
|
||||
VMOVDQU 32(SRC0)(POS*1), Y1
|
||||
VMOVDQU 64(SRC0)(POS*1), Y2
|
||||
VMOVDQU 96(SRC0)(POS*1), Y3
|
||||
VPXOR (SRC1)(POS*1), Y0, Y0
|
||||
VPXOR 32(SRC1)(POS*1), Y1, Y1
|
||||
VPXOR 64(SRC1)(POS*1), Y2, Y2
|
||||
VPXOR 96(SRC1)(POS*1), Y3, Y3
|
||||
LONG $0xe77da1c4; WORD $0x0304
|
||||
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
|
||||
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
|
||||
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
SFENCE
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVB -1(SRC0)(LEN*1), TMP1
|
||||
MOVB -1(SRC1)(LEN*1), TMP2
|
||||
XORB TMP1, TMP2
|
||||
MOVB TMP2, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP1
|
||||
ANDQ $127, TMP1
|
||||
|
||||
loop_8b:
|
||||
MOVQ -8(SRC0)(LEN*1), TMP2
|
||||
MOVQ -8(SRC1)(LEN*1), TMP3
|
||||
XORQ TMP2, TMP3
|
||||
MOVQ TMP3, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP1
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixAVX2small(dst []byte, src [][]byte)
|
||||
TEXT ·matrixAVX2small(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y0
|
||||
VMOVDQU 32(TMP4)(POS*1), Y1
|
||||
VMOVDQU 64(TMP3)(POS*1), Y2
|
||||
VMOVDQU 96(TMP4)(POS*1), Y3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y4
|
||||
VMOVDQU 32(TMP4)(POS*1), Y5
|
||||
VMOVDQU 64(TMP3)(POS*1), Y6
|
||||
VMOVDQU 96(TMP4)(POS*1), Y7
|
||||
VPXOR Y4, Y0, Y0
|
||||
VPXOR Y5, Y1, Y1
|
||||
VPXOR Y6, Y2, Y2
|
||||
VPXOR Y7, Y3, Y3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
VMOVDQU Y0, (DST)(POS*1)
|
||||
VMOVDQU Y1, 32(DST)(POS*1)
|
||||
VMOVDQU Y2, 64(DST)(POS*1)
|
||||
VMOVDQU Y3, 96(DST)(POS*1)
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $127, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
||||
// func matrixAVX2big(dst []byte, src [][]byte)
|
||||
TEXT ·matrixAVX2big(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DST
|
||||
MOVQ src+24(FP), SRC
|
||||
MOVQ vec+32(FP), VECT
|
||||
MOVQ len+8(FP), LEN
|
||||
TESTQ $127, LEN
|
||||
JNZ not_aligned
|
||||
|
||||
aligned:
|
||||
MOVQ $0, POS
|
||||
|
||||
loop128b:
|
||||
MOVQ VECT, TMP1
|
||||
SUBQ $2, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y0
|
||||
VMOVDQU 32(TMP4)(POS*1), Y1
|
||||
VMOVDQU 64(TMP3)(POS*1), Y2
|
||||
VMOVDQU 96(TMP4)(POS*1), Y3
|
||||
|
||||
next_vect:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ TMP3, TMP4
|
||||
VMOVDQU (TMP3)(POS*1), Y4
|
||||
VMOVDQU 32(TMP4)(POS*1), Y5
|
||||
VMOVDQU 64(TMP3)(POS*1), Y6
|
||||
VMOVDQU 96(TMP4)(POS*1), Y7
|
||||
VPXOR Y4, Y0, Y0
|
||||
VPXOR Y5, Y1, Y1
|
||||
VPXOR Y6, Y2, Y2
|
||||
VPXOR Y7, Y3, Y3
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect
|
||||
|
||||
LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
|
||||
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
|
||||
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
|
||||
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
|
||||
|
||||
ADDQ $128, POS
|
||||
CMPQ LEN, POS
|
||||
JNE loop128b
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
loop_1b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVB -1(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_1b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVB -1(TMP3)(LEN*1), TMP6
|
||||
XORB TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_1b
|
||||
|
||||
MOVB TMP5, -1(DST)(LEN*1)
|
||||
SUBQ $1, LEN
|
||||
TESTQ $7, LEN
|
||||
JNZ loop_1b
|
||||
|
||||
CMPQ LEN, $0
|
||||
JE ret
|
||||
TESTQ $127, LEN
|
||||
JZ aligned
|
||||
|
||||
not_aligned:
|
||||
TESTQ $7, LEN
|
||||
JNE loop_1b
|
||||
MOVQ LEN, TMP4
|
||||
ANDQ $127, TMP4
|
||||
|
||||
loop_8b:
|
||||
MOVQ VECT, TMP1
|
||||
MOVQ $0, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
SUBQ $2, TMP1
|
||||
MOVQ -8(TMP3)(LEN*1), TMP5
|
||||
|
||||
next_vect_8b:
|
||||
ADDQ $24, TMP2
|
||||
MOVQ (SRC)(TMP2*1), TMP3
|
||||
MOVQ -8(TMP3)(LEN*1), TMP6
|
||||
XORQ TMP6, TMP5
|
||||
SUBQ $1, TMP1
|
||||
JGE next_vect_8b
|
||||
|
||||
MOVQ TMP5, -8(DST)(LEN*1)
|
||||
SUBQ $8, LEN
|
||||
SUBQ $8, TMP4
|
||||
JG loop_8b
|
||||
|
||||
CMPQ LEN, $128
|
||||
JGE aligned
|
||||
RET
|
||||
|
||||
ret:
|
||||
RET
|
||||
|
Reference in New Issue
Block a user