439 lines
7.3 KiB
Go
439 lines
7.3 KiB
Go
#include "textflag.h"
|
|
|
|
// addr of mem
|
|
#define DST BX
|
|
#define SRC SI
|
|
#define SRC0 TMP4
|
|
#define SRC1 TMP5
|
|
|
|
// loop args
|
|
// num of vect
|
|
#define VECT CX
|
|
#define LEN DX
|
|
// pos of matrix
|
|
#define POS R8
|
|
|
|
// tmp store
|
|
// num of vect or ...
|
|
#define TMP1 R9
|
|
// pos of matrix or ...
|
|
#define TMP2 R10
|
|
// store addr of data/parity or ...
|
|
#define TMP3 R11
|
|
#define TMP4 R12
|
|
#define TMP5 R13
|
|
#define TMP6 R14
|
|
|
|
// func bytesAVX2mini(dst, src0, src1 []byte, size int)
|
|
TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
|
|
MOVQ len+72(FP), LEN
|
|
CMPQ LEN, $0
|
|
JE ret
|
|
MOVQ dst+0(FP), DST
|
|
MOVQ src0+24(FP), SRC0
|
|
MOVQ src1+48(FP), SRC1
|
|
TESTQ $31, LEN
|
|
JNZ not_aligned
|
|
|
|
aligned:
|
|
MOVQ $0, POS
|
|
|
|
loop32b:
|
|
VMOVDQU (SRC0)(POS*1), Y0
|
|
VPXOR (SRC1)(POS*1), Y0, Y0
|
|
VMOVDQU Y0, (DST)(POS*1)
|
|
ADDQ $32, POS
|
|
CMPQ LEN, POS
|
|
JNE loop32b
|
|
VZEROUPPER
|
|
RET
|
|
|
|
loop_1b:
|
|
MOVB -1(SRC0)(LEN*1), TMP1
|
|
MOVB -1(SRC1)(LEN*1), TMP2
|
|
XORB TMP1, TMP2
|
|
MOVB TMP2, -1(DST)(LEN*1)
|
|
SUBQ $1, LEN
|
|
TESTQ $7, LEN
|
|
JNZ loop_1b
|
|
CMPQ LEN, $0
|
|
JE ret
|
|
TESTQ $31, LEN
|
|
JZ aligned
|
|
|
|
not_aligned:
|
|
TESTQ $7, LEN
|
|
JNE loop_1b
|
|
MOVQ LEN, TMP1
|
|
ANDQ $31, TMP1
|
|
|
|
loop_8b:
|
|
MOVQ -8(SRC0)(LEN*1), TMP2
|
|
MOVQ -8(SRC1)(LEN*1), TMP3
|
|
XORQ TMP2, TMP3
|
|
MOVQ TMP3, -8(DST)(LEN*1)
|
|
SUBQ $8, LEN
|
|
SUBQ $8, TMP1
|
|
JG loop_8b
|
|
|
|
CMPQ LEN, $32
|
|
JGE aligned
|
|
RET
|
|
|
|
ret:
|
|
RET
|
|
|
|
// func bytesAVX2small(dst, src0, src1 []byte, size int)
|
|
TEXT ·bytesAVX2small(SB), NOSPLIT, $0
|
|
MOVQ len+72(FP), LEN
|
|
CMPQ LEN, $0
|
|
JE ret
|
|
MOVQ dst+0(FP), DST
|
|
MOVQ src0+24(FP), SRC0
|
|
MOVQ src1+48(FP), SRC1
|
|
TESTQ $127, LEN
|
|
JNZ not_aligned
|
|
|
|
aligned:
|
|
MOVQ $0, POS
|
|
|
|
loop128b:
|
|
VMOVDQU (SRC0)(POS*1), Y0
|
|
VMOVDQU 32(SRC0)(POS*1), Y1
|
|
VMOVDQU 64(SRC0)(POS*1), Y2
|
|
VMOVDQU 96(SRC0)(POS*1), Y3
|
|
VPXOR (SRC1)(POS*1), Y0, Y0
|
|
VPXOR 32(SRC1)(POS*1), Y1, Y1
|
|
VPXOR 64(SRC1)(POS*1), Y2, Y2
|
|
VPXOR 96(SRC1)(POS*1), Y3, Y3
|
|
VMOVDQU Y0, (DST)(POS*1)
|
|
VMOVDQU Y1, 32(DST)(POS*1)
|
|
VMOVDQU Y2, 64(DST)(POS*1)
|
|
VMOVDQU Y3, 96(DST)(POS*1)
|
|
|
|
ADDQ $128, POS
|
|
CMPQ LEN, POS
|
|
JNE loop128b
|
|
VZEROUPPER
|
|
RET
|
|
|
|
loop_1b:
|
|
MOVB -1(SRC0)(LEN*1), TMP1
|
|
MOVB -1(SRC1)(LEN*1), TMP2
|
|
XORB TMP1, TMP2
|
|
MOVB TMP2, -1(DST)(LEN*1)
|
|
SUBQ $1, LEN
|
|
TESTQ $7, LEN
|
|
JNZ loop_1b
|
|
CMPQ LEN, $0
|
|
JE ret
|
|
TESTQ $127, LEN
|
|
JZ aligned
|
|
|
|
not_aligned:
|
|
TESTQ $7, LEN
|
|
JNE loop_1b
|
|
MOVQ LEN, TMP1
|
|
ANDQ $127, TMP1
|
|
|
|
loop_8b:
|
|
MOVQ -8(SRC0)(LEN*1), TMP2
|
|
MOVQ -8(SRC1)(LEN*1), TMP3
|
|
XORQ TMP2, TMP3
|
|
MOVQ TMP3, -8(DST)(LEN*1)
|
|
SUBQ $8, LEN
|
|
SUBQ $8, TMP1
|
|
JG loop_8b
|
|
|
|
CMPQ LEN, $128
|
|
JGE aligned
|
|
RET
|
|
|
|
ret:
|
|
RET
|
|
|
|
// func bytesAVX2big(dst, src0, src1 []byte, size int)
|
|
TEXT ·bytesAVX2big(SB), NOSPLIT, $0
|
|
MOVQ len+72(FP), LEN
|
|
CMPQ LEN, $0
|
|
JE ret
|
|
MOVQ dst+0(FP), DST
|
|
MOVQ src0+24(FP), SRC0
|
|
MOVQ src1+48(FP), SRC1
|
|
TESTQ $127, LEN
|
|
JNZ not_aligned
|
|
|
|
aligned:
|
|
MOVQ $0, POS
|
|
|
|
loop128b:
|
|
VMOVDQU (SRC0)(POS*1), Y0
|
|
VMOVDQU 32(SRC0)(POS*1), Y1
|
|
VMOVDQU 64(SRC0)(POS*1), Y2
|
|
VMOVDQU 96(SRC0)(POS*1), Y3
|
|
VPXOR (SRC1)(POS*1), Y0, Y0
|
|
VPXOR 32(SRC1)(POS*1), Y1, Y1
|
|
VPXOR 64(SRC1)(POS*1), Y2, Y2
|
|
VPXOR 96(SRC1)(POS*1), Y3, Y3
|
|
LONG $0xe77da1c4; WORD $0x0304
|
|
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
|
|
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
|
|
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
|
|
|
|
ADDQ $128, POS
|
|
CMPQ LEN, POS
|
|
JNE loop128b
|
|
SFENCE
|
|
VZEROUPPER
|
|
RET
|
|
|
|
loop_1b:
|
|
MOVB -1(SRC0)(LEN*1), TMP1
|
|
MOVB -1(SRC1)(LEN*1), TMP2
|
|
XORB TMP1, TMP2
|
|
MOVB TMP2, -1(DST)(LEN*1)
|
|
SUBQ $1, LEN
|
|
TESTQ $7, LEN
|
|
JNZ loop_1b
|
|
CMPQ LEN, $0
|
|
JE ret
|
|
TESTQ $127, LEN
|
|
JZ aligned
|
|
|
|
not_aligned:
|
|
TESTQ $7, LEN
|
|
JNE loop_1b
|
|
MOVQ LEN, TMP1
|
|
ANDQ $127, TMP1
|
|
|
|
loop_8b:
|
|
MOVQ -8(SRC0)(LEN*1), TMP2
|
|
MOVQ -8(SRC1)(LEN*1), TMP3
|
|
XORQ TMP2, TMP3
|
|
MOVQ TMP3, -8(DST)(LEN*1)
|
|
SUBQ $8, LEN
|
|
SUBQ $8, TMP1
|
|
JG loop_8b
|
|
|
|
CMPQ LEN, $128
|
|
JGE aligned
|
|
RET
|
|
|
|
ret:
|
|
RET
|
|
|
|
// func matrixAVX2small(dst []byte, src [][]byte)
|
|
TEXT ·matrixAVX2small(SB), NOSPLIT, $0
|
|
MOVQ dst+0(FP), DST
|
|
MOVQ src+24(FP), SRC
|
|
MOVQ vec+32(FP), VECT
|
|
MOVQ len+8(FP), LEN
|
|
TESTQ $127, LEN
|
|
JNZ not_aligned
|
|
|
|
aligned:
|
|
MOVQ $0, POS
|
|
|
|
loop128b:
|
|
MOVQ VECT, TMP1
|
|
SUBQ $2, TMP1
|
|
MOVQ $0, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
MOVQ TMP3, TMP4
|
|
VMOVDQU (TMP3)(POS*1), Y0
|
|
VMOVDQU 32(TMP4)(POS*1), Y1
|
|
VMOVDQU 64(TMP3)(POS*1), Y2
|
|
VMOVDQU 96(TMP4)(POS*1), Y3
|
|
|
|
next_vect:
|
|
ADDQ $24, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
MOVQ TMP3, TMP4
|
|
VMOVDQU (TMP3)(POS*1), Y4
|
|
VMOVDQU 32(TMP4)(POS*1), Y5
|
|
VMOVDQU 64(TMP3)(POS*1), Y6
|
|
VMOVDQU 96(TMP4)(POS*1), Y7
|
|
VPXOR Y4, Y0, Y0
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
SUBQ $1, TMP1
|
|
JGE next_vect
|
|
|
|
VMOVDQU Y0, (DST)(POS*1)
|
|
VMOVDQU Y1, 32(DST)(POS*1)
|
|
VMOVDQU Y2, 64(DST)(POS*1)
|
|
VMOVDQU Y3, 96(DST)(POS*1)
|
|
|
|
ADDQ $128, POS
|
|
CMPQ LEN, POS
|
|
JNE loop128b
|
|
VZEROUPPER
|
|
RET
|
|
|
|
loop_1b:
|
|
MOVQ VECT, TMP1
|
|
MOVQ $0, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
SUBQ $2, TMP1
|
|
MOVB -1(TMP3)(LEN*1), TMP5
|
|
|
|
next_vect_1b:
|
|
ADDQ $24, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
MOVB -1(TMP3)(LEN*1), TMP6
|
|
XORB TMP6, TMP5
|
|
SUBQ $1, TMP1
|
|
JGE next_vect_1b
|
|
|
|
MOVB TMP5, -1(DST)(LEN*1)
|
|
SUBQ $1, LEN
|
|
TESTQ $7, LEN
|
|
JNZ loop_1b
|
|
|
|
CMPQ LEN, $0
|
|
JE ret
|
|
TESTQ $127, LEN
|
|
JZ aligned
|
|
|
|
not_aligned:
|
|
TESTQ $7, LEN
|
|
JNE loop_1b
|
|
MOVQ LEN, TMP4
|
|
ANDQ $127, TMP4
|
|
|
|
loop_8b:
|
|
MOVQ VECT, TMP1
|
|
MOVQ $0, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
SUBQ $2, TMP1
|
|
MOVQ -8(TMP3)(LEN*1), TMP5
|
|
|
|
next_vect_8b:
|
|
ADDQ $24, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
MOVQ -8(TMP3)(LEN*1), TMP6
|
|
XORQ TMP6, TMP5
|
|
SUBQ $1, TMP1
|
|
JGE next_vect_8b
|
|
|
|
MOVQ TMP5, -8(DST)(LEN*1)
|
|
SUBQ $8, LEN
|
|
SUBQ $8, TMP4
|
|
JG loop_8b
|
|
|
|
CMPQ LEN, $128
|
|
JGE aligned
|
|
RET
|
|
|
|
ret:
|
|
RET
|
|
|
|
// func matrixAVX2big(dst []byte, src [][]byte)
|
|
TEXT ·matrixAVX2big(SB), NOSPLIT, $0
|
|
MOVQ dst+0(FP), DST
|
|
MOVQ src+24(FP), SRC
|
|
MOVQ vec+32(FP), VECT
|
|
MOVQ len+8(FP), LEN
|
|
TESTQ $127, LEN
|
|
JNZ not_aligned
|
|
|
|
aligned:
|
|
MOVQ $0, POS
|
|
|
|
loop128b:
|
|
MOVQ VECT, TMP1
|
|
SUBQ $2, TMP1
|
|
MOVQ $0, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
MOVQ TMP3, TMP4
|
|
VMOVDQU (TMP3)(POS*1), Y0
|
|
VMOVDQU 32(TMP4)(POS*1), Y1
|
|
VMOVDQU 64(TMP3)(POS*1), Y2
|
|
VMOVDQU 96(TMP4)(POS*1), Y3
|
|
|
|
next_vect:
|
|
ADDQ $24, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
MOVQ TMP3, TMP4
|
|
VMOVDQU (TMP3)(POS*1), Y4
|
|
VMOVDQU 32(TMP4)(POS*1), Y5
|
|
VMOVDQU 64(TMP3)(POS*1), Y6
|
|
VMOVDQU 96(TMP4)(POS*1), Y7
|
|
VPXOR Y4, Y0, Y0
|
|
VPXOR Y5, Y1, Y1
|
|
VPXOR Y6, Y2, Y2
|
|
VPXOR Y7, Y3, Y3
|
|
SUBQ $1, TMP1
|
|
JGE next_vect
|
|
|
|
LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
|
|
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
|
|
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
|
|
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
|
|
|
|
ADDQ $128, POS
|
|
CMPQ LEN, POS
|
|
JNE loop128b
|
|
VZEROUPPER
|
|
RET
|
|
|
|
loop_1b:
|
|
MOVQ VECT, TMP1
|
|
MOVQ $0, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
SUBQ $2, TMP1
|
|
MOVB -1(TMP3)(LEN*1), TMP5
|
|
|
|
next_vect_1b:
|
|
ADDQ $24, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
MOVB -1(TMP3)(LEN*1), TMP6
|
|
XORB TMP6, TMP5
|
|
SUBQ $1, TMP1
|
|
JGE next_vect_1b
|
|
|
|
MOVB TMP5, -1(DST)(LEN*1)
|
|
SUBQ $1, LEN
|
|
TESTQ $7, LEN
|
|
JNZ loop_1b
|
|
|
|
CMPQ LEN, $0
|
|
JE ret
|
|
TESTQ $127, LEN
|
|
JZ aligned
|
|
|
|
not_aligned:
|
|
TESTQ $7, LEN
|
|
JNE loop_1b
|
|
MOVQ LEN, TMP4
|
|
ANDQ $127, TMP4
|
|
|
|
loop_8b:
|
|
MOVQ VECT, TMP1
|
|
MOVQ $0, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
SUBQ $2, TMP1
|
|
MOVQ -8(TMP3)(LEN*1), TMP5
|
|
|
|
next_vect_8b:
|
|
ADDQ $24, TMP2
|
|
MOVQ (SRC)(TMP2*1), TMP3
|
|
MOVQ -8(TMP3)(LEN*1), TMP6
|
|
XORQ TMP6, TMP5
|
|
SUBQ $1, TMP1
|
|
JGE next_vect_8b
|
|
|
|
MOVQ TMP5, -8(DST)(LEN*1)
|
|
SUBQ $8, LEN
|
|
SUBQ $8, TMP4
|
|
JG loop_8b
|
|
|
|
CMPQ LEN, $128
|
|
JGE aligned
|
|
RET
|
|
|
|
ret:
|
|
RET
|
|
|