route/vendor/github.com/templexxx/reedsolomon/rs_amd64.s

402 lines
8.8 KiB
ArmAsm

// Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
#include "textflag.h"
#define low_tbl Y0
#define high_tbl Y1
#define mask Y2
#define in0 Y3
#define in1 Y4
#define in2 Y5
#define in3 Y6
#define in4 Y7
#define in5 Y8
#define in0_h Y10
#define in1_h Y11
#define in2_h Y12
#define in3_h Y13
#define in4_h Y14
#define in5_h Y15
#define in BX
#define out DI
#define len R8
#define pos R9
#define tmp0 R10
#define low_tblx X0
#define high_tblx X1
#define maskx X2
#define in0x X3
#define in0_hx X10
#define tmp0x X9
#define tmp1x X11
#define tmp2x X12
#define tmp3x X13
// func mulVectAVX2(tbl, d, p []byte)
TEXT ·mulVectAVX2(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
VMOVDQU (tmp0), low_tblx
VMOVDQU 16(tmp0), high_tblx
MOVB $0x0f, DX
LONG $0x2069e3c4; WORD $0x00d2 // VPINSRB $0x00, EDX, XMM2, XMM2
VPBROADCASTB maskx, maskx
MOVQ in_len+32(FP), len
TESTQ $31, len
JNZ one16b
ymm:
VINSERTI128 $1, low_tblx, low_tbl, low_tbl
VINSERTI128 $1, high_tblx, high_tbl, high_tbl
VINSERTI128 $1, maskx, mask, mask
TESTQ $255, len
JNZ not_aligned
// 256bytes/loop
aligned:
MOVQ $0, pos
loop256b:
VMOVDQU (in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, (out)(pos*1)
VMOVDQU 32(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VMOVDQU in1, 32(out)(pos*1)
VMOVDQU 64(in)(pos*1), in2
VPSRLQ $4, in2, in2_h
VPAND mask, in2_h, in2_h
VPAND mask, in2, in2
VPSHUFB in2_h, high_tbl, in2_h
VPSHUFB in2, low_tbl, in2
VPXOR in2, in2_h, in2
VMOVDQU in2, 64(out)(pos*1)
VMOVDQU 96(in)(pos*1), in3
VPSRLQ $4, in3, in3_h
VPAND mask, in3_h, in3_h
VPAND mask, in3, in3
VPSHUFB in3_h, high_tbl, in3_h
VPSHUFB in3, low_tbl, in3
VPXOR in3, in3_h, in3
VMOVDQU in3, 96(out)(pos*1)
VMOVDQU 128(in)(pos*1), in4
VPSRLQ $4, in4, in4_h
VPAND mask, in4_h, in4_h
VPAND mask, in4, in4
VPSHUFB in4_h, high_tbl, in4_h
VPSHUFB in4, low_tbl, in4
VPXOR in4, in4_h, in4
VMOVDQU in4, 128(out)(pos*1)
VMOVDQU 160(in)(pos*1), in5
VPSRLQ $4, in5, in5_h
VPAND mask, in5_h, in5_h
VPAND mask, in5, in5
VPSHUFB in5_h, high_tbl, in5_h
VPSHUFB in5, low_tbl, in5
VPXOR in5, in5_h, in5
VMOVDQU in5, 160(out)(pos*1)
VMOVDQU 192(in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, 192(out)(pos*1)
VMOVDQU 224(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VMOVDQU in1, 224(out)(pos*1)
ADDQ $256, pos
CMPQ len, pos
JNE loop256b
VZEROUPPER
RET
not_aligned:
MOVQ len, tmp0
ANDQ $255, tmp0
loop32b:
VMOVDQU -32(in)(len*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, -32(out)(len*1)
SUBQ $32, len
SUBQ $32, tmp0
JG loop32b
CMPQ len, $256
JGE aligned
VZEROUPPER
RET
one16b:
VMOVDQU -16(in)(len*1), in0x
VPSRLQ $4, in0x, in0_hx
VPAND maskx, in0x, in0x
VPAND maskx, in0_hx, in0_hx
VPSHUFB in0_hx, high_tblx, in0_hx
VPSHUFB in0x, low_tblx, in0x
VPXOR in0x, in0_hx, in0x
VMOVDQU in0x, -16(out)(len*1)
SUBQ $16, len
CMPQ len, $0
JNE ymm
RET
// func mulVectAddAVX2(tbl, d, p []byte)
TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
VMOVDQU (tmp0), low_tblx
VMOVDQU 16(tmp0), high_tblx
MOVB $0x0f, DX
LONG $0x2069e3c4; WORD $0x00d2
VPBROADCASTB maskx, maskx
MOVQ in_len+32(FP), len
TESTQ $31, len
JNZ one16b
ymm:
VINSERTI128 $1, low_tblx, low_tbl, low_tbl
VINSERTI128 $1, high_tblx, high_tbl, high_tbl
VINSERTI128 $1, maskx, mask, mask
TESTQ $255, len
JNZ not_aligned
aligned:
MOVQ $0, pos
loop256b:
VMOVDQU (in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR (out)(pos*1), in0, in0
VMOVDQU in0, (out)(pos*1)
VMOVDQU 32(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VPXOR 32(out)(pos*1), in1, in1
VMOVDQU in1, 32(out)(pos*1)
VMOVDQU 64(in)(pos*1), in2
VPSRLQ $4, in2, in2_h
VPAND mask, in2_h, in2_h
VPAND mask, in2, in2
VPSHUFB in2_h, high_tbl, in2_h
VPSHUFB in2, low_tbl, in2
VPXOR in2, in2_h, in2
VPXOR 64(out)(pos*1), in2, in2
VMOVDQU in2, 64(out)(pos*1)
VMOVDQU 96(in)(pos*1), in3
VPSRLQ $4, in3, in3_h
VPAND mask, in3_h, in3_h
VPAND mask, in3, in3
VPSHUFB in3_h, high_tbl, in3_h
VPSHUFB in3, low_tbl, in3
VPXOR in3, in3_h, in3
VPXOR 96(out)(pos*1), in3, in3
VMOVDQU in3, 96(out)(pos*1)
VMOVDQU 128(in)(pos*1), in4
VPSRLQ $4, in4, in4_h
VPAND mask, in4_h, in4_h
VPAND mask, in4, in4
VPSHUFB in4_h, high_tbl, in4_h
VPSHUFB in4, low_tbl, in4
VPXOR in4, in4_h, in4
VPXOR 128(out)(pos*1), in4, in4
VMOVDQU in4, 128(out)(pos*1)
VMOVDQU 160(in)(pos*1), in5
VPSRLQ $4, in5, in5_h
VPAND mask, in5_h, in5_h
VPAND mask, in5, in5
VPSHUFB in5_h, high_tbl, in5_h
VPSHUFB in5, low_tbl, in5
VPXOR in5, in5_h, in5
VPXOR 160(out)(pos*1), in5, in5
VMOVDQU in5, 160(out)(pos*1)
VMOVDQU 192(in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR 192(out)(pos*1), in0, in0
VMOVDQU in0, 192(out)(pos*1)
VMOVDQU 224(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VPXOR 224(out)(pos*1), in1, in1
VMOVDQU in1, 224(out)(pos*1)
ADDQ $256, pos
CMPQ len, pos
JNE loop256b
VZEROUPPER
RET
not_aligned:
MOVQ len, tmp0
ANDQ $255, tmp0
loop32b:
VMOVDQU -32(in)(len*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR -32(out)(len*1), in0, in0
VMOVDQU in0, -32(out)(len*1)
SUBQ $32, len
SUBQ $32, tmp0
JG loop32b
CMPQ len, $256
JGE aligned
VZEROUPPER
RET
one16b:
VMOVDQU -16(in)(len*1), in0x
VPSRLQ $4, in0x, in0_hx
VPAND maskx, in0x, in0x
VPAND maskx, in0_hx, in0_hx
VPSHUFB in0_hx, high_tblx, in0_hx
VPSHUFB in0x, low_tblx, in0x
VPXOR in0x, in0_hx, in0x
VPXOR -16(out)(len*1), in0x, in0x
VMOVDQU in0x, -16(out)(len*1)
SUBQ $16, len
CMPQ len, $0
JNE ymm
RET
// func mulVectSSSE3(tbl, d, p []byte)
TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
MOVOU (tmp0), low_tblx
MOVOU 16(tmp0), high_tblx
MOVB $15, tmp0
MOVQ tmp0, maskx
PXOR tmp0x, tmp0x
PSHUFB tmp0x, maskx
MOVQ in_len+32(FP), len
SHRQ $4, len
loop:
MOVOU (in), in0x
MOVOU in0x, in0_hx
PSRLQ $4, in0_hx
PAND maskx, in0x
PAND maskx, in0_hx
MOVOU low_tblx, tmp1x
MOVOU high_tblx, tmp2x
PSHUFB in0x, tmp1x
PSHUFB in0_hx, tmp2x
PXOR tmp1x, tmp2x
MOVOU tmp2x, (out)
ADDQ $16, in
ADDQ $16, out
SUBQ $1, len
JNZ loop
RET
// func mulVectAddSSSE3(tbl, d, p []byte)
TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
MOVOU (tmp0), low_tblx
MOVOU 16(tmp0), high_tblx
MOVB $15, tmp0
MOVQ tmp0, maskx
PXOR tmp0x, tmp0x
PSHUFB tmp0x, maskx
MOVQ in_len+32(FP), len
SHRQ $4, len
loop:
MOVOU (in), in0x
MOVOU in0x, in0_hx
PSRLQ $4, in0_hx
PAND maskx, in0x
PAND maskx, in0_hx
MOVOU low_tblx, tmp1x
MOVOU high_tblx, tmp2x
PSHUFB in0x, tmp1x
PSHUFB in0_hx, tmp2x
PXOR tmp1x, tmp2x
MOVOU (out), tmp3x
PXOR tmp3x, tmp2x
MOVOU tmp2x, (out)
ADDQ $16, in
ADDQ $16, out
SUBQ $1, len
JNZ loop
RET
// func copy32B(dst, src []byte)
TEXT ·copy32B(SB), NOSPLIT, $0
MOVQ dst+0(FP), SI
MOVQ src+24(FP), DX
MOVOU (DX), X0
MOVOU 16(DX), X1
MOVOU X0, (SI)
MOVOU X1, 16(SI)
RET