789 lines
14 KiB
ArmAsm
789 lines
14 KiB
ArmAsm
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
|
// Use of this source code is governed by a license that can be
|
|
// found in the LICENSE file.
|
|
|
|
// +build amd64,!gccgo,!appengine,!nacl
|
|
|
|
#include "textflag.h"
|
|
|
|
DATA ·sigma<>+0x00(SB)/4, $0x61707865
|
|
DATA ·sigma<>+0x04(SB)/4, $0x3320646e
|
|
DATA ·sigma<>+0x08(SB)/4, $0x79622d32
|
|
DATA ·sigma<>+0x0C(SB)/4, $0x6b206574
|
|
GLOBL ·sigma<>(SB), (NOPTR+RODATA), $16
|
|
|
|
DATA ·one<>+0x00(SB)/8, $1
|
|
DATA ·one<>+0x08(SB)/8, $0
|
|
GLOBL ·one<>(SB), (NOPTR+RODATA), $16
|
|
|
|
DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
|
|
DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
|
GLOBL ·rol16<>(SB), (NOPTR+RODATA), $16
|
|
|
|
DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
|
|
DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
|
GLOBL ·rol8<>(SB), (NOPTR+RODATA), $16
|
|
|
|
#define ROTL_SSE2(n, t, v) \
|
|
MOVO v, t; \
|
|
PSLLL $n, t; \
|
|
PSRLL $(32-n), v; \
|
|
PXOR t, v
|
|
|
|
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t0) \
|
|
PADDL v1, v0; \
|
|
PXOR v0, v3; \
|
|
ROTL_SSE2(16, t0, v3); \
|
|
PADDL v3, v2; \
|
|
PXOR v2, v1; \
|
|
ROTL_SSE2(12, t0, v1); \
|
|
PADDL v1, v0; \
|
|
PXOR v0, v3; \
|
|
ROTL_SSE2(8, t0, v3); \
|
|
PADDL v3, v2; \
|
|
PXOR v2, v1; \
|
|
ROTL_SSE2(7, t0, v1)
|
|
|
|
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t0, r16, r8) \
|
|
PADDL v1, v0; \
|
|
PXOR v0, v3; \
|
|
PSHUFB r16, v3; \
|
|
PADDL v3, v2; \
|
|
PXOR v2, v1; \
|
|
ROTL_SSE2(12, t0, v1); \
|
|
PADDL v1, v0; \
|
|
PXOR v0, v3; \
|
|
PSHUFB r8, v3; \
|
|
PADDL v3, v2; \
|
|
PXOR v2, v1; \
|
|
ROTL_SSE2(7, t0, v1)
|
|
|
|
#define CHACHA_SHUFFLE(v1, v2, v3) \
|
|
PSHUFL $0x39, v1, v1; \
|
|
PSHUFL $0x4E, v2, v2; \
|
|
PSHUFL $0x93, v3, v3
|
|
|
|
#define XOR(dst, src, off, v0, v1, v2, v3, t0) \
|
|
MOVOU 0+off(src), t0; \
|
|
PXOR v0, t0; \
|
|
MOVOU t0, 0+off(dst); \
|
|
MOVOU 16+off(src), t0; \
|
|
PXOR v1, t0; \
|
|
MOVOU t0, 16+off(dst); \
|
|
MOVOU 32+off(src), t0; \
|
|
PXOR v2, t0; \
|
|
MOVOU t0, 32+off(dst); \
|
|
MOVOU 48+off(src), t0; \
|
|
PXOR v3, t0; \
|
|
MOVOU t0, 48+off(dst)
|
|
|
|
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
|
TEXT ·xorKeyStreamSSE2(SB), 4, $112-80
|
|
MOVQ dst_base+0(FP), DI
|
|
MOVQ src_base+24(FP), SI
|
|
MOVQ src_len+32(FP), CX
|
|
MOVQ block+48(FP), BX
|
|
MOVQ state+56(FP), AX
|
|
MOVQ rounds+64(FP), DX
|
|
|
|
MOVQ SP, R9
|
|
ADDQ $16, SP
|
|
ANDQ $-16, SP
|
|
|
|
MOVOU 0(AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU 32(AX), X2
|
|
MOVOU 48(AX), X3
|
|
MOVOU ·one<>(SB), X15
|
|
|
|
TESTQ CX, CX
|
|
JZ done
|
|
|
|
CMPQ CX, $64
|
|
JBE between_0_and_64
|
|
|
|
CMPQ CX, $128
|
|
JBE between_64_and_128
|
|
|
|
MOVO X0, 0(SP)
|
|
MOVO X1, 16(SP)
|
|
MOVO X2, 32(SP)
|
|
MOVO X3, 48(SP)
|
|
MOVO X15, 64(SP)
|
|
|
|
CMPQ CX, $192
|
|
JBE between_128_and_192
|
|
|
|
MOVQ $192, R14
|
|
|
|
at_least_256:
|
|
MOVO X0, X4
|
|
MOVO X1, X5
|
|
MOVO X2, X6
|
|
MOVO X3, X7
|
|
PADDQ 64(SP), X7
|
|
MOVO X0, X12
|
|
MOVO X1, X13
|
|
MOVO X2, X14
|
|
MOVO X7, X15
|
|
PADDQ 64(SP), X15
|
|
MOVO X0, X8
|
|
MOVO X1, X9
|
|
MOVO X2, X10
|
|
MOVO X15, X11
|
|
PADDQ 64(SP), X11
|
|
|
|
MOVQ DX, R8
|
|
|
|
chacha_loop_256:
|
|
MOVO X8, 80(SP)
|
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X8)
|
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8)
|
|
MOVO 80(SP), X8
|
|
|
|
MOVO X0, 80(SP)
|
|
CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0)
|
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
|
|
MOVO 80(SP), X0
|
|
|
|
CHACHA_SHUFFLE(X1, X2, X3)
|
|
CHACHA_SHUFFLE(X5, X6, X7)
|
|
CHACHA_SHUFFLE(X13, X14, X15)
|
|
CHACHA_SHUFFLE(X9, X10, X11)
|
|
|
|
MOVO X8, 80(SP)
|
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X8)
|
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8)
|
|
MOVO 80(SP), X8
|
|
|
|
MOVO X0, 80(SP)
|
|
CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0)
|
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
|
|
MOVO 80(SP), X0
|
|
|
|
CHACHA_SHUFFLE(X3, X2, X1)
|
|
CHACHA_SHUFFLE(X7, X6, X5)
|
|
CHACHA_SHUFFLE(X15, X14, X13)
|
|
CHACHA_SHUFFLE(X11, X10, X9)
|
|
SUBQ $2, R8
|
|
JA chacha_loop_256
|
|
|
|
MOVO X8, 80(SP)
|
|
|
|
PADDL 0(SP), X0
|
|
PADDL 16(SP), X1
|
|
PADDL 32(SP), X2
|
|
PADDL 48(SP), X3
|
|
XOR(DI, SI, 0, X0, X1, X2, X3, X8)
|
|
|
|
MOVO 0(SP), X0
|
|
MOVO 16(SP), X1
|
|
MOVO 32(SP), X2
|
|
MOVO 48(SP), X3
|
|
PADDQ 64(SP), X3
|
|
|
|
PADDL X0, X4
|
|
PADDL X1, X5
|
|
PADDL X2, X6
|
|
PADDL X3, X7
|
|
PADDQ 64(SP), X3
|
|
XOR(DI, SI, 64, X4, X5, X6, X7, X8)
|
|
|
|
MOVO 64(SP), X5
|
|
MOVO 80(SP), X8
|
|
|
|
PADDL X0, X12
|
|
PADDL X1, X13
|
|
PADDL X2, X14
|
|
PADDL X3, X15
|
|
PADDQ X5, X3
|
|
XOR(DI, SI, 128, X12, X13, X14, X15, X4)
|
|
|
|
PADDL X0, X8
|
|
PADDL X1, X9
|
|
PADDL X2, X10
|
|
PADDL X3, X11
|
|
PADDQ X5, X3
|
|
|
|
CMPQ CX, $256
|
|
JB less_than_64
|
|
|
|
XOR(DI, SI, 192, X8, X9, X10, X11, X4)
|
|
MOVO X3, 48(SP)
|
|
ADDQ $256, SI
|
|
ADDQ $256, DI
|
|
SUBQ $256, CX
|
|
CMPQ CX, $192
|
|
JA at_least_256
|
|
|
|
TESTQ CX, CX
|
|
JZ done
|
|
MOVO 64(SP), X15
|
|
CMPQ CX, $64
|
|
JBE between_0_and_64
|
|
CMPQ CX, $128
|
|
JBE between_64_and_128
|
|
|
|
between_128_and_192:
|
|
MOVQ $128, R14
|
|
MOVO X0, X4
|
|
MOVO X1, X5
|
|
MOVO X2, X6
|
|
MOVO X3, X7
|
|
PADDQ X15, X7
|
|
MOVO X0, X8
|
|
MOVO X1, X9
|
|
MOVO X2, X10
|
|
MOVO X7, X11
|
|
PADDQ X15, X11
|
|
|
|
MOVQ DX, R8
|
|
|
|
chacha_loop_192:
|
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X12)
|
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
|
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
|
|
CHACHA_SHUFFLE(X1, X2, X3)
|
|
CHACHA_SHUFFLE(X5, X6, X7)
|
|
CHACHA_SHUFFLE(X9, X10, X11)
|
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X12)
|
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
|
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
|
|
CHACHA_SHUFFLE(X3, X2, X1)
|
|
CHACHA_SHUFFLE(X7, X6, X5)
|
|
CHACHA_SHUFFLE(X11, X10, X9)
|
|
SUBQ $2, R8
|
|
JA chacha_loop_192
|
|
|
|
PADDL 0(SP), X0
|
|
PADDL 16(SP), X1
|
|
PADDL 32(SP), X2
|
|
PADDL 48(SP), X3
|
|
XOR(DI, SI, 0, X0, X1, X2, X3, X12)
|
|
|
|
MOVO 0(SP), X0
|
|
MOVO 16(SP), X1
|
|
MOVO 32(SP), X2
|
|
MOVO 48(SP), X3
|
|
PADDQ X15, X3
|
|
|
|
PADDL X0, X4
|
|
PADDL X1, X5
|
|
PADDL X2, X6
|
|
PADDL X3, X7
|
|
PADDQ X15, X3
|
|
XOR(DI, SI, 64, X4, X5, X6, X7, X12)
|
|
|
|
PADDL X0, X8
|
|
PADDL X1, X9
|
|
PADDL X2, X10
|
|
PADDL X3, X11
|
|
PADDQ X15, X3
|
|
|
|
CMPQ CX, $192
|
|
JB less_than_64
|
|
|
|
XOR(DI, SI, 128, X8, X9, X10, X11, X12)
|
|
SUBQ $192, CX
|
|
JMP done
|
|
|
|
between_64_and_128:
|
|
MOVQ $64, R14
|
|
MOVO X0, X4
|
|
MOVO X1, X5
|
|
MOVO X2, X6
|
|
MOVO X3, X7
|
|
MOVO X0, X8
|
|
MOVO X1, X9
|
|
MOVO X2, X10
|
|
MOVO X3, X11
|
|
PADDQ X15, X11
|
|
|
|
MOVQ DX, R8
|
|
|
|
chacha_loop_128:
|
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
|
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
|
|
CHACHA_SHUFFLE(X5, X6, X7)
|
|
CHACHA_SHUFFLE(X9, X10, X11)
|
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
|
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
|
|
CHACHA_SHUFFLE(X7, X6, X5)
|
|
CHACHA_SHUFFLE(X11, X10, X9)
|
|
SUBQ $2, R8
|
|
JA chacha_loop_128
|
|
|
|
PADDL X0, X4
|
|
PADDL X1, X5
|
|
PADDL X2, X6
|
|
PADDL X3, X7
|
|
PADDQ X15, X3
|
|
PADDL X0, X8
|
|
PADDL X1, X9
|
|
PADDL X2, X10
|
|
PADDL X3, X11
|
|
PADDQ X15, X3
|
|
XOR(DI, SI, 0, X4, X5, X6, X7, X12)
|
|
|
|
CMPQ CX, $128
|
|
JB less_than_64
|
|
|
|
XOR(DI, SI, 64, X8, X9, X10, X11, X12)
|
|
SUBQ $128, CX
|
|
JMP done
|
|
|
|
between_0_and_64:
|
|
MOVQ $0, R14
|
|
MOVO X0, X8
|
|
MOVO X1, X9
|
|
MOVO X2, X10
|
|
MOVO X3, X11
|
|
MOVQ DX, R8
|
|
|
|
chacha_loop_64:
|
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
|
|
CHACHA_SHUFFLE(X9, X10, X11)
|
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
|
|
CHACHA_SHUFFLE(X11, X10, X9)
|
|
SUBQ $2, R8
|
|
JA chacha_loop_64
|
|
|
|
PADDL X0, X8
|
|
PADDL X1, X9
|
|
PADDL X2, X10
|
|
PADDL X3, X11
|
|
PADDQ X15, X3
|
|
CMPQ CX, $64
|
|
JB less_than_64
|
|
|
|
XOR(DI, SI, 0, X8, X9, X10, X11, X12)
|
|
SUBQ $64, CX
|
|
JMP done
|
|
|
|
less_than_64:
|
|
// R14 contains the num of bytes already xor'd
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
SUBQ R14, CX
|
|
MOVOU X8, 0(BX)
|
|
MOVOU X9, 16(BX)
|
|
MOVOU X10, 32(BX)
|
|
MOVOU X11, 48(BX)
|
|
XORQ R11, R11
|
|
XORQ R12, R12
|
|
MOVQ CX, BP
|
|
|
|
xor_loop:
|
|
MOVB 0(SI), R11
|
|
MOVB 0(BX), R12
|
|
XORQ R11, R12
|
|
MOVB R12, 0(DI)
|
|
INCQ SI
|
|
INCQ BX
|
|
INCQ DI
|
|
DECQ BP
|
|
JA xor_loop
|
|
|
|
done:
|
|
MOVOU X3, 48(AX)
|
|
MOVQ R9, SP
|
|
MOVQ CX, ret+72(FP)
|
|
RET
|
|
|
|
// func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
|
|
TEXT ·xorKeyStreamSSSE3(SB), 4, $144-80
|
|
MOVQ dst_base+0(FP), DI
|
|
MOVQ src_base+24(FP), SI
|
|
MOVQ src_len+32(FP), CX
|
|
MOVQ block+48(FP), BX
|
|
MOVQ state+56(FP), AX
|
|
MOVQ rounds+64(FP), DX
|
|
|
|
MOVQ SP, R9
|
|
ADDQ $16, SP
|
|
ANDQ $-16, SP
|
|
|
|
MOVOU 0(AX), X0
|
|
MOVOU 16(AX), X1
|
|
MOVOU 32(AX), X2
|
|
MOVOU 48(AX), X3
|
|
MOVOU ·rol16<>(SB), X13
|
|
MOVOU ·rol8<>(SB), X14
|
|
MOVOU ·one<>(SB), X15
|
|
|
|
TESTQ CX, CX
|
|
JZ done
|
|
|
|
CMPQ CX, $64
|
|
JBE between_0_and_64
|
|
|
|
CMPQ CX, $128
|
|
JBE between_64_and_128
|
|
|
|
MOVO X0, 0(SP)
|
|
MOVO X1, 16(SP)
|
|
MOVO X2, 32(SP)
|
|
MOVO X3, 48(SP)
|
|
MOVO X15, 64(SP)
|
|
|
|
CMPQ CX, $192
|
|
JBE between_128_and_192
|
|
|
|
MOVO X13, 96(SP)
|
|
MOVO X14, 112(SP)
|
|
MOVQ $192, R14
|
|
|
|
at_least_256:
|
|
MOVO X0, X4
|
|
MOVO X1, X5
|
|
MOVO X2, X6
|
|
MOVO X3, X7
|
|
PADDQ 64(SP), X7
|
|
MOVO X0, X12
|
|
MOVO X1, X13
|
|
MOVO X2, X14
|
|
MOVO X7, X15
|
|
PADDQ 64(SP), X15
|
|
MOVO X0, X8
|
|
MOVO X1, X9
|
|
MOVO X2, X10
|
|
MOVO X15, X11
|
|
PADDQ 64(SP), X11
|
|
|
|
MOVQ DX, R8
|
|
|
|
chacha_loop_256:
|
|
MOVO X8, 80(SP)
|
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X8, 96(SP), 112(SP))
|
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, 96(SP), 112(SP))
|
|
MOVO 80(SP), X8
|
|
|
|
MOVO X0, 80(SP)
|
|
CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, 96(SP), 112(SP))
|
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 96(SP), 112(SP))
|
|
MOVO 80(SP), X0
|
|
|
|
CHACHA_SHUFFLE(X1, X2, X3)
|
|
CHACHA_SHUFFLE(X5, X6, X7)
|
|
CHACHA_SHUFFLE(X13, X14, X15)
|
|
CHACHA_SHUFFLE(X9, X10, X11)
|
|
|
|
MOVO X8, 80(SP)
|
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X8, 96(SP), 112(SP))
|
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, 96(SP), 112(SP))
|
|
MOVO 80(SP), X8
|
|
|
|
MOVO X0, 80(SP)
|
|
CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, 96(SP), 112(SP))
|
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 96(SP), 112(SP))
|
|
MOVO 80(SP), X0
|
|
|
|
CHACHA_SHUFFLE(X3, X2, X1)
|
|
CHACHA_SHUFFLE(X7, X6, X5)
|
|
CHACHA_SHUFFLE(X15, X14, X13)
|
|
CHACHA_SHUFFLE(X11, X10, X9)
|
|
SUBQ $2, R8
|
|
JA chacha_loop_256
|
|
|
|
MOVO X8, 80(SP)
|
|
|
|
PADDL 0(SP), X0
|
|
PADDL 16(SP), X1
|
|
PADDL 32(SP), X2
|
|
PADDL 48(SP), X3
|
|
XOR(DI, SI, 0, X0, X1, X2, X3, X8)
|
|
MOVO 0(SP), X0
|
|
MOVO 16(SP), X1
|
|
MOVO 32(SP), X2
|
|
MOVO 48(SP), X3
|
|
PADDQ 64(SP), X3
|
|
|
|
PADDL X0, X4
|
|
PADDL X1, X5
|
|
PADDL X2, X6
|
|
PADDL X3, X7
|
|
PADDQ 64(SP), X3
|
|
XOR(DI, SI, 64, X4, X5, X6, X7, X8)
|
|
|
|
MOVO 64(SP), X5
|
|
MOVO 80(SP), X8
|
|
|
|
PADDL X0, X12
|
|
PADDL X1, X13
|
|
PADDL X2, X14
|
|
PADDL X3, X15
|
|
PADDQ X5, X3
|
|
XOR(DI, SI, 128, X12, X13, X14, X15, X4)
|
|
|
|
PADDL X0, X8
|
|
PADDL X1, X9
|
|
PADDL X2, X10
|
|
PADDL X3, X11
|
|
PADDQ X5, X3
|
|
|
|
CMPQ CX, $256
|
|
JB less_than_64
|
|
|
|
XOR(DI, SI, 192, X8, X9, X10, X11, X4)
|
|
MOVO X3, 48(SP)
|
|
ADDQ $256, SI
|
|
ADDQ $256, DI
|
|
SUBQ $256, CX
|
|
CMPQ CX, $192
|
|
JA at_least_256
|
|
|
|
TESTQ CX, CX
|
|
JZ done
|
|
MOVOU ·rol16<>(SB), X13
|
|
MOVOU ·rol8<>(SB), X14
|
|
MOVO 64(SP), X15
|
|
CMPQ CX, $64
|
|
JBE between_0_and_64
|
|
CMPQ CX, $128
|
|
JBE between_64_and_128
|
|
|
|
between_128_and_192:
|
|
MOVQ $128, R14
|
|
MOVO X0, X4
|
|
MOVO X1, X5
|
|
MOVO X2, X6
|
|
MOVO X3, X7
|
|
PADDQ X15, X7
|
|
MOVO X0, X8
|
|
MOVO X1, X9
|
|
MOVO X2, X10
|
|
MOVO X7, X11
|
|
PADDQ X15, X11
|
|
|
|
MOVQ DX, R8
|
|
|
|
chacha_loop_192:
|
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X12, X13, X14)
|
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
|
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
|
|
CHACHA_SHUFFLE(X1, X2, X3)
|
|
CHACHA_SHUFFLE(X5, X6, X7)
|
|
CHACHA_SHUFFLE(X9, X10, X11)
|
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X12, X13, X14)
|
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
|
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
|
|
CHACHA_SHUFFLE(X3, X2, X1)
|
|
CHACHA_SHUFFLE(X7, X6, X5)
|
|
CHACHA_SHUFFLE(X11, X10, X9)
|
|
SUBQ $2, R8
|
|
JA chacha_loop_192
|
|
|
|
PADDL 0(SP), X0
|
|
PADDL 16(SP), X1
|
|
PADDL 32(SP), X2
|
|
PADDL 48(SP), X3
|
|
XOR(DI, SI, 0, X0, X1, X2, X3, X12)
|
|
|
|
MOVO 0(SP), X0
|
|
MOVO 16(SP), X1
|
|
MOVO 32(SP), X2
|
|
MOVO 48(SP), X3
|
|
PADDQ X15, X3
|
|
|
|
PADDL X0, X4
|
|
PADDL X1, X5
|
|
PADDL X2, X6
|
|
PADDL X3, X7
|
|
PADDQ X15, X3
|
|
XOR(DI, SI, 64, X4, X5, X6, X7, X12)
|
|
|
|
PADDL X0, X8
|
|
PADDL X1, X9
|
|
PADDL X2, X10
|
|
PADDL X3, X11
|
|
PADDQ X15, X3
|
|
|
|
CMPQ CX, $192
|
|
JB less_than_64
|
|
|
|
XOR(DI, SI, 128, X8, X9, X10, X11, X12)
|
|
SUBQ $192, CX
|
|
JMP done
|
|
|
|
between_64_and_128:
|
|
MOVQ $64, R14
|
|
MOVO X0, X4
|
|
MOVO X1, X5
|
|
MOVO X2, X6
|
|
MOVO X3, X7
|
|
MOVO X0, X8
|
|
MOVO X1, X9
|
|
MOVO X2, X10
|
|
MOVO X3, X11
|
|
PADDQ X15, X11
|
|
|
|
MOVQ DX, R8
|
|
|
|
chacha_loop_128:
|
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
|
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
|
|
CHACHA_SHUFFLE(X5, X6, X7)
|
|
CHACHA_SHUFFLE(X9, X10, X11)
|
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
|
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
|
|
CHACHA_SHUFFLE(X7, X6, X5)
|
|
CHACHA_SHUFFLE(X11, X10, X9)
|
|
SUBQ $2, R8
|
|
JA chacha_loop_128
|
|
|
|
PADDL X0, X4
|
|
PADDL X1, X5
|
|
PADDL X2, X6
|
|
PADDL X3, X7
|
|
PADDQ X15, X3
|
|
PADDL X0, X8
|
|
PADDL X1, X9
|
|
PADDL X2, X10
|
|
PADDL X3, X11
|
|
PADDQ X15, X3
|
|
XOR(DI, SI, 0, X4, X5, X6, X7, X12)
|
|
|
|
CMPQ CX, $128
|
|
JB less_than_64
|
|
|
|
XOR(DI, SI, 64, X8, X9, X10, X11, X12)
|
|
SUBQ $128, CX
|
|
JMP done
|
|
|
|
between_0_and_64:
|
|
MOVQ $0, R14
|
|
MOVO X0, X8
|
|
MOVO X1, X9
|
|
MOVO X2, X10
|
|
MOVO X3, X11
|
|
MOVQ DX, R8
|
|
|
|
chacha_loop_64:
|
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
|
|
CHACHA_SHUFFLE(X9, X10, X11)
|
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
|
|
CHACHA_SHUFFLE(X11, X10, X9)
|
|
SUBQ $2, R8
|
|
JA chacha_loop_64
|
|
|
|
PADDL X0, X8
|
|
PADDL X1, X9
|
|
PADDL X2, X10
|
|
PADDL X3, X11
|
|
PADDQ X15, X3
|
|
CMPQ CX, $64
|
|
JB less_than_64
|
|
|
|
XOR(DI, SI, 0, X8, X9, X10, X11, X12)
|
|
SUBQ $64, CX
|
|
JMP done
|
|
|
|
less_than_64:
|
|
// R14 contains the num of bytes already xor'd
|
|
ADDQ R14, SI
|
|
ADDQ R14, DI
|
|
SUBQ R14, CX
|
|
MOVOU X8, 0(BX)
|
|
MOVOU X9, 16(BX)
|
|
MOVOU X10, 32(BX)
|
|
MOVOU X11, 48(BX)
|
|
XORQ R11, R11
|
|
XORQ R12, R12
|
|
MOVQ CX, BP
|
|
|
|
xor_loop:
|
|
MOVB 0(SI), R11
|
|
MOVB 0(BX), R12
|
|
XORQ R11, R12
|
|
MOVB R12, 0(DI)
|
|
INCQ SI
|
|
INCQ BX
|
|
INCQ DI
|
|
DECQ BP
|
|
JA xor_loop
|
|
|
|
done:
|
|
MOVQ R9, SP
|
|
MOVOU X3, 48(AX)
|
|
MOVQ CX, ret+72(FP)
|
|
RET
|
|
|
|
// func supportsSSSE3() bool
|
|
TEXT ·supportsSSSE3(SB), NOSPLIT, $0-1
|
|
XORQ AX, AX
|
|
INCQ AX
|
|
CPUID
|
|
SHRQ $9, CX
|
|
ANDQ $1, CX
|
|
MOVB CX, ret+0(FP)
|
|
RET
|
|
|
|
// func initialize(state *[64]byte, key []byte, nonce *[16]byte)
|
|
TEXT ·initialize(SB), 4, $0-40
|
|
MOVQ state+0(FP), DI
|
|
MOVQ key+8(FP), AX
|
|
MOVQ nonce+32(FP), BX
|
|
|
|
MOVOU ·sigma<>(SB), X0
|
|
MOVOU 0(AX), X1
|
|
MOVOU 16(AX), X2
|
|
MOVOU 0(BX), X3
|
|
|
|
MOVOU X0, 0(DI)
|
|
MOVOU X1, 16(DI)
|
|
MOVOU X2, 32(DI)
|
|
MOVOU X3, 48(DI)
|
|
RET
|
|
|
|
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
|
TEXT ·hChaCha20SSE2(SB), 4, $0-24
|
|
MOVQ out+0(FP), DI
|
|
MOVQ nonce+8(FP), AX
|
|
MOVQ key+16(FP), BX
|
|
|
|
MOVOU ·sigma<>(SB), X0
|
|
MOVOU 0(BX), X1
|
|
MOVOU 16(BX), X2
|
|
MOVOU 0(AX), X3
|
|
|
|
MOVQ $20, CX
|
|
|
|
chacha_loop:
|
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
|
CHACHA_SHUFFLE(X1, X2, X3)
|
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
|
CHACHA_SHUFFLE(X3, X2, X1)
|
|
SUBQ $2, CX
|
|
JNZ chacha_loop
|
|
|
|
MOVOU X0, 0(DI)
|
|
MOVOU X3, 16(DI)
|
|
RET
|
|
|
|
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
|
TEXT ·hChaCha20SSSE3(SB), 4, $0-24
|
|
MOVQ out+0(FP), DI
|
|
MOVQ nonce+8(FP), AX
|
|
MOVQ key+16(FP), BX
|
|
|
|
MOVOU ·sigma<>(SB), X0
|
|
MOVOU 0(BX), X1
|
|
MOVOU 16(BX), X2
|
|
MOVOU 0(AX), X3
|
|
MOVOU ·rol16<>(SB), X5
|
|
MOVOU ·rol8<>(SB), X6
|
|
|
|
MOVQ $20, CX
|
|
|
|
chacha_loop:
|
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
|
CHACHA_SHUFFLE(X1, X2, X3)
|
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
|
CHACHA_SHUFFLE(X3, X2, X1)
|
|
SUBQ $2, CX
|
|
JNZ chacha_loop
|
|
|
|
MOVOU X0, 0(DI)
|
|
MOVOU X3, 16(DI)
|
|
RET
|