route/vendor/github.com/aead/chacha20/chacha/chacha_amd64.s

789 lines
14 KiB
ArmAsm

// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build amd64,!gccgo,!appengine,!nacl
#include "textflag.h"
DATA ·sigma<>+0x00(SB)/4, $0x61707865
DATA ·sigma<>+0x04(SB)/4, $0x3320646e
DATA ·sigma<>+0x08(SB)/4, $0x79622d32
DATA ·sigma<>+0x0C(SB)/4, $0x6b206574
GLOBL ·sigma<>(SB), (NOPTR+RODATA), $16
DATA ·one<>+0x00(SB)/8, $1
DATA ·one<>+0x08(SB)/8, $0
GLOBL ·one<>(SB), (NOPTR+RODATA), $16
DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL ·rol16<>(SB), (NOPTR+RODATA), $16
DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL ·rol8<>(SB), (NOPTR+RODATA), $16
#define ROTL_SSE2(n, t, v) \
MOVO v, t; \
PSLLL $n, t; \
PSRLL $(32-n), v; \
PXOR t, v
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t0) \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(16, t0, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(12, t0, v1); \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE2(8, t0, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(7, t0, v1)
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t0, r16, r8) \
PADDL v1, v0; \
PXOR v0, v3; \
PSHUFB r16, v3; \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(12, t0, v1); \
PADDL v1, v0; \
PXOR v0, v3; \
PSHUFB r8, v3; \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE2(7, t0, v1)
#define CHACHA_SHUFFLE(v1, v2, v3) \
PSHUFL $0x39, v1, v1; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v3, v3
#define XOR(dst, src, off, v0, v1, v2, v3, t0) \
MOVOU 0+off(src), t0; \
PXOR v0, t0; \
MOVOU t0, 0+off(dst); \
MOVOU 16+off(src), t0; \
PXOR v1, t0; \
MOVOU t0, 16+off(dst); \
MOVOU 32+off(src), t0; \
PXOR v2, t0; \
MOVOU t0, 32+off(dst); \
MOVOU 48+off(src), t0; \
PXOR v3, t0; \
MOVOU t0, 48+off(dst)
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
TEXT ·xorKeyStreamSSE2(SB), 4, $112-80
MOVQ dst_base+0(FP), DI
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), CX
MOVQ block+48(FP), BX
MOVQ state+56(FP), AX
MOVQ rounds+64(FP), DX
MOVQ SP, R9
ADDQ $16, SP
ANDQ $-16, SP
MOVOU 0(AX), X0
MOVOU 16(AX), X1
MOVOU 32(AX), X2
MOVOU 48(AX), X3
MOVOU ·one<>(SB), X15
TESTQ CX, CX
JZ done
CMPQ CX, $64
JBE between_0_and_64
CMPQ CX, $128
JBE between_64_and_128
MOVO X0, 0(SP)
MOVO X1, 16(SP)
MOVO X2, 32(SP)
MOVO X3, 48(SP)
MOVO X15, 64(SP)
CMPQ CX, $192
JBE between_128_and_192
MOVQ $192, R14
at_least_256:
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
PADDQ 64(SP), X7
MOVO X0, X12
MOVO X1, X13
MOVO X2, X14
MOVO X7, X15
PADDQ 64(SP), X15
MOVO X0, X8
MOVO X1, X9
MOVO X2, X10
MOVO X15, X11
PADDQ 64(SP), X11
MOVQ DX, R8
chacha_loop_256:
MOVO X8, 80(SP)
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X8)
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8)
MOVO 80(SP), X8
MOVO X0, 80(SP)
CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0)
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
MOVO 80(SP), X0
CHACHA_SHUFFLE(X1, X2, X3)
CHACHA_SHUFFLE(X5, X6, X7)
CHACHA_SHUFFLE(X13, X14, X15)
CHACHA_SHUFFLE(X9, X10, X11)
MOVO X8, 80(SP)
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X8)
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8)
MOVO 80(SP), X8
MOVO X0, 80(SP)
CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0)
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
MOVO 80(SP), X0
CHACHA_SHUFFLE(X3, X2, X1)
CHACHA_SHUFFLE(X7, X6, X5)
CHACHA_SHUFFLE(X15, X14, X13)
CHACHA_SHUFFLE(X11, X10, X9)
SUBQ $2, R8
JA chacha_loop_256
MOVO X8, 80(SP)
PADDL 0(SP), X0
PADDL 16(SP), X1
PADDL 32(SP), X2
PADDL 48(SP), X3
XOR(DI, SI, 0, X0, X1, X2, X3, X8)
MOVO 0(SP), X0
MOVO 16(SP), X1
MOVO 32(SP), X2
MOVO 48(SP), X3
PADDQ 64(SP), X3
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
PADDQ 64(SP), X3
XOR(DI, SI, 64, X4, X5, X6, X7, X8)
MOVO 64(SP), X5
MOVO 80(SP), X8
PADDL X0, X12
PADDL X1, X13
PADDL X2, X14
PADDL X3, X15
PADDQ X5, X3
XOR(DI, SI, 128, X12, X13, X14, X15, X4)
PADDL X0, X8
PADDL X1, X9
PADDL X2, X10
PADDL X3, X11
PADDQ X5, X3
CMPQ CX, $256
JB less_than_64
XOR(DI, SI, 192, X8, X9, X10, X11, X4)
MOVO X3, 48(SP)
ADDQ $256, SI
ADDQ $256, DI
SUBQ $256, CX
CMPQ CX, $192
JA at_least_256
TESTQ CX, CX
JZ done
MOVO 64(SP), X15
CMPQ CX, $64
JBE between_0_and_64
CMPQ CX, $128
JBE between_64_and_128
between_128_and_192:
MOVQ $128, R14
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
PADDQ X15, X7
MOVO X0, X8
MOVO X1, X9
MOVO X2, X10
MOVO X7, X11
PADDQ X15, X11
MOVQ DX, R8
chacha_loop_192:
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X12)
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
CHACHA_SHUFFLE(X1, X2, X3)
CHACHA_SHUFFLE(X5, X6, X7)
CHACHA_SHUFFLE(X9, X10, X11)
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X12)
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
CHACHA_SHUFFLE(X3, X2, X1)
CHACHA_SHUFFLE(X7, X6, X5)
CHACHA_SHUFFLE(X11, X10, X9)
SUBQ $2, R8
JA chacha_loop_192
PADDL 0(SP), X0
PADDL 16(SP), X1
PADDL 32(SP), X2
PADDL 48(SP), X3
XOR(DI, SI, 0, X0, X1, X2, X3, X12)
MOVO 0(SP), X0
MOVO 16(SP), X1
MOVO 32(SP), X2
MOVO 48(SP), X3
PADDQ X15, X3
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
PADDQ X15, X3
XOR(DI, SI, 64, X4, X5, X6, X7, X12)
PADDL X0, X8
PADDL X1, X9
PADDL X2, X10
PADDL X3, X11
PADDQ X15, X3
CMPQ CX, $192
JB less_than_64
XOR(DI, SI, 128, X8, X9, X10, X11, X12)
SUBQ $192, CX
JMP done
between_64_and_128:
MOVQ $64, R14
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
MOVO X0, X8
MOVO X1, X9
MOVO X2, X10
MOVO X3, X11
PADDQ X15, X11
MOVQ DX, R8
chacha_loop_128:
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
CHACHA_SHUFFLE(X5, X6, X7)
CHACHA_SHUFFLE(X9, X10, X11)
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
CHACHA_SHUFFLE(X7, X6, X5)
CHACHA_SHUFFLE(X11, X10, X9)
SUBQ $2, R8
JA chacha_loop_128
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
PADDQ X15, X3
PADDL X0, X8
PADDL X1, X9
PADDL X2, X10
PADDL X3, X11
PADDQ X15, X3
XOR(DI, SI, 0, X4, X5, X6, X7, X12)
CMPQ CX, $128
JB less_than_64
XOR(DI, SI, 64, X8, X9, X10, X11, X12)
SUBQ $128, CX
JMP done
between_0_and_64:
MOVQ $0, R14
MOVO X0, X8
MOVO X1, X9
MOVO X2, X10
MOVO X3, X11
MOVQ DX, R8
chacha_loop_64:
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
CHACHA_SHUFFLE(X9, X10, X11)
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
CHACHA_SHUFFLE(X11, X10, X9)
SUBQ $2, R8
JA chacha_loop_64
PADDL X0, X8
PADDL X1, X9
PADDL X2, X10
PADDL X3, X11
PADDQ X15, X3
CMPQ CX, $64
JB less_than_64
XOR(DI, SI, 0, X8, X9, X10, X11, X12)
SUBQ $64, CX
JMP done
less_than_64:
// R14 contains the num of bytes already xor'd
ADDQ R14, SI
ADDQ R14, DI
SUBQ R14, CX
MOVOU X8, 0(BX)
MOVOU X9, 16(BX)
MOVOU X10, 32(BX)
MOVOU X11, 48(BX)
XORQ R11, R11
XORQ R12, R12
MOVQ CX, BP
xor_loop:
MOVB 0(SI), R11
MOVB 0(BX), R12
XORQ R11, R12
MOVB R12, 0(DI)
INCQ SI
INCQ BX
INCQ DI
DECQ BP
JA xor_loop
done:
MOVOU X3, 48(AX)
MOVQ R9, SP
MOVQ CX, ret+72(FP)
RET
// func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
TEXT ·xorKeyStreamSSSE3(SB), 4, $144-80
MOVQ dst_base+0(FP), DI
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), CX
MOVQ block+48(FP), BX
MOVQ state+56(FP), AX
MOVQ rounds+64(FP), DX
MOVQ SP, R9
ADDQ $16, SP
ANDQ $-16, SP
MOVOU 0(AX), X0
MOVOU 16(AX), X1
MOVOU 32(AX), X2
MOVOU 48(AX), X3
MOVOU ·rol16<>(SB), X13
MOVOU ·rol8<>(SB), X14
MOVOU ·one<>(SB), X15
TESTQ CX, CX
JZ done
CMPQ CX, $64
JBE between_0_and_64
CMPQ CX, $128
JBE between_64_and_128
MOVO X0, 0(SP)
MOVO X1, 16(SP)
MOVO X2, 32(SP)
MOVO X3, 48(SP)
MOVO X15, 64(SP)
CMPQ CX, $192
JBE between_128_and_192
MOVO X13, 96(SP)
MOVO X14, 112(SP)
MOVQ $192, R14
at_least_256:
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
PADDQ 64(SP), X7
MOVO X0, X12
MOVO X1, X13
MOVO X2, X14
MOVO X7, X15
PADDQ 64(SP), X15
MOVO X0, X8
MOVO X1, X9
MOVO X2, X10
MOVO X15, X11
PADDQ 64(SP), X11
MOVQ DX, R8
chacha_loop_256:
MOVO X8, 80(SP)
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X8, 96(SP), 112(SP))
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, 96(SP), 112(SP))
MOVO 80(SP), X8
MOVO X0, 80(SP)
CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, 96(SP), 112(SP))
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 96(SP), 112(SP))
MOVO 80(SP), X0
CHACHA_SHUFFLE(X1, X2, X3)
CHACHA_SHUFFLE(X5, X6, X7)
CHACHA_SHUFFLE(X13, X14, X15)
CHACHA_SHUFFLE(X9, X10, X11)
MOVO X8, 80(SP)
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X8, 96(SP), 112(SP))
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, 96(SP), 112(SP))
MOVO 80(SP), X8
MOVO X0, 80(SP)
CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, 96(SP), 112(SP))
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 96(SP), 112(SP))
MOVO 80(SP), X0
CHACHA_SHUFFLE(X3, X2, X1)
CHACHA_SHUFFLE(X7, X6, X5)
CHACHA_SHUFFLE(X15, X14, X13)
CHACHA_SHUFFLE(X11, X10, X9)
SUBQ $2, R8
JA chacha_loop_256
MOVO X8, 80(SP)
PADDL 0(SP), X0
PADDL 16(SP), X1
PADDL 32(SP), X2
PADDL 48(SP), X3
XOR(DI, SI, 0, X0, X1, X2, X3, X8)
MOVO 0(SP), X0
MOVO 16(SP), X1
MOVO 32(SP), X2
MOVO 48(SP), X3
PADDQ 64(SP), X3
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
PADDQ 64(SP), X3
XOR(DI, SI, 64, X4, X5, X6, X7, X8)
MOVO 64(SP), X5
MOVO 80(SP), X8
PADDL X0, X12
PADDL X1, X13
PADDL X2, X14
PADDL X3, X15
PADDQ X5, X3
XOR(DI, SI, 128, X12, X13, X14, X15, X4)
PADDL X0, X8
PADDL X1, X9
PADDL X2, X10
PADDL X3, X11
PADDQ X5, X3
CMPQ CX, $256
JB less_than_64
XOR(DI, SI, 192, X8, X9, X10, X11, X4)
MOVO X3, 48(SP)
ADDQ $256, SI
ADDQ $256, DI
SUBQ $256, CX
CMPQ CX, $192
JA at_least_256
TESTQ CX, CX
JZ done
MOVOU ·rol16<>(SB), X13
MOVOU ·rol8<>(SB), X14
MOVO 64(SP), X15
CMPQ CX, $64
JBE between_0_and_64
CMPQ CX, $128
JBE between_64_and_128
between_128_and_192:
MOVQ $128, R14
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
PADDQ X15, X7
MOVO X0, X8
MOVO X1, X9
MOVO X2, X10
MOVO X7, X11
PADDQ X15, X11
MOVQ DX, R8
chacha_loop_192:
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X12, X13, X14)
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
CHACHA_SHUFFLE(X1, X2, X3)
CHACHA_SHUFFLE(X5, X6, X7)
CHACHA_SHUFFLE(X9, X10, X11)
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X12, X13, X14)
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
CHACHA_SHUFFLE(X3, X2, X1)
CHACHA_SHUFFLE(X7, X6, X5)
CHACHA_SHUFFLE(X11, X10, X9)
SUBQ $2, R8
JA chacha_loop_192
PADDL 0(SP), X0
PADDL 16(SP), X1
PADDL 32(SP), X2
PADDL 48(SP), X3
XOR(DI, SI, 0, X0, X1, X2, X3, X12)
MOVO 0(SP), X0
MOVO 16(SP), X1
MOVO 32(SP), X2
MOVO 48(SP), X3
PADDQ X15, X3
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
PADDQ X15, X3
XOR(DI, SI, 64, X4, X5, X6, X7, X12)
PADDL X0, X8
PADDL X1, X9
PADDL X2, X10
PADDL X3, X11
PADDQ X15, X3
CMPQ CX, $192
JB less_than_64
XOR(DI, SI, 128, X8, X9, X10, X11, X12)
SUBQ $192, CX
JMP done
between_64_and_128:
MOVQ $64, R14
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
MOVO X0, X8
MOVO X1, X9
MOVO X2, X10
MOVO X3, X11
PADDQ X15, X11
MOVQ DX, R8
chacha_loop_128:
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
CHACHA_SHUFFLE(X5, X6, X7)
CHACHA_SHUFFLE(X9, X10, X11)
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
CHACHA_SHUFFLE(X7, X6, X5)
CHACHA_SHUFFLE(X11, X10, X9)
SUBQ $2, R8
JA chacha_loop_128
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
PADDQ X15, X3
PADDL X0, X8
PADDL X1, X9
PADDL X2, X10
PADDL X3, X11
PADDQ X15, X3
XOR(DI, SI, 0, X4, X5, X6, X7, X12)
CMPQ CX, $128
JB less_than_64
XOR(DI, SI, 64, X8, X9, X10, X11, X12)
SUBQ $128, CX
JMP done
between_0_and_64:
MOVQ $0, R14
MOVO X0, X8
MOVO X1, X9
MOVO X2, X10
MOVO X3, X11
MOVQ DX, R8
chacha_loop_64:
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
CHACHA_SHUFFLE(X9, X10, X11)
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
CHACHA_SHUFFLE(X11, X10, X9)
SUBQ $2, R8
JA chacha_loop_64
PADDL X0, X8
PADDL X1, X9
PADDL X2, X10
PADDL X3, X11
PADDQ X15, X3
CMPQ CX, $64
JB less_than_64
XOR(DI, SI, 0, X8, X9, X10, X11, X12)
SUBQ $64, CX
JMP done
less_than_64:
// R14 contains the num of bytes already xor'd
ADDQ R14, SI
ADDQ R14, DI
SUBQ R14, CX
MOVOU X8, 0(BX)
MOVOU X9, 16(BX)
MOVOU X10, 32(BX)
MOVOU X11, 48(BX)
XORQ R11, R11
XORQ R12, R12
MOVQ CX, BP
xor_loop:
MOVB 0(SI), R11
MOVB 0(BX), R12
XORQ R11, R12
MOVB R12, 0(DI)
INCQ SI
INCQ BX
INCQ DI
DECQ BP
JA xor_loop
done:
MOVQ R9, SP
MOVOU X3, 48(AX)
MOVQ CX, ret+72(FP)
RET
// func supportsSSSE3() bool
TEXT ·supportsSSSE3(SB), NOSPLIT, $0-1
XORQ AX, AX
INCQ AX
CPUID
SHRQ $9, CX
ANDQ $1, CX
MOVB CX, ret+0(FP)
RET
// func initialize(state *[64]byte, key []byte, nonce *[16]byte)
TEXT ·initialize(SB), 4, $0-40
MOVQ state+0(FP), DI
MOVQ key+8(FP), AX
MOVQ nonce+32(FP), BX
MOVOU ·sigma<>(SB), X0
MOVOU 0(AX), X1
MOVOU 16(AX), X2
MOVOU 0(BX), X3
MOVOU X0, 0(DI)
MOVOU X1, 16(DI)
MOVOU X2, 32(DI)
MOVOU X3, 48(DI)
RET
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
TEXT ·hChaCha20SSE2(SB), 4, $0-24
MOVQ out+0(FP), DI
MOVQ nonce+8(FP), AX
MOVQ key+16(FP), BX
MOVOU ·sigma<>(SB), X0
MOVOU 0(BX), X1
MOVOU 16(BX), X2
MOVOU 0(AX), X3
MOVQ $20, CX
chacha_loop:
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
CHACHA_SHUFFLE(X1, X2, X3)
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
CHACHA_SHUFFLE(X3, X2, X1)
SUBQ $2, CX
JNZ chacha_loop
MOVOU X0, 0(DI)
MOVOU X3, 16(DI)
RET
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
TEXT ·hChaCha20SSSE3(SB), 4, $0-24
MOVQ out+0(FP), DI
MOVQ nonce+8(FP), AX
MOVQ key+16(FP), BX
MOVOU ·sigma<>(SB), X0
MOVOU 0(BX), X1
MOVOU 16(BX), X2
MOVOU 0(AX), X3
MOVOU ·rol16<>(SB), X5
MOVOU ·rol8<>(SB), X6
MOVQ $20, CX
chacha_loop:
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
CHACHA_SHUFFLE(X1, X2, X3)
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
CHACHA_SHUFFLE(X3, X2, X1)
SUBQ $2, CX
JNZ chacha_loop
MOVOU X0, 0(DI)
MOVOU X3, 16(DI)
RET