|  | /* | 
|  | * AVX2 implementation of MORUS-1280 | 
|  | * | 
|  | * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> | 
|  | * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. | 
|  | * | 
|  | * This program is free software; you can redistribute it and/or modify it | 
|  | * under the terms of the GNU General Public License version 2 as published | 
|  | * by the Free Software Foundation. | 
|  | */ | 
|  |  | 
|  | #include <linux/linkage.h> | 
|  | #include <asm/frame.h> | 
|  |  | 
|  | #define SHUFFLE_MASK(i0, i1, i2, i3) \ | 
|  | (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) | 
|  |  | 
|  | #define MASK1 SHUFFLE_MASK(3, 0, 1, 2) | 
|  | #define MASK2 SHUFFLE_MASK(2, 3, 0, 1) | 
|  | #define MASK3 SHUFFLE_MASK(1, 2, 3, 0) | 
|  |  | 
|  | #define STATE0		%ymm0 | 
|  | #define STATE0_LOW	%xmm0 | 
|  | #define STATE1		%ymm1 | 
|  | #define STATE2		%ymm2 | 
|  | #define STATE3		%ymm3 | 
|  | #define STATE4		%ymm4 | 
|  | #define KEY		%ymm5 | 
|  | #define MSG		%ymm5 | 
|  | #define MSG_LOW		%xmm5 | 
|  | #define T0		%ymm6 | 
|  | #define T0_LOW		%xmm6 | 
|  | #define T1		%ymm7 | 
|  |  | 
|  | .section .rodata.cst32.morus1280_const, "aM", @progbits, 32 | 
|  | .align 32 | 
|  | .Lmorus1280_const: | 
|  | .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d | 
|  | .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 | 
|  | .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 | 
|  | .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd | 
|  |  | 
|  | .section .rodata.cst32.morus1280_counter, "aM", @progbits, 32 | 
|  | .align 32 | 
|  | .Lmorus1280_counter: | 
|  | .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 | 
|  | .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f | 
|  | .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 | 
|  | .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f | 
|  |  | 
|  | .text | 
|  |  | 
|  | .macro morus1280_round s0, s1, s2, s3, s4, b, w | 
|  | vpand \s1, \s2, T0 | 
|  | vpxor T0, \s0, \s0 | 
|  | vpxor \s3, \s0, \s0 | 
|  | vpsllq $\b, \s0, T0 | 
|  | vpsrlq $(64 - \b), \s0, \s0 | 
|  | vpxor T0, \s0, \s0 | 
|  | vpermq $\w, \s3, \s3 | 
|  | .endm | 
|  |  | 
|  | /* | 
|  | * __morus1280_update: internal ABI | 
|  | * input: | 
|  | *   STATE[0-4] - input state | 
|  | *   MSG        - message block | 
|  | * output: | 
|  | *   STATE[0-4] - output state | 
|  | * changed: | 
|  | *   T0 | 
|  | */ | 
|  | __morus1280_update: | 
|  | morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 | 
|  | vpxor MSG, STATE1, STATE1 | 
|  | morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 | 
|  | vpxor MSG, STATE2, STATE2 | 
|  | morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 | 
|  | vpxor MSG, STATE3, STATE3 | 
|  | morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2,  7, MASK2 | 
|  | vpxor MSG, STATE4, STATE4 | 
|  | morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3,  4, MASK1 | 
|  | ret | 
|  | ENDPROC(__morus1280_update) | 
|  |  | 
|  | /* | 
|  | * __morus1280_update_zero: internal ABI | 
|  | * input: | 
|  | *   STATE[0-4] - input state | 
|  | * output: | 
|  | *   STATE[0-4] - output state | 
|  | * changed: | 
|  | *   T0 | 
|  | */ | 
|  | __morus1280_update_zero: | 
|  | morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 | 
|  | morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 | 
|  | morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 | 
|  | morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2,  7, MASK2 | 
|  | morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3,  4, MASK1 | 
|  | ret | 
|  | ENDPROC(__morus1280_update_zero) | 
|  |  | 
|  | /* | 
|  | * __load_partial: internal ABI | 
|  | * input: | 
|  | *   %rsi - src | 
|  | *   %rcx - bytes | 
|  | * output: | 
|  | *   MSG  - message block | 
|  | * changed: | 
|  | *   %r8 | 
|  | *   %r9 | 
|  | */ | 
|  | __load_partial: | 
|  | xor %r9d, %r9d | 
|  | vpxor MSG, MSG, MSG | 
|  |  | 
|  | mov %rcx, %r8 | 
|  | and $0x1, %r8 | 
|  | jz .Lld_partial_1 | 
|  |  | 
|  | mov %rcx, %r8 | 
|  | and $0x1E, %r8 | 
|  | add %rsi, %r8 | 
|  | mov (%r8), %r9b | 
|  |  | 
|  | .Lld_partial_1: | 
|  | mov %rcx, %r8 | 
|  | and $0x2, %r8 | 
|  | jz .Lld_partial_2 | 
|  |  | 
|  | mov %rcx, %r8 | 
|  | and $0x1C, %r8 | 
|  | add %rsi, %r8 | 
|  | shl $16, %r9 | 
|  | mov (%r8), %r9w | 
|  |  | 
|  | .Lld_partial_2: | 
|  | mov %rcx, %r8 | 
|  | and $0x4, %r8 | 
|  | jz .Lld_partial_4 | 
|  |  | 
|  | mov %rcx, %r8 | 
|  | and $0x18, %r8 | 
|  | add %rsi, %r8 | 
|  | shl $32, %r9 | 
|  | mov (%r8), %r8d | 
|  | xor %r8, %r9 | 
|  |  | 
|  | .Lld_partial_4: | 
|  | movq %r9, MSG_LOW | 
|  |  | 
|  | mov %rcx, %r8 | 
|  | and $0x8, %r8 | 
|  | jz .Lld_partial_8 | 
|  |  | 
|  | mov %rcx, %r8 | 
|  | and $0x10, %r8 | 
|  | add %rsi, %r8 | 
|  | pshufd $MASK2, MSG_LOW, MSG_LOW | 
|  | pinsrq $0, (%r8), MSG_LOW | 
|  |  | 
|  | .Lld_partial_8: | 
|  | mov %rcx, %r8 | 
|  | and $0x10, %r8 | 
|  | jz .Lld_partial_16 | 
|  |  | 
|  | vpermq $MASK2, MSG, MSG | 
|  | movdqu (%rsi), MSG_LOW | 
|  |  | 
|  | .Lld_partial_16: | 
|  | ret | 
|  | ENDPROC(__load_partial) | 
|  |  | 
|  | /* | 
|  | * __store_partial: internal ABI | 
|  | * input: | 
|  | *   %rdx - dst | 
|  | *   %rcx - bytes | 
|  | * output: | 
|  | *   T0   - message block | 
|  | * changed: | 
|  | *   %r8 | 
|  | *   %r9 | 
|  | *   %r10 | 
|  | */ | 
|  | __store_partial: | 
|  | mov %rcx, %r8 | 
|  | mov %rdx, %r9 | 
|  |  | 
|  | cmp $16, %r8 | 
|  | jl .Lst_partial_16 | 
|  |  | 
|  | movdqu T0_LOW, (%r9) | 
|  | vpermq $MASK2, T0, T0 | 
|  |  | 
|  | sub $16, %r8 | 
|  | add $16, %r9 | 
|  |  | 
|  | .Lst_partial_16: | 
|  | movq T0_LOW, %r10 | 
|  |  | 
|  | cmp $8, %r8 | 
|  | jl .Lst_partial_8 | 
|  |  | 
|  | mov %r10, (%r9) | 
|  | pextrq $1, T0_LOW, %r10 | 
|  |  | 
|  | sub $8, %r8 | 
|  | add $8, %r9 | 
|  |  | 
|  | .Lst_partial_8: | 
|  | cmp $4, %r8 | 
|  | jl .Lst_partial_4 | 
|  |  | 
|  | mov %r10d, (%r9) | 
|  | shr $32, %r10 | 
|  |  | 
|  | sub $4, %r8 | 
|  | add $4, %r9 | 
|  |  | 
|  | .Lst_partial_4: | 
|  | cmp $2, %r8 | 
|  | jl .Lst_partial_2 | 
|  |  | 
|  | mov %r10w, (%r9) | 
|  | shr $16, %r10 | 
|  |  | 
|  | sub $2, %r8 | 
|  | add $2, %r9 | 
|  |  | 
|  | .Lst_partial_2: | 
|  | cmp $1, %r8 | 
|  | jl .Lst_partial_1 | 
|  |  | 
|  | mov %r10b, (%r9) | 
|  |  | 
|  | .Lst_partial_1: | 
|  | ret | 
|  | ENDPROC(__store_partial) | 
|  |  | 
|  | /* | 
|  | * void crypto_morus1280_avx2_init(void *state, const void *key, | 
|  | *                                 const void *iv); | 
|  | */ | 
|  | ENTRY(crypto_morus1280_avx2_init) | 
|  | FRAME_BEGIN | 
|  |  | 
|  | /* load IV: */ | 
|  | vpxor STATE0, STATE0, STATE0 | 
|  | movdqu (%rdx), STATE0_LOW | 
|  | /* load key: */ | 
|  | vmovdqu (%rsi), KEY | 
|  | vmovdqa KEY, STATE1 | 
|  | /* load all ones: */ | 
|  | vpcmpeqd STATE2, STATE2, STATE2 | 
|  | /* load all zeros: */ | 
|  | vpxor STATE3, STATE3, STATE3 | 
|  | /* load the constant: */ | 
|  | vmovdqa .Lmorus1280_const, STATE4 | 
|  |  | 
|  | /* update 16 times with zero: */ | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  | call __morus1280_update_zero | 
|  |  | 
|  | /* xor-in the key again after updates: */ | 
|  | vpxor KEY, STATE1, STATE1 | 
|  |  | 
|  | /* store the state: */ | 
|  | vmovdqu STATE0, (0 * 32)(%rdi) | 
|  | vmovdqu STATE1, (1 * 32)(%rdi) | 
|  | vmovdqu STATE2, (2 * 32)(%rdi) | 
|  | vmovdqu STATE3, (3 * 32)(%rdi) | 
|  | vmovdqu STATE4, (4 * 32)(%rdi) | 
|  |  | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(crypto_morus1280_avx2_init) | 
|  |  | 
|  | /* | 
|  | * void crypto_morus1280_avx2_ad(void *state, const void *data, | 
|  | *                               unsigned int length); | 
|  | */ | 
|  | ENTRY(crypto_morus1280_avx2_ad) | 
|  | FRAME_BEGIN | 
|  |  | 
|  | cmp $32, %rdx | 
|  | jb .Lad_out | 
|  |  | 
|  | /* load the state: */ | 
|  | vmovdqu (0 * 32)(%rdi), STATE0 | 
|  | vmovdqu (1 * 32)(%rdi), STATE1 | 
|  | vmovdqu (2 * 32)(%rdi), STATE2 | 
|  | vmovdqu (3 * 32)(%rdi), STATE3 | 
|  | vmovdqu (4 * 32)(%rdi), STATE4 | 
|  |  | 
|  | mov %rsi,  %r8 | 
|  | and $0x1F, %r8 | 
|  | jnz .Lad_u_loop | 
|  |  | 
|  | .align 4 | 
|  | .Lad_a_loop: | 
|  | vmovdqa (%rsi), MSG | 
|  | call __morus1280_update | 
|  | sub $32, %rdx | 
|  | add $32, %rsi | 
|  | cmp $32, %rdx | 
|  | jge .Lad_a_loop | 
|  |  | 
|  | jmp .Lad_cont | 
|  | .align 4 | 
|  | .Lad_u_loop: | 
|  | vmovdqu (%rsi), MSG | 
|  | call __morus1280_update | 
|  | sub $32, %rdx | 
|  | add $32, %rsi | 
|  | cmp $32, %rdx | 
|  | jge .Lad_u_loop | 
|  |  | 
|  | .Lad_cont: | 
|  | /* store the state: */ | 
|  | vmovdqu STATE0, (0 * 32)(%rdi) | 
|  | vmovdqu STATE1, (1 * 32)(%rdi) | 
|  | vmovdqu STATE2, (2 * 32)(%rdi) | 
|  | vmovdqu STATE3, (3 * 32)(%rdi) | 
|  | vmovdqu STATE4, (4 * 32)(%rdi) | 
|  |  | 
|  | .Lad_out: | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(crypto_morus1280_avx2_ad) | 
|  |  | 
|  | /* | 
|  | * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst, | 
|  | *                                unsigned int length); | 
|  | */ | 
|  | ENTRY(crypto_morus1280_avx2_enc) | 
|  | FRAME_BEGIN | 
|  |  | 
|  | cmp $32, %rcx | 
|  | jb .Lenc_out | 
|  |  | 
|  | /* load the state: */ | 
|  | vmovdqu (0 * 32)(%rdi), STATE0 | 
|  | vmovdqu (1 * 32)(%rdi), STATE1 | 
|  | vmovdqu (2 * 32)(%rdi), STATE2 | 
|  | vmovdqu (3 * 32)(%rdi), STATE3 | 
|  | vmovdqu (4 * 32)(%rdi), STATE4 | 
|  |  | 
|  | mov %rsi,  %r8 | 
|  | or  %rdx,  %r8 | 
|  | and $0x1F, %r8 | 
|  | jnz .Lenc_u_loop | 
|  |  | 
|  | .align 4 | 
|  | .Lenc_a_loop: | 
|  | vmovdqa (%rsi), MSG | 
|  | vmovdqa MSG, T0 | 
|  | vpxor STATE0, T0, T0 | 
|  | vpermq $MASK3, STATE1, T1 | 
|  | vpxor T1, T0, T0 | 
|  | vpand STATE2, STATE3, T1 | 
|  | vpxor T1, T0, T0 | 
|  | vmovdqa T0, (%rdx) | 
|  |  | 
|  | call __morus1280_update | 
|  | sub $32, %rcx | 
|  | add $32, %rsi | 
|  | add $32, %rdx | 
|  | cmp $32, %rcx | 
|  | jge .Lenc_a_loop | 
|  |  | 
|  | jmp .Lenc_cont | 
|  | .align 4 | 
|  | .Lenc_u_loop: | 
|  | vmovdqu (%rsi), MSG | 
|  | vmovdqa MSG, T0 | 
|  | vpxor STATE0, T0, T0 | 
|  | vpermq $MASK3, STATE1, T1 | 
|  | vpxor T1, T0, T0 | 
|  | vpand STATE2, STATE3, T1 | 
|  | vpxor T1, T0, T0 | 
|  | vmovdqu T0, (%rdx) | 
|  |  | 
|  | call __morus1280_update | 
|  | sub $32, %rcx | 
|  | add $32, %rsi | 
|  | add $32, %rdx | 
|  | cmp $32, %rcx | 
|  | jge .Lenc_u_loop | 
|  |  | 
|  | .Lenc_cont: | 
|  | /* store the state: */ | 
|  | vmovdqu STATE0, (0 * 32)(%rdi) | 
|  | vmovdqu STATE1, (1 * 32)(%rdi) | 
|  | vmovdqu STATE2, (2 * 32)(%rdi) | 
|  | vmovdqu STATE3, (3 * 32)(%rdi) | 
|  | vmovdqu STATE4, (4 * 32)(%rdi) | 
|  |  | 
|  | .Lenc_out: | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(crypto_morus1280_avx2_enc) | 
|  |  | 
|  | /* | 
|  | * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst, | 
|  | *                                     unsigned int length); | 
|  | */ | 
|  | ENTRY(crypto_morus1280_avx2_enc_tail) | 
|  | FRAME_BEGIN | 
|  |  | 
|  | /* load the state: */ | 
|  | vmovdqu (0 * 32)(%rdi), STATE0 | 
|  | vmovdqu (1 * 32)(%rdi), STATE1 | 
|  | vmovdqu (2 * 32)(%rdi), STATE2 | 
|  | vmovdqu (3 * 32)(%rdi), STATE3 | 
|  | vmovdqu (4 * 32)(%rdi), STATE4 | 
|  |  | 
|  | /* encrypt message: */ | 
|  | call __load_partial | 
|  |  | 
|  | vmovdqa MSG, T0 | 
|  | vpxor STATE0, T0, T0 | 
|  | vpermq $MASK3, STATE1, T1 | 
|  | vpxor T1, T0, T0 | 
|  | vpand STATE2, STATE3, T1 | 
|  | vpxor T1, T0, T0 | 
|  |  | 
|  | call __store_partial | 
|  |  | 
|  | call __morus1280_update | 
|  |  | 
|  | /* store the state: */ | 
|  | vmovdqu STATE0, (0 * 32)(%rdi) | 
|  | vmovdqu STATE1, (1 * 32)(%rdi) | 
|  | vmovdqu STATE2, (2 * 32)(%rdi) | 
|  | vmovdqu STATE3, (3 * 32)(%rdi) | 
|  | vmovdqu STATE4, (4 * 32)(%rdi) | 
|  |  | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(crypto_morus1280_avx2_enc_tail) | 
|  |  | 
|  | /* | 
|  | * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst, | 
|  | *                                unsigned int length); | 
|  | */ | 
|  | ENTRY(crypto_morus1280_avx2_dec) | 
|  | FRAME_BEGIN | 
|  |  | 
|  | cmp $32, %rcx | 
|  | jb .Ldec_out | 
|  |  | 
|  | /* load the state: */ | 
|  | vmovdqu (0 * 32)(%rdi), STATE0 | 
|  | vmovdqu (1 * 32)(%rdi), STATE1 | 
|  | vmovdqu (2 * 32)(%rdi), STATE2 | 
|  | vmovdqu (3 * 32)(%rdi), STATE3 | 
|  | vmovdqu (4 * 32)(%rdi), STATE4 | 
|  |  | 
|  | mov %rsi,  %r8 | 
|  | or  %rdx,  %r8 | 
|  | and $0x1F, %r8 | 
|  | jnz .Ldec_u_loop | 
|  |  | 
|  | .align 4 | 
|  | .Ldec_a_loop: | 
|  | vmovdqa (%rsi), MSG | 
|  | vpxor STATE0, MSG, MSG | 
|  | vpermq $MASK3, STATE1, T0 | 
|  | vpxor T0, MSG, MSG | 
|  | vpand STATE2, STATE3, T0 | 
|  | vpxor T0, MSG, MSG | 
|  | vmovdqa MSG, (%rdx) | 
|  |  | 
|  | call __morus1280_update | 
|  | sub $32, %rcx | 
|  | add $32, %rsi | 
|  | add $32, %rdx | 
|  | cmp $32, %rcx | 
|  | jge .Ldec_a_loop | 
|  |  | 
|  | jmp .Ldec_cont | 
|  | .align 4 | 
|  | .Ldec_u_loop: | 
|  | vmovdqu (%rsi), MSG | 
|  | vpxor STATE0, MSG, MSG | 
|  | vpermq $MASK3, STATE1, T0 | 
|  | vpxor T0, MSG, MSG | 
|  | vpand STATE2, STATE3, T0 | 
|  | vpxor T0, MSG, MSG | 
|  | vmovdqu MSG, (%rdx) | 
|  |  | 
|  | call __morus1280_update | 
|  | sub $32, %rcx | 
|  | add $32, %rsi | 
|  | add $32, %rdx | 
|  | cmp $32, %rcx | 
|  | jge .Ldec_u_loop | 
|  |  | 
|  | .Ldec_cont: | 
|  | /* store the state: */ | 
|  | vmovdqu STATE0, (0 * 32)(%rdi) | 
|  | vmovdqu STATE1, (1 * 32)(%rdi) | 
|  | vmovdqu STATE2, (2 * 32)(%rdi) | 
|  | vmovdqu STATE3, (3 * 32)(%rdi) | 
|  | vmovdqu STATE4, (4 * 32)(%rdi) | 
|  |  | 
|  | .Ldec_out: | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(crypto_morus1280_avx2_dec) | 
|  |  | 
|  | /* | 
|  | * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst, | 
|  | *                                     unsigned int length); | 
|  | */ | 
|  | ENTRY(crypto_morus1280_avx2_dec_tail) | 
|  | FRAME_BEGIN | 
|  |  | 
|  | /* load the state: */ | 
|  | vmovdqu (0 * 32)(%rdi), STATE0 | 
|  | vmovdqu (1 * 32)(%rdi), STATE1 | 
|  | vmovdqu (2 * 32)(%rdi), STATE2 | 
|  | vmovdqu (3 * 32)(%rdi), STATE3 | 
|  | vmovdqu (4 * 32)(%rdi), STATE4 | 
|  |  | 
|  | /* decrypt message: */ | 
|  | call __load_partial | 
|  |  | 
|  | vpxor STATE0, MSG, MSG | 
|  | vpermq $MASK3, STATE1, T0 | 
|  | vpxor T0, MSG, MSG | 
|  | vpand STATE2, STATE3, T0 | 
|  | vpxor T0, MSG, MSG | 
|  | vmovdqa MSG, T0 | 
|  |  | 
|  | call __store_partial | 
|  |  | 
|  | /* mask with byte count: */ | 
|  | movq %rcx, T0_LOW | 
|  | vpbroadcastb T0_LOW, T0 | 
|  | vmovdqa .Lmorus1280_counter, T1 | 
|  | vpcmpgtb T1, T0, T0 | 
|  | vpand T0, MSG, MSG | 
|  |  | 
|  | call __morus1280_update | 
|  |  | 
|  | /* store the state: */ | 
|  | vmovdqu STATE0, (0 * 32)(%rdi) | 
|  | vmovdqu STATE1, (1 * 32)(%rdi) | 
|  | vmovdqu STATE2, (2 * 32)(%rdi) | 
|  | vmovdqu STATE3, (3 * 32)(%rdi) | 
|  | vmovdqu STATE4, (4 * 32)(%rdi) | 
|  |  | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(crypto_morus1280_avx2_dec_tail) | 
|  |  | 
|  | /* | 
|  | * void crypto_morus1280_avx2_final(void *state, void *tag_xor, | 
|  | *                                  u64 assoclen, u64 cryptlen); | 
|  | */ | 
|  | ENTRY(crypto_morus1280_avx2_final) | 
|  | FRAME_BEGIN | 
|  |  | 
|  | /* load the state: */ | 
|  | vmovdqu (0 * 32)(%rdi), STATE0 | 
|  | vmovdqu (1 * 32)(%rdi), STATE1 | 
|  | vmovdqu (2 * 32)(%rdi), STATE2 | 
|  | vmovdqu (3 * 32)(%rdi), STATE3 | 
|  | vmovdqu (4 * 32)(%rdi), STATE4 | 
|  |  | 
|  | /* xor state[0] into state[4]: */ | 
|  | vpxor STATE0, STATE4, STATE4 | 
|  |  | 
|  | /* prepare length block: */ | 
|  | vpxor MSG, MSG, MSG | 
|  | vpinsrq $0, %rdx, MSG_LOW, MSG_LOW | 
|  | vpinsrq $1, %rcx, MSG_LOW, MSG_LOW | 
|  | vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */ | 
|  |  | 
|  | /* update state: */ | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  | call __morus1280_update | 
|  |  | 
|  | /* xor tag: */ | 
|  | vmovdqu (%rsi), MSG | 
|  |  | 
|  | vpxor STATE0, MSG, MSG | 
|  | vpermq $MASK3, STATE1, T0 | 
|  | vpxor T0, MSG, MSG | 
|  | vpand STATE2, STATE3, T0 | 
|  | vpxor T0, MSG, MSG | 
|  | vmovdqu MSG, (%rsi) | 
|  |  | 
|  | FRAME_END | 
|  | ret | 
|  | ENDPROC(crypto_morus1280_avx2_final) |