| /* | 
 |  * Multi-buffer SHA1 algorithm hash compute routine | 
 |  * | 
 |  * This file is provided under a dual BSD/GPLv2 license.  When using or | 
 |  * redistributing this file, you may do so under either license. | 
 |  * | 
 |  * GPL LICENSE SUMMARY | 
 |  * | 
 |  *  Copyright(c) 2014 Intel Corporation. | 
 |  * | 
 |  *  This program is free software; you can redistribute it and/or modify | 
 |  *  it under the terms of version 2 of the GNU General Public License as | 
 |  *  published by the Free Software Foundation. | 
 |  * | 
 |  *  This program is distributed in the hope that it will be useful, but | 
 |  *  WITHOUT ANY WARRANTY; without even the implied warranty of | 
 |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
 |  *  General Public License for more details. | 
 |  * | 
 |  *  Contact Information: | 
 |  *      James Guilford <james.guilford@intel.com> | 
 |  *	Tim Chen <tim.c.chen@linux.intel.com> | 
 |  * | 
 |  *  BSD LICENSE | 
 |  * | 
 |  *  Copyright(c) 2014 Intel Corporation. | 
 |  * | 
 |  *  Redistribution and use in source and binary forms, with or without | 
 |  *  modification, are permitted provided that the following conditions | 
 |  *  are met: | 
 |  * | 
 |  *    * Redistributions of source code must retain the above copyright | 
 |  *      notice, this list of conditions and the following disclaimer. | 
 |  *    * Redistributions in binary form must reproduce the above copyright | 
 |  *      notice, this list of conditions and the following disclaimer in | 
 |  *      the documentation and/or other materials provided with the | 
 |  *      distribution. | 
 |  *    * Neither the name of Intel Corporation nor the names of its | 
 |  *      contributors may be used to endorse or promote products derived | 
 |  *      from this software without specific prior written permission. | 
 |  * | 
 |  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
 |  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
 |  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 
 |  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 
 |  *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 
 |  *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 
 |  *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 
 |  *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 
 |  *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
 |  *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
 |  */ | 
 |  | 
 | #include <linux/linkage.h> | 
 | #include "sha1_mb_mgr_datastruct.S" | 
 |  | 
 | ## code to compute oct SHA1 using SSE-256 | 
 | ## outer calling routine takes care of save and restore of XMM registers | 
 |  | 
 | ## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15# ymm0-15 | 
 | ## | 
 | ## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15 | 
 | ## Linux preserves:                       rdi rbp r8 | 
 | ## | 
 | ## clobbers ymm0-15 | 
 |  | 
 |  | 
 | # TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 | 
 | # "transpose" data in {r0...r7} using temps {t0...t1} | 
 | # Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} | 
 | # r0 = {a7 a6 a5 a4   a3 a2 a1 a0} | 
 | # r1 = {b7 b6 b5 b4   b3 b2 b1 b0} | 
 | # r2 = {c7 c6 c5 c4   c3 c2 c1 c0} | 
 | # r3 = {d7 d6 d5 d4   d3 d2 d1 d0} | 
 | # r4 = {e7 e6 e5 e4   e3 e2 e1 e0} | 
 | # r5 = {f7 f6 f5 f4   f3 f2 f1 f0} | 
 | # r6 = {g7 g6 g5 g4   g3 g2 g1 g0} | 
 | # r7 = {h7 h6 h5 h4   h3 h2 h1 h0} | 
 | # | 
 | # Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} | 
 | # r0 = {h0 g0 f0 e0   d0 c0 b0 a0} | 
 | # r1 = {h1 g1 f1 e1   d1 c1 b1 a1} | 
 | # r2 = {h2 g2 f2 e2   d2 c2 b2 a2} | 
 | # r3 = {h3 g3 f3 e3   d3 c3 b3 a3} | 
 | # r4 = {h4 g4 f4 e4   d4 c4 b4 a4} | 
 | # r5 = {h5 g5 f5 e5   d5 c5 b5 a5} | 
 | # r6 = {h6 g6 f6 e6   d6 c6 b6 a6} | 
 | # r7 = {h7 g7 f7 e7   d7 c7 b7 a7} | 
 | # | 
 |  | 
 | .macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 | 
 | 	# process top half (r0..r3) {a...d} | 
 | 	vshufps  $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0} | 
 | 	vshufps  $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2} | 
 | 	vshufps  $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0} | 
 | 	vshufps  $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2} | 
 | 	vshufps  $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1} | 
 | 	vshufps  $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2} | 
 | 	vshufps  $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3} | 
 | 	vshufps  $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0} | 
 |  | 
 | 	# use r2 in place of t0 | 
 | 	# process bottom half (r4..r7) {e...h} | 
 | 	vshufps  $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0} | 
 | 	vshufps  $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2} | 
 | 	vshufps  $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0} | 
 | 	vshufps  $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2} | 
 | 	vshufps  $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1} | 
 | 	vshufps  $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2} | 
 | 	vshufps  $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3} | 
 | 	vshufps  $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0} | 
 |  | 
 | 	vperm2f128      $0x13, \r1, \r5, \r6  # h6...a6 | 
 | 	vperm2f128      $0x02, \r1, \r5, \r2  # h2...a2 | 
 | 	vperm2f128      $0x13, \r3, \r7, \r5  # h5...a5 | 
 | 	vperm2f128      $0x02, \r3, \r7, \r1  # h1...a1 | 
 | 	vperm2f128      $0x13, \r0, \r4, \r7  # h7...a7 | 
 | 	vperm2f128      $0x02, \r0, \r4, \r3  # h3...a3 | 
 | 	vperm2f128      $0x13, \t0, \t1, \r4  # h4...a4 | 
 | 	vperm2f128      $0x02, \t0, \t1, \r0  # h0...a0 | 
 |  | 
 | .endm | 
 | ## | 
 | ## Magic functions defined in FIPS 180-1 | 
 | ## | 
 | # macro MAGIC_F0 F,B,C,D,T   ## F = (D ^ (B & (C ^ D))) | 
 | .macro MAGIC_F0 regF regB regC regD regT | 
 |     vpxor \regD, \regC, \regF | 
 |     vpand \regB, \regF, \regF | 
 |     vpxor \regD, \regF, \regF | 
 | .endm | 
 |  | 
 | # macro MAGIC_F1 F,B,C,D,T   ## F = (B ^ C ^ D) | 
 | .macro MAGIC_F1 regF regB regC regD regT | 
 |     vpxor  \regC, \regD, \regF | 
 |     vpxor  \regB, \regF, \regF | 
 | .endm | 
 |  | 
 | # macro MAGIC_F2 F,B,C,D,T   ## F = ((B & C) | (B & D) | (C & D)) | 
 | .macro MAGIC_F2 regF regB regC regD regT | 
 |     vpor  \regC, \regB, \regF | 
 |     vpand \regC, \regB, \regT | 
 |     vpand \regD, \regF, \regF | 
 |     vpor  \regT, \regF, \regF | 
 | .endm | 
 |  | 
 | # macro MAGIC_F3 F,B,C,D,T   ## F = (B ^ C ^ D) | 
 | .macro MAGIC_F3 regF regB regC regD regT | 
 |     MAGIC_F1 \regF,\regB,\regC,\regD,\regT | 
 | .endm | 
 |  | 
 | # PROLD reg, imm, tmp | 
 | .macro PROLD reg imm tmp | 
 | 	vpsrld  $(32-\imm), \reg, \tmp | 
 | 	vpslld  $\imm, \reg, \reg | 
 | 	vpor    \tmp, \reg, \reg | 
 | .endm | 
 |  | 
 | .macro PROLD_nd reg imm tmp src | 
 | 	vpsrld  $(32-\imm), \src, \tmp | 
 | 	vpslld  $\imm, \src, \reg | 
 | 	vpor	\tmp, \reg, \reg | 
 | .endm | 
 |  | 
 | .macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC | 
 | 	vpaddd	\immCNT, \regE, \regE | 
 | 	vpaddd	\memW*32(%rsp), \regE, \regE | 
 | 	PROLD_nd \regT, 5, \regF, \regA | 
 | 	vpaddd	\regT, \regE, \regE | 
 | 	\MAGIC  \regF, \regB, \regC, \regD, \regT | 
 |         PROLD   \regB, 30, \regT | 
 |         vpaddd  \regF, \regE, \regE | 
 | .endm | 
 |  | 
 | .macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC | 
 | 	vpaddd	\immCNT, \regE, \regE | 
 | 	offset = ((\memW - 14) & 15) * 32 | 
 | 	vmovdqu offset(%rsp), W14 | 
 | 	vpxor	W14, W16, W16 | 
 | 	offset = ((\memW -  8) & 15) * 32 | 
 | 	vpxor	offset(%rsp), W16, W16 | 
 | 	offset = ((\memW -  3) & 15) * 32 | 
 | 	vpxor	offset(%rsp), W16, W16 | 
 | 	vpsrld	$(32-1), W16, \regF | 
 | 	vpslld	$1, W16, W16 | 
 | 	vpor	W16, \regF, \regF | 
 |  | 
 | 	ROTATE_W | 
 |  | 
 | 	offset = ((\memW - 0) & 15) * 32 | 
 | 	vmovdqu	\regF, offset(%rsp) | 
 | 	vpaddd	\regF, \regE, \regE | 
 | 	PROLD_nd \regT, 5, \regF, \regA | 
 | 	vpaddd	\regT, \regE, \regE | 
 | 	\MAGIC \regF,\regB,\regC,\regD,\regT      ## FUN  = MAGIC_Fi(B,C,D) | 
 | 	PROLD   \regB,30, \regT | 
 | 	vpaddd  \regF, \regE, \regE | 
 | .endm | 
 |  | 
 | ######################################################################## | 
 | ######################################################################## | 
 | ######################################################################## | 
 |  | 
 | ## FRAMESZ plus pushes must be an odd multiple of 8 | 
 | YMM_SAVE = (15-15)*32 | 
 | FRAMESZ = 32*16 + YMM_SAVE | 
 | _YMM  =   FRAMESZ - YMM_SAVE | 
 |  | 
 | #define VMOVPS   vmovups | 
 |  | 
 | IDX  = %rax | 
 | inp0 = %r9 | 
 | inp1 = %r10 | 
 | inp2 = %r11 | 
 | inp3 = %r12 | 
 | inp4 = %r13 | 
 | inp5 = %r14 | 
 | inp6 = %r15 | 
 | inp7 = %rcx | 
 | arg1 = %rdi | 
 | arg2 = %rsi | 
 | RSP_SAVE = %rdx | 
 |  | 
 | # ymm0 A | 
 | # ymm1 B | 
 | # ymm2 C | 
 | # ymm3 D | 
 | # ymm4 E | 
 | # ymm5         F       AA | 
 | # ymm6         T0      BB | 
 | # ymm7         T1      CC | 
 | # ymm8         T2      DD | 
 | # ymm9         T3      EE | 
 | # ymm10                T4      TMP | 
 | # ymm11                T5      FUN | 
 | # ymm12                T6      K | 
 | # ymm13                T7      W14 | 
 | # ymm14                T8      W15 | 
 | # ymm15                T9      W16 | 
 |  | 
 |  | 
 | A  =     %ymm0 | 
 | B  =     %ymm1 | 
 | C  =     %ymm2 | 
 | D  =     %ymm3 | 
 | E  =     %ymm4 | 
 | F  =     %ymm5 | 
 | T0 =	 %ymm6 | 
 | T1 =     %ymm7 | 
 | T2 =     %ymm8 | 
 | T3 =     %ymm9 | 
 | T4 =     %ymm10 | 
 | T5 =     %ymm11 | 
 | T6 =     %ymm12 | 
 | T7 =     %ymm13 | 
 | T8  =     %ymm14 | 
 | T9  =     %ymm15 | 
 |  | 
 | AA  =     %ymm5 | 
 | BB  =     %ymm6 | 
 | CC  =     %ymm7 | 
 | DD  =     %ymm8 | 
 | EE  =     %ymm9 | 
 | TMP =     %ymm10 | 
 | FUN =     %ymm11 | 
 | K   =     %ymm12 | 
 | W14 =     %ymm13 | 
 | W15 =     %ymm14 | 
 | W16 =     %ymm15 | 
 |  | 
 | .macro ROTATE_ARGS | 
 |  TMP_ = E | 
 |  E = D | 
 |  D = C | 
 |  C = B | 
 |  B = A | 
 |  A = TMP_ | 
 | .endm | 
 |  | 
 | .macro ROTATE_W | 
 | TMP_  = W16 | 
 | W16  = W15 | 
 | W15  = W14 | 
 | W14  = TMP_ | 
 | .endm | 
 |  | 
 | # 8 streams x 5 32bit words per digest x 4 bytes per word | 
 | #define DIGEST_SIZE (8*5*4) | 
 |  | 
 | .align 32 | 
 |  | 
 | # void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size) | 
 | # arg 1 : pointer to array[4] of pointer to input data | 
 | # arg 2 : size (in blocks) ;; assumed to be >= 1 | 
 | # | 
 | ENTRY(sha1_x8_avx2) | 
 |  | 
 | 	# save callee-saved clobbered registers to comply with C function ABI | 
 | 	push	%r12 | 
 | 	push	%r13 | 
 | 	push	%r14 | 
 | 	push	%r15 | 
 |  | 
 | 	#save rsp | 
 | 	mov	%rsp, RSP_SAVE | 
 | 	sub     $FRAMESZ, %rsp | 
 |  | 
 | 	#align rsp to 32 Bytes | 
 | 	and	$~0x1F, %rsp | 
 |  | 
 | 	## Initialize digests | 
 | 	vmovdqu  0*32(arg1), A | 
 | 	vmovdqu  1*32(arg1), B | 
 | 	vmovdqu  2*32(arg1), C | 
 | 	vmovdqu  3*32(arg1), D | 
 | 	vmovdqu  4*32(arg1), E | 
 |  | 
 | 	## transpose input onto stack | 
 | 	mov     _data_ptr+0*8(arg1),inp0 | 
 | 	mov     _data_ptr+1*8(arg1),inp1 | 
 | 	mov     _data_ptr+2*8(arg1),inp2 | 
 | 	mov     _data_ptr+3*8(arg1),inp3 | 
 | 	mov     _data_ptr+4*8(arg1),inp4 | 
 | 	mov     _data_ptr+5*8(arg1),inp5 | 
 | 	mov     _data_ptr+6*8(arg1),inp6 | 
 | 	mov     _data_ptr+7*8(arg1),inp7 | 
 |  | 
 | 	xor     IDX, IDX | 
 | lloop: | 
 | 	vmovdqu  PSHUFFLE_BYTE_FLIP_MASK(%rip), F | 
 | 	I=0 | 
 | .rep 2 | 
 | 	VMOVPS   (inp0, IDX), T0 | 
 | 	VMOVPS   (inp1, IDX), T1 | 
 | 	VMOVPS   (inp2, IDX), T2 | 
 | 	VMOVPS   (inp3, IDX), T3 | 
 | 	VMOVPS   (inp4, IDX), T4 | 
 | 	VMOVPS   (inp5, IDX), T5 | 
 | 	VMOVPS   (inp6, IDX), T6 | 
 | 	VMOVPS   (inp7, IDX), T7 | 
 |  | 
 | 	TRANSPOSE8       T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 | 
 | 	vpshufb  F, T0, T0 | 
 | 	vmovdqu  T0, (I*8)*32(%rsp) | 
 | 	vpshufb  F, T1, T1 | 
 | 	vmovdqu  T1, (I*8+1)*32(%rsp) | 
 | 	vpshufb  F, T2, T2 | 
 | 	vmovdqu  T2, (I*8+2)*32(%rsp) | 
 | 	vpshufb  F, T3, T3 | 
 | 	vmovdqu  T3, (I*8+3)*32(%rsp) | 
 | 	vpshufb  F, T4, T4 | 
 | 	vmovdqu  T4, (I*8+4)*32(%rsp) | 
 | 	vpshufb  F, T5, T5 | 
 | 	vmovdqu  T5, (I*8+5)*32(%rsp) | 
 | 	vpshufb  F, T6, T6 | 
 | 	vmovdqu  T6, (I*8+6)*32(%rsp) | 
 | 	vpshufb  F, T7, T7 | 
 | 	vmovdqu  T7, (I*8+7)*32(%rsp) | 
 | 	add     $32, IDX | 
 | 	I = (I+1) | 
 | .endr | 
 | 	# save old digests | 
 | 	vmovdqu  A,AA | 
 | 	vmovdqu  B,BB | 
 | 	vmovdqu  C,CC | 
 | 	vmovdqu  D,DD | 
 | 	vmovdqu  E,EE | 
 |  | 
 | ## | 
 | ## perform 0-79 steps | 
 | ## | 
 | 	vmovdqu  K00_19(%rip), K | 
 | ## do rounds 0...15 | 
 | 	I = 0 | 
 | .rep 16 | 
 | 	SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 | 
 | 	ROTATE_ARGS | 
 | 	I = (I+1) | 
 | .endr | 
 |  | 
 | ## do rounds 16...19 | 
 | 	vmovdqu  ((16 - 16) & 15) * 32 (%rsp), W16 | 
 | 	vmovdqu  ((16 - 15) & 15) * 32 (%rsp), W15 | 
 | .rep 4 | 
 | 	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 | 
 | 	ROTATE_ARGS | 
 | 	I = (I+1) | 
 | .endr | 
 |  | 
 | ## do rounds 20...39 | 
 | 	vmovdqu  K20_39(%rip), K | 
 | .rep 20 | 
 | 	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 | 
 | 	ROTATE_ARGS | 
 | 	I = (I+1) | 
 | .endr | 
 |  | 
 | ## do rounds 40...59 | 
 | 	vmovdqu  K40_59(%rip), K | 
 | .rep 20 | 
 | 	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 | 
 | 	ROTATE_ARGS | 
 | 	I = (I+1) | 
 | .endr | 
 |  | 
 | ## do rounds 60...79 | 
 | 	vmovdqu  K60_79(%rip), K | 
 | .rep 20 | 
 | 	SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 | 
 | 	ROTATE_ARGS | 
 | 	I = (I+1) | 
 | .endr | 
 |  | 
 | 	vpaddd   AA,A,A | 
 | 	vpaddd   BB,B,B | 
 | 	vpaddd   CC,C,C | 
 | 	vpaddd   DD,D,D | 
 | 	vpaddd   EE,E,E | 
 |  | 
 | 	sub     $1, arg2 | 
 | 	jne     lloop | 
 |  | 
 | 	# write out digests | 
 | 	vmovdqu  A, 0*32(arg1) | 
 | 	vmovdqu  B, 1*32(arg1) | 
 | 	vmovdqu  C, 2*32(arg1) | 
 | 	vmovdqu  D, 3*32(arg1) | 
 | 	vmovdqu  E, 4*32(arg1) | 
 |  | 
 | 	# update input pointers | 
 | 	add     IDX, inp0 | 
 | 	add     IDX, inp1 | 
 | 	add     IDX, inp2 | 
 | 	add     IDX, inp3 | 
 | 	add     IDX, inp4 | 
 | 	add     IDX, inp5 | 
 | 	add     IDX, inp6 | 
 | 	add     IDX, inp7 | 
 | 	mov     inp0, _data_ptr (arg1) | 
 | 	mov     inp1, _data_ptr + 1*8(arg1) | 
 | 	mov     inp2, _data_ptr + 2*8(arg1) | 
 | 	mov     inp3, _data_ptr + 3*8(arg1) | 
 | 	mov     inp4, _data_ptr + 4*8(arg1) | 
 | 	mov     inp5, _data_ptr + 5*8(arg1) | 
 | 	mov     inp6, _data_ptr + 6*8(arg1) | 
 | 	mov     inp7, _data_ptr + 7*8(arg1) | 
 |  | 
 | 	################ | 
 | 	## Postamble | 
 |  | 
 | 	mov     RSP_SAVE, %rsp | 
 |  | 
 | 	# restore callee-saved clobbered registers | 
 | 	pop	%r15 | 
 | 	pop	%r14 | 
 | 	pop	%r13 | 
 | 	pop	%r12 | 
 |  | 
 | 	ret | 
 | ENDPROC(sha1_x8_avx2) | 
 |  | 
 |  | 
 | .section	.rodata.cst32.K00_19, "aM", @progbits, 32 | 
 | .align 32 | 
 | K00_19: | 
 | .octa 0x5A8279995A8279995A8279995A827999 | 
 | .octa 0x5A8279995A8279995A8279995A827999 | 
 |  | 
 | .section	.rodata.cst32.K20_39, "aM", @progbits, 32 | 
 | .align 32 | 
 | K20_39: | 
 | .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 | 
 | .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 | 
 |  | 
 | .section	.rodata.cst32.K40_59, "aM", @progbits, 32 | 
 | .align 32 | 
 | K40_59: | 
 | .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC | 
 | .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC | 
 |  | 
 | .section	.rodata.cst32.K60_79, "aM", @progbits, 32 | 
 | .align 32 | 
 | K60_79: | 
 | .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 | 
 | .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 | 
 |  | 
 | .section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 | 
 | .align 32 | 
 | PSHUFFLE_BYTE_FLIP_MASK: | 
 | .octa 0x0c0d0e0f08090a0b0405060700010203 | 
 | .octa 0x0c0d0e0f08090a0b0405060700010203 |