| diff --git a/Configure b/Configure |
| index de78469..26743bb 100755 |
| --- a/Configure |
| +++ b/Configure |
| @@ -136,7 +136,8 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-a |
| my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::"; |
| my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::"; |
| my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:"; |
| -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void"; |
| +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void"; |
| +my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:"; |
| my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32"; |
| my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64"; |
| my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::"; |
| @@ -350,6 +351,7 @@ my %table=( |
| # It's believed that majority of ARM toolchains predefine appropriate -march. |
| # If you compiler does not, do complement config command line with one! |
| "linux-armv4", "gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", |
| +"linux-aarch64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", |
| #### IA-32 targets... |
| "linux-ia32-icc", "icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", |
| "linux-elf", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", |
| @@ -1503,7 +1505,7 @@ if ($rmd160_obj =~ /\.o$/) |
| } |
| if ($aes_obj =~ /\.o$/) |
| { |
| - $cflags.=" -DAES_ASM"; |
| + $cflags.=" -DAES_ASM" if ($aes_obj =~ m/\baes\-/);; |
| # aes-ctr.o is not a real file, only indication that assembler |
| # module implements AES_ctr32_encrypt... |
| $cflags.=" -DAES_CTR_ASM" if ($aes_obj =~ s/\s*aes\-ctr\.o//); |
| @@ -1525,7 +1527,7 @@ else { |
| $wp_obj="wp_block.o"; |
| } |
| $cmll_obj=$cmll_enc unless ($cmll_obj =~ /.o$/); |
| -if ($modes_obj =~ /ghash/) |
| +if ($modes_obj =~ /ghash\-/) |
| { |
| $cflags.=" -DGHASH_ASM"; |
| } |
| diff --git a/config b/config |
| index 41fa2a6..dff7df7 100755 |
| --- a/config |
| +++ b/config |
| @@ -644,6 +644,7 @@ case "$GUESSOS" in |
| armv[1-3]*-*-linux2) OUT="linux-generic32" ;; |
| armv[7-9]*-*-linux2) OUT="linux-armv4"; options="$options -march=armv7-a" ;; |
| arm*-*-linux2) OUT="linux-armv4" ;; |
| + aarch64-*-linux2) OUT="linux-aarch64" ;; |
| sh*b-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; |
| sh*-*-linux2) OUT="linux-generic32"; options="$options -DL_ENDIAN" ;; |
| m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;; |
| diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile |
| index 45ede0a..9181a1a 100644 |
| --- a/crypto/aes/Makefile |
| +++ b/crypto/aes/Makefile |
| @@ -78,9 +78,15 @@ aes-parisc.s: asm/aes-parisc.pl |
| aes-mips.S: asm/aes-mips.pl |
| $(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@ |
| |
| +aesv8-armx.S: asm/aesv8-armx.pl |
| + $(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@ |
| +aesv8-armx.o: aesv8-armx.S |
| + |
| # GNU make "catch all" |
| aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) > $@ |
| aes-armv4.o: aes-armv4.S |
| +bsaes-%.S: asm/bsaes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ |
| +bsaes-armv7.o: bsaes-armv7.S |
| |
| files: |
| $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO |
| diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl |
| index 86b86c4..4f89170 100644 |
| --- a/crypto/aes/asm/aes-armv4.pl |
| +++ b/crypto/aes/asm/aes-armv4.pl |
| @@ -1,7 +1,7 @@ |
| #!/usr/bin/env perl |
| |
| # ==================================================================== |
| -# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| @@ -51,9 +51,23 @@ $key="r11"; |
| $rounds="r12"; |
| |
| $code=<<___; |
| -#include "arm_arch.h" |
| +#ifndef __KERNEL__ |
| +# include "arm_arch.h" |
| +#else |
| +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ |
| +#endif |
| + |
| .text |
| +#if __ARM_ARCH__<7 |
| +.code 32 |
| +#else |
| +.syntax unified |
| +# ifdef __thumb2__ |
| +.thumb |
| +# else |
| .code 32 |
| +# endif |
| +#endif |
| |
| .type AES_Te,%object |
| .align 5 |
| @@ -167,7 +181,11 @@ AES_Te: |
| .type AES_encrypt,%function |
| .align 5 |
| AES_encrypt: |
| +#if __ARM_ARCH__<7 |
| sub r3,pc,#8 @ AES_encrypt |
| +#else |
| + adr r3,AES_encrypt |
| +#endif |
| stmdb sp!,{r1,r4-r12,lr} |
| mov $rounds,r0 @ inp |
| mov $key,r2 |
| @@ -409,11 +427,21 @@ _armv4_AES_encrypt: |
| .align 5 |
| private_AES_set_encrypt_key: |
| _armv4_AES_set_encrypt_key: |
| +#if __ARM_ARCH__<7 |
| sub r3,pc,#8 @ AES_set_encrypt_key |
| +#else |
| + adr r3,private_AES_set_encrypt_key |
| +#endif |
| teq r0,#0 |
| +#if __ARM_ARCH__>=7 |
| + itt eq @ Thumb2 thing, sanity check in ARM |
| +#endif |
| moveq r0,#-1 |
| beq .Labrt |
| teq r2,#0 |
| +#if __ARM_ARCH__>=7 |
| + itt eq @ Thumb2 thing, sanity check in ARM |
| +#endif |
| moveq r0,#-1 |
| beq .Labrt |
| |
| @@ -422,6 +450,9 @@ _armv4_AES_set_encrypt_key: |
| teq r1,#192 |
| beq .Lok |
| teq r1,#256 |
| +#if __ARM_ARCH__>=7 |
| + itt ne @ Thumb2 thing, sanity check in ARM |
| +#endif |
| movne r0,#-1 |
| bne .Labrt |
| |
| @@ -576,6 +607,9 @@ _armv4_AES_set_encrypt_key: |
| str $s2,[$key,#-16] |
| subs $rounds,$rounds,#1 |
| str $s3,[$key,#-12] |
| +#if __ARM_ARCH__>=7 |
| + itt eq @ Thumb2 thing, sanity check in ARM |
| +#endif |
| subeq r2,$key,#216 |
| beq .Ldone |
| |
| @@ -645,6 +679,9 @@ _armv4_AES_set_encrypt_key: |
| str $s2,[$key,#-24] |
| subs $rounds,$rounds,#1 |
| str $s3,[$key,#-20] |
| +#if __ARM_ARCH__>=7 |
| + itt eq @ Thumb2 thing, sanity check in ARM |
| +#endif |
| subeq r2,$key,#256 |
| beq .Ldone |
| |
| @@ -674,11 +711,17 @@ _armv4_AES_set_encrypt_key: |
| str $i3,[$key,#-4] |
| b .L256_loop |
| |
| +.align 2 |
| .Ldone: mov r0,#0 |
| ldmia sp!,{r4-r12,lr} |
| -.Labrt: tst lr,#1 |
| +.Labrt: |
| +#if __ARM_ARCH__>=5 |
| + ret @ bx lr |
| +#else |
| + tst lr,#1 |
| moveq pc,lr @ be binary compatible with V4, yet |
| bx lr @ interoperable with Thumb ISA:-) |
| +#endif |
| .size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key |
| |
| .global private_AES_set_decrypt_key |
| @@ -688,34 +731,57 @@ private_AES_set_decrypt_key: |
| str lr,[sp,#-4]! @ push lr |
| bl _armv4_AES_set_encrypt_key |
| teq r0,#0 |
| - ldrne lr,[sp],#4 @ pop lr |
| + ldr lr,[sp],#4 @ pop lr |
| bne .Labrt |
| |
| - stmdb sp!,{r4-r12} |
| + mov r0,r2 @ AES_set_encrypt_key preserves r2, |
| + mov r1,r2 @ which is AES_KEY *key |
| + b _armv4_AES_set_enc2dec_key |
| +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key |
| |
| - ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2, |
| - mov $key,r2 @ which is AES_KEY *key |
| - mov $i1,r2 |
| - add $i2,r2,$rounds,lsl#4 |
| +@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out) |
| +.global AES_set_enc2dec_key |
| +.type AES_set_enc2dec_key,%function |
| +.align 5 |
| +AES_set_enc2dec_key: |
| +_armv4_AES_set_enc2dec_key: |
| + stmdb sp!,{r4-r12,lr} |
| + |
| + ldr $rounds,[r0,#240] |
| + mov $i1,r0 @ input |
| + add $i2,r0,$rounds,lsl#4 |
| + mov $key,r1 @ ouput |
| + add $tbl,r1,$rounds,lsl#4 |
| + str $rounds,[r1,#240] |
| + |
| +.Linv: ldr $s0,[$i1],#16 |
| + ldr $s1,[$i1,#-12] |
| + ldr $s2,[$i1,#-8] |
| + ldr $s3,[$i1,#-4] |
| + ldr $t1,[$i2],#-16 |
| + ldr $t2,[$i2,#16+4] |
| + ldr $t3,[$i2,#16+8] |
| + ldr $i3,[$i2,#16+12] |
| + str $s0,[$tbl],#-16 |
| + str $s1,[$tbl,#16+4] |
| + str $s2,[$tbl,#16+8] |
| + str $s3,[$tbl,#16+12] |
| + str $t1,[$key],#16 |
| + str $t2,[$key,#-12] |
| + str $t3,[$key,#-8] |
| + str $i3,[$key,#-4] |
| + teq $i1,$i2 |
| + bne .Linv |
| |
| -.Linv: ldr $s0,[$i1] |
| + ldr $s0,[$i1] |
| ldr $s1,[$i1,#4] |
| ldr $s2,[$i1,#8] |
| ldr $s3,[$i1,#12] |
| - ldr $t1,[$i2] |
| - ldr $t2,[$i2,#4] |
| - ldr $t3,[$i2,#8] |
| - ldr $i3,[$i2,#12] |
| - str $s0,[$i2],#-16 |
| - str $s1,[$i2,#16+4] |
| - str $s2,[$i2,#16+8] |
| - str $s3,[$i2,#16+12] |
| - str $t1,[$i1],#16 |
| - str $t2,[$i1,#-12] |
| - str $t3,[$i1,#-8] |
| - str $i3,[$i1,#-4] |
| - teq $i1,$i2 |
| - bne .Linv |
| + str $s0,[$key] |
| + str $s1,[$key,#4] |
| + str $s2,[$key,#8] |
| + str $s3,[$key,#12] |
| + sub $key,$key,$rounds,lsl#3 |
| ___ |
| $mask80=$i1; |
| $mask1b=$i2; |
| @@ -773,7 +839,7 @@ $code.=<<___; |
| moveq pc,lr @ be binary compatible with V4, yet |
| bx lr @ interoperable with Thumb ISA:-) |
| #endif |
| -.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key |
| +.size AES_set_enc2dec_key,.-AES_set_enc2dec_key |
| |
| .type AES_Td,%object |
| .align 5 |
| @@ -883,7 +949,11 @@ AES_Td: |
| .type AES_decrypt,%function |
| .align 5 |
| AES_decrypt: |
| +#if __ARM_ARCH__<7 |
| sub r3,pc,#8 @ AES_decrypt |
| +#else |
| + adr r3,AES_decrypt |
| +#endif |
| stmdb sp!,{r1,r4-r12,lr} |
| mov $rounds,r0 @ inp |
| mov $key,r2 |
| @@ -1080,8 +1150,9 @@ _armv4_AES_decrypt: |
| ldrb $t3,[$tbl,$i3] @ Td4[s0>>0] |
| and $i3,lr,$s1,lsr#8 |
| |
| + add $s1,$tbl,$s1,lsr#24 |
| ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] |
| - ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] |
| + ldrb $s1,[$s1] @ Td4[s1>>24] |
| ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] |
| eor $s0,$i1,$s0,lsl#24 |
| ldrb $i3,[$tbl,$i3] @ Td4[s1>>8] |
| @@ -1094,7 +1165,8 @@ _armv4_AES_decrypt: |
| ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] |
| and $i3,lr,$s2,lsr#16 |
| |
| - ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] |
| + add $s2,$tbl,$s2,lsr#24 |
| + ldrb $s2,[$s2] @ Td4[s2>>24] |
| eor $s0,$s0,$i1,lsl#8 |
| ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] |
| eor $s1,$i2,$s1,lsl#16 |
| @@ -1106,8 +1178,9 @@ _armv4_AES_decrypt: |
| ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] |
| and $i3,lr,$s3 @ i2 |
| |
| + add $s3,$tbl,$s3,lsr#24 |
| ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] |
| - ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] |
| + ldrb $s3,[$s3] @ Td4[s3>>24] |
| eor $s0,$s0,$i1,lsl#16 |
| ldr $i1,[$key,#0] |
| eor $s1,$s1,$i2,lsl#8 |
| @@ -1130,5 +1203,15 @@ _armv4_AES_decrypt: |
| ___ |
| |
| $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
| +$code =~ s/\bret\b/bx\tlr/gm; |
| + |
| +open SELF,$0; |
| +while(<SELF>) { |
| + next if (/^#!/); |
| + last if (!s/^#/@/ and !/^$/); |
| + print; |
| +} |
| +close SELF; |
| + |
| print $code; |
| close STDOUT; # enforce flush |
| diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl |
| new file mode 100755 |
| index 0000000..415dc04 |
| --- /dev/null |
| +++ b/crypto/aes/asm/aesv8-armx.pl |
| @@ -0,0 +1,980 @@ |
| +#!/usr/bin/env perl |
| +# |
| +# ==================================================================== |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# project. The module is, however, dual licensed under OpenSSL and |
| +# CRYPTOGAMS licenses depending on where you obtain it. For further |
| +# details see http://www.openssl.org/~appro/cryptogams/. |
| +# ==================================================================== |
| +# |
| +# This module implements support for ARMv8 AES instructions. The |
| +# module is endian-agnostic in sense that it supports both big- and |
| +# little-endian cases. As does it support both 32- and 64-bit modes |
| +# of operation. Latter is achieved by limiting amount of utilized |
| +# registers to 16, which implies additional instructions. This has |
| +# no effect on mighty Apple A7, as results are literally equal to |
| +# the theoretical estimates based on instruction latencies and issue |
| +# rate. It remains to be seen how does it affect other platforms... |
| +# |
| +# Performance in cycles per byte processed with 128-bit key: |
| +# |
| +# CBC enc CBC dec CTR |
| +# Apple A7 2.39 1.20 1.20 |
| +# Cortex-A5x n/a n/a n/a |
| + |
| +$flavour = shift; |
| +open STDOUT,">".shift; |
| + |
| +$prefix="aes_v8"; |
| + |
| +$code=<<___; |
| +#include "arm_arch.h" |
| + |
| +#if __ARM_ARCH__>=7 |
| +.text |
| +___ |
| +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); |
| +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); |
| + |
| +# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, |
| +# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to |
| +# maintain both 32- and 64-bit codes within single module and |
| +# transliterate common code to either flavour with regex vodoo. |
| +# |
| +{{{ |
| +my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); |
| +my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= |
| + $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); |
| + |
| + |
| +$code.=<<___; |
| +.align 5 |
| +rcon: |
| +.long 0x01,0x01,0x01,0x01 |
| +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat |
| +.long 0x1b,0x1b,0x1b,0x1b |
| + |
| +.globl ${prefix}_set_encrypt_key |
| +.type ${prefix}_set_encrypt_key,%function |
| +.align 5 |
| +${prefix}_set_encrypt_key: |
| +.Lenc_key: |
| +___ |
| +$code.=<<___ if ($flavour =~ /64/); |
| + stp x29,x30,[sp,#-16]! |
| + add x29,sp,#0 |
| +___ |
| +$code.=<<___; |
| + adr $ptr,rcon |
| + cmp $bits,#192 |
| + |
| + veor $zero,$zero,$zero |
| + vld1.8 {$in0},[$inp],#16 |
| + mov $bits,#8 // reuse $bits |
| + vld1.32 {$rcon,$mask},[$ptr],#32 |
| + |
| + b.lt .Loop128 |
| + b.eq .L192 |
| + b .L256 |
| + |
| +.align 4 |
| +.Loop128: |
| + vtbl.8 $key,{$in0},$mask |
| + vext.8 $tmp,$zero,$in0,#12 |
| + vst1.32 {$in0},[$out],#16 |
| + aese $key,$zero |
| + subs $bits,$bits,#1 |
| + |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $key,$key,$rcon |
| + veor $in0,$in0,$tmp |
| + vshl.u8 $rcon,$rcon,#1 |
| + veor $in0,$in0,$key |
| + b.ne .Loop128 |
| + |
| + vld1.32 {$rcon},[$ptr] |
| + |
| + vtbl.8 $key,{$in0},$mask |
| + vext.8 $tmp,$zero,$in0,#12 |
| + vst1.32 {$in0},[$out],#16 |
| + aese $key,$zero |
| + |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $key,$key,$rcon |
| + veor $in0,$in0,$tmp |
| + vshl.u8 $rcon,$rcon,#1 |
| + veor $in0,$in0,$key |
| + |
| + vtbl.8 $key,{$in0},$mask |
| + vext.8 $tmp,$zero,$in0,#12 |
| + vst1.32 {$in0},[$out],#16 |
| + aese $key,$zero |
| + |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $key,$key,$rcon |
| + veor $in0,$in0,$tmp |
| + veor $in0,$in0,$key |
| + vst1.32 {$in0},[$out] |
| + add $out,$out,#0x50 |
| + |
| + mov $rounds,#10 |
| + b .Ldone |
| + |
| +.align 4 |
| +.L192: |
| + vld1.8 {$in1},[$inp],#8 |
| + vmov.i8 $key,#8 // borrow $key |
| + vst1.32 {$in0},[$out],#16 |
| + vsub.i8 $mask,$mask,$key // adjust the mask |
| + |
| +.Loop192: |
| + vtbl.8 $key,{$in1},$mask |
| + vext.8 $tmp,$zero,$in0,#12 |
| + vst1.32 {$in1},[$out],#8 |
| + aese $key,$zero |
| + subs $bits,$bits,#1 |
| + |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $in0,$in0,$tmp |
| + |
| + vdup.32 $tmp,${in0}[3] |
| + veor $tmp,$tmp,$in1 |
| + veor $key,$key,$rcon |
| + vext.8 $in1,$zero,$in1,#12 |
| + vshl.u8 $rcon,$rcon,#1 |
| + veor $in1,$in1,$tmp |
| + veor $in0,$in0,$key |
| + veor $in1,$in1,$key |
| + vst1.32 {$in0},[$out],#16 |
| + b.ne .Loop192 |
| + |
| + mov $rounds,#12 |
| + add $out,$out,#0x20 |
| + b .Ldone |
| + |
| +.align 4 |
| +.L256: |
| + vld1.8 {$in1},[$inp] |
| + mov $bits,#7 |
| + mov $rounds,#14 |
| + vst1.32 {$in0},[$out],#16 |
| + |
| +.Loop256: |
| + vtbl.8 $key,{$in1},$mask |
| + vext.8 $tmp,$zero,$in0,#12 |
| + vst1.32 {$in1},[$out],#16 |
| + aese $key,$zero |
| + subs $bits,$bits,#1 |
| + |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $in0,$in0,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $key,$key,$rcon |
| + veor $in0,$in0,$tmp |
| + vshl.u8 $rcon,$rcon,#1 |
| + veor $in0,$in0,$key |
| + vst1.32 {$in0},[$out],#16 |
| + b.eq .Ldone |
| + |
| + vdup.32 $key,${in0}[3] // just splat |
| + vext.8 $tmp,$zero,$in1,#12 |
| + aese $key,$zero |
| + |
| + veor $in1,$in1,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $in1,$in1,$tmp |
| + vext.8 $tmp,$zero,$tmp,#12 |
| + veor $in1,$in1,$tmp |
| + |
| + veor $in1,$in1,$key |
| + b .Loop256 |
| + |
| +.Ldone: |
| + str $rounds,[$out] |
| + |
| + eor x0,x0,x0 // return value |
| + `"ldr x29,[sp],#16" if ($flavour =~ /64/)` |
| + ret |
| +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key |
| + |
| +.globl ${prefix}_set_decrypt_key |
| +.type ${prefix}_set_decrypt_key,%function |
| +.align 5 |
| +${prefix}_set_decrypt_key: |
| +___ |
| +$code.=<<___ if ($flavour =~ /64/); |
| + stp x29,x30,[sp,#-16]! |
| + add x29,sp,#0 |
| +___ |
| +$code.=<<___ if ($flavour !~ /64/); |
| + stmdb sp!,{r4,lr} |
| +___ |
| +$code.=<<___; |
| + bl .Lenc_key |
| + |
| + sub $out,$out,#240 // restore original $out |
| + mov x4,#-16 |
| + add $inp,$out,x12,lsl#4 // end of key schedule |
| + |
| + vld1.32 {v0.16b},[$out] |
| + vld1.32 {v1.16b},[$inp] |
| + vst1.32 {v0.16b},[$inp],x4 |
| + vst1.32 {v1.16b},[$out],#16 |
| + |
| +.Loop_imc: |
| + vld1.32 {v0.16b},[$out] |
| + vld1.32 {v1.16b},[$inp] |
| + aesimc v0.16b,v0.16b |
| + aesimc v1.16b,v1.16b |
| + vst1.32 {v0.16b},[$inp],x4 |
| + vst1.32 {v1.16b},[$out],#16 |
| + cmp $inp,$out |
| + b.hi .Loop_imc |
| + |
| + vld1.32 {v0.16b},[$out] |
| + aesimc v0.16b,v0.16b |
| + vst1.32 {v0.16b},[$inp] |
| + |
| + eor x0,x0,x0 // return value |
| +___ |
| +$code.=<<___ if ($flavour !~ /64/); |
| + ldmia sp!,{r4,pc} |
| +___ |
| +$code.=<<___ if ($flavour =~ /64/); |
| + ldp x29,x30,[sp],#16 |
| + ret |
| +___ |
| +$code.=<<___; |
| +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key |
| +___ |
| +}}} |
| +{{{ |
| +sub gen_block () { |
| +my $dir = shift; |
| +my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); |
| +my ($inp,$out,$key)=map("x$_",(0..2)); |
| +my $rounds="w3"; |
| +my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); |
| + |
| +$code.=<<___; |
| +.globl ${prefix}_${dir}crypt |
| +.type ${prefix}_${dir}crypt,%function |
| +.align 5 |
| +${prefix}_${dir}crypt: |
| + ldr $rounds,[$key,#240] |
| + vld1.32 {$rndkey0},[$key],#16 |
| + vld1.8 {$inout},[$inp] |
| + sub $rounds,$rounds,#2 |
| + vld1.32 {$rndkey1},[$key],#16 |
| + |
| +.Loop_${dir}c: |
| + aes$e $inout,$rndkey0 |
| + vld1.32 {$rndkey0},[$key],#16 |
| + aes$mc $inout,$inout |
| + subs $rounds,$rounds,#2 |
| + aes$e $inout,$rndkey1 |
| + vld1.32 {$rndkey1},[$key],#16 |
| + aes$mc $inout,$inout |
| + b.gt .Loop_${dir}c |
| + |
| + aes$e $inout,$rndkey0 |
| + vld1.32 {$rndkey0},[$key] |
| + aes$mc $inout,$inout |
| + aes$e $inout,$rndkey1 |
| + veor $inout,$inout,$rndkey0 |
| + |
| + vst1.8 {$inout},[$out] |
| + ret |
| +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt |
| +___ |
| +} |
| +&gen_block("en"); |
| +&gen_block("de"); |
| +}}} |
| +{{{ |
| +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; |
| +my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); |
| +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); |
| + |
| +my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); |
| + |
| +### q8-q15 preloaded key schedule |
| + |
| +$code.=<<___; |
| +.globl ${prefix}_cbc_encrypt |
| +.type ${prefix}_cbc_encrypt,%function |
| +.align 5 |
| +${prefix}_cbc_encrypt: |
| +___ |
| +$code.=<<___ if ($flavour =~ /64/); |
| + stp x29,x30,[sp,#-16]! |
| + add x29,sp,#0 |
| +___ |
| +$code.=<<___ if ($flavour !~ /64/); |
| + mov ip,sp |
| + stmdb sp!,{r4-r8,lr} |
| + vstmdb sp!,{d8-d15} @ ABI specification says so |
| + ldmia ip,{r4-r5} @ load remaining args |
| +___ |
| +$code.=<<___; |
| + subs $len,$len,#16 |
| + mov $step,#16 |
| + b.lo .Lcbc_abort |
| + cclr $step,eq |
| + |
| + cmp $enc,#0 // en- or decrypting? |
| + ldr $rounds,[$key,#240] |
| + and $len,$len,#-16 |
| + vld1.8 {$ivec},[$ivp] |
| + vld1.8 {$dat},[$inp],$step |
| + |
| + vld1.32 {q8-q9},[$key] // load key schedule... |
| + sub $rounds,$rounds,#6 |
| + add $key_,$key,x5,lsl#4 // pointer to last 7 round keys |
| + sub $rounds,$rounds,#2 |
| + vld1.32 {q10-q11},[$key_],#32 |
| + vld1.32 {q12-q13},[$key_],#32 |
| + vld1.32 {q14-q15},[$key_],#32 |
| + vld1.32 {$rndlast},[$key_] |
| + |
| + add $key_,$key,#32 |
| + mov $cnt,$rounds |
| + b.eq .Lcbc_dec |
| + |
| + cmp $rounds,#2 |
| + veor $dat,$dat,$ivec |
| + veor $rndzero_n_last,q8,$rndlast |
| + b.eq .Lcbc_enc128 |
| + |
| +.Loop_cbc_enc: |
| + aese $dat,q8 |
| + vld1.32 {q8},[$key_],#16 |
| + aesmc $dat,$dat |
| + subs $cnt,$cnt,#2 |
| + aese $dat,q9 |
| + vld1.32 {q9},[$key_],#16 |
| + aesmc $dat,$dat |
| + b.gt .Loop_cbc_enc |
| + |
| + aese $dat,q8 |
| + aesmc $dat,$dat |
| + subs $len,$len,#16 |
| + aese $dat,q9 |
| + aesmc $dat,$dat |
| + cclr $step,eq |
| + aese $dat,q10 |
| + aesmc $dat,$dat |
| + add $key_,$key,#16 |
| + aese $dat,q11 |
| + aesmc $dat,$dat |
| + vld1.8 {q8},[$inp],$step |
| + aese $dat,q12 |
| + aesmc $dat,$dat |
| + veor q8,q8,$rndzero_n_last |
| + aese $dat,q13 |
| + aesmc $dat,$dat |
| + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] |
| + aese $dat,q14 |
| + aesmc $dat,$dat |
| + aese $dat,q15 |
| + |
| + mov $cnt,$rounds |
| + veor $ivec,$dat,$rndlast |
| + vst1.8 {$ivec},[$out],#16 |
| + b.hs .Loop_cbc_enc |
| + |
| + b .Lcbc_done |
| + |
| +.align 5 |
| +.Lcbc_enc128: |
| + vld1.32 {$in0-$in1},[$key_] |
| + aese $dat,q8 |
| + aesmc $dat,$dat |
| + b .Lenter_cbc_enc128 |
| +.Loop_cbc_enc128: |
| + aese $dat,q8 |
| + aesmc $dat,$dat |
| + vst1.8 {$ivec},[$out],#16 |
| +.Lenter_cbc_enc128: |
| + aese $dat,q9 |
| + aesmc $dat,$dat |
| + subs $len,$len,#16 |
| + aese $dat,$in0 |
| + aesmc $dat,$dat |
| + cclr $step,eq |
| + aese $dat,$in1 |
| + aesmc $dat,$dat |
| + aese $dat,q10 |
| + aesmc $dat,$dat |
| + aese $dat,q11 |
| + aesmc $dat,$dat |
| + vld1.8 {q8},[$inp],$step |
| + aese $dat,q12 |
| + aesmc $dat,$dat |
| + aese $dat,q13 |
| + aesmc $dat,$dat |
| + aese $dat,q14 |
| + aesmc $dat,$dat |
| + veor q8,q8,$rndzero_n_last |
| + aese $dat,q15 |
| + veor $ivec,$dat,$rndlast |
| + b.hs .Loop_cbc_enc128 |
| + |
| + vst1.8 {$ivec},[$out],#16 |
| + b .Lcbc_done |
| + |
| +.align 5 |
| +.Lcbc_dec128: |
| + vld1.32 {$tmp0-$tmp1},[$key_] |
| + veor $ivec,$ivec,$rndlast |
| + veor $in0,$dat0,$rndlast |
| + mov $step1,$step |
| + |
| +.Loop2x_cbc_dec128: |
| + aesd $dat0,q8 |
| + aesd $dat1,q8 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + subs $len,$len,#32 |
| + aesd $dat0,q9 |
| + aesd $dat1,q9 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + cclr $step,lo |
| + aesd $dat0,$tmp0 |
| + aesd $dat1,$tmp0 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + cclr $step1,ls |
| + aesd $dat0,$tmp1 |
| + aesd $dat1,$tmp1 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + aesd $dat0,q10 |
| + aesd $dat1,q10 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + aesd $dat0,q11 |
| + aesd $dat1,q11 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + aesd $dat0,q12 |
| + aesd $dat1,q12 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + aesd $dat0,q13 |
| + aesd $dat1,q13 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + aesd $dat0,q14 |
| + aesd $dat1,q14 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + aesd $dat0,q15 |
| + aesd $dat1,q15 |
| + |
| + veor $ivec,$ivec,$dat0 |
| + vld1.8 {$dat0},[$inp],$step |
| + veor $in0,$in0,$dat1 |
| + vld1.8 {$dat1},[$inp],$step1 |
| + vst1.8 {$ivec},[$out],#16 |
| + veor $ivec,$in1,$rndlast |
| + vst1.8 {$in0},[$out],#16 |
| + veor $in0,$dat0,$rndlast |
| + vorr $in1,$dat1,$dat1 |
| + b.hs .Loop2x_cbc_dec128 |
| + |
| + adds $len,$len,#32 |
| + veor $ivec,$ivec,$rndlast |
| + b.eq .Lcbc_done |
| + veor $in0,$in0,$rndlast |
| + b .Lcbc_dec_tail |
| + |
| +.align 5 |
| +.Lcbc_dec: |
| + subs $len,$len,#16 |
| + vorr $in0,$dat,$dat |
| + b.lo .Lcbc_dec_tail |
| + |
| + cclr $step,eq |
| + cmp $rounds,#2 |
| + vld1.8 {$dat1},[$inp],$step |
| + vorr $in1,$dat1,$dat1 |
| + b.eq .Lcbc_dec128 |
| + |
| +.Loop2x_cbc_dec: |
| + aesd $dat0,q8 |
| + aesd $dat1,q8 |
| + vld1.32 {q8},[$key_],#16 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + subs $cnt,$cnt,#2 |
| + aesd $dat0,q9 |
| + aesd $dat1,q9 |
| + vld1.32 {q9},[$key_],#16 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + b.gt .Loop2x_cbc_dec |
| + |
| + aesd $dat0,q8 |
| + aesd $dat1,q8 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + veor $tmp0,$ivec,$rndlast |
| + veor $tmp1,$in0,$rndlast |
| + aesd $dat0,q9 |
| + aesd $dat1,q9 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + vorr $ivec,$in1,$in1 |
| + subs $len,$len,#32 |
| + aesd $dat0,q10 |
| + aesd $dat1,q10 |
| + aesimc $dat0,$dat0 |
| + cclr $step,lo |
| + aesimc $dat1,$dat1 |
| + mov $key_,$key |
| + aesd $dat0,q11 |
| + aesd $dat1,q11 |
| + aesimc $dat0,$dat0 |
| + vld1.8 {$in0},[$inp],$step |
| + aesimc $dat1,$dat1 |
| + cclr $step,ls |
| + aesd $dat0,q12 |
| + aesd $dat1,q12 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + vld1.8 {$in1},[$inp],$step |
| + aesd $dat0,q13 |
| + aesd $dat1,q13 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] |
| + aesd $dat0,q14 |
| + aesd $dat1,q14 |
| + aesimc $dat0,$dat0 |
| + aesimc $dat1,$dat1 |
| + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] |
| + aesd $dat0,q15 |
| + aesd $dat1,q15 |
| + |
| + mov $cnt,$rounds |
| + veor $tmp0,$tmp0,$dat0 |
| + veor $tmp1,$tmp1,$dat1 |
| + vorr $dat0,$in0,$in0 |
| + vst1.8 {$tmp0},[$out],#16 |
| + vorr $dat1,$in1,$in1 |
| + vst1.8 {$tmp1},[$out],#16 |
| + b.hs .Loop2x_cbc_dec |
| + |
| + adds $len,$len,#32 |
| + b.eq .Lcbc_done |
| + |
| +.Lcbc_dec_tail: |
| + aesd $dat,q8 |
| + vld1.32 {q8},[$key_],#16 |
| + aesimc $dat,$dat |
| + subs $cnt,$cnt,#2 |
| + aesd $dat,q9 |
| + vld1.32 {q9},[$key_],#16 |
| + aesimc $dat,$dat |
| + b.gt .Lcbc_dec_tail |
| + |
| + aesd $dat,q8 |
| + aesimc $dat,$dat |
| + aesd $dat,q9 |
| + aesimc $dat,$dat |
| + veor $tmp,$ivec,$rndlast |
| + aesd $dat,q10 |
| + aesimc $dat,$dat |
| + vorr $ivec,$in0,$in0 |
| + aesd $dat,q11 |
| + aesimc $dat,$dat |
| + aesd $dat,q12 |
| + aesimc $dat,$dat |
| + aesd $dat,q13 |
| + aesimc $dat,$dat |
| + aesd $dat,q14 |
| + aesimc $dat,$dat |
| + aesd $dat,q15 |
| + |
| + veor $tmp,$tmp,$dat |
| + vst1.8 {$tmp},[$out],#16 |
| + |
| +.Lcbc_done: |
| + vst1.8 {$ivec},[$ivp] |
| +.Lcbc_abort: |
| +___ |
| +$code.=<<___ if ($flavour !~ /64/); |
| + vldmia sp!,{d8-d15} |
| + ldmia sp!,{r4-r8,pc} |
| +___ |
| +$code.=<<___ if ($flavour =~ /64/); |
| + ldr x29,[sp],#16 |
| + ret |
| +___ |
| +$code.=<<___; |
| +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt |
| +___ |
| +}}} |
| +{{{ |
| +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); |
| +my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10"); |
| +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); |
| + |
| +my ($dat,$tmp)=($dat0,$tmp0); |
| + |
| +### q8-q15 preloaded key schedule |
| + |
| +$code.=<<___; |
| +.globl ${prefix}_ctr32_encrypt_blocks |
| +.type ${prefix}_ctr32_encrypt_blocks,%function |
| +.align 5 |
| +${prefix}_ctr32_encrypt_blocks: |
| +___ |
| +$code.=<<___ if ($flavour =~ /64/); |
| + stp x29,x30,[sp,#-16]! |
| + add x29,sp,#0 |
| +___ |
| +$code.=<<___ if ($flavour !~ /64/); |
| + mov ip,sp |
| + stmdb sp!,{r4-r10,lr} |
| + vstmdb sp!,{d8-d15} @ ABI specification says so |
| + ldr r4, [ip] @ load remaining arg |
| +___ |
| +$code.=<<___; |
| + ldr $rounds,[$key,#240] |
| + |
| + ldr $ctr, [$ivp, #12] |
| + vld1.32 {$dat0},[$ivp] |
| + |
| + vld1.32 {q8-q9},[$key] // load key schedule... |
| + sub $rounds,$rounds,#6 |
| + add $key_,$key,x5,lsl#4 // pointer to last 7 round keys |
| + sub $rounds,$rounds,#2 |
| + vld1.32 {q10-q11},[$key_],#32 |
| + vld1.32 {q12-q13},[$key_],#32 |
| + vld1.32 {q14-q15},[$key_],#32 |
| + vld1.32 {$rndlast},[$key_] |
| + |
| + add $key_,$key,#32 |
| + mov $cnt,$rounds |
| + |
| + subs $len,$len,#2 |
| + b.lo .Lctr32_tail |
| + |
| +#ifndef __ARMEB__ |
| + rev $ctr, $ctr |
| +#endif |
| + vorr $dat1,$dat0,$dat0 |
| + add $ctr, $ctr, #1 |
| + vorr $ivec,$dat0,$dat0 |
| + rev $tctr1, $ctr |
| + cmp $rounds,#2 |
| + vmov.32 ${dat1}[3],$tctr1 |
| + b.eq .Lctr32_128 |
| + |
| +.Loop2x_ctr32: |
| + aese $dat0,q8 |
| + aese $dat1,q8 |
| + vld1.32 {q8},[$key_],#16 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + subs $cnt,$cnt,#2 |
| + aese $dat0,q9 |
| + aese $dat1,q9 |
| + vld1.32 {q9},[$key_],#16 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + b.gt .Loop2x_ctr32 |
| + |
| + aese $dat0,q8 |
| + aese $dat1,q8 |
| + aesmc $tmp0,$dat0 |
| + vorr $dat0,$ivec,$ivec |
| + aesmc $tmp1,$dat1 |
| + vorr $dat1,$ivec,$ivec |
| + aese $tmp0,q9 |
| + aese $tmp1,q9 |
| + vld1.8 {$in0},[$inp],#16 |
| + aesmc $tmp0,$tmp0 |
| + vld1.8 {$in1},[$inp],#16 |
| + aesmc $tmp1,$tmp1 |
| + add $ctr,$ctr,#1 |
| + aese $tmp0,q10 |
| + aese $tmp1,q10 |
| + rev $tctr,$ctr |
| + aesmc $tmp0,$tmp0 |
| + aesmc $tmp1,$tmp1 |
| + add $ctr,$ctr,#1 |
| + aese $tmp0,q11 |
| + aese $tmp1,q11 |
| + veor $in0,$in0,$rndlast |
| + rev $tctr1,$ctr |
| + aesmc $tmp0,$tmp0 |
| + aesmc $tmp1,$tmp1 |
| + veor $in1,$in1,$rndlast |
| + mov $key_,$key |
| + aese $tmp0,q12 |
| + aese $tmp1,q12 |
| + subs $len,$len,#2 |
| + aesmc $tmp0,$tmp0 |
| + aesmc $tmp1,$tmp1 |
| + vld1.32 {q8-q9},[$key_],#32 // re-pre-load rndkey[0-1] |
| + aese $tmp0,q13 |
| + aese $tmp1,q13 |
| + aesmc $tmp0,$tmp0 |
| + aesmc $tmp1,$tmp1 |
| + aese $tmp0,q14 |
| + aese $tmp1,q14 |
| + vmov.32 ${dat0}[3], $tctr |
| + aesmc $tmp0,$tmp0 |
| + vmov.32 ${dat1}[3], $tctr1 |
| + aesmc $tmp1,$tmp1 |
| + aese $tmp0,q15 |
| + aese $tmp1,q15 |
| + |
| + mov $cnt,$rounds |
| + veor $in0,$in0,$tmp0 |
| + veor $in1,$in1,$tmp1 |
| + vst1.8 {$in0},[$out],#16 |
| + vst1.8 {$in1},[$out],#16 |
| + b.hs .Loop2x_ctr32 |
| + |
| + adds $len,$len,#2 |
| + b.eq .Lctr32_done |
| + b .Lctr32_tail |
| + |
| +.Lctr32_128: |
| + vld1.32 {$tmp0-$tmp1},[$key_] |
| + |
| +.Loop2x_ctr32_128: |
| + aese $dat0,q8 |
| + aese $dat1,q8 |
| + aesmc $dat0,$dat0 |
| + vld1.8 {$in0},[$inp],#16 |
| + aesmc $dat1,$dat1 |
| + vld1.8 {$in1},[$inp],#16 |
| + aese $dat0,q9 |
| + aese $dat1,q9 |
| + add $ctr,$ctr,#1 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + rev $tctr,$ctr |
| + aese $dat0,$tmp0 |
| + aese $dat1,$tmp0 |
| + add $ctr,$ctr,#1 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + rev $tctr1,$ctr |
| + aese $dat0,$tmp1 |
| + aese $dat1,$tmp1 |
| + subs $len,$len,#2 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + aese $dat0,q10 |
| + aese $dat1,q10 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + aese $dat0,q11 |
| + aese $dat1,q11 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + aese $dat0,q12 |
| + aese $dat1,q12 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + aese $dat0,q13 |
| + aese $dat1,q13 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + aese $dat0,q14 |
| + aese $dat1,q14 |
| + aesmc $dat0,$dat0 |
| + aesmc $dat1,$dat1 |
| + veor $in0,$in0,$rndlast |
| + aese $dat0,q15 |
| + veor $in1,$in1,$rndlast |
| + aese $dat1,q15 |
| + |
| + veor $in0,$in0,$dat0 |
| + vorr $dat0,$ivec,$ivec |
| + veor $in1,$in1,$dat1 |
| + vorr $dat1,$ivec,$ivec |
| + vst1.8 {$in0},[$out],#16 |
| + vmov.32 ${dat0}[3], $tctr |
| + vst1.8 {$in1},[$out],#16 |
| + vmov.32 ${dat1}[3], $tctr1 |
| + b.hs .Loop2x_ctr32_128 |
| + |
| + adds $len,$len,#2 |
| + b.eq .Lctr32_done |
| + |
| +.Lctr32_tail: |
| + aese $dat,q8 |
| + vld1.32 {q8},[$key_],#16 |
| + aesmc $dat,$dat |
| + subs $cnt,$cnt,#2 |
| + aese $dat,q9 |
| + vld1.32 {q9},[$key_],#16 |
| + aesmc $dat,$dat |
| + b.gt .Lctr32_tail |
| + |
| + aese $dat,q8 |
| + aesmc $dat,$dat |
| + aese $dat,q9 |
| + aesmc $dat,$dat |
| + vld1.8 {$in0},[$inp] |
| + aese $dat,q10 |
| + aesmc $dat,$dat |
| + aese $dat,q11 |
| + aesmc $dat,$dat |
| + aese $dat,q12 |
| + aesmc $dat,$dat |
| + aese $dat,q13 |
| + aesmc $dat,$dat |
| + aese $dat,q14 |
| + aesmc $dat,$dat |
| + veor $in0,$in0,$rndlast |
| + aese $dat,q15 |
| + |
| + veor $in0,$in0,$dat |
| + vst1.8 {$in0},[$out] |
| + |
| +.Lctr32_done: |
| +___ |
| +$code.=<<___ if ($flavour !~ /64/); |
| + vldmia sp!,{d8-d15} |
| + ldmia sp!,{r4-r10,pc} |
| +___ |
| +$code.=<<___ if ($flavour =~ /64/); |
| + ldr x29,[sp],#16 |
| + ret |
| +___ |
| +$code.=<<___; |
| +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks |
| +___ |
| +}}} |
| +$code.=<<___; |
| +#endif |
| +___ |
| +######################################## |
| +if ($flavour =~ /64/) { ######## 64-bit code |
| + my %opcode = ( |
| + "aesd" => 0x4e285800, "aese" => 0x4e284800, |
| + "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); |
| + |
| + local *unaes = sub { |
| + my ($mnemonic,$arg)=@_; |
| + |
| + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && |
| + sprintf ".inst\t0x%08x\t//%s %s", |
| + $opcode{$mnemonic}|$1|($2<<5), |
| + $mnemonic,$arg; |
| + }; |
| + |
| + foreach(split("\n",$code)) { |
| + s/\`([^\`]*)\`/eval($1)/geo; |
| + |
| + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers |
| + s/@\s/\/\//o; # old->new style commentary |
| + |
| + #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or |
| + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or |
| + s/vmov\.i8/movi/o or # fix up legacy mnemonics |
| + s/vext\.8/ext/o or |
| + s/vrev32\.8/rev32/o or |
| + s/vtst\.8/cmtst/o or |
| + s/vshr/ushr/o or |
| + s/^(\s+)v/$1/o or # strip off v prefix |
| + s/\bbx\s+lr\b/ret/o; |
| + |
| + # fix up remainig legacy suffixes |
| + s/\.[ui]?8//o; |
| + m/\],#8/o and s/\.16b/\.8b/go; |
| + s/\.[ui]?32//o and s/\.16b/\.4s/go; |
| + s/\.[ui]?64//o and s/\.16b/\.2d/go; |
| + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; |
| + |
| + print $_,"\n"; |
| + } |
| +} else { ######## 32-bit code |
| + my %opcode = ( |
| + "aesd" => 0xf3b00340, "aese" => 0xf3b00300, |
| + "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); |
| + |
| + local *unaes = sub { |
| + my ($mnemonic,$arg)=@_; |
| + |
| + if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { |
| + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) |
| + |(($2&7)<<1) |(($2&8)<<2); |
| + # since ARMv7 instructions are always encoded little-endian. |
| + # correct solution is to use .inst directive, but older |
| + # assemblers don't implement it:-( |
| + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", |
| + $word&0xff,($word>>8)&0xff, |
| + ($word>>16)&0xff,($word>>24)&0xff, |
| + $mnemonic,$arg; |
| + } |
| + }; |
| + |
| + sub unvtbl { |
| + my $arg=shift; |
| + |
| + $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && |
| + sprintf "vtbl.8 d%d,{q%d},d%d\n\t". |
| + "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; |
| + } |
| + |
| + sub unvdup32 { |
| + my $arg=shift; |
| + |
| + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && |
| + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; |
| + } |
| + |
| + sub unvmov32 { |
| + my $arg=shift; |
| + |
| + $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && |
| + sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; |
| + } |
| + |
| + foreach(split("\n",$code)) { |
| + s/\`([^\`]*)\`/eval($1)/geo; |
| + |
| + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers |
| + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers |
| + s/\/\/\s?/@ /o; # new->old style commentary |
| + |
| + # fix up remainig new-style suffixes |
| + s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or |
| + s/\],#[0-9]+/]!/o; |
| + |
| + s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or |
| + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or |
| + s/vtbl\.8\s+(.*)/unvtbl($1)/geo or |
| + s/vdup\.32\s+(.*)/unvdup32($1)/geo or |
| + s/vmov\.32\s+(.*)/unvmov32($1)/geo or |
| + s/^(\s+)b\./$1b/o or |
| + s/^(\s+)ret/$1bx\tlr/o; |
| + |
| + print $_,"\n"; |
| + } |
| +} |
| + |
| +close STDOUT; |
| diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl |
| new file mode 100644 |
| index 0000000..f3d96d9 |
| --- /dev/null |
| +++ b/crypto/aes/asm/bsaes-armv7.pl |
| @@ -0,0 +1,2467 @@ |
| +#!/usr/bin/env perl |
| + |
| +# ==================================================================== |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# project. The module is, however, dual licensed under OpenSSL and |
| +# CRYPTOGAMS licenses depending on where you obtain it. For further |
| +# details see http://www.openssl.org/~appro/cryptogams/. |
| +# |
| +# Specific modes and adaptation for Linux kernel by Ard Biesheuvel |
| +# <[email protected]>. Permission to use under GPL terms is |
| +# granted. |
| +# ==================================================================== |
| + |
| +# Bit-sliced AES for ARM NEON |
| +# |
| +# February 2012. |
| +# |
| +# This implementation is direct adaptation of bsaes-x86_64 module for |
| +# ARM NEON. Except that this module is endian-neutral [in sense that |
| +# it can be compiled for either endianness] by courtesy of vld1.8's |
| +# neutrality. Initial version doesn't implement interface to OpenSSL, |
| +# only low-level primitives and unsupported entry points, just enough |
| +# to collect performance results, which for Cortex-A8 core are: |
| +# |
| +# encrypt 19.5 cycles per byte processed with 128-bit key |
| +# decrypt 22.1 cycles per byte processed with 128-bit key |
| +# key conv. 440 cycles per 128-bit key/0.18 of 8x block |
| +# |
| +# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, |
| +# which is [much] worse than anticipated (for further details see |
| +# http://www.openssl.org/~appro/Snapdragon-S4.html). |
| +# |
| +# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code |
| +# manages in 20.0 cycles]. |
| +# |
| +# When comparing to x86_64 results keep in mind that NEON unit is |
| +# [mostly] single-issue and thus can't [fully] benefit from |
| +# instruction-level parallelism. And when comparing to aes-armv4 |
| +# results keep in mind key schedule conversion overhead (see |
| +# bsaes-x86_64.pl for further details)... |
| +# |
| +# <[email protected]> |
| + |
| +# April-August 2013 |
| +# |
| +# Add CBC, CTR and XTS subroutines, adapt for kernel use. |
| +# |
| +# <[email protected]> |
| + |
| +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| +open STDOUT,">$output"; |
| + |
| +my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); |
| +my @XMM=map("q$_",(0..15)); |
| + |
| +{ |
| +my ($key,$rounds,$const)=("r4","r5","r6"); |
| + |
| +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } |
| +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } |
| + |
| +sub Sbox { |
| +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb |
| +# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb |
| +my @b=@_[0..7]; |
| +my @t=@_[8..11]; |
| +my @s=@_[12..15]; |
| + &InBasisChange (@b); |
| + &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); |
| + &OutBasisChange (@b[7,1,4,2,6,5,0,3]); |
| +} |
| + |
| +sub InBasisChange { |
| +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb |
| +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb |
| +my @b=@_[0..7]; |
| +$code.=<<___; |
| + veor @b[2], @b[2], @b[1] |
| + veor @b[5], @b[5], @b[6] |
| + veor @b[3], @b[3], @b[0] |
| + veor @b[6], @b[6], @b[2] |
| + veor @b[5], @b[5], @b[0] |
| + |
| + veor @b[6], @b[6], @b[3] |
| + veor @b[3], @b[3], @b[7] |
| + veor @b[7], @b[7], @b[5] |
| + veor @b[3], @b[3], @b[4] |
| + veor @b[4], @b[4], @b[5] |
| + |
| + veor @b[2], @b[2], @b[7] |
| + veor @b[3], @b[3], @b[1] |
| + veor @b[1], @b[1], @b[5] |
| +___ |
| +} |
| + |
| +sub OutBasisChange { |
| +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb |
| +# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb |
| +my @b=@_[0..7]; |
| +$code.=<<___; |
| + veor @b[0], @b[0], @b[6] |
| + veor @b[1], @b[1], @b[4] |
| + veor @b[4], @b[4], @b[6] |
| + veor @b[2], @b[2], @b[0] |
| + veor @b[6], @b[6], @b[1] |
| + |
| + veor @b[1], @b[1], @b[5] |
| + veor @b[5], @b[5], @b[3] |
| + veor @b[3], @b[3], @b[7] |
| + veor @b[7], @b[7], @b[5] |
| + veor @b[2], @b[2], @b[5] |
| + |
| + veor @b[4], @b[4], @b[7] |
| +___ |
| +} |
| + |
| +sub InvSbox { |
| +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb |
| +# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb |
| +my @b=@_[0..7]; |
| +my @t=@_[8..11]; |
| +my @s=@_[12..15]; |
| + &InvInBasisChange (@b); |
| + &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); |
| + &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); |
| +} |
| + |
| +sub InvInBasisChange { # OutBasisChange in reverse (with twist) |
| +my @b=@_[5,1,2,6,3,7,0,4]; |
| +$code.=<<___ |
| + veor @b[1], @b[1], @b[7] |
| + veor @b[4], @b[4], @b[7] |
| + |
| + veor @b[7], @b[7], @b[5] |
| + veor @b[1], @b[1], @b[3] |
| + veor @b[2], @b[2], @b[5] |
| + veor @b[3], @b[3], @b[7] |
| + |
| + veor @b[6], @b[6], @b[1] |
| + veor @b[2], @b[2], @b[0] |
| + veor @b[5], @b[5], @b[3] |
| + veor @b[4], @b[4], @b[6] |
| + veor @b[0], @b[0], @b[6] |
| + veor @b[1], @b[1], @b[4] |
| +___ |
| +} |
| + |
| +sub InvOutBasisChange { # InBasisChange in reverse |
| +my @b=@_[2,5,7,3,6,1,0,4]; |
| +$code.=<<___; |
| + veor @b[1], @b[1], @b[5] |
| + veor @b[2], @b[2], @b[7] |
| + |
| + veor @b[3], @b[3], @b[1] |
| + veor @b[4], @b[4], @b[5] |
| + veor @b[7], @b[7], @b[5] |
| + veor @b[3], @b[3], @b[4] |
| + veor @b[5], @b[5], @b[0] |
| + veor @b[3], @b[3], @b[7] |
| + veor @b[6], @b[6], @b[2] |
| + veor @b[2], @b[2], @b[1] |
| + veor @b[6], @b[6], @b[3] |
| + |
| + veor @b[3], @b[3], @b[0] |
| + veor @b[5], @b[5], @b[6] |
| +___ |
| +} |
| + |
| +sub Mul_GF4 { |
| +#;************************************************************* |
| +#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * |
| +#;************************************************************* |
| +my ($x0,$x1,$y0,$y1,$t0,$t1)=@_; |
| +$code.=<<___; |
| + veor $t0, $y0, $y1 |
| + vand $t0, $t0, $x0 |
| + veor $x0, $x0, $x1 |
| + vand $t1, $x1, $y0 |
| + vand $x0, $x0, $y1 |
| + veor $x1, $t1, $t0 |
| + veor $x0, $x0, $t1 |
| +___ |
| +} |
| + |
| +sub Mul_GF4_N { # not used, see next subroutine |
| +# multiply and scale by N |
| +my ($x0,$x1,$y0,$y1,$t0)=@_; |
| +$code.=<<___; |
| + veor $t0, $y0, $y1 |
| + vand $t0, $t0, $x0 |
| + veor $x0, $x0, $x1 |
| + vand $x1, $x1, $y0 |
| + vand $x0, $x0, $y1 |
| + veor $x1, $x1, $x0 |
| + veor $x0, $x0, $t0 |
| +___ |
| +} |
| + |
| +sub Mul_GF4_N_GF4 { |
| +# interleaved Mul_GF4_N and Mul_GF4 |
| +my ($x0,$x1,$y0,$y1,$t0, |
| + $x2,$x3,$y2,$y3,$t1)=@_; |
| +$code.=<<___; |
| + veor $t0, $y0, $y1 |
| + veor $t1, $y2, $y3 |
| + vand $t0, $t0, $x0 |
| + vand $t1, $t1, $x2 |
| + veor $x0, $x0, $x1 |
| + veor $x2, $x2, $x3 |
| + vand $x1, $x1, $y0 |
| + vand $x3, $x3, $y2 |
| + vand $x0, $x0, $y1 |
| + vand $x2, $x2, $y3 |
| + veor $x1, $x1, $x0 |
| + veor $x2, $x2, $x3 |
| + veor $x0, $x0, $t0 |
| + veor $x3, $x3, $t1 |
| +___ |
| +} |
| +sub Mul_GF16_2 { |
| +my @x=@_[0..7]; |
| +my @y=@_[8..11]; |
| +my @t=@_[12..15]; |
| +$code.=<<___; |
| + veor @t[0], @x[0], @x[2] |
| + veor @t[1], @x[1], @x[3] |
| +___ |
| + &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]); |
| +$code.=<<___; |
| + veor @y[0], @y[0], @y[2] |
| + veor @y[1], @y[1], @y[3] |
| +___ |
| + Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], |
| + @x[2], @x[3], @y[2], @y[3], @t[2]); |
| +$code.=<<___; |
| + veor @x[0], @x[0], @t[0] |
| + veor @x[2], @x[2], @t[0] |
| + veor @x[1], @x[1], @t[1] |
| + veor @x[3], @x[3], @t[1] |
| + |
| + veor @t[0], @x[4], @x[6] |
| + veor @t[1], @x[5], @x[7] |
| +___ |
| + &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], |
| + @x[6], @x[7], @y[2], @y[3], @t[2]); |
| +$code.=<<___; |
| + veor @y[0], @y[0], @y[2] |
| + veor @y[1], @y[1], @y[3] |
| +___ |
| + &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]); |
| +$code.=<<___; |
| + veor @x[4], @x[4], @t[0] |
| + veor @x[6], @x[6], @t[0] |
| + veor @x[5], @x[5], @t[1] |
| + veor @x[7], @x[7], @t[1] |
| +___ |
| +} |
| +sub Inv_GF256 { |
| +#;******************************************************************** |
| +#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * |
| +#;******************************************************************** |
| +my @x=@_[0..7]; |
| +my @t=@_[8..11]; |
| +my @s=@_[12..15]; |
| +# direct optimizations from hardware |
| +$code.=<<___; |
| + veor @t[3], @x[4], @x[6] |
| + veor @t[2], @x[5], @x[7] |
| + veor @t[1], @x[1], @x[3] |
| + veor @s[1], @x[7], @x[6] |
| + vmov @t[0], @t[2] |
| + veor @s[0], @x[0], @x[2] |
| + |
| + vorr @t[2], @t[2], @t[1] |
| + veor @s[3], @t[3], @t[0] |
| + vand @s[2], @t[3], @s[0] |
| + vorr @t[3], @t[3], @s[0] |
| + veor @s[0], @s[0], @t[1] |
| + vand @t[0], @t[0], @t[1] |
| + veor @t[1], @x[3], @x[2] |
| + vand @s[3], @s[3], @s[0] |
| + vand @s[1], @s[1], @t[1] |
| + veor @t[1], @x[4], @x[5] |
| + veor @s[0], @x[1], @x[0] |
| + veor @t[3], @t[3], @s[1] |
| + veor @t[2], @t[2], @s[1] |
| + vand @s[1], @t[1], @s[0] |
| + vorr @t[1], @t[1], @s[0] |
| + veor @t[3], @t[3], @s[3] |
| + veor @t[0], @t[0], @s[1] |
| + veor @t[2], @t[2], @s[2] |
| + veor @t[1], @t[1], @s[3] |
| + veor @t[0], @t[0], @s[2] |
| + vand @s[0], @x[7], @x[3] |
| + veor @t[1], @t[1], @s[2] |
| + vand @s[1], @x[6], @x[2] |
| + vand @s[2], @x[5], @x[1] |
| + vorr @s[3], @x[4], @x[0] |
| + veor @t[3], @t[3], @s[0] |
| + veor @t[1], @t[1], @s[2] |
| + veor @t[0], @t[0], @s[3] |
| + veor @t[2], @t[2], @s[1] |
| + |
| + @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 |
| + |
| + @ new smaller inversion |
| + |
| + vand @s[2], @t[3], @t[1] |
| + vmov @s[0], @t[0] |
| + |
| + veor @s[1], @t[2], @s[2] |
| + veor @s[3], @t[0], @s[2] |
| + veor @s[2], @t[0], @s[2] @ @s[2]=@s[3] |
| + |
| + vbsl @s[1], @t[1], @t[0] |
| + vbsl @s[3], @t[3], @t[2] |
| + veor @t[3], @t[3], @t[2] |
| + |
| + vbsl @s[0], @s[1], @s[2] |
| + vbsl @t[0], @s[2], @s[1] |
| + |
| + vand @s[2], @s[0], @s[3] |
| + veor @t[1], @t[1], @t[0] |
| + |
| + veor @s[2], @s[2], @t[3] |
| +___ |
| +# output in s3, s2, s1, t1 |
| + |
| +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 |
| + |
| +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 |
| + &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); |
| + |
| +### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb |
| +} |
| + |
| +# AES linear components |
| + |
| +sub ShiftRows { |
| +my @x=@_[0..7]; |
| +my @t=@_[8..11]; |
| +my $mask=pop; |
| +$code.=<<___; |
| + vldmia $key!, {@t[0]-@t[3]} |
| + veor @t[0], @t[0], @x[0] |
| + veor @t[1], @t[1], @x[1] |
| + vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)` |
| + vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)` |
| + vldmia $key!, {@t[0]} |
| + veor @t[2], @t[2], @x[2] |
| + vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)` |
| + vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)` |
| + vldmia $key!, {@t[1]} |
| + veor @t[3], @t[3], @x[3] |
| + vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)` |
| + vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)` |
| + vldmia $key!, {@t[2]} |
| + vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)` |
| + vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)` |
| + vldmia $key!, {@t[3]} |
| + veor @t[0], @t[0], @x[4] |
| + veor @t[1], @t[1], @x[5] |
| + vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)` |
| + vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)` |
| + veor @t[2], @t[2], @x[6] |
| + vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)` |
| + vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)` |
| + veor @t[3], @t[3], @x[7] |
| + vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)` |
| + vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)` |
| + vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)` |
| + vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)` |
| +___ |
| +} |
| + |
| +sub MixColumns { |
| +# modified to emit output in order suitable for feeding back to aesenc[last] |
| +my @x=@_[0..7]; |
| +my @t=@_[8..15]; |
| +my $inv=@_[16]; # optional |
| +$code.=<<___; |
| + vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 |
| + vext.8 @t[1], @x[1], @x[1], #12 |
| + veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32) |
| + vext.8 @t[2], @x[2], @x[2], #12 |
| + veor @x[1], @x[1], @t[1] |
| + vext.8 @t[3], @x[3], @x[3], #12 |
| + veor @x[2], @x[2], @t[2] |
| + vext.8 @t[4], @x[4], @x[4], #12 |
| + veor @x[3], @x[3], @t[3] |
| + vext.8 @t[5], @x[5], @x[5], #12 |
| + veor @x[4], @x[4], @t[4] |
| + vext.8 @t[6], @x[6], @x[6], #12 |
| + veor @x[5], @x[5], @t[5] |
| + vext.8 @t[7], @x[7], @x[7], #12 |
| + veor @x[6], @x[6], @t[6] |
| + |
| + veor @t[1], @t[1], @x[0] |
| + veor @x[7], @x[7], @t[7] |
| + vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64) |
| + veor @t[2], @t[2], @x[1] |
| + veor @t[0], @t[0], @x[7] |
| + veor @t[1], @t[1], @x[7] |
| + vext.8 @x[1], @x[1], @x[1], #8 |
| + veor @t[5], @t[5], @x[4] |
| + veor @x[0], @x[0], @t[0] |
| + veor @t[6], @t[6], @x[5] |
| + veor @x[1], @x[1], @t[1] |
| + vext.8 @t[0], @x[4], @x[4], #8 |
| + veor @t[4], @t[4], @x[3] |
| + vext.8 @t[1], @x[5], @x[5], #8 |
| + veor @t[7], @t[7], @x[6] |
| + vext.8 @x[4], @x[3], @x[3], #8 |
| + veor @t[3], @t[3], @x[2] |
| + vext.8 @x[5], @x[7], @x[7], #8 |
| + veor @t[4], @t[4], @x[7] |
| + vext.8 @x[3], @x[6], @x[6], #8 |
| + veor @t[3], @t[3], @x[7] |
| + vext.8 @x[6], @x[2], @x[2], #8 |
| + veor @x[7], @t[1], @t[5] |
| +___ |
| +$code.=<<___ if (!$inv); |
| + veor @x[2], @t[0], @t[4] |
| + veor @x[4], @x[4], @t[3] |
| + veor @x[5], @x[5], @t[7] |
| + veor @x[3], @x[3], @t[6] |
| + @ vmov @x[2], @t[0] |
| + veor @x[6], @x[6], @t[2] |
| + @ vmov @x[7], @t[1] |
| +___ |
| +$code.=<<___ if ($inv); |
| + veor @t[3], @t[3], @x[4] |
| + veor @x[5], @x[5], @t[7] |
| + veor @x[2], @x[3], @t[6] |
| + veor @x[3], @t[0], @t[4] |
| + veor @x[4], @x[6], @t[2] |
| + vmov @x[6], @t[3] |
| + @ vmov @x[7], @t[1] |
| +___ |
| +} |
| + |
| +sub InvMixColumns_orig { |
| +my @x=@_[0..7]; |
| +my @t=@_[8..15]; |
| + |
| +$code.=<<___; |
| + @ multiplication by 0x0e |
| + vext.8 @t[7], @x[7], @x[7], #12 |
| + vmov @t[2], @x[2] |
| + veor @x[2], @x[2], @x[5] @ 2 5 |
| + veor @x[7], @x[7], @x[5] @ 7 5 |
| + vext.8 @t[0], @x[0], @x[0], #12 |
| + vmov @t[5], @x[5] |
| + veor @x[5], @x[5], @x[0] @ 5 0 [1] |
| + veor @x[0], @x[0], @x[1] @ 0 1 |
| + vext.8 @t[1], @x[1], @x[1], #12 |
| + veor @x[1], @x[1], @x[2] @ 1 25 |
| + veor @x[0], @x[0], @x[6] @ 01 6 [2] |
| + vext.8 @t[3], @x[3], @x[3], #12 |
| + veor @x[1], @x[1], @x[3] @ 125 3 [4] |
| + veor @x[2], @x[2], @x[0] @ 25 016 [3] |
| + veor @x[3], @x[3], @x[7] @ 3 75 |
| + veor @x[7], @x[7], @x[6] @ 75 6 [0] |
| + vext.8 @t[6], @x[6], @x[6], #12 |
| + vmov @t[4], @x[4] |
| + veor @x[6], @x[6], @x[4] @ 6 4 |
| + veor @x[4], @x[4], @x[3] @ 4 375 [6] |
| + veor @x[3], @x[3], @x[7] @ 375 756=36 |
| + veor @x[6], @x[6], @t[5] @ 64 5 [7] |
| + veor @x[3], @x[3], @t[2] @ 36 2 |
| + vext.8 @t[5], @t[5], @t[5], #12 |
| + veor @x[3], @x[3], @t[4] @ 362 4 [5] |
| +___ |
| + my @y = @x[7,5,0,2,1,3,4,6]; |
| +$code.=<<___; |
| + @ multiplication by 0x0b |
| + veor @y[1], @y[1], @y[0] |
| + veor @y[0], @y[0], @t[0] |
| + vext.8 @t[2], @t[2], @t[2], #12 |
| + veor @y[1], @y[1], @t[1] |
| + veor @y[0], @y[0], @t[5] |
| + vext.8 @t[4], @t[4], @t[4], #12 |
| + veor @y[1], @y[1], @t[6] |
| + veor @y[0], @y[0], @t[7] |
| + veor @t[7], @t[7], @t[6] @ clobber t[7] |
| + |
| + veor @y[3], @y[3], @t[0] |
| + veor @y[1], @y[1], @y[0] |
| + vext.8 @t[0], @t[0], @t[0], #12 |
| + veor @y[2], @y[2], @t[1] |
| + veor @y[4], @y[4], @t[1] |
| + vext.8 @t[1], @t[1], @t[1], #12 |
| + veor @y[2], @y[2], @t[2] |
| + veor @y[3], @y[3], @t[2] |
| + veor @y[5], @y[5], @t[2] |
| + veor @y[2], @y[2], @t[7] |
| + vext.8 @t[2], @t[2], @t[2], #12 |
| + veor @y[3], @y[3], @t[3] |
| + veor @y[6], @y[6], @t[3] |
| + veor @y[4], @y[4], @t[3] |
| + veor @y[7], @y[7], @t[4] |
| + vext.8 @t[3], @t[3], @t[3], #12 |
| + veor @y[5], @y[5], @t[4] |
| + veor @y[7], @y[7], @t[7] |
| + veor @t[7], @t[7], @t[5] @ clobber t[7] even more |
| + veor @y[3], @y[3], @t[5] |
| + veor @y[4], @y[4], @t[4] |
| + |
| + veor @y[5], @y[5], @t[7] |
| + vext.8 @t[4], @t[4], @t[4], #12 |
| + veor @y[6], @y[6], @t[7] |
| + veor @y[4], @y[4], @t[7] |
| + |
| + veor @t[7], @t[7], @t[5] |
| + vext.8 @t[5], @t[5], @t[5], #12 |
| + |
| + @ multiplication by 0x0d |
| + veor @y[4], @y[4], @y[7] |
| + veor @t[7], @t[7], @t[6] @ restore t[7] |
| + veor @y[7], @y[7], @t[4] |
| + vext.8 @t[6], @t[6], @t[6], #12 |
| + veor @y[2], @y[2], @t[0] |
| + veor @y[7], @y[7], @t[5] |
| + vext.8 @t[7], @t[7], @t[7], #12 |
| + veor @y[2], @y[2], @t[2] |
| + |
| + veor @y[3], @y[3], @y[1] |
| + veor @y[1], @y[1], @t[1] |
| + veor @y[0], @y[0], @t[0] |
| + veor @y[3], @y[3], @t[0] |
| + veor @y[1], @y[1], @t[5] |
| + veor @y[0], @y[0], @t[5] |
| + vext.8 @t[0], @t[0], @t[0], #12 |
| + veor @y[1], @y[1], @t[7] |
| + veor @y[0], @y[0], @t[6] |
| + veor @y[3], @y[3], @y[1] |
| + veor @y[4], @y[4], @t[1] |
| + vext.8 @t[1], @t[1], @t[1], #12 |
| + |
| + veor @y[7], @y[7], @t[7] |
| + veor @y[4], @y[4], @t[2] |
| + veor @y[5], @y[5], @t[2] |
| + veor @y[2], @y[2], @t[6] |
| + veor @t[6], @t[6], @t[3] @ clobber t[6] |
| + vext.8 @t[2], @t[2], @t[2], #12 |
| + veor @y[4], @y[4], @y[7] |
| + veor @y[3], @y[3], @t[6] |
| + |
| + veor @y[6], @y[6], @t[6] |
| + veor @y[5], @y[5], @t[5] |
| + vext.8 @t[5], @t[5], @t[5], #12 |
| + veor @y[6], @y[6], @t[4] |
| + vext.8 @t[4], @t[4], @t[4], #12 |
| + veor @y[5], @y[5], @t[6] |
| + veor @y[6], @y[6], @t[7] |
| + vext.8 @t[7], @t[7], @t[7], #12 |
| + veor @t[6], @t[6], @t[3] @ restore t[6] |
| + vext.8 @t[3], @t[3], @t[3], #12 |
| + |
| + @ multiplication by 0x09 |
| + veor @y[4], @y[4], @y[1] |
| + veor @t[1], @t[1], @y[1] @ t[1]=y[1] |
| + veor @t[0], @t[0], @t[5] @ clobber t[0] |
| + vext.8 @t[6], @t[6], @t[6], #12 |
| + veor @t[1], @t[1], @t[5] |
| + veor @y[3], @y[3], @t[0] |
| + veor @t[0], @t[0], @y[0] @ t[0]=y[0] |
| + veor @t[1], @t[1], @t[6] |
| + veor @t[6], @t[6], @t[7] @ clobber t[6] |
| + veor @y[4], @y[4], @t[1] |
| + veor @y[7], @y[7], @t[4] |
| + veor @y[6], @y[6], @t[3] |
| + veor @y[5], @y[5], @t[2] |
| + veor @t[4], @t[4], @y[4] @ t[4]=y[4] |
| + veor @t[3], @t[3], @y[3] @ t[3]=y[3] |
| + veor @t[5], @t[5], @y[5] @ t[5]=y[5] |
| + veor @t[2], @t[2], @y[2] @ t[2]=y[2] |
| + veor @t[3], @t[3], @t[7] |
| + veor @XMM[5], @t[5], @t[6] |
| + veor @XMM[6], @t[6], @y[6] @ t[6]=y[6] |
| + veor @XMM[2], @t[2], @t[6] |
| + veor @XMM[7], @t[7], @y[7] @ t[7]=y[7] |
| + |
| + vmov @XMM[0], @t[0] |
| + vmov @XMM[1], @t[1] |
| + @ vmov @XMM[2], @t[2] |
| + vmov @XMM[3], @t[3] |
| + vmov @XMM[4], @t[4] |
| + @ vmov @XMM[5], @t[5] |
| + @ vmov @XMM[6], @t[6] |
| + @ vmov @XMM[7], @t[7] |
| +___ |
| +} |
| + |
| +sub InvMixColumns { |
| +my @x=@_[0..7]; |
| +my @t=@_[8..15]; |
| + |
| +# Thanks to Jussi Kivilinna for providing pointer to |
| +# |
| +# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | |
| +# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | |
| +# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | |
| +# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | |
| + |
| +$code.=<<___; |
| + @ multiplication by 0x05-0x00-0x04-0x00 |
| + vext.8 @t[0], @x[0], @x[0], #8 |
| + vext.8 @t[6], @x[6], @x[6], #8 |
| + vext.8 @t[7], @x[7], @x[7], #8 |
| + veor @t[0], @t[0], @x[0] |
| + vext.8 @t[1], @x[1], @x[1], #8 |
| + veor @t[6], @t[6], @x[6] |
| + vext.8 @t[2], @x[2], @x[2], #8 |
| + veor @t[7], @t[7], @x[7] |
| + vext.8 @t[3], @x[3], @x[3], #8 |
| + veor @t[1], @t[1], @x[1] |
| + vext.8 @t[4], @x[4], @x[4], #8 |
| + veor @t[2], @t[2], @x[2] |
| + vext.8 @t[5], @x[5], @x[5], #8 |
| + veor @t[3], @t[3], @x[3] |
| + veor @t[4], @t[4], @x[4] |
| + veor @t[5], @t[5], @x[5] |
| + |
| + veor @x[0], @x[0], @t[6] |
| + veor @x[1], @x[1], @t[6] |
| + veor @x[2], @x[2], @t[0] |
| + veor @x[4], @x[4], @t[2] |
| + veor @x[3], @x[3], @t[1] |
| + veor @x[1], @x[1], @t[7] |
| + veor @x[2], @x[2], @t[7] |
| + veor @x[4], @x[4], @t[6] |
| + veor @x[5], @x[5], @t[3] |
| + veor @x[3], @x[3], @t[6] |
| + veor @x[6], @x[6], @t[4] |
| + veor @x[4], @x[4], @t[7] |
| + veor @x[5], @x[5], @t[7] |
| + veor @x[7], @x[7], @t[5] |
| +___ |
| + &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 |
| +} |
| + |
| +sub swapmove { |
| +my ($a,$b,$n,$mask,$t)=@_; |
| +$code.=<<___; |
| + vshr.u64 $t, $b, #$n |
| + veor $t, $t, $a |
| + vand $t, $t, $mask |
| + veor $a, $a, $t |
| + vshl.u64 $t, $t, #$n |
| + veor $b, $b, $t |
| +___ |
| +} |
| +sub swapmove2x { |
| +my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; |
| +$code.=<<___; |
| + vshr.u64 $t0, $b0, #$n |
| + vshr.u64 $t1, $b1, #$n |
| + veor $t0, $t0, $a0 |
| + veor $t1, $t1, $a1 |
| + vand $t0, $t0, $mask |
| + vand $t1, $t1, $mask |
| + veor $a0, $a0, $t0 |
| + vshl.u64 $t0, $t0, #$n |
| + veor $a1, $a1, $t1 |
| + vshl.u64 $t1, $t1, #$n |
| + veor $b0, $b0, $t0 |
| + veor $b1, $b1, $t1 |
| +___ |
| +} |
| + |
| +sub bitslice { |
| +my @x=reverse(@_[0..7]); |
| +my ($t0,$t1,$t2,$t3)=@_[8..11]; |
| +$code.=<<___; |
| + vmov.i8 $t0,#0x55 @ compose .LBS0 |
| + vmov.i8 $t1,#0x33 @ compose .LBS1 |
| +___ |
| + &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); |
| + &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); |
| +$code.=<<___; |
| + vmov.i8 $t0,#0x0f @ compose .LBS2 |
| +___ |
| + &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); |
| + &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); |
| + |
| + &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); |
| + &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); |
| +} |
| + |
| +$code.=<<___; |
| +#ifndef __KERNEL__ |
| +# include "arm_arch.h" |
| + |
| +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} |
| +# define VFP_ABI_POP vldmia sp!,{d8-d15} |
| +# define VFP_ABI_FRAME 0x40 |
| +#else |
| +# define VFP_ABI_PUSH |
| +# define VFP_ABI_POP |
| +# define VFP_ABI_FRAME 0 |
| +# define BSAES_ASM_EXTENDED_KEY |
| +# define XTS_CHAIN_TWEAK |
| +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ |
| +#endif |
| + |
| +#ifdef __thumb__ |
| +# define adrl adr |
| +#endif |
| + |
| +#if __ARM_ARCH__>=7 |
| +.text |
| +.syntax unified @ ARMv7-capable assembler is expected to handle this |
| +#ifdef __thumb2__ |
| +.thumb |
| +#else |
| +.code 32 |
| +#endif |
| + |
| +.fpu neon |
| + |
| +.type _bsaes_decrypt8,%function |
| +.align 4 |
| +_bsaes_decrypt8: |
| + adr $const,_bsaes_decrypt8 |
| + vldmia $key!, {@XMM[9]} @ round 0 key |
| + add $const,$const,#.LM0ISR-_bsaes_decrypt8 |
| + |
| + vldmia $const!, {@XMM[8]} @ .LM0ISR |
| + veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key |
| + veor @XMM[11], @XMM[1], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` |
| + veor @XMM[12], @XMM[2], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` |
| + veor @XMM[13], @XMM[3], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` |
| + veor @XMM[14], @XMM[4], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` |
| + veor @XMM[15], @XMM[5], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` |
| + veor @XMM[10], @XMM[6], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` |
| + veor @XMM[11], @XMM[7], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` |
| + vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` |
| +___ |
| + &bitslice (@XMM[0..7, 8..11]); |
| +$code.=<<___; |
| + sub $rounds,$rounds,#1 |
| + b .Ldec_sbox |
| +.align 4 |
| +.Ldec_loop: |
| +___ |
| + &ShiftRows (@XMM[0..7, 8..12]); |
| +$code.=".Ldec_sbox:\n"; |
| + &InvSbox (@XMM[0..7, 8..15]); |
| +$code.=<<___; |
| + subs $rounds,$rounds,#1 |
| + bcc .Ldec_done |
| +___ |
| + &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); |
| +$code.=<<___; |
| + vldmia $const, {@XMM[12]} @ .LISR |
| + ite eq @ Thumb2 thing, sanity check in ARM |
| + addeq $const,$const,#0x10 |
| + bne .Ldec_loop |
| + vldmia $const, {@XMM[12]} @ .LISRM0 |
| + b .Ldec_loop |
| +.align 4 |
| +.Ldec_done: |
| +___ |
| + &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); |
| +$code.=<<___; |
| + vldmia $key, {@XMM[8]} @ last round key |
| + veor @XMM[6], @XMM[6], @XMM[8] |
| + veor @XMM[4], @XMM[4], @XMM[8] |
| + veor @XMM[2], @XMM[2], @XMM[8] |
| + veor @XMM[7], @XMM[7], @XMM[8] |
| + veor @XMM[3], @XMM[3], @XMM[8] |
| + veor @XMM[5], @XMM[5], @XMM[8] |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + bx lr |
| +.size _bsaes_decrypt8,.-_bsaes_decrypt8 |
| + |
| +.type _bsaes_const,%object |
| +.align 6 |
| +_bsaes_const: |
| +.LM0ISR: @ InvShiftRows constants |
| + .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 |
| +.LISR: |
| + .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 |
| +.LISRM0: |
| + .quad 0x01040b0e0205080f, 0x0306090c00070a0d |
| +.LM0SR: @ ShiftRows constants |
| + .quad 0x0a0e02060f03070b, 0x0004080c05090d01 |
| +.LSR: |
| + .quad 0x0504070600030201, 0x0f0e0d0c0a09080b |
| +.LSRM0: |
| + .quad 0x0304090e00050a0f, 0x01060b0c0207080d |
| +.LM0: |
| + .quad 0x02060a0e03070b0f, 0x0004080c0105090d |
| +.LREVM0SR: |
| + .quad 0x090d01050c000408, 0x03070b0f060a0e02 |
| +.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>" |
| +.align 6 |
| +.size _bsaes_const,.-_bsaes_const |
| + |
| +.type _bsaes_encrypt8,%function |
| +.align 4 |
| +_bsaes_encrypt8: |
| + adr $const,_bsaes_encrypt8 |
| + vldmia $key!, {@XMM[9]} @ round 0 key |
| + sub $const,$const,#_bsaes_encrypt8-.LM0SR |
| + |
| + vldmia $const!, {@XMM[8]} @ .LM0SR |
| +_bsaes_encrypt8_alt: |
| + veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key |
| + veor @XMM[11], @XMM[1], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` |
| + veor @XMM[12], @XMM[2], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` |
| + veor @XMM[13], @XMM[3], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` |
| + veor @XMM[14], @XMM[4], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` |
| + veor @XMM[15], @XMM[5], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` |
| + veor @XMM[10], @XMM[6], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` |
| + veor @XMM[11], @XMM[7], @XMM[9] |
| + vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` |
| + vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` |
| + vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` |
| +_bsaes_encrypt8_bitslice: |
| +___ |
| + &bitslice (@XMM[0..7, 8..11]); |
| +$code.=<<___; |
| + sub $rounds,$rounds,#1 |
| + b .Lenc_sbox |
| +.align 4 |
| +.Lenc_loop: |
| +___ |
| + &ShiftRows (@XMM[0..7, 8..12]); |
| +$code.=".Lenc_sbox:\n"; |
| + &Sbox (@XMM[0..7, 8..15]); |
| +$code.=<<___; |
| + subs $rounds,$rounds,#1 |
| + bcc .Lenc_done |
| +___ |
| + &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); |
| +$code.=<<___; |
| + vldmia $const, {@XMM[12]} @ .LSR |
| + ite eq @ Thumb2 thing, samity check in ARM |
| + addeq $const,$const,#0x10 |
| + bne .Lenc_loop |
| + vldmia $const, {@XMM[12]} @ .LSRM0 |
| + b .Lenc_loop |
| +.align 4 |
| +.Lenc_done: |
| +___ |
| + # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb |
| + &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); |
| +$code.=<<___; |
| + vldmia $key, {@XMM[8]} @ last round key |
| + veor @XMM[4], @XMM[4], @XMM[8] |
| + veor @XMM[6], @XMM[6], @XMM[8] |
| + veor @XMM[3], @XMM[3], @XMM[8] |
| + veor @XMM[7], @XMM[7], @XMM[8] |
| + veor @XMM[2], @XMM[2], @XMM[8] |
| + veor @XMM[5], @XMM[5], @XMM[8] |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + bx lr |
| +.size _bsaes_encrypt8,.-_bsaes_encrypt8 |
| +___ |
| +} |
| +{ |
| +my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6"); |
| + |
| +sub bitslice_key { |
| +my @x=reverse(@_[0..7]); |
| +my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; |
| + |
| + &swapmove (@x[0,1],1,$bs0,$t2,$t3); |
| +$code.=<<___; |
| + @ &swapmove(@x[2,3],1,$t0,$t2,$t3); |
| + vmov @x[2], @x[0] |
| + vmov @x[3], @x[1] |
| +___ |
| + #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); |
| + |
| + &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); |
| +$code.=<<___; |
| + @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); |
| + vmov @x[4], @x[0] |
| + vmov @x[6], @x[2] |
| + vmov @x[5], @x[1] |
| + vmov @x[7], @x[3] |
| +___ |
| + &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); |
| + &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); |
| +} |
| + |
| +$code.=<<___; |
| +.type _bsaes_key_convert,%function |
| +.align 4 |
| +_bsaes_key_convert: |
| + adr $const,_bsaes_key_convert |
| + vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key |
| + sub $const,$const,#_bsaes_key_convert-.LM0 |
| + vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key |
| + |
| + vmov.i8 @XMM[8], #0x01 @ bit masks |
| + vmov.i8 @XMM[9], #0x02 |
| + vmov.i8 @XMM[10], #0x04 |
| + vmov.i8 @XMM[11], #0x08 |
| + vmov.i8 @XMM[12], #0x10 |
| + vmov.i8 @XMM[13], #0x20 |
| + vldmia $const, {@XMM[14]} @ .LM0 |
| + |
| +#ifdef __ARMEL__ |
| + vrev32.8 @XMM[7], @XMM[7] |
| + vrev32.8 @XMM[15], @XMM[15] |
| +#endif |
| + sub $rounds,$rounds,#1 |
| + vstmia $out!, {@XMM[7]} @ save round 0 key |
| + b .Lkey_loop |
| + |
| +.align 4 |
| +.Lkey_loop: |
| + vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])` |
| + vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])` |
| + vmov.i8 @XMM[6], #0x40 |
| + vmov.i8 @XMM[15], #0x80 |
| + |
| + vtst.8 @XMM[0], @XMM[7], @XMM[8] |
| + vtst.8 @XMM[1], @XMM[7], @XMM[9] |
| + vtst.8 @XMM[2], @XMM[7], @XMM[10] |
| + vtst.8 @XMM[3], @XMM[7], @XMM[11] |
| + vtst.8 @XMM[4], @XMM[7], @XMM[12] |
| + vtst.8 @XMM[5], @XMM[7], @XMM[13] |
| + vtst.8 @XMM[6], @XMM[7], @XMM[6] |
| + vtst.8 @XMM[7], @XMM[7], @XMM[15] |
| + vld1.8 {@XMM[15]}, [$inp]! @ load next round key |
| + vmvn @XMM[0], @XMM[0] @ "pnot" |
| + vmvn @XMM[1], @XMM[1] |
| + vmvn @XMM[5], @XMM[5] |
| + vmvn @XMM[6], @XMM[6] |
| +#ifdef __ARMEL__ |
| + vrev32.8 @XMM[15], @XMM[15] |
| +#endif |
| + subs $rounds,$rounds,#1 |
| + vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key |
| + bne .Lkey_loop |
| + |
| + vmov.i8 @XMM[7],#0x63 @ compose .L63 |
| + @ don't save last round key |
| + bx lr |
| +.size _bsaes_key_convert,.-_bsaes_key_convert |
| +___ |
| +} |
| + |
| +if (0) { # following four functions are unsupported interface |
| + # used for benchmarking... |
| +$code.=<<___; |
| +.globl bsaes_enc_key_convert |
| +.type bsaes_enc_key_convert,%function |
| +.align 4 |
| +bsaes_enc_key_convert: |
| + stmdb sp!,{r4-r6,lr} |
| + vstmdb sp!,{d8-d15} @ ABI specification says so |
| + |
| + ldr r5,[$inp,#240] @ pass rounds |
| + mov r4,$inp @ pass key |
| + mov r12,$out @ pass key schedule |
| + bl _bsaes_key_convert |
| + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key |
| + vstmia r12, {@XMM[7]} @ save last round key |
| + |
| + vldmia sp!,{d8-d15} |
| + ldmia sp!,{r4-r6,pc} |
| +.size bsaes_enc_key_convert,.-bsaes_enc_key_convert |
| + |
| +.globl bsaes_encrypt_128 |
| +.type bsaes_encrypt_128,%function |
| +.align 4 |
| +bsaes_encrypt_128: |
| + stmdb sp!,{r4-r6,lr} |
| + vstmdb sp!,{d8-d15} @ ABI specification says so |
| +.Lenc128_loop: |
| + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input |
| + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! |
| + mov r4,$key @ pass the key |
| + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! |
| + mov r5,#10 @ pass rounds |
| + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! |
| + |
| + bl _bsaes_encrypt8 |
| + |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + vst1.8 {@XMM[4]}, [$out]! |
| + vst1.8 {@XMM[6]}, [$out]! |
| + vst1.8 {@XMM[3]}, [$out]! |
| + vst1.8 {@XMM[7]}, [$out]! |
| + vst1.8 {@XMM[2]}, [$out]! |
| + subs $len,$len,#0x80 |
| + vst1.8 {@XMM[5]}, [$out]! |
| + bhi .Lenc128_loop |
| + |
| + vldmia sp!,{d8-d15} |
| + ldmia sp!,{r4-r6,pc} |
| +.size bsaes_encrypt_128,.-bsaes_encrypt_128 |
| + |
| +.globl bsaes_dec_key_convert |
| +.type bsaes_dec_key_convert,%function |
| +.align 4 |
| +bsaes_dec_key_convert: |
| + stmdb sp!,{r4-r6,lr} |
| + vstmdb sp!,{d8-d15} @ ABI specification says so |
| + |
| + ldr r5,[$inp,#240] @ pass rounds |
| + mov r4,$inp @ pass key |
| + mov r12,$out @ pass key schedule |
| + bl _bsaes_key_convert |
| + vldmia $out, {@XMM[6]} |
| + vstmia r12, {@XMM[15]} @ save last round key |
| + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key |
| + vstmia $out, {@XMM[7]} |
| + |
| + vldmia sp!,{d8-d15} |
| + ldmia sp!,{r4-r6,pc} |
| +.size bsaes_dec_key_convert,.-bsaes_dec_key_convert |
| + |
| +.globl bsaes_decrypt_128 |
| +.type bsaes_decrypt_128,%function |
| +.align 4 |
| +bsaes_decrypt_128: |
| + stmdb sp!,{r4-r6,lr} |
| + vstmdb sp!,{d8-d15} @ ABI specification says so |
| +.Ldec128_loop: |
| + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input |
| + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! |
| + mov r4,$key @ pass the key |
| + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! |
| + mov r5,#10 @ pass rounds |
| + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + vst1.8 {@XMM[6]}, [$out]! |
| + vst1.8 {@XMM[4]}, [$out]! |
| + vst1.8 {@XMM[2]}, [$out]! |
| + vst1.8 {@XMM[7]}, [$out]! |
| + vst1.8 {@XMM[3]}, [$out]! |
| + subs $len,$len,#0x80 |
| + vst1.8 {@XMM[5]}, [$out]! |
| + bhi .Ldec128_loop |
| + |
| + vldmia sp!,{d8-d15} |
| + ldmia sp!,{r4-r6,pc} |
| +.size bsaes_decrypt_128,.-bsaes_decrypt_128 |
| +___ |
| +} |
| +{ |
| +my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10)); |
| +my ($keysched)=("sp"); |
| + |
| +$code.=<<___; |
| +.extern AES_cbc_encrypt |
| +.extern AES_decrypt |
| + |
| +.global bsaes_cbc_encrypt |
| +.type bsaes_cbc_encrypt,%function |
| +.align 5 |
| +bsaes_cbc_encrypt: |
| +#ifndef __KERNEL__ |
| + cmp $len, #128 |
| +#ifndef __thumb__ |
| + blo AES_cbc_encrypt |
| +#else |
| + bhs 1f |
| + b AES_cbc_encrypt |
| +1: |
| +#endif |
| +#endif |
| + |
| + @ it is up to the caller to make sure we are called with enc == 0 |
| + |
| + mov ip, sp |
| + stmdb sp!, {r4-r10, lr} |
| + VFP_ABI_PUSH |
| + ldr $ivp, [ip] @ IV is 1st arg on the stack |
| + mov $len, $len, lsr#4 @ len in 16 byte blocks |
| + sub sp, #0x10 @ scratch space to carry over the IV |
| + mov $fp, sp @ save sp |
| + |
| + ldr $rounds, [$key, #240] @ get # of rounds |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + @ allocate the key schedule on the stack |
| + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key |
| + add r12, #`128-32` @ sifze of bit-slices key schedule |
| + |
| + @ populate the key schedule |
| + mov r4, $key @ pass key |
| + mov r5, $rounds @ pass # of rounds |
| + mov sp, r12 @ sp is $keysched |
| + bl _bsaes_key_convert |
| + vldmia $keysched, {@XMM[6]} |
| + vstmia r12, {@XMM[15]} @ save last round key |
| + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key |
| + vstmia $keysched, {@XMM[7]} |
| +#else |
| + ldr r12, [$key, #244] |
| + eors r12, #1 |
| + beq 0f |
| + |
| + @ populate the key schedule |
| + str r12, [$key, #244] |
| + mov r4, $key @ pass key |
| + mov r5, $rounds @ pass # of rounds |
| + add r12, $key, #248 @ pass key schedule |
| + bl _bsaes_key_convert |
| + add r4, $key, #248 |
| + vldmia r4, {@XMM[6]} |
| + vstmia r12, {@XMM[15]} @ save last round key |
| + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key |
| + vstmia r4, {@XMM[7]} |
| + |
| +.align 2 |
| +0: |
| +#endif |
| + |
| + vld1.8 {@XMM[15]}, [$ivp] @ load IV |
| + b .Lcbc_dec_loop |
| + |
| +.align 4 |
| +.Lcbc_dec_loop: |
| + subs $len, $len, #0x8 |
| + bmi .Lcbc_dec_loop_finish |
| + |
| + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input |
| + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + mov r4, $keysched @ pass the key |
| +#else |
| + add r4, $key, #248 |
| +#endif |
| + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! |
| + mov r5, $rounds |
| + vld1.8 {@XMM[6]-@XMM[7]}, [$inp] |
| + sub $inp, $inp, #0x60 |
| + vstmia $fp, {@XMM[15]} @ put aside IV |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vldmia $fp, {@XMM[14]} @ reload IV |
| + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input |
| + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV |
| + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + veor @XMM[6], @XMM[6], @XMM[9] |
| + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! |
| + veor @XMM[4], @XMM[4], @XMM[10] |
| + veor @XMM[2], @XMM[2], @XMM[11] |
| + vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! |
| + veor @XMM[7], @XMM[7], @XMM[12] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + veor @XMM[3], @XMM[3], @XMM[13] |
| + vst1.8 {@XMM[6]}, [$out]! |
| + veor @XMM[5], @XMM[5], @XMM[14] |
| + vst1.8 {@XMM[4]}, [$out]! |
| + vst1.8 {@XMM[2]}, [$out]! |
| + vst1.8 {@XMM[7]}, [$out]! |
| + vst1.8 {@XMM[3]}, [$out]! |
| + vst1.8 {@XMM[5]}, [$out]! |
| + |
| + b .Lcbc_dec_loop |
| + |
| +.Lcbc_dec_loop_finish: |
| + adds $len, $len, #8 |
| + beq .Lcbc_dec_done |
| + |
| + vld1.8 {@XMM[0]}, [$inp]! @ load input |
| + cmp $len, #2 |
| + blo .Lcbc_dec_one |
| + vld1.8 {@XMM[1]}, [$inp]! |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + mov r4, $keysched @ pass the key |
| +#else |
| + add r4, $key, #248 |
| +#endif |
| + mov r5, $rounds |
| + vstmia $fp, {@XMM[15]} @ put aside IV |
| + beq .Lcbc_dec_two |
| + vld1.8 {@XMM[2]}, [$inp]! |
| + cmp $len, #4 |
| + blo .Lcbc_dec_three |
| + vld1.8 {@XMM[3]}, [$inp]! |
| + beq .Lcbc_dec_four |
| + vld1.8 {@XMM[4]}, [$inp]! |
| + cmp $len, #6 |
| + blo .Lcbc_dec_five |
| + vld1.8 {@XMM[5]}, [$inp]! |
| + beq .Lcbc_dec_six |
| + vld1.8 {@XMM[6]}, [$inp]! |
| + sub $inp, $inp, #0x70 |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vldmia $fp, {@XMM[14]} @ reload IV |
| + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input |
| + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV |
| + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + veor @XMM[6], @XMM[6], @XMM[9] |
| + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! |
| + veor @XMM[4], @XMM[4], @XMM[10] |
| + veor @XMM[2], @XMM[2], @XMM[11] |
| + vld1.8 {@XMM[15]}, [$inp]! |
| + veor @XMM[7], @XMM[7], @XMM[12] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + veor @XMM[3], @XMM[3], @XMM[13] |
| + vst1.8 {@XMM[6]}, [$out]! |
| + vst1.8 {@XMM[4]}, [$out]! |
| + vst1.8 {@XMM[2]}, [$out]! |
| + vst1.8 {@XMM[7]}, [$out]! |
| + vst1.8 {@XMM[3]}, [$out]! |
| + b .Lcbc_dec_done |
| +.align 4 |
| +.Lcbc_dec_six: |
| + sub $inp, $inp, #0x60 |
| + bl _bsaes_decrypt8 |
| + vldmia $fp,{@XMM[14]} @ reload IV |
| + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input |
| + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV |
| + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + veor @XMM[6], @XMM[6], @XMM[9] |
| + vld1.8 {@XMM[12]}, [$inp]! |
| + veor @XMM[4], @XMM[4], @XMM[10] |
| + veor @XMM[2], @XMM[2], @XMM[11] |
| + vld1.8 {@XMM[15]}, [$inp]! |
| + veor @XMM[7], @XMM[7], @XMM[12] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + vst1.8 {@XMM[6]}, [$out]! |
| + vst1.8 {@XMM[4]}, [$out]! |
| + vst1.8 {@XMM[2]}, [$out]! |
| + vst1.8 {@XMM[7]}, [$out]! |
| + b .Lcbc_dec_done |
| +.align 4 |
| +.Lcbc_dec_five: |
| + sub $inp, $inp, #0x50 |
| + bl _bsaes_decrypt8 |
| + vldmia $fp, {@XMM[14]} @ reload IV |
| + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input |
| + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV |
| + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + veor @XMM[6], @XMM[6], @XMM[9] |
| + vld1.8 {@XMM[15]}, [$inp]! |
| + veor @XMM[4], @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + veor @XMM[2], @XMM[2], @XMM[11] |
| + vst1.8 {@XMM[6]}, [$out]! |
| + vst1.8 {@XMM[4]}, [$out]! |
| + vst1.8 {@XMM[2]}, [$out]! |
| + b .Lcbc_dec_done |
| +.align 4 |
| +.Lcbc_dec_four: |
| + sub $inp, $inp, #0x40 |
| + bl _bsaes_decrypt8 |
| + vldmia $fp, {@XMM[14]} @ reload IV |
| + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input |
| + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV |
| + vld1.8 {@XMM[10]}, [$inp]! |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + veor @XMM[6], @XMM[6], @XMM[9] |
| + vld1.8 {@XMM[15]}, [$inp]! |
| + veor @XMM[4], @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + vst1.8 {@XMM[6]}, [$out]! |
| + vst1.8 {@XMM[4]}, [$out]! |
| + b .Lcbc_dec_done |
| +.align 4 |
| +.Lcbc_dec_three: |
| + sub $inp, $inp, #0x30 |
| + bl _bsaes_decrypt8 |
| + vldmia $fp, {@XMM[14]} @ reload IV |
| + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input |
| + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV |
| + vld1.8 {@XMM[15]}, [$inp]! |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + veor @XMM[6], @XMM[6], @XMM[9] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + vst1.8 {@XMM[6]}, [$out]! |
| + b .Lcbc_dec_done |
| +.align 4 |
| +.Lcbc_dec_two: |
| + sub $inp, $inp, #0x20 |
| + bl _bsaes_decrypt8 |
| + vldmia $fp, {@XMM[14]} @ reload IV |
| + vld1.8 {@XMM[8]}, [$inp]! @ reload input |
| + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV |
| + vld1.8 {@XMM[15]}, [$inp]! @ reload input |
| + veor @XMM[1], @XMM[1], @XMM[8] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + b .Lcbc_dec_done |
| +.align 4 |
| +.Lcbc_dec_one: |
| + sub $inp, $inp, #0x10 |
| + mov $rounds, $out @ save original out pointer |
| + mov $out, $fp @ use the iv scratch space as out buffer |
| + mov r2, $key |
| + vmov @XMM[4],@XMM[15] @ just in case ensure that IV |
| + vmov @XMM[5],@XMM[0] @ and input are preserved |
| + bl AES_decrypt |
| + vld1.8 {@XMM[0]}, [$fp,:64] @ load result |
| + veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV |
| + vmov @XMM[15], @XMM[5] @ @XMM[5] holds input |
| + vst1.8 {@XMM[0]}, [$rounds] @ write output |
| + |
| +.Lcbc_dec_done: |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + vmov.i32 q0, #0 |
| + vmov.i32 q1, #0 |
| +.Lcbc_dec_bzero: @ wipe key schedule [if any] |
| + vstmia $keysched!, {q0-q1} |
| + cmp $keysched, $fp |
| + bne .Lcbc_dec_bzero |
| +#endif |
| + |
| + mov sp, $fp |
| + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb |
| + vst1.8 {@XMM[15]}, [$ivp] @ return IV |
| + VFP_ABI_POP |
| + ldmia sp!, {r4-r10, pc} |
| +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt |
| +___ |
| +} |
| +{ |
| +my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); |
| +my $const = "r6"; # shared with _bsaes_encrypt8_alt |
| +my $keysched = "sp"; |
| + |
| +$code.=<<___; |
| +.extern AES_encrypt |
| +.global bsaes_ctr32_encrypt_blocks |
| +.type bsaes_ctr32_encrypt_blocks,%function |
| +.align 5 |
| +bsaes_ctr32_encrypt_blocks: |
| + cmp $len, #8 @ use plain AES for |
| + blo .Lctr_enc_short @ small sizes |
| + |
| + mov ip, sp |
| + stmdb sp!, {r4-r10, lr} |
| + VFP_ABI_PUSH |
| + ldr $ctr, [ip] @ ctr is 1st arg on the stack |
| + sub sp, sp, #0x10 @ scratch space to carry over the ctr |
| + mov $fp, sp @ save sp |
| + |
| + ldr $rounds, [$key, #240] @ get # of rounds |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + @ allocate the key schedule on the stack |
| + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key |
| + add r12, #`128-32` @ size of bit-sliced key schedule |
| + |
| + @ populate the key schedule |
| + mov r4, $key @ pass key |
| + mov r5, $rounds @ pass # of rounds |
| + mov sp, r12 @ sp is $keysched |
| + bl _bsaes_key_convert |
| + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key |
| + vstmia r12, {@XMM[7]} @ save last round key |
| + |
| + vld1.8 {@XMM[0]}, [$ctr] @ load counter |
| + add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr |
| + vldmia $keysched, {@XMM[4]} @ load round0 key |
| +#else |
| + ldr r12, [$key, #244] |
| + eors r12, #1 |
| + beq 0f |
| + |
| + @ populate the key schedule |
| + str r12, [$key, #244] |
| + mov r4, $key @ pass key |
| + mov r5, $rounds @ pass # of rounds |
| + add r12, $key, #248 @ pass key schedule |
| + bl _bsaes_key_convert |
| + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key |
| + vstmia r12, {@XMM[7]} @ save last round key |
| + |
| +.align 2 |
| +0: add r12, $key, #248 |
| + vld1.8 {@XMM[0]}, [$ctr] @ load counter |
| + adrl $ctr, .LREVM0SR @ borrow $ctr |
| + vldmia r12, {@XMM[4]} @ load round0 key |
| + sub sp, #0x10 @ place for adjusted round0 key |
| +#endif |
| + |
| + vmov.i32 @XMM[8],#1 @ compose 1<<96 |
| + veor @XMM[9],@XMM[9],@XMM[9] |
| + vrev32.8 @XMM[0],@XMM[0] |
| + vext.8 @XMM[8],@XMM[9],@XMM[8],#4 |
| + vrev32.8 @XMM[4],@XMM[4] |
| + vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 |
| + vstmia $keysched, {@XMM[4]} @ save adjusted round0 key |
| + b .Lctr_enc_loop |
| + |
| +.align 4 |
| +.Lctr_enc_loop: |
| + vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 |
| + vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 |
| + vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 |
| + vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 |
| + vadd.u32 @XMM[4], @XMM[1], @XMM[10] |
| + vadd.u32 @XMM[5], @XMM[2], @XMM[10] |
| + vadd.u32 @XMM[6], @XMM[3], @XMM[10] |
| + vadd.u32 @XMM[7], @XMM[4], @XMM[10] |
| + vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter |
| + |
| + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity |
| + @ to flip byte order in 32-bit counter |
| + |
| + vldmia $keysched, {@XMM[9]} @ load round0 key |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, $keysched, #0x10 @ pass next round key |
| +#else |
| + add r4, $key, #`248+16` |
| +#endif |
| + vldmia $ctr, {@XMM[8]} @ .LREVM0SR |
| + mov r5, $rounds @ pass rounds |
| + vstmia $fp, {@XMM[10]} @ save next counter |
| + sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants |
| + |
| + bl _bsaes_encrypt8_alt |
| + |
| + subs $len, $len, #8 |
| + blo .Lctr_enc_loop_done |
| + |
| + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input |
| + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! |
| + veor @XMM[0], @XMM[8] |
| + veor @XMM[1], @XMM[9] |
| + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! |
| + veor @XMM[4], @XMM[10] |
| + veor @XMM[6], @XMM[11] |
| + vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! |
| + veor @XMM[3], @XMM[12] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output |
| + veor @XMM[7], @XMM[13] |
| + veor @XMM[2], @XMM[14] |
| + vst1.8 {@XMM[4]}, [$out]! |
| + veor @XMM[5], @XMM[15] |
| + vst1.8 {@XMM[6]}, [$out]! |
| + vmov.i32 @XMM[8], #1 @ compose 1<<96 |
| + vst1.8 {@XMM[3]}, [$out]! |
| + veor @XMM[9], @XMM[9], @XMM[9] |
| + vst1.8 {@XMM[7]}, [$out]! |
| + vext.8 @XMM[8], @XMM[9], @XMM[8], #4 |
| + vst1.8 {@XMM[2]}, [$out]! |
| + vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 |
| + vst1.8 {@XMM[5]}, [$out]! |
| + vldmia $fp, {@XMM[0]} @ load counter |
| + |
| + bne .Lctr_enc_loop |
| + b .Lctr_enc_done |
| + |
| +.align 4 |
| +.Lctr_enc_loop_done: |
| + add $len, $len, #8 |
| + vld1.8 {@XMM[8]}, [$inp]! @ load input |
| + veor @XMM[0], @XMM[8] |
| + vst1.8 {@XMM[0]}, [$out]! @ write output |
| + cmp $len, #2 |
| + blo .Lctr_enc_done |
| + vld1.8 {@XMM[9]}, [$inp]! |
| + veor @XMM[1], @XMM[9] |
| + vst1.8 {@XMM[1]}, [$out]! |
| + beq .Lctr_enc_done |
| + vld1.8 {@XMM[10]}, [$inp]! |
| + veor @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[4]}, [$out]! |
| + cmp $len, #4 |
| + blo .Lctr_enc_done |
| + vld1.8 {@XMM[11]}, [$inp]! |
| + veor @XMM[6], @XMM[11] |
| + vst1.8 {@XMM[6]}, [$out]! |
| + beq .Lctr_enc_done |
| + vld1.8 {@XMM[12]}, [$inp]! |
| + veor @XMM[3], @XMM[12] |
| + vst1.8 {@XMM[3]}, [$out]! |
| + cmp $len, #6 |
| + blo .Lctr_enc_done |
| + vld1.8 {@XMM[13]}, [$inp]! |
| + veor @XMM[7], @XMM[13] |
| + vst1.8 {@XMM[7]}, [$out]! |
| + beq .Lctr_enc_done |
| + vld1.8 {@XMM[14]}, [$inp] |
| + veor @XMM[2], @XMM[14] |
| + vst1.8 {@XMM[2]}, [$out]! |
| + |
| +.Lctr_enc_done: |
| + vmov.i32 q0, #0 |
| + vmov.i32 q1, #0 |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| +.Lctr_enc_bzero: @ wipe key schedule [if any] |
| + vstmia $keysched!, {q0-q1} |
| + cmp $keysched, $fp |
| + bne .Lctr_enc_bzero |
| +#else |
| + vstmia $keysched, {q0-q1} |
| +#endif |
| + |
| + mov sp, $fp |
| + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb |
| + VFP_ABI_POP |
| + ldmia sp!, {r4-r10, pc} @ return |
| + |
| +.align 4 |
| +.Lctr_enc_short: |
| + ldr ip, [sp] @ ctr pointer is passed on stack |
| + stmdb sp!, {r4-r8, lr} |
| + |
| + mov r4, $inp @ copy arguments |
| + mov r5, $out |
| + mov r6, $len |
| + mov r7, $key |
| + ldr r8, [ip, #12] @ load counter LSW |
| + vld1.8 {@XMM[1]}, [ip] @ load whole counter value |
| +#ifdef __ARMEL__ |
| + rev r8, r8 |
| +#endif |
| + sub sp, sp, #0x10 |
| + vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value |
| + sub sp, sp, #0x10 |
| + |
| +.Lctr_enc_short_loop: |
| + add r0, sp, #0x10 @ input counter value |
| + mov r1, sp @ output on the stack |
| + mov r2, r7 @ key |
| + |
| + bl AES_encrypt |
| + |
| + vld1.8 {@XMM[0]}, [r4]! @ load input |
| + vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter |
| + add r8, r8, #1 |
| +#ifdef __ARMEL__ |
| + rev r0, r8 |
| + str r0, [sp, #0x1c] @ next counter value |
| +#else |
| + str r8, [sp, #0x1c] @ next counter value |
| +#endif |
| + veor @XMM[0],@XMM[0],@XMM[1] |
| + vst1.8 {@XMM[0]}, [r5]! @ store output |
| + subs r6, r6, #1 |
| + bne .Lctr_enc_short_loop |
| + |
| + vmov.i32 q0, #0 |
| + vmov.i32 q1, #0 |
| + vstmia sp!, {q0-q1} |
| + |
| + ldmia sp!, {r4-r8, pc} |
| +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks |
| +___ |
| +} |
| +{ |
| +###################################################################### |
| +# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, |
| +# const AES_KEY *key1, const AES_KEY *key2, |
| +# const unsigned char iv[16]); |
| +# |
| +my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3))); |
| +my $const="r6"; # returned by _bsaes_key_convert |
| +my $twmask=@XMM[5]; |
| +my @T=@XMM[6..7]; |
| + |
| +$code.=<<___; |
| +.globl bsaes_xts_encrypt |
| +.type bsaes_xts_encrypt,%function |
| +.align 4 |
| +bsaes_xts_encrypt: |
| + mov ip, sp |
| + stmdb sp!, {r4-r10, lr} @ 0x20 |
| + VFP_ABI_PUSH |
| + mov r6, sp @ future $fp |
| + |
| + mov $inp, r0 |
| + mov $out, r1 |
| + mov $len, r2 |
| + mov $key, r3 |
| + |
| + sub r0, sp, #0x10 @ 0x10 |
| + bic r0, #0xf @ align at 16 bytes |
| + mov sp, r0 |
| + |
| +#ifdef XTS_CHAIN_TWEAK |
| + ldr r0, [ip] @ pointer to input tweak |
| +#else |
| + @ generate initial tweak |
| + ldr r0, [ip, #4] @ iv[] |
| + mov r1, sp |
| + ldr r2, [ip, #0] @ key2 |
| + bl AES_encrypt |
| + mov r0,sp @ pointer to initial tweak |
| +#endif |
| + |
| + ldr $rounds, [$key, #240] @ get # of rounds |
| + mov $fp, r6 |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + @ allocate the key schedule on the stack |
| + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key |
| + @ add r12, #`128-32` @ size of bit-sliced key schedule |
| + sub r12, #`32+16` @ place for tweak[9] |
| + |
| + @ populate the key schedule |
| + mov r4, $key @ pass key |
| + mov r5, $rounds @ pass # of rounds |
| + mov sp, r12 |
| + add r12, #0x90 @ pass key schedule |
| + bl _bsaes_key_convert |
| + veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key |
| + vstmia r12, {@XMM[7]} @ save last round key |
| +#else |
| + ldr r12, [$key, #244] |
| + eors r12, #1 |
| + beq 0f |
| + |
| + str r12, [$key, #244] |
| + mov r4, $key @ pass key |
| + mov r5, $rounds @ pass # of rounds |
| + add r12, $key, #248 @ pass key schedule |
| + bl _bsaes_key_convert |
| + veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key |
| + vstmia r12, {@XMM[7]} |
| + |
| +.align 2 |
| +0: sub sp, #0x90 @ place for tweak[9] |
| +#endif |
| + |
| + vld1.8 {@XMM[8]}, [r0] @ initial tweak |
| + adr $magic, .Lxts_magic |
| + |
| + subs $len, #0x80 |
| + blo .Lxts_enc_short |
| + b .Lxts_enc_loop |
| + |
| +.align 4 |
| +.Lxts_enc_loop: |
| + vldmia $magic, {$twmask} @ load XTS magic |
| + vshr.s64 @T[0], @XMM[8], #63 |
| + mov r0, sp |
| + vand @T[0], @T[0], $twmask |
| +___ |
| +for($i=9;$i<16;$i++) { |
| +$code.=<<___; |
| + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] |
| + vst1.64 {@XMM[$i-1]}, [r0,:128]! |
| + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` |
| + vshr.s64 @T[1], @XMM[$i], #63 |
| + veor @XMM[$i], @XMM[$i], @T[0] |
| + vand @T[1], @T[1], $twmask |
| +___ |
| + @T=reverse(@T); |
| + |
| +$code.=<<___ if ($i>=10); |
| + vld1.8 {@XMM[$i-10]}, [$inp]! |
| +___ |
| +$code.=<<___ if ($i>=11); |
| + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] |
| +___ |
| +} |
| +$code.=<<___; |
| + vadd.u64 @XMM[8], @XMM[15], @XMM[15] |
| + vst1.64 {@XMM[15]}, [r0,:128]! |
| + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` |
| + veor @XMM[8], @XMM[8], @T[0] |
| + vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + |
| + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! |
| + veor @XMM[5], @XMM[5], @XMM[13] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[6], @XMM[6], @XMM[14] |
| + mov r5, $rounds @ pass rounds |
| + veor @XMM[7], @XMM[7], @XMM[15] |
| + mov r0, sp |
| + |
| + bl _bsaes_encrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[6], @XMM[11] |
| + vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! |
| + veor @XMM[10], @XMM[3], @XMM[12] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + veor @XMM[11], @XMM[7], @XMM[13] |
| + veor @XMM[12], @XMM[2], @XMM[14] |
| + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! |
| + veor @XMM[13], @XMM[5], @XMM[15] |
| + vst1.8 {@XMM[12]-@XMM[13]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + |
| + subs $len, #0x80 |
| + bpl .Lxts_enc_loop |
| + |
| +.Lxts_enc_short: |
| + adds $len, #0x70 |
| + bmi .Lxts_enc_done |
| + |
| + vldmia $magic, {$twmask} @ load XTS magic |
| + vshr.s64 @T[0], @XMM[8], #63 |
| + mov r0, sp |
| + vand @T[0], @T[0], $twmask |
| +___ |
| +for($i=9;$i<16;$i++) { |
| +$code.=<<___; |
| + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] |
| + vst1.64 {@XMM[$i-1]}, [r0,:128]! |
| + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` |
| + vshr.s64 @T[1], @XMM[$i], #63 |
| + veor @XMM[$i], @XMM[$i], @T[0] |
| + vand @T[1], @T[1], $twmask |
| +___ |
| + @T=reverse(@T); |
| + |
| +$code.=<<___ if ($i>=10); |
| + vld1.8 {@XMM[$i-10]}, [$inp]! |
| + subs $len, #0x10 |
| + bmi .Lxts_enc_`$i-9` |
| +___ |
| +$code.=<<___ if ($i>=11); |
| + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] |
| +___ |
| +} |
| +$code.=<<___; |
| + sub $len, #0x10 |
| + vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak |
| + |
| + vld1.8 {@XMM[6]}, [$inp]! |
| + veor @XMM[5], @XMM[5], @XMM[13] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[6], @XMM[6], @XMM[14] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_encrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[6], @XMM[11] |
| + vld1.64 {@XMM[14]}, [r0,:128]! |
| + veor @XMM[10], @XMM[3], @XMM[12] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + veor @XMM[11], @XMM[7], @XMM[13] |
| + veor @XMM[12], @XMM[2], @XMM[14] |
| + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! |
| + vst1.8 {@XMM[12]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_enc_done |
| +.align 4 |
| +.Lxts_enc_6: |
| + vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[4], @XMM[4], @XMM[12] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[5], @XMM[5], @XMM[13] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_encrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[6], @XMM[11] |
| + veor @XMM[10], @XMM[3], @XMM[12] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + veor @XMM[11], @XMM[7], @XMM[13] |
| + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_enc_done |
| + |
| +@ put this in range for both ARM and Thumb mode adr instructions |
| +.align 5 |
| +.Lxts_magic: |
| + .quad 1, 0x87 |
| + |
| +.align 5 |
| +.Lxts_enc_5: |
| + vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[3], @XMM[3], @XMM[11] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[4], @XMM[4], @XMM[12] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_encrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + vld1.64 {@XMM[12]}, [r0,:128]! |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[6], @XMM[11] |
| + veor @XMM[10], @XMM[3], @XMM[12] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + vst1.8 {@XMM[10]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_enc_done |
| +.align 4 |
| +.Lxts_enc_4: |
| + vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[2], @XMM[2], @XMM[10] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[3], @XMM[3], @XMM[11] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_encrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[6], @XMM[11] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_enc_done |
| +.align 4 |
| +.Lxts_enc_3: |
| + vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[1], @XMM[1], @XMM[9] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[2], @XMM[2], @XMM[10] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_encrypt8 |
| + |
| + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[4], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + vst1.8 {@XMM[8]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_enc_done |
| +.align 4 |
| +.Lxts_enc_2: |
| + vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[1], @XMM[1], @XMM[9] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_encrypt8 |
| + |
| + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_enc_done |
| +.align 4 |
| +.Lxts_enc_1: |
| + mov r0, sp |
| + veor @XMM[0], @XMM[8] |
| + mov r1, sp |
| + vst1.8 {@XMM[0]}, [sp,:128] |
| + mov r2, $key |
| + mov r4, $fp @ preserve fp |
| + |
| + bl AES_encrypt |
| + |
| + vld1.8 {@XMM[0]}, [sp,:128] |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| + vst1.8 {@XMM[0]}, [$out]! |
| + mov $fp, r4 |
| + |
| + vmov @XMM[8], @XMM[9] @ next round tweak |
| + |
| +.Lxts_enc_done: |
| +#ifndef XTS_CHAIN_TWEAK |
| + adds $len, #0x10 |
| + beq .Lxts_enc_ret |
| + sub r6, $out, #0x10 |
| + |
| +.Lxts_enc_steal: |
| + ldrb r0, [$inp], #1 |
| + ldrb r1, [$out, #-0x10] |
| + strb r0, [$out, #-0x10] |
| + strb r1, [$out], #1 |
| + |
| + subs $len, #1 |
| + bhi .Lxts_enc_steal |
| + |
| + vld1.8 {@XMM[0]}, [r6] |
| + mov r0, sp |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| + mov r1, sp |
| + vst1.8 {@XMM[0]}, [sp,:128] |
| + mov r2, $key |
| + mov r4, $fp @ preserve fp |
| + |
| + bl AES_encrypt |
| + |
| + vld1.8 {@XMM[0]}, [sp,:128] |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| + vst1.8 {@XMM[0]}, [r6] |
| + mov $fp, r4 |
| +#endif |
| + |
| +.Lxts_enc_ret: |
| + bic r0, $fp, #0xf |
| + vmov.i32 q0, #0 |
| + vmov.i32 q1, #0 |
| +#ifdef XTS_CHAIN_TWEAK |
| + ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak |
| +#endif |
| +.Lxts_enc_bzero: @ wipe key schedule [if any] |
| + vstmia sp!, {q0-q1} |
| + cmp sp, r0 |
| + bne .Lxts_enc_bzero |
| + |
| + mov sp, $fp |
| +#ifdef XTS_CHAIN_TWEAK |
| + vst1.8 {@XMM[8]}, [r1] |
| +#endif |
| + VFP_ABI_POP |
| + ldmia sp!, {r4-r10, pc} @ return |
| + |
| +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt |
| + |
| +.globl bsaes_xts_decrypt |
| +.type bsaes_xts_decrypt,%function |
| +.align 4 |
| +bsaes_xts_decrypt: |
| + mov ip, sp |
| + stmdb sp!, {r4-r10, lr} @ 0x20 |
| + VFP_ABI_PUSH |
| + mov r6, sp @ future $fp |
| + |
| + mov $inp, r0 |
| + mov $out, r1 |
| + mov $len, r2 |
| + mov $key, r3 |
| + |
| + sub r0, sp, #0x10 @ 0x10 |
| + bic r0, #0xf @ align at 16 bytes |
| + mov sp, r0 |
| + |
| +#ifdef XTS_CHAIN_TWEAK |
| + ldr r0, [ip] @ pointer to input tweak |
| +#else |
| + @ generate initial tweak |
| + ldr r0, [ip, #4] @ iv[] |
| + mov r1, sp |
| + ldr r2, [ip, #0] @ key2 |
| + bl AES_encrypt |
| + mov r0, sp @ pointer to initial tweak |
| +#endif |
| + |
| + ldr $rounds, [$key, #240] @ get # of rounds |
| + mov $fp, r6 |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + @ allocate the key schedule on the stack |
| + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key |
| + @ add r12, #`128-32` @ size of bit-sliced key schedule |
| + sub r12, #`32+16` @ place for tweak[9] |
| + |
| + @ populate the key schedule |
| + mov r4, $key @ pass key |
| + mov r5, $rounds @ pass # of rounds |
| + mov sp, r12 |
| + add r12, #0x90 @ pass key schedule |
| + bl _bsaes_key_convert |
| + add r4, sp, #0x90 |
| + vldmia r4, {@XMM[6]} |
| + vstmia r12, {@XMM[15]} @ save last round key |
| + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key |
| + vstmia r4, {@XMM[7]} |
| +#else |
| + ldr r12, [$key, #244] |
| + eors r12, #1 |
| + beq 0f |
| + |
| + str r12, [$key, #244] |
| + mov r4, $key @ pass key |
| + mov r5, $rounds @ pass # of rounds |
| + add r12, $key, #248 @ pass key schedule |
| + bl _bsaes_key_convert |
| + add r4, $key, #248 |
| + vldmia r4, {@XMM[6]} |
| + vstmia r12, {@XMM[15]} @ save last round key |
| + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key |
| + vstmia r4, {@XMM[7]} |
| + |
| +.align 2 |
| +0: sub sp, #0x90 @ place for tweak[9] |
| +#endif |
| + vld1.8 {@XMM[8]}, [r0] @ initial tweak |
| + adr $magic, .Lxts_magic |
| + |
| + tst $len, #0xf @ if not multiple of 16 |
| + it ne @ Thumb2 thing, sanity check in ARM |
| + subne $len, #0x10 @ subtract another 16 bytes |
| + subs $len, #0x80 |
| + |
| + blo .Lxts_dec_short |
| + b .Lxts_dec_loop |
| + |
| +.align 4 |
| +.Lxts_dec_loop: |
| + vldmia $magic, {$twmask} @ load XTS magic |
| + vshr.s64 @T[0], @XMM[8], #63 |
| + mov r0, sp |
| + vand @T[0], @T[0], $twmask |
| +___ |
| +for($i=9;$i<16;$i++) { |
| +$code.=<<___; |
| + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] |
| + vst1.64 {@XMM[$i-1]}, [r0,:128]! |
| + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` |
| + vshr.s64 @T[1], @XMM[$i], #63 |
| + veor @XMM[$i], @XMM[$i], @T[0] |
| + vand @T[1], @T[1], $twmask |
| +___ |
| + @T=reverse(@T); |
| + |
| +$code.=<<___ if ($i>=10); |
| + vld1.8 {@XMM[$i-10]}, [$inp]! |
| +___ |
| +$code.=<<___ if ($i>=11); |
| + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] |
| +___ |
| +} |
| +$code.=<<___; |
| + vadd.u64 @XMM[8], @XMM[15], @XMM[15] |
| + vst1.64 {@XMM[15]}, [r0,:128]! |
| + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` |
| + veor @XMM[8], @XMM[8], @T[0] |
| + vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + |
| + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! |
| + veor @XMM[5], @XMM[5], @XMM[13] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[6], @XMM[6], @XMM[14] |
| + mov r5, $rounds @ pass rounds |
| + veor @XMM[7], @XMM[7], @XMM[15] |
| + mov r0, sp |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[6], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[4], @XMM[11] |
| + vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! |
| + veor @XMM[10], @XMM[2], @XMM[12] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + veor @XMM[11], @XMM[7], @XMM[13] |
| + veor @XMM[12], @XMM[3], @XMM[14] |
| + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! |
| + veor @XMM[13], @XMM[5], @XMM[15] |
| + vst1.8 {@XMM[12]-@XMM[13]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + |
| + subs $len, #0x80 |
| + bpl .Lxts_dec_loop |
| + |
| +.Lxts_dec_short: |
| + adds $len, #0x70 |
| + bmi .Lxts_dec_done |
| + |
| + vldmia $magic, {$twmask} @ load XTS magic |
| + vshr.s64 @T[0], @XMM[8], #63 |
| + mov r0, sp |
| + vand @T[0], @T[0], $twmask |
| +___ |
| +for($i=9;$i<16;$i++) { |
| +$code.=<<___; |
| + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] |
| + vst1.64 {@XMM[$i-1]}, [r0,:128]! |
| + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` |
| + vshr.s64 @T[1], @XMM[$i], #63 |
| + veor @XMM[$i], @XMM[$i], @T[0] |
| + vand @T[1], @T[1], $twmask |
| +___ |
| + @T=reverse(@T); |
| + |
| +$code.=<<___ if ($i>=10); |
| + vld1.8 {@XMM[$i-10]}, [$inp]! |
| + subs $len, #0x10 |
| + bmi .Lxts_dec_`$i-9` |
| +___ |
| +$code.=<<___ if ($i>=11); |
| + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] |
| +___ |
| +} |
| +$code.=<<___; |
| + sub $len, #0x10 |
| + vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak |
| + |
| + vld1.8 {@XMM[6]}, [$inp]! |
| + veor @XMM[5], @XMM[5], @XMM[13] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[6], @XMM[6], @XMM[14] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[6], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[4], @XMM[11] |
| + vld1.64 {@XMM[14]}, [r0,:128]! |
| + veor @XMM[10], @XMM[2], @XMM[12] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + veor @XMM[11], @XMM[7], @XMM[13] |
| + veor @XMM[12], @XMM[3], @XMM[14] |
| + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! |
| + vst1.8 {@XMM[12]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_dec_done |
| +.align 4 |
| +.Lxts_dec_6: |
| + vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[4], @XMM[4], @XMM[12] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[5], @XMM[5], @XMM[13] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[6], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[4], @XMM[11] |
| + veor @XMM[10], @XMM[2], @XMM[12] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + veor @XMM[11], @XMM[7], @XMM[13] |
| + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_dec_done |
| +.align 4 |
| +.Lxts_dec_5: |
| + vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[3], @XMM[3], @XMM[11] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[4], @XMM[4], @XMM[12] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + vld1.64 {@XMM[12]}, [r0,:128]! |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[6], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[4], @XMM[11] |
| + veor @XMM[10], @XMM[2], @XMM[12] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + vst1.8 {@XMM[10]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_dec_done |
| +.align 4 |
| +.Lxts_dec_4: |
| + vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[2], @XMM[2], @XMM[10] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[3], @XMM[3], @XMM[11] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[6], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + veor @XMM[9], @XMM[4], @XMM[11] |
| + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_dec_done |
| +.align 4 |
| +.Lxts_dec_3: |
| + vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[1], @XMM[1], @XMM[9] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[2], @XMM[2], @XMM[10] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! |
| + vld1.64 {@XMM[10]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + veor @XMM[8], @XMM[6], @XMM[10] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + vst1.8 {@XMM[8]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_dec_done |
| +.align 4 |
| +.Lxts_dec_2: |
| + vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak |
| + |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| +#ifndef BSAES_ASM_EXTENDED_KEY |
| + add r4, sp, #0x90 @ pass key schedule |
| +#else |
| + add r4, $key, #248 @ pass key schedule |
| +#endif |
| + veor @XMM[1], @XMM[1], @XMM[9] |
| + mov r5, $rounds @ pass rounds |
| + mov r0, sp |
| + |
| + bl _bsaes_decrypt8 |
| + |
| + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! |
| + veor @XMM[0], @XMM[0], @XMM[ 8] |
| + veor @XMM[1], @XMM[1], @XMM[ 9] |
| + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! |
| + |
| + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak |
| + b .Lxts_dec_done |
| +.align 4 |
| +.Lxts_dec_1: |
| + mov r0, sp |
| + veor @XMM[0], @XMM[8] |
| + mov r1, sp |
| + vst1.8 {@XMM[0]}, [sp,:128] |
| + mov r2, $key |
| + mov r4, $fp @ preserve fp |
| + mov r5, $magic @ preserve magic |
| + |
| + bl AES_decrypt |
| + |
| + vld1.8 {@XMM[0]}, [sp,:128] |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| + vst1.8 {@XMM[0]}, [$out]! |
| + mov $fp, r4 |
| + mov $magic, r5 |
| + |
| + vmov @XMM[8], @XMM[9] @ next round tweak |
| + |
| +.Lxts_dec_done: |
| +#ifndef XTS_CHAIN_TWEAK |
| + adds $len, #0x10 |
| + beq .Lxts_dec_ret |
| + |
| + @ calculate one round of extra tweak for the stolen ciphertext |
| + vldmia $magic, {$twmask} |
| + vshr.s64 @XMM[6], @XMM[8], #63 |
| + vand @XMM[6], @XMM[6], $twmask |
| + vadd.u64 @XMM[9], @XMM[8], @XMM[8] |
| + vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")` |
| + veor @XMM[9], @XMM[9], @XMM[6] |
| + |
| + @ perform the final decryption with the last tweak value |
| + vld1.8 {@XMM[0]}, [$inp]! |
| + mov r0, sp |
| + veor @XMM[0], @XMM[0], @XMM[9] |
| + mov r1, sp |
| + vst1.8 {@XMM[0]}, [sp,:128] |
| + mov r2, $key |
| + mov r4, $fp @ preserve fp |
| + |
| + bl AES_decrypt |
| + |
| + vld1.8 {@XMM[0]}, [sp,:128] |
| + veor @XMM[0], @XMM[0], @XMM[9] |
| + vst1.8 {@XMM[0]}, [$out] |
| + |
| + mov r6, $out |
| +.Lxts_dec_steal: |
| + ldrb r1, [$out] |
| + ldrb r0, [$inp], #1 |
| + strb r1, [$out, #0x10] |
| + strb r0, [$out], #1 |
| + |
| + subs $len, #1 |
| + bhi .Lxts_dec_steal |
| + |
| + vld1.8 {@XMM[0]}, [r6] |
| + mov r0, sp |
| + veor @XMM[0], @XMM[8] |
| + mov r1, sp |
| + vst1.8 {@XMM[0]}, [sp,:128] |
| + mov r2, $key |
| + |
| + bl AES_decrypt |
| + |
| + vld1.8 {@XMM[0]}, [sp,:128] |
| + veor @XMM[0], @XMM[0], @XMM[8] |
| + vst1.8 {@XMM[0]}, [r6] |
| + mov $fp, r4 |
| +#endif |
| + |
| +.Lxts_dec_ret: |
| + bic r0, $fp, #0xf |
| + vmov.i32 q0, #0 |
| + vmov.i32 q1, #0 |
| +#ifdef XTS_CHAIN_TWEAK |
| + ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak |
| +#endif |
| +.Lxts_dec_bzero: @ wipe key schedule [if any] |
| + vstmia sp!, {q0-q1} |
| + cmp sp, r0 |
| + bne .Lxts_dec_bzero |
| + |
| + mov sp, $fp |
| +#ifdef XTS_CHAIN_TWEAK |
| + vst1.8 {@XMM[8]}, [r1] |
| +#endif |
| + VFP_ABI_POP |
| + ldmia sp!, {r4-r10, pc} @ return |
| + |
| +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt |
| +___ |
| +} |
| +$code.=<<___; |
| +#endif |
| +___ |
| + |
| +$code =~ s/\`([^\`]*)\`/eval($1)/gem; |
| + |
| +open SELF,$0; |
| +while(<SELF>) { |
| + next if (/^#!/); |
| + last if (!s/^#/@/ and !/^$/); |
| + print; |
| +} |
| +close SELF; |
| + |
| +print $code; |
| + |
| +close STDOUT; |
| diff --git a/crypto/arm64cpuid.S b/crypto/arm64cpuid.S |
| new file mode 100644 |
| index 0000000..4778ac1 |
| --- /dev/null |
| +++ b/crypto/arm64cpuid.S |
| @@ -0,0 +1,46 @@ |
| +#include "arm_arch.h" |
| + |
| +.text |
| +.arch armv8-a+crypto |
| + |
| +.align 5 |
| +.global _armv7_neon_probe |
| +.type _armv7_neon_probe,%function |
| +_armv7_neon_probe: |
| + orr v15.16b, v15.16b, v15.16b |
| + ret |
| +.size _armv7_neon_probe,.-_armv7_neon_probe |
| + |
| +.global _armv7_tick |
| +.type _armv7_tick,%function |
| +_armv7_tick: |
| + mrs x0, CNTVCT_EL0 |
| + ret |
| +.size _armv7_tick,.-_armv7_tick |
| + |
| +.global _armv8_aes_probe |
| +.type _armv8_aes_probe,%function |
| +_armv8_aes_probe: |
| + aese v0.16b, v0.16b |
| + ret |
| +.size _armv8_aes_probe,.-_armv8_aes_probe |
| + |
| +.global _armv8_sha1_probe |
| +.type _armv8_sha1_probe,%function |
| +_armv8_sha1_probe: |
| + sha1h s0, s0 |
| + ret |
| +.size _armv8_sha1_probe,.-_armv8_sha1_probe |
| + |
| +.global _armv8_sha256_probe |
| +.type _armv8_sha256_probe,%function |
| +_armv8_sha256_probe: |
| + sha256su0 v0.4s, v0.4s |
| + ret |
| +.size _armv8_sha256_probe,.-_armv8_sha256_probe |
| +.global _armv8_pmull_probe |
| +.type _armv8_pmull_probe,%function |
| +_armv8_pmull_probe: |
| + pmull v0.1q, v0.1d, v0.1d |
| + ret |
| +.size _armv8_pmull_probe,.-_armv8_pmull_probe |
| diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h |
| index 5a83107..6fa8724 100644 |
| --- a/crypto/arm_arch.h |
| +++ b/crypto/arm_arch.h |
| @@ -10,13 +10,24 @@ |
| # define __ARMEL__ |
| # endif |
| # elif defined(__GNUC__) |
| +# if defined(__aarch64__) |
| +# define __ARM_ARCH__ 8 |
| +# if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__ |
| +# define __ARMEB__ |
| +# else |
| +# define __ARMEL__ |
| +# endif |
| /* |
| * Why doesn't gcc define __ARM_ARCH__? Instead it defines |
| * bunch of below macros. See all_architectires[] table in |
| * gcc/config/arm/arm.c. On a side note it defines |
| * __ARMEL__/__ARMEB__ for little-/big-endian. |
| */ |
| -# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ |
| +# elif defined(__ARM_ARCH) |
| +# define __ARM_ARCH__ __ARM_ARCH |
| +# elif defined(__ARM_ARCH_8A__) |
| +# define __ARM_ARCH__ 8 |
| +# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ |
| defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ |
| defined(__ARM_ARCH_7EM__) |
| # define __ARM_ARCH__ 7 |
| @@ -43,9 +54,13 @@ |
| |
| #if !__ASSEMBLER__ |
| extern unsigned int OPENSSL_armcap_P; |
| +#endif |
| |
| #define ARMV7_NEON (1<<0) |
| #define ARMV7_TICK (1<<1) |
| -#endif |
| +#define ARMV8_AES (1<<2) |
| +#define ARMV8_SHA1 (1<<3) |
| +#define ARMV8_SHA256 (1<<4) |
| +#define ARMV8_PMULL (1<<5) |
| |
| #endif |
| diff --git a/crypto/armcap.c b/crypto/armcap.c |
| index 9abaf39..7e46d07 100644 |
| --- a/crypto/armcap.c |
| +++ b/crypto/armcap.c |
| @@ -19,9 +19,13 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } |
| * ARM compilers support inline assembler... |
| */ |
| void _armv7_neon_probe(void); |
| -unsigned int _armv7_tick(void); |
| +void _armv8_aes_probe(void); |
| +void _armv8_sha1_probe(void); |
| +void _armv8_sha256_probe(void); |
| +void _armv8_pmull_probe(void); |
| +unsigned long _armv7_tick(void); |
| |
| -unsigned int OPENSSL_rdtsc(void) |
| +unsigned long OPENSSL_rdtsc(void) |
| { |
| if (OPENSSL_armcap_P & ARMV7_TICK) |
| return _armv7_tick(); |
| @@ -29,9 +33,41 @@ unsigned int OPENSSL_rdtsc(void) |
| return 0; |
| } |
| |
| +/* |
| + * Use a weak reference to getauxval() so we can use it if it is available but |
| + * don't break the build if it is not. |
| + */ |
| #if defined(__GNUC__) && __GNUC__>=2 |
| void OPENSSL_cpuid_setup(void) __attribute__((constructor)); |
| +extern unsigned long getauxval(unsigned long type) __attribute__((weak)); |
| +#else |
| +static unsigned long (*getauxval)(unsigned long) = NULL; |
| #endif |
| + |
| +/* |
| + * ARM puts the the feature bits for Crypto Extensions in AT_HWCAP2, whereas |
| + * AArch64 used AT_HWCAP. |
| + */ |
| +#if defined(__arm__) || defined (__arm) |
| +# define HWCAP 16 /* AT_HWCAP */ |
| +# define HWCAP_NEON (1 << 12) |
| + |
| +# define HWCAP_CE 26 /* AT_HWCAP2 */ |
| +# define HWCAP_CE_AES (1 << 0) |
| +# define HWCAP_CE_PMULL (1 << 1) |
| +# define HWCAP_CE_SHA1 (1 << 2) |
| +# define HWCAP_CE_SHA256 (1 << 3) |
| +#elif defined(__aarch64__) |
| +# define HWCAP 16 /* AT_HWCAP */ |
| +# define HWCAP_NEON (1 << 1) |
| + |
| +# define HWCAP_CE HWCAP |
| +# define HWCAP_CE_AES (1 << 3) |
| +# define HWCAP_CE_PMULL (1 << 4) |
| +# define HWCAP_CE_SHA1 (1 << 5) |
| +# define HWCAP_CE_SHA256 (1 << 6) |
| +#endif |
| + |
| void OPENSSL_cpuid_setup(void) |
| { |
| char *e; |
| @@ -44,7 +80,7 @@ void OPENSSL_cpuid_setup(void) |
| |
| if ((e=getenv("OPENSSL_armcap"))) |
| { |
| - OPENSSL_armcap_P=strtoul(e,NULL,0); |
| + OPENSSL_armcap_P=(unsigned int)strtoul(e,NULL,0); |
| return; |
| } |
| |
| @@ -64,10 +100,51 @@ void OPENSSL_cpuid_setup(void) |
| sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); |
| sigaction(SIGILL,&ill_act,&ill_oact); |
| |
| - if (sigsetjmp(ill_jmp,1) == 0) |
| + if (getauxval != NULL) |
| + { |
| + if (getauxval(HWCAP) & HWCAP_NEON) |
| + { |
| + unsigned long hwcap = getauxval(HWCAP_CE); |
| + |
| + OPENSSL_armcap_P |= ARMV7_NEON; |
| + |
| + if (hwcap & HWCAP_CE_AES) |
| + OPENSSL_armcap_P |= ARMV8_AES; |
| + |
| + if (hwcap & HWCAP_CE_PMULL) |
| + OPENSSL_armcap_P |= ARMV8_PMULL; |
| + |
| + if (hwcap & HWCAP_CE_SHA1) |
| + OPENSSL_armcap_P |= ARMV8_SHA1; |
| + |
| + if (hwcap & HWCAP_CE_SHA256) |
| + OPENSSL_armcap_P |= ARMV8_SHA256; |
| + } |
| + } |
| + else if (sigsetjmp(ill_jmp,1) == 0) |
| { |
| _armv7_neon_probe(); |
| OPENSSL_armcap_P |= ARMV7_NEON; |
| + if (sigsetjmp(ill_jmp,1) == 0) |
| + { |
| + _armv8_pmull_probe(); |
| + OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES; |
| + } |
| + else if (sigsetjmp(ill_jmp,1) == 0) |
| + { |
| + _armv8_aes_probe(); |
| + OPENSSL_armcap_P |= ARMV8_AES; |
| + } |
| + if (sigsetjmp(ill_jmp,1) == 0) |
| + { |
| + _armv8_sha1_probe(); |
| + OPENSSL_armcap_P |= ARMV8_SHA1; |
| + } |
| + if (sigsetjmp(ill_jmp,1) == 0) |
| + { |
| + _armv8_sha256_probe(); |
| + OPENSSL_armcap_P |= ARMV8_SHA256; |
| + } |
| } |
| if (sigsetjmp(ill_jmp,1) == 0) |
| { |
| diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S |
| index 2d618de..add11d4 100644 |
| --- a/crypto/armv4cpuid.S |
| +++ b/crypto/armv4cpuid.S |
| @@ -7,17 +7,49 @@ |
| .global _armv7_neon_probe |
| .type _armv7_neon_probe,%function |
| _armv7_neon_probe: |
| - .word 0xf26ee1fe @ vorr q15,q15,q15 |
| - .word 0xe12fff1e @ bx lr |
| + .byte 0xf0,0x01,0x60,0xf2 @ vorr q8,q8,q8 |
| + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr |
| .size _armv7_neon_probe,.-_armv7_neon_probe |
| |
| .global _armv7_tick |
| .type _armv7_tick,%function |
| _armv7_tick: |
| - mrc p15,0,r0,c9,c13,0 |
| - .word 0xe12fff1e @ bx lr |
| + mrrc p15,1,r0,r1,c14 @ CNTVCT |
| +#if __ARM_ARCH__>=5 |
| + bx lr |
| +#else |
| + .word 0xe12fff1e @ bx lr |
| +#endif |
| .size _armv7_tick,.-_armv7_tick |
| |
| +.global _armv8_aes_probe |
| +.type _armv8_aes_probe,%function |
| +_armv8_aes_probe: |
| + .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0 |
| + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr |
| +.size _armv8_aes_probe,.-_armv8_aes_probe |
| + |
| +.global _armv8_sha1_probe |
| +.type _armv8_sha1_probe,%function |
| +_armv8_sha1_probe: |
| + .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0 |
| + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr |
| +.size _armv8_sha1_probe,.-_armv8_sha1_probe |
| + |
| +.global _armv8_sha256_probe |
| +.type _armv8_sha256_probe,%function |
| +_armv8_sha256_probe: |
| + .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0 |
| + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr |
| +.size _armv8_sha256_probe,.-_armv8_sha256_probe |
| +.global _armv8_pmull_probe |
| +.type _armv8_pmull_probe,%function |
| +_armv8_pmull_probe: |
| + .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0 |
| + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr |
| +.size _armv8_pmull_probe,.-_armv8_pmull_probe |
| + |
| +.align 5 |
| .global OPENSSL_atomic_add |
| .type OPENSSL_atomic_add,%function |
| OPENSSL_atomic_add: |
| @@ -28,7 +60,7 @@ OPENSSL_atomic_add: |
| cmp r2,#0 |
| bne .Ladd |
| mov r0,r3 |
| - .word 0xe12fff1e @ bx lr |
| + bx lr |
| #else |
| stmdb sp!,{r4-r6,lr} |
| ldr r2,.Lspinlock |
| @@ -81,9 +113,13 @@ OPENSSL_cleanse: |
| adds r1,r1,#4 |
| bne .Little |
| .Lcleanse_done: |
| +#if __ARM_ARCH__>=5 |
| + bx lr |
| +#else |
| tst lr,#1 |
| moveq pc,lr |
| .word 0xe12fff1e @ bx lr |
| +#endif |
| .size OPENSSL_cleanse,.-OPENSSL_cleanse |
| |
| .global OPENSSL_wipe_cpu |
| @@ -97,41 +133,53 @@ OPENSSL_wipe_cpu: |
| eor ip,ip,ip |
| tst r0,#1 |
| beq .Lwipe_done |
| - .word 0xf3000150 @ veor q0, q0, q0 |
| - .word 0xf3022152 @ veor q1, q1, q1 |
| - .word 0xf3044154 @ veor q2, q2, q2 |
| - .word 0xf3066156 @ veor q3, q3, q3 |
| - .word 0xf34001f0 @ veor q8, q8, q8 |
| - .word 0xf34221f2 @ veor q9, q9, q9 |
| - .word 0xf34441f4 @ veor q10, q10, q10 |
| - .word 0xf34661f6 @ veor q11, q11, q11 |
| - .word 0xf34881f8 @ veor q12, q12, q12 |
| - .word 0xf34aa1fa @ veor q13, q13, q13 |
| - .word 0xf34cc1fc @ veor q14, q14, q14 |
| - .word 0xf34ee1fe @ veor q15, q15, q15 |
| + .byte 0x50,0x01,0x00,0xf3 @ veor q0, q0, q0 |
| + .byte 0x52,0x21,0x02,0xf3 @ veor q1, q1, q1 |
| + .byte 0x54,0x41,0x04,0xf3 @ veor q2, q2, q2 |
| + .byte 0x56,0x61,0x06,0xf3 @ veor q3, q3, q3 |
| + .byte 0xf0,0x01,0x40,0xf3 @ veor q8, q8, q8 |
| + .byte 0xf2,0x21,0x42,0xf3 @ veor q9, q9, q9 |
| + .byte 0xf4,0x41,0x44,0xf3 @ veor q10, q10, q10 |
| + .byte 0xf6,0x61,0x46,0xf3 @ veor q11, q11, q11 |
| + .byte 0xf8,0x81,0x48,0xf3 @ veor q12, q12, q12 |
| + .byte 0xfa,0xa1,0x4a,0xf3 @ veor q13, q13, q13 |
| + .byte 0xfc,0xc1,0x4c,0xf3 @ veor q14, q14, q14 |
| + .byte 0xfe,0xe1,0x4e,0xf3 @ veor q14, q14, q14 |
| .Lwipe_done: |
| mov r0,sp |
| +#if __ARM_ARCH__>=5 |
| + bx lr |
| +#else |
| tst lr,#1 |
| moveq pc,lr |
| .word 0xe12fff1e @ bx lr |
| +#endif |
| .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu |
| |
| .global OPENSSL_instrument_bus |
| .type OPENSSL_instrument_bus,%function |
| OPENSSL_instrument_bus: |
| eor r0,r0,r0 |
| +#if __ARM_ARCH__>=5 |
| + bx lr |
| +#else |
| tst lr,#1 |
| moveq pc,lr |
| .word 0xe12fff1e @ bx lr |
| +#endif |
| .size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus |
| |
| .global OPENSSL_instrument_bus2 |
| .type OPENSSL_instrument_bus2,%function |
| OPENSSL_instrument_bus2: |
| eor r0,r0,r0 |
| +#if __ARM_ARCH__>=5 |
| + bx lr |
| +#else |
| tst lr,#1 |
| moveq pc,lr |
| .word 0xe12fff1e @ bx lr |
| +#endif |
| .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 |
| |
| .align 5 |
| diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile |
| index 6dd136b..effc409 100644 |
| --- a/crypto/bn/Makefile |
| +++ b/crypto/bn/Makefile |
| @@ -130,9 +130,10 @@ alpha-mont.s: asm/alpha-mont.pl |
| $(CC) -E $$preproc > $@ && rm $$preproc) |
| |
| # GNU make "catch all" |
| -%-mont.s: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@ |
| +%-mont.S: asm/%-mont.pl; $(PERL) $< $(PERLASM_SCHEME) $@ |
| %-gf2m.S: asm/%-gf2m.pl; $(PERL) $< $(PERLASM_SCHEME) $@ |
| |
| +armv4-mont.o: armv4-mont.S |
| armv4-gf2m.o: armv4-gf2m.S |
| |
| files: |
| diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl |
| index c52e0b7..b781afb 100644 |
| --- a/crypto/bn/asm/armv4-gf2m.pl |
| +++ b/crypto/bn/asm/armv4-gf2m.pl |
| @@ -20,14 +20,21 @@ |
| # length, more for longer keys. Even though NEON 1x1 multiplication |
| # runs in even less cycles, ~30, improvement is measurable only on |
| # longer keys. One has to optimize code elsewhere to get NEON glow... |
| +# |
| +# April 2014 |
| +# |
| +# Double bn_GF2m_mul_2x2 performance by using algorithm from paper |
| +# referred below, which improves ECDH and ECDSA verify benchmarks |
| +# by 18-40%. |
| +# |
| +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software |
| +# Polynomial Multiplication on ARM Processors using the NEON Engine. |
| +# |
| +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf |
| |
| while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| open STDOUT,">$output"; |
| |
| -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } |
| -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } |
| -sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } |
| - |
| $code=<<___; |
| #include "arm_arch.h" |
| |
| @@ -36,31 +43,6 @@ $code=<<___; |
| |
| #if __ARM_ARCH__>=7 |
| .fpu neon |
| - |
| -.type mul_1x1_neon,%function |
| -.align 5 |
| -mul_1x1_neon: |
| - vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a |
| - vmull.p8 `&Q("d0")`,d16,d17 @ a·bb |
| - vshl.u64 `&Dlo("q2")`,d16,#16 |
| - vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb |
| - vshl.u64 `&Dlo("q3")`,d16,#24 |
| - vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb |
| - vshr.u64 `&Dlo("q1")`,#8 |
| - vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb |
| - vshl.u64 `&Dhi("q1")`,#24 |
| - veor d0,`&Dlo("q1")` |
| - vshr.u64 `&Dlo("q2")`,#16 |
| - veor d0,`&Dhi("q1")` |
| - vshl.u64 `&Dhi("q2")`,#16 |
| - veor d0,`&Dlo("q2")` |
| - vshr.u64 `&Dlo("q3")`,#24 |
| - veor d0,`&Dhi("q2")` |
| - vshl.u64 `&Dhi("q3")`,#8 |
| - veor d0,`&Dlo("q3")` |
| - veor d0,`&Dhi("q3")` |
| - bx lr |
| -.size mul_1x1_neon,.-mul_1x1_neon |
| #endif |
| ___ |
| ################ |
| @@ -159,8 +141,9 @@ ___ |
| # void bn_GF2m_mul_2x2(BN_ULONG *r, |
| # BN_ULONG a1,BN_ULONG a0, |
| # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 |
| - |
| -($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); |
| +{ |
| +my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12)); |
| +my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31)); |
| |
| $code.=<<___; |
| .global bn_GF2m_mul_2x2 |
| @@ -173,44 +156,58 @@ bn_GF2m_mul_2x2: |
| tst r12,#1 |
| beq .Lialu |
| |
| - veor $A1,$A1 |
| - vmov.32 $B1,r3,r3 @ two copies of b1 |
| - vmov.32 ${A1}[0],r1 @ a1 |
| - |
| - veor $A0,$A0 |
| - vld1.32 ${B0}[],[sp,:32] @ two copies of b0 |
| - vmov.32 ${A0}[0],r2 @ a0 |
| - mov r12,lr |
| - |
| - vmov d16,$A1 |
| - vmov d17,$B1 |
| - bl mul_1x1_neon @ a1·b1 |
| - vmov $A1B1,d0 |
| - |
| - vmov d16,$A0 |
| - vmov d17,$B0 |
| - bl mul_1x1_neon @ a0·b0 |
| - vmov $A0B0,d0 |
| - |
| - veor d16,$A0,$A1 |
| - veor d17,$B0,$B1 |
| - veor $A0,$A0B0,$A1B1 |
| - bl mul_1x1_neon @ (a0+a1)·(b0+b1) |
| - |
| - veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 |
| - vshl.u64 d1,d0,#32 |
| - vshr.u64 d0,d0,#32 |
| - veor $A0B0,d1 |
| - veor $A1B1,d0 |
| - vst1.32 {${A0B0}[0]},[r0,:32]! |
| - vst1.32 {${A0B0}[1]},[r0,:32]! |
| - vst1.32 {${A1B1}[0]},[r0,:32]! |
| - vst1.32 {${A1B1}[1]},[r0,:32] |
| - bx r12 |
| + ldr r12, [sp] @ 5th argument |
| + vmov.32 $a, r2, r1 |
| + vmov.32 $b, r12, r3 |
| + vmov.i64 $k48, #0x0000ffffffffffff |
| + vmov.i64 $k32, #0x00000000ffffffff |
| + vmov.i64 $k16, #0x000000000000ffff |
| + |
| + vext.8 $t0#lo, $a, $a, #1 @ A1 |
| + vmull.p8 $t0, $t0#lo, $b @ F = A1*B |
| + vext.8 $r#lo, $b, $b, #1 @ B1 |
| + vmull.p8 $r, $a, $r#lo @ E = A*B1 |
| + vext.8 $t1#lo, $a, $a, #2 @ A2 |
| + vmull.p8 $t1, $t1#lo, $b @ H = A2*B |
| + vext.8 $t3#lo, $b, $b, #2 @ B2 |
| + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 |
| + vext.8 $t2#lo, $a, $a, #3 @ A3 |
| + veor $t0, $t0, $r @ L = E + F |
| + vmull.p8 $t2, $t2#lo, $b @ J = A3*B |
| + vext.8 $r#lo, $b, $b, #3 @ B3 |
| + veor $t1, $t1, $t3 @ M = G + H |
| + vmull.p8 $r, $a, $r#lo @ I = A*B3 |
| + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 |
| + vand $t0#hi, $t0#hi, $k48 |
| + vext.8 $t3#lo, $b, $b, #4 @ B4 |
| + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 |
| + vand $t1#hi, $t1#hi, $k32 |
| + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 |
| + veor $t2, $t2, $r @ N = I + J |
| + veor $t0#lo, $t0#lo, $t0#hi |
| + veor $t1#lo, $t1#lo, $t1#hi |
| + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 |
| + vand $t2#hi, $t2#hi, $k16 |
| + vext.8 $t0, $t0, $t0, #15 |
| + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 |
| + vmov.i64 $t3#hi, #0 |
| + vext.8 $t1, $t1, $t1, #14 |
| + veor $t2#lo, $t2#lo, $t2#hi |
| + vmull.p8 $r, $a, $b @ D = A*B |
| + vext.8 $t3, $t3, $t3, #12 |
| + vext.8 $t2, $t2, $t2, #13 |
| + veor $t0, $t0, $t1 |
| + veor $t2, $t2, $t3 |
| + veor $r, $r, $t0 |
| + veor $r, $r, $t2 |
| + |
| + vst1.32 {$r}, [r0] |
| + ret @ bx lr |
| .align 4 |
| .Lialu: |
| #endif |
| ___ |
| +} |
| $ret="r10"; # reassigned 1st argument |
| $code.=<<___; |
| stmdb sp!,{r4-r10,lr} |
| @@ -272,7 +269,13 @@ $code.=<<___; |
| .comm OPENSSL_armcap_P,4,4 |
| ___ |
| |
| -$code =~ s/\`([^\`]*)\`/eval $1/gem; |
| -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
| -print $code; |
| +foreach (split("\n",$code)) { |
| + s/\`([^\`]*)\`/eval $1/geo; |
| + |
| + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or |
| + s/\bret\b/bx lr/go or |
| + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 |
| + |
| + print $_,"\n"; |
| +} |
| close STDOUT; # enforce flush |
| diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl |
| index f78a8b5..72bad8e 100644 |
| --- a/crypto/bn/asm/armv4-mont.pl |
| +++ b/crypto/bn/asm/armv4-mont.pl |
| @@ -1,7 +1,7 @@ |
| #!/usr/bin/env perl |
| |
| # ==================================================================== |
| -# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| @@ -23,6 +23,21 @@ |
| # than 1/2KB. Windows CE port would be trivial, as it's exclusively |
| # about decorations, ABI and instruction syntax are identical. |
| |
| +# November 2013 |
| +# |
| +# Add NEON code path, which handles lengths divisible by 8. RSA/DSA |
| +# performance improvement on Cortex-A8 is ~45-100% depending on key |
| +# length, more for longer keys. On Cortex-A15 the span is ~10-105%. |
| +# On Snapdragon S4 improvement was measured to vary from ~70% to |
| +# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is |
| +# rather because original integer-only code seems to perform |
| +# suboptimally on S4. Situation on Cortex-A9 is unfortunately |
| +# different. It's being looked into, but the trouble is that |
| +# performance for vectors longer than 256 bits is actually couple |
| +# of percent worse than for integer-only code. The code is chosen |
| +# for execution on all NEON-capable processors, because gain on |
| +# others outweighs the marginal loss on Cortex-A9. |
| + |
| while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| open STDOUT,">$output"; |
| |
| @@ -52,16 +67,40 @@ $_n0="$num,#14*4"; |
| $_num="$num,#15*4"; $_bpend=$_num; |
| |
| $code=<<___; |
| +#include "arm_arch.h" |
| + |
| .text |
| +.code 32 |
| + |
| +#if __ARM_ARCH__>=7 |
| +.align 5 |
| +.LOPENSSL_armcap: |
| +.word OPENSSL_armcap_P-bn_mul_mont |
| +#endif |
| |
| .global bn_mul_mont |
| .type bn_mul_mont,%function |
| |
| -.align 2 |
| +.align 5 |
| bn_mul_mont: |
| + ldr ip,[sp,#4] @ load num |
| stmdb sp!,{r0,r2} @ sp points at argument block |
| - ldr $num,[sp,#3*4] @ load num |
| - cmp $num,#2 |
| +#if __ARM_ARCH__>=7 |
| + tst ip,#7 |
| + bne .Lialu |
| + adr r0,bn_mul_mont |
| + ldr r2,.LOPENSSL_armcap |
| + ldr r0,[r0,r2] |
| + tst r0,#1 @ NEON available? |
| + ldmia sp, {r0,r2} |
| + beq .Lialu |
| + add sp,sp,#8 |
| + b bn_mul8x_mont_neon |
| +.align 4 |
| +.Lialu: |
| +#endif |
| + cmp ip,#2 |
| + mov $num,ip @ load num |
| movlt r0,#0 |
| addlt sp,sp,#2*4 |
| blt .Labrt |
| @@ -191,14 +230,446 @@ bn_mul_mont: |
| ldmia sp!,{r4-r12,lr} @ restore registers |
| add sp,sp,#2*4 @ skip over {r0,r2} |
| mov r0,#1 |
| -.Labrt: tst lr,#1 |
| +.Labrt: |
| +#if __ARM_ARCH__>=5 |
| + ret @ bx lr |
| +#else |
| + tst lr,#1 |
| moveq pc,lr @ be binary compatible with V4, yet |
| bx lr @ interoperable with Thumb ISA:-) |
| +#endif |
| .size bn_mul_mont,.-bn_mul_mont |
| -.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
| +___ |
| +{ |
| +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } |
| +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } |
| + |
| +my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); |
| +my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); |
| +my ($Z,$Temp)=("q4","q5"); |
| +my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); |
| +my ($Bi,$Ni,$M0)=map("d$_",(28..31)); |
| +my $zero=&Dlo($Z); |
| +my $temp=&Dlo($Temp); |
| + |
| +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); |
| +my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); |
| + |
| +$code.=<<___; |
| +#if __ARM_ARCH__>=7 |
| +.fpu neon |
| + |
| +.type bn_mul8x_mont_neon,%function |
| +.align 5 |
| +bn_mul8x_mont_neon: |
| + mov ip,sp |
| + stmdb sp!,{r4-r11} |
| + vstmdb sp!,{d8-d15} @ ABI specification says so |
| + ldmia ip,{r4-r5} @ load rest of parameter block |
| + |
| + sub $toutptr,sp,#16 |
| + vld1.32 {${Bi}[0]}, [$bptr,:32]! |
| + sub $toutptr,$toutptr,$num,lsl#4 |
| + vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( |
| + and $toutptr,$toutptr,#-64 |
| + vld1.32 {${M0}[0]}, [$n0,:32] |
| + mov sp,$toutptr @ alloca |
| + veor $zero,$zero,$zero |
| + subs $inner,$num,#8 |
| + vzip.16 $Bi,$zero |
| + |
| + vmull.u32 $A0xB,$Bi,${A0}[0] |
| + vmull.u32 $A1xB,$Bi,${A0}[1] |
| + vmull.u32 $A2xB,$Bi,${A1}[0] |
| + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 |
| + vmull.u32 $A3xB,$Bi,${A1}[1] |
| + |
| + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` |
| + veor $zero,$zero,$zero |
| + vmul.u32 $Ni,$temp,$M0 |
| + |
| + vmull.u32 $A4xB,$Bi,${A2}[0] |
| + vld1.32 {$N0-$N3}, [$nptr]! |
| + vmull.u32 $A5xB,$Bi,${A2}[1] |
| + vmull.u32 $A6xB,$Bi,${A3}[0] |
| + vzip.16 $Ni,$zero |
| + vmull.u32 $A7xB,$Bi,${A3}[1] |
| + |
| + bne .LNEON_1st |
| + |
| + @ special case for num=8, everything is in register bank... |
| + |
| + vmlal.u32 $A0xB,$Ni,${N0}[0] |
| + sub $outer,$num,#1 |
| + vmlal.u32 $A1xB,$Ni,${N0}[1] |
| + vmlal.u32 $A2xB,$Ni,${N1}[0] |
| + vmlal.u32 $A3xB,$Ni,${N1}[1] |
| + |
| + vmlal.u32 $A4xB,$Ni,${N2}[0] |
| + vmov $Temp,$A0xB |
| + vmlal.u32 $A5xB,$Ni,${N2}[1] |
| + vmov $A0xB,$A1xB |
| + vmlal.u32 $A6xB,$Ni,${N3}[0] |
| + vmov $A1xB,$A2xB |
| + vmlal.u32 $A7xB,$Ni,${N3}[1] |
| + vmov $A2xB,$A3xB |
| + vmov $A3xB,$A4xB |
| + vshr.u64 $temp,$temp,#16 |
| + vmov $A4xB,$A5xB |
| + vmov $A5xB,$A6xB |
| + vadd.u64 $temp,$temp,`&Dhi("$Temp")` |
| + vmov $A6xB,$A7xB |
| + veor $A7xB,$A7xB |
| + vshr.u64 $temp,$temp,#16 |
| + |
| + b .LNEON_outer8 |
| + |
| +.align 4 |
| +.LNEON_outer8: |
| + vld1.32 {${Bi}[0]}, [$bptr,:32]! |
| + veor $zero,$zero,$zero |
| + vzip.16 $Bi,$zero |
| + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp |
| + |
| + vmlal.u32 $A0xB,$Bi,${A0}[0] |
| + vmlal.u32 $A1xB,$Bi,${A0}[1] |
| + vmlal.u32 $A2xB,$Bi,${A1}[0] |
| + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 |
| + vmlal.u32 $A3xB,$Bi,${A1}[1] |
| + |
| + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` |
| + veor $zero,$zero,$zero |
| + subs $outer,$outer,#1 |
| + vmul.u32 $Ni,$temp,$M0 |
| + |
| + vmlal.u32 $A4xB,$Bi,${A2}[0] |
| + vmlal.u32 $A5xB,$Bi,${A2}[1] |
| + vmlal.u32 $A6xB,$Bi,${A3}[0] |
| + vzip.16 $Ni,$zero |
| + vmlal.u32 $A7xB,$Bi,${A3}[1] |
| + |
| + vmlal.u32 $A0xB,$Ni,${N0}[0] |
| + vmlal.u32 $A1xB,$Ni,${N0}[1] |
| + vmlal.u32 $A2xB,$Ni,${N1}[0] |
| + vmlal.u32 $A3xB,$Ni,${N1}[1] |
| + |
| + vmlal.u32 $A4xB,$Ni,${N2}[0] |
| + vmov $Temp,$A0xB |
| + vmlal.u32 $A5xB,$Ni,${N2}[1] |
| + vmov $A0xB,$A1xB |
| + vmlal.u32 $A6xB,$Ni,${N3}[0] |
| + vmov $A1xB,$A2xB |
| + vmlal.u32 $A7xB,$Ni,${N3}[1] |
| + vmov $A2xB,$A3xB |
| + vmov $A3xB,$A4xB |
| + vshr.u64 $temp,$temp,#16 |
| + vmov $A4xB,$A5xB |
| + vmov $A5xB,$A6xB |
| + vadd.u64 $temp,$temp,`&Dhi("$Temp")` |
| + vmov $A6xB,$A7xB |
| + veor $A7xB,$A7xB |
| + vshr.u64 $temp,$temp,#16 |
| + |
| + bne .LNEON_outer8 |
| + |
| + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp |
| + mov $toutptr,sp |
| + vshr.u64 $temp,`&Dlo("$A0xB")`,#16 |
| + mov $inner,$num |
| + vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp |
| + add $tinptr,sp,#16 |
| + vshr.u64 $temp,`&Dhi("$A0xB")`,#16 |
| + vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` |
| + |
| + b .LNEON_tail2 |
| + |
| +.align 4 |
| +.LNEON_1st: |
| + vmlal.u32 $A0xB,$Ni,${N0}[0] |
| + vld1.32 {$A0-$A3}, [$aptr]! |
| + vmlal.u32 $A1xB,$Ni,${N0}[1] |
| + subs $inner,$inner,#8 |
| + vmlal.u32 $A2xB,$Ni,${N1}[0] |
| + vmlal.u32 $A3xB,$Ni,${N1}[1] |
| + |
| + vmlal.u32 $A4xB,$Ni,${N2}[0] |
| + vld1.32 {$N0-$N1}, [$nptr]! |
| + vmlal.u32 $A5xB,$Ni,${N2}[1] |
| + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! |
| + vmlal.u32 $A6xB,$Ni,${N3}[0] |
| + vmlal.u32 $A7xB,$Ni,${N3}[1] |
| + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! |
| + |
| + vmull.u32 $A0xB,$Bi,${A0}[0] |
| + vld1.32 {$N2-$N3}, [$nptr]! |
| + vmull.u32 $A1xB,$Bi,${A0}[1] |
| + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! |
| + vmull.u32 $A2xB,$Bi,${A1}[0] |
| + vmull.u32 $A3xB,$Bi,${A1}[1] |
| + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! |
| + |
| + vmull.u32 $A4xB,$Bi,${A2}[0] |
| + vmull.u32 $A5xB,$Bi,${A2}[1] |
| + vmull.u32 $A6xB,$Bi,${A3}[0] |
| + vmull.u32 $A7xB,$Bi,${A3}[1] |
| + |
| + bne .LNEON_1st |
| + |
| + vmlal.u32 $A0xB,$Ni,${N0}[0] |
| + add $tinptr,sp,#16 |
| + vmlal.u32 $A1xB,$Ni,${N0}[1] |
| + sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr |
| + vmlal.u32 $A2xB,$Ni,${N1}[0] |
| + vld1.64 {$Temp}, [sp,:128] |
| + vmlal.u32 $A3xB,$Ni,${N1}[1] |
| + sub $outer,$num,#1 |
| + |
| + vmlal.u32 $A4xB,$Ni,${N2}[0] |
| + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! |
| + vmlal.u32 $A5xB,$Ni,${N2}[1] |
| + vshr.u64 $temp,$temp,#16 |
| + vld1.64 {$A0xB}, [$tinptr, :128]! |
| + vmlal.u32 $A6xB,$Ni,${N3}[0] |
| + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! |
| + vmlal.u32 $A7xB,$Ni,${N3}[1] |
| + |
| + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! |
| + vadd.u64 $temp,$temp,`&Dhi("$Temp")` |
| + veor $Z,$Z,$Z |
| + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! |
| + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! |
| + vst1.64 {$Z}, [$toutptr,:128] |
| + vshr.u64 $temp,$temp,#16 |
| + |
| + b .LNEON_outer |
| + |
| +.align 4 |
| +.LNEON_outer: |
| + vld1.32 {${Bi}[0]}, [$bptr,:32]! |
| + sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr |
| + vld1.32 {$A0-$A3}, [$aptr]! |
| + veor $zero,$zero,$zero |
| + mov $toutptr,sp |
| + vzip.16 $Bi,$zero |
| + sub $inner,$num,#8 |
| + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp |
| + |
| + vmlal.u32 $A0xB,$Bi,${A0}[0] |
| + vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! |
| + vmlal.u32 $A1xB,$Bi,${A0}[1] |
| + vmlal.u32 $A2xB,$Bi,${A1}[0] |
| + vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! |
| + vmlal.u32 $A3xB,$Bi,${A1}[1] |
| + |
| + vshl.i64 $temp,`&Dhi("$A0xB")`,#16 |
| + veor $zero,$zero,$zero |
| + vadd.u64 $temp,$temp,`&Dlo("$A0xB")` |
| + vld1.64 {$A7xB},[$tinptr,:128]! |
| + vmul.u32 $Ni,$temp,$M0 |
| + |
| + vmlal.u32 $A4xB,$Bi,${A2}[0] |
| + vld1.32 {$N0-$N3}, [$nptr]! |
| + vmlal.u32 $A5xB,$Bi,${A2}[1] |
| + vmlal.u32 $A6xB,$Bi,${A3}[0] |
| + vzip.16 $Ni,$zero |
| + vmlal.u32 $A7xB,$Bi,${A3}[1] |
| + |
| +.LNEON_inner: |
| + vmlal.u32 $A0xB,$Ni,${N0}[0] |
| + vld1.32 {$A0-$A3}, [$aptr]! |
| + vmlal.u32 $A1xB,$Ni,${N0}[1] |
| + subs $inner,$inner,#8 |
| + vmlal.u32 $A2xB,$Ni,${N1}[0] |
| + vmlal.u32 $A3xB,$Ni,${N1}[1] |
| + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! |
| + |
| + vmlal.u32 $A4xB,$Ni,${N2}[0] |
| + vld1.64 {$A0xB}, [$tinptr, :128]! |
| + vmlal.u32 $A5xB,$Ni,${N2}[1] |
| + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! |
| + vmlal.u32 $A6xB,$Ni,${N3}[0] |
| + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! |
| + vmlal.u32 $A7xB,$Ni,${N3}[1] |
| + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! |
| + |
| + vmlal.u32 $A0xB,$Bi,${A0}[0] |
| + vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! |
| + vmlal.u32 $A1xB,$Bi,${A0}[1] |
| + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! |
| + vmlal.u32 $A2xB,$Bi,${A1}[0] |
| + vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! |
| + vmlal.u32 $A3xB,$Bi,${A1}[1] |
| + vld1.32 {$N0-$N3}, [$nptr]! |
| + |
| + vmlal.u32 $A4xB,$Bi,${A2}[0] |
| + vld1.64 {$A7xB}, [$tinptr, :128]! |
| + vmlal.u32 $A5xB,$Bi,${A2}[1] |
| + vmlal.u32 $A6xB,$Bi,${A3}[0] |
| + vmlal.u32 $A7xB,$Bi,${A3}[1] |
| + |
| + bne .LNEON_inner |
| + |
| + vmlal.u32 $A0xB,$Ni,${N0}[0] |
| + add $tinptr,sp,#16 |
| + vmlal.u32 $A1xB,$Ni,${N0}[1] |
| + sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr |
| + vmlal.u32 $A2xB,$Ni,${N1}[0] |
| + vld1.64 {$Temp}, [sp,:128] |
| + vmlal.u32 $A3xB,$Ni,${N1}[1] |
| + subs $outer,$outer,#1 |
| + |
| + vmlal.u32 $A4xB,$Ni,${N2}[0] |
| + vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! |
| + vmlal.u32 $A5xB,$Ni,${N2}[1] |
| + vld1.64 {$A0xB}, [$tinptr, :128]! |
| + vshr.u64 $temp,$temp,#16 |
| + vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! |
| + vmlal.u32 $A6xB,$Ni,${N3}[0] |
| + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! |
| + vmlal.u32 $A7xB,$Ni,${N3}[1] |
| + |
| + vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! |
| + vadd.u64 $temp,$temp,`&Dhi("$Temp")` |
| + vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! |
| + vshr.u64 $temp,$temp,#16 |
| + |
| + bne .LNEON_outer |
| + |
| + mov $toutptr,sp |
| + mov $inner,$num |
| + |
| +.LNEON_tail: |
| + vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp |
| + vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! |
| + vshr.u64 $temp,`&Dlo("$A0xB")`,#16 |
| + vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp |
| + vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! |
| + vshr.u64 $temp,`&Dhi("$A0xB")`,#16 |
| + vld1.64 {$A7xB}, [$tinptr, :128]! |
| + vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` |
| + |
| +.LNEON_tail2: |
| + vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp |
| + vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! |
| + vshr.u64 $temp,`&Dlo("$A1xB")`,#16 |
| + vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp |
| + vshr.u64 $temp,`&Dhi("$A1xB")`,#16 |
| + vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` |
| + |
| + vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp |
| + vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! |
| + vshr.u64 $temp,`&Dlo("$A2xB")`,#16 |
| + vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp |
| + vshr.u64 $temp,`&Dhi("$A2xB")`,#16 |
| + vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` |
| + |
| + vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp |
| + vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! |
| + vshr.u64 $temp,`&Dlo("$A3xB")`,#16 |
| + vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp |
| + vshr.u64 $temp,`&Dhi("$A3xB")`,#16 |
| + vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` |
| + |
| + vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp |
| + vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! |
| + vshr.u64 $temp,`&Dlo("$A4xB")`,#16 |
| + vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp |
| + vshr.u64 $temp,`&Dhi("$A4xB")`,#16 |
| + vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` |
| + |
| + vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp |
| + vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! |
| + vshr.u64 $temp,`&Dlo("$A5xB")`,#16 |
| + vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp |
| + vshr.u64 $temp,`&Dhi("$A5xB")`,#16 |
| + vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` |
| + |
| + vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp |
| + vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! |
| + vshr.u64 $temp,`&Dlo("$A6xB")`,#16 |
| + vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp |
| + vld1.64 {$A0xB}, [$tinptr, :128]! |
| + vshr.u64 $temp,`&Dhi("$A6xB")`,#16 |
| + vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` |
| + |
| + vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp |
| + vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! |
| + vshr.u64 $temp,`&Dlo("$A7xB")`,#16 |
| + vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp |
| + vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! |
| + vshr.u64 $temp,`&Dhi("$A7xB")`,#16 |
| + vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` |
| + subs $inner,$inner,#8 |
| + vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! |
| + |
| + bne .LNEON_tail |
| + |
| + vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit |
| + sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr |
| + subs $aptr,sp,#0 @ clear carry flag |
| + add $bptr,sp,$num,lsl#2 |
| + |
| +.LNEON_sub: |
| + ldmia $aptr!, {r4-r7} |
| + ldmia $nptr!, {r8-r11} |
| + sbcs r8, r4,r8 |
| + sbcs r9, r5,r9 |
| + sbcs r10,r6,r10 |
| + sbcs r11,r7,r11 |
| + teq $aptr,$bptr @ preserves carry |
| + stmia $rptr!, {r8-r11} |
| + bne .LNEON_sub |
| + |
| + ldr r10, [$aptr] @ load top-most bit |
| + veor q0,q0,q0 |
| + sub r11,$bptr,sp @ this is num*4 |
| + veor q1,q1,q1 |
| + mov $aptr,sp |
| + sub $rptr,$rptr,r11 @ rewind $rptr |
| + mov $nptr,$bptr @ second 3/4th of frame |
| + sbcs r10,r10,#0 @ result is carry flag |
| + |
| +.LNEON_copy_n_zap: |
| + ldmia $aptr!, {r4-r7} |
| + ldmia $rptr, {r8-r11} |
| + movcc r8, r4 |
| + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe |
| + movcc r9, r5 |
| + movcc r10,r6 |
| + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe |
| + movcc r11,r7 |
| + ldmia $aptr, {r4-r7} |
| + stmia $rptr!, {r8-r11} |
| + sub $aptr,$aptr,#16 |
| + ldmia $rptr, {r8-r11} |
| + movcc r8, r4 |
| + vst1.64 {q0-q1}, [$aptr,:256]! @ wipe |
| + movcc r9, r5 |
| + movcc r10,r6 |
| + vst1.64 {q0-q1}, [$nptr,:256]! @ wipe |
| + movcc r11,r7 |
| + teq $aptr,$bptr @ preserves carry |
| + stmia $rptr!, {r8-r11} |
| + bne .LNEON_copy_n_zap |
| + |
| + sub sp,ip,#96 |
| + vldmia sp!,{d8-d15} |
| + ldmia sp!,{r4-r11} |
| + ret @ bx lr |
| +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon |
| +#endif |
| +___ |
| +} |
| +$code.=<<___; |
| +.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" |
| .align 2 |
| +#if __ARM_ARCH__>=7 |
| +.comm OPENSSL_armcap_P,4,4 |
| +#endif |
| ___ |
| |
| +$code =~ s/\`([^\`]*)\`/eval $1/gem; |
| $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
| +$code =~ s/\bret\b/bx lr/gm; |
| print $code; |
| close STDOUT; |
| diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c |
| index c7869b6..ad0f7a4 100644 |
| --- a/crypto/evp/e_aes.c |
| +++ b/crypto/evp/e_aes.c |
| @@ -62,7 +62,7 @@ |
| |
| typedef struct |
| { |
| - AES_KEY ks; |
| + union { double align; AES_KEY ks; } ks; |
| block128_f block; |
| union { |
| cbc128_f cbc; |
| @@ -72,7 +72,7 @@ typedef struct |
| |
| typedef struct |
| { |
| - AES_KEY ks; /* AES key schedule to use */ |
| + union { double align; AES_KEY ks; } ks; /* AES key schedule to use */ |
| int key_set; /* Set if key initialised */ |
| int iv_set; /* Set if an iv is set */ |
| GCM128_CONTEXT gcm; |
| @@ -86,7 +86,7 @@ typedef struct |
| |
| typedef struct |
| { |
| - AES_KEY ks1, ks2; /* AES key schedules to use */ |
| + union { double align; AES_KEY ks; } ks1, ks2; /* AES key schedules to use */ |
| XTS128_CONTEXT xts; |
| void (*stream)(const unsigned char *in, |
| unsigned char *out, size_t length, |
| @@ -96,7 +96,7 @@ typedef struct |
| |
| typedef struct |
| { |
| - AES_KEY ks; /* AES key schedule to use */ |
| + union { double align; AES_KEY ks; } ks; /* AES key schedule to use */ |
| int key_set; /* Set if key initialised */ |
| int iv_set; /* Set if an iv is set */ |
| int tag_set; /* Set if tag is valid */ |
| @@ -160,7 +160,7 @@ void AES_xts_decrypt(const char *inp,char *out,size_t len, |
| defined(_M_AMD64) || defined(_M_X64) || \ |
| defined(__INTEL__) ) |
| |
| -extern unsigned int OPENSSL_ia32cap_P[2]; |
| +extern unsigned int OPENSSL_ia32cap_P[]; |
| |
| #ifdef VPAES_ASM |
| #define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32))) |
| @@ -310,7 +310,7 @@ static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| return 1; |
| if (key) |
| { |
| - aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); |
| + aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks); |
| CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, |
| (block128_f)aesni_encrypt); |
| gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks; |
| @@ -355,19 +355,19 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| /* key_len is two AES keys */ |
| if (enc) |
| { |
| - aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); |
| + aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); |
| xctx->xts.block1 = (block128_f)aesni_encrypt; |
| xctx->stream = aesni_xts_encrypt; |
| } |
| else |
| { |
| - aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); |
| + aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); |
| xctx->xts.block1 = (block128_f)aesni_decrypt; |
| xctx->stream = aesni_xts_decrypt; |
| } |
| |
| aesni_set_encrypt_key(key + ctx->key_len/2, |
| - ctx->key_len * 4, &xctx->ks2); |
| + ctx->key_len * 4, &xctx->ks2.ks); |
| xctx->xts.block2 = (block128_f)aesni_encrypt; |
| |
| xctx->xts.key1 = &xctx->ks1; |
| @@ -394,7 +394,7 @@ static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| return 1; |
| if (key) |
| { |
| - aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); |
| + aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks); |
| CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, |
| &cctx->ks, (block128_f)aesni_encrypt); |
| cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks : |
| @@ -484,6 +484,38 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ |
| { return &aes_##keylen##_##mode; } |
| #endif |
| |
| +#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) |
| +#include "arm_arch.h" |
| +#if __ARM_ARCH__>=7 |
| +# if defined(BSAES_ASM) |
| +# define BSAES_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) |
| +# endif |
| +# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) |
| +# define HWAES_set_encrypt_key aes_v8_set_encrypt_key |
| +# define HWAES_set_decrypt_key aes_v8_set_decrypt_key |
| +# define HWAES_encrypt aes_v8_encrypt |
| +# define HWAES_decrypt aes_v8_decrypt |
| +# define HWAES_cbc_encrypt aes_v8_cbc_encrypt |
| +# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks |
| +#endif |
| +#endif |
| + |
| +#if defined(HWAES_CAPABLE) |
| +int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits, |
| + AES_KEY *key); |
| +int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits, |
| + AES_KEY *key); |
| +void HWAES_encrypt(const unsigned char *in, unsigned char *out, |
| + const AES_KEY *key); |
| +void HWAES_decrypt(const unsigned char *in, unsigned char *out, |
| + const AES_KEY *key); |
| +void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out, |
| + size_t length, const AES_KEY *key, |
| + unsigned char *ivec, const int enc); |
| +void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, |
| + size_t len, const AES_KEY *key, const unsigned char ivec[16]); |
| +#endif |
| + |
| #define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \ |
| BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ |
| BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ |
| @@ -502,10 +534,23 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| mode = ctx->cipher->flags & EVP_CIPH_MODE; |
| if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) |
| && !enc) |
| +#ifdef HWAES_CAPABLE |
| + if (HWAES_CAPABLE) |
| + { |
| + ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); |
| + dat->block = (block128_f)HWAES_decrypt; |
| + dat->stream.cbc = NULL; |
| +#ifdef HWAES_cbc_encrypt |
| + if (mode==EVP_CIPH_CBC_MODE) |
| + dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt; |
| +#endif |
| + } |
| + else |
| +#endif |
| #ifdef BSAES_CAPABLE |
| if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE) |
| { |
| - ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); |
| + ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); |
| dat->block = (block128_f)AES_decrypt; |
| dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt; |
| } |
| @@ -514,7 +559,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| #ifdef VPAES_CAPABLE |
| if (VPAES_CAPABLE) |
| { |
| - ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks); |
| + ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); |
| dat->block = (block128_f)vpaes_decrypt; |
| dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? |
| (cbc128_f)vpaes_cbc_encrypt : |
| @@ -523,17 +568,37 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| else |
| #endif |
| { |
| - ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); |
| + ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks.ks); |
| dat->block = (block128_f)AES_decrypt; |
| dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? |
| (cbc128_f)AES_cbc_encrypt : |
| NULL; |
| } |
| else |
| +#ifdef HWAES_CAPABLE |
| + if (HWAES_CAPABLE) |
| + { |
| + ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); |
| + dat->block = (block128_f)HWAES_encrypt; |
| + dat->stream.cbc = NULL; |
| +#ifdef HWAES_cbc_encrypt |
| + if (mode==EVP_CIPH_CBC_MODE) |
| + dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt; |
| + else |
| +#endif |
| +#ifdef HWAES_ctr32_encrypt_blocks |
| + if (mode==EVP_CIPH_CTR_MODE) |
| + dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks; |
| + else |
| +#endif |
| + (void)0; /* terminate potentially open 'else' */ |
| + } |
| + else |
| +#endif |
| #ifdef BSAES_CAPABLE |
| if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE) |
| { |
| - ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); |
| + ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); |
| dat->block = (block128_f)AES_encrypt; |
| dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; |
| } |
| @@ -542,7 +607,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| #ifdef VPAES_CAPABLE |
| if (VPAES_CAPABLE) |
| { |
| - ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks); |
| + ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); |
| dat->block = (block128_f)vpaes_encrypt; |
| dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? |
| (cbc128_f)vpaes_cbc_encrypt : |
| @@ -551,7 +616,7 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| else |
| #endif |
| { |
| - ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); |
| + ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks.ks); |
| dat->block = (block128_f)AES_encrypt; |
| dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? |
| (cbc128_f)AES_cbc_encrypt : |
| @@ -822,10 +887,25 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| return 1; |
| if (key) |
| { do { |
| +#ifdef HWAES_CAPABLE |
| + if (HWAES_CAPABLE) |
| + { |
| + HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); |
| + CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, |
| + (block128_f)HWAES_encrypt); |
| +#ifdef HWAES_ctr32_encrypt_blocks |
| + gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks; |
| +#else |
| + gctx->ctr = NULL; |
| +#endif |
| + break; |
| + } |
| + else |
| +#endif |
| #ifdef BSAES_CAPABLE |
| if (BSAES_CAPABLE) |
| { |
| - AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); |
| + AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); |
| CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, |
| (block128_f)AES_encrypt); |
| gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; |
| @@ -836,7 +916,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| #ifdef VPAES_CAPABLE |
| if (VPAES_CAPABLE) |
| { |
| - vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); |
| + vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks.ks); |
| CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, |
| (block128_f)vpaes_encrypt); |
| gctx->ctr = NULL; |
| @@ -846,7 +926,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| #endif |
| (void)0; /* terminate potentially open 'else' */ |
| |
| - AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); |
| + AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks.ks); |
| CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt); |
| #ifdef AES_CTR_ASM |
| gctx->ctr = (ctr128_f)AES_ctr32_encrypt; |
| @@ -1067,6 +1147,29 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| xctx->stream = NULL; |
| #endif |
| /* key_len is two AES keys */ |
| +#ifdef HWAES_CAPABLE |
| + if (HWAES_CAPABLE) |
| + { |
| + if (enc) |
| + { |
| + HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); |
| + xctx->xts.block1 = (block128_f)HWAES_encrypt; |
| + } |
| + else |
| + { |
| + HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); |
| + xctx->xts.block1 = (block128_f)HWAES_decrypt; |
| + } |
| + |
| + HWAES_set_encrypt_key(key + ctx->key_len/2, |
| + ctx->key_len * 4, &xctx->ks2.ks); |
| + xctx->xts.block2 = (block128_f)HWAES_encrypt; |
| + |
| + xctx->xts.key1 = &xctx->ks1; |
| + break; |
| + } |
| + else |
| +#endif |
| #ifdef BSAES_CAPABLE |
| if (BSAES_CAPABLE) |
| xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt; |
| @@ -1077,17 +1180,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| { |
| if (enc) |
| { |
| - vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); |
| + vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); |
| xctx->xts.block1 = (block128_f)vpaes_encrypt; |
| } |
| else |
| { |
| - vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); |
| + vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); |
| xctx->xts.block1 = (block128_f)vpaes_decrypt; |
| } |
| |
| vpaes_set_encrypt_key(key + ctx->key_len/2, |
| - ctx->key_len * 4, &xctx->ks2); |
| + ctx->key_len * 4, &xctx->ks2.ks); |
| xctx->xts.block2 = (block128_f)vpaes_encrypt; |
| |
| xctx->xts.key1 = &xctx->ks1; |
| @@ -1099,17 +1202,17 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| |
| if (enc) |
| { |
| - AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); |
| + AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); |
| xctx->xts.block1 = (block128_f)AES_encrypt; |
| } |
| else |
| { |
| - AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); |
| + AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1.ks); |
| xctx->xts.block1 = (block128_f)AES_decrypt; |
| } |
| |
| AES_set_encrypt_key(key + ctx->key_len/2, |
| - ctx->key_len * 4, &xctx->ks2); |
| + ctx->key_len * 4, &xctx->ks2.ks); |
| xctx->xts.block2 = (block128_f)AES_encrypt; |
| |
| xctx->xts.key1 = &xctx->ks1; |
| @@ -1217,10 +1320,23 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| return 1; |
| if (key) do |
| { |
| +#ifdef HWAES_CAPABLE |
| + if (HWAES_CAPABLE) |
| + { |
| + HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks.ks); |
| + |
| + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, |
| + &cctx->ks, (block128_f)HWAES_encrypt); |
| + cctx->str = NULL; |
| + cctx->key_set = 1; |
| + break; |
| + } |
| + else |
| +#endif |
| #ifdef VPAES_CAPABLE |
| if (VPAES_CAPABLE) |
| { |
| - vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks); |
| + vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks.ks); |
| CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, |
| &cctx->ks, (block128_f)vpaes_encrypt); |
| cctx->str = NULL; |
| @@ -1228,7 +1344,7 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, |
| break; |
| } |
| #endif |
| - AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); |
| + AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks.ks); |
| CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, |
| &cctx->ks, (block128_f)AES_encrypt); |
| cctx->str = NULL; |
| diff --git a/crypto/modes/Makefile b/crypto/modes/Makefile |
| index 3d8bafd..9bcfa0e 100644 |
| --- a/crypto/modes/Makefile |
| +++ b/crypto/modes/Makefile |
| @@ -56,14 +56,16 @@ ghash-alpha.s: asm/ghash-alpha.pl |
| (preproc=/tmp/$$$$.$@; trap "rm $$preproc" INT; \ |
| $(PERL) asm/ghash-alpha.pl > $$preproc && \ |
| $(CC) -E $$preproc > $@ && rm $$preproc) |
| - |
| ghash-parisc.s: asm/ghash-parisc.pl |
| $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@ |
| +ghashv8-armx.S: asm/ghashv8-armx.pl |
| + $(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@ |
| |
| # GNU make "catch all" |
| ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ |
| |
| ghash-armv4.o: ghash-armv4.S |
| +ghashv8-armx.o: ghashv8-armx.S |
| |
| files: |
| $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO |
| diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl |
| index d91586e..0023bf9 100644 |
| --- a/crypto/modes/asm/ghash-armv4.pl |
| +++ b/crypto/modes/asm/ghash-armv4.pl |
| @@ -35,6 +35,20 @@ |
| # Add NEON implementation featuring polynomial multiplication, i.e. no |
| # lookup tables involved. On Cortex A8 it was measured to process one |
| # byte in 15 cycles or 55% faster than integer-only code. |
| +# |
| +# April 2014 |
| +# |
| +# Switch to multiplication algorithm suggested in paper referred |
| +# below and combine it with reduction algorithm from x86 module. |
| +# Performance improvement over previous version varies from 65% on |
| +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 |
| +# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 - |
| +# in 9.33. |
| +# |
| +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software |
| +# Polynomial Multiplication on ARM Processors using the NEON Engine. |
| +# |
| +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf |
| |
| # ==================================================================== |
| # Note about "528B" variant. In ARM case it makes lesser sense to |
| @@ -303,117 +317,160 @@ $code.=<<___; |
| .size gcm_gmult_4bit,.-gcm_gmult_4bit |
| ___ |
| { |
| -my $cnt=$Htbl; # $Htbl is used once in the very beginning |
| - |
| -my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); |
| -my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); |
| - |
| -# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit |
| -# in Zo. Or should I say "top bit", because GHASH is specified in |
| -# reverse bit order? Otherwise straightforward 128-bt H by one input |
| -# byte multiplication and modulo-reduction, times 16. |
| +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); |
| +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); |
| +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); |
| |
| -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } |
| -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } |
| -sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } |
| +sub clmul64x64 { |
| +my ($r,$a,$b)=@_; |
| +$code.=<<___; |
| + vext.8 $t0#lo, $a, $a, #1 @ A1 |
| + vmull.p8 $t0, $t0#lo, $b @ F = A1*B |
| + vext.8 $r#lo, $b, $b, #1 @ B1 |
| + vmull.p8 $r, $a, $r#lo @ E = A*B1 |
| + vext.8 $t1#lo, $a, $a, #2 @ A2 |
| + vmull.p8 $t1, $t1#lo, $b @ H = A2*B |
| + vext.8 $t3#lo, $b, $b, #2 @ B2 |
| + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 |
| + vext.8 $t2#lo, $a, $a, #3 @ A3 |
| + veor $t0, $t0, $r @ L = E + F |
| + vmull.p8 $t2, $t2#lo, $b @ J = A3*B |
| + vext.8 $r#lo, $b, $b, #3 @ B3 |
| + veor $t1, $t1, $t3 @ M = G + H |
| + vmull.p8 $r, $a, $r#lo @ I = A*B3 |
| + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 |
| + vand $t0#hi, $t0#hi, $k48 |
| + vext.8 $t3#lo, $b, $b, #4 @ B4 |
| + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 |
| + vand $t1#hi, $t1#hi, $k32 |
| + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 |
| + veor $t2, $t2, $r @ N = I + J |
| + veor $t0#lo, $t0#lo, $t0#hi |
| + veor $t1#lo, $t1#lo, $t1#hi |
| + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 |
| + vand $t2#hi, $t2#hi, $k16 |
| + vext.8 $t0, $t0, $t0, #15 |
| + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 |
| + vmov.i64 $t3#hi, #0 |
| + vext.8 $t1, $t1, $t1, #14 |
| + veor $t2#lo, $t2#lo, $t2#hi |
| + vmull.p8 $r, $a, $b @ D = A*B |
| + vext.8 $t3, $t3, $t3, #12 |
| + vext.8 $t2, $t2, $t2, #13 |
| + veor $t0, $t0, $t1 |
| + veor $t2, $t2, $t3 |
| + veor $r, $r, $t0 |
| + veor $r, $r, $t2 |
| +___ |
| +} |
| |
| $code.=<<___; |
| #if __ARM_ARCH__>=7 |
| .fpu neon |
| |
| +.global gcm_init_neon |
| +.type gcm_init_neon,%function |
| +.align 4 |
| +gcm_init_neon: |
| + vld1.64 $IN#hi,[r1,:64]! @ load H |
| + vmov.i8 $t0,#0xe1 |
| + vld1.64 $IN#lo,[r1,:64] |
| + vshl.i64 $t0#hi,#57 |
| + vshr.u64 $t0#lo,#63 @ t0=0xc2....01 |
| + vdup.8 $t1,$IN#hi[7] |
| + vshr.u64 $Hlo,$IN#lo,#63 |
| + vshr.s8 $t1,#7 @ broadcast carry bit |
| + vshl.i64 $IN,$IN,#1 |
| + vand $t0,$t0,$t1 |
| + vorr $IN#hi,$Hlo @ H<<<=1 |
| + veor $IN,$IN,$t0 @ twisted H |
| + vstmia r0,{$IN} |
| + |
| + ret @ bx lr |
| +.size gcm_init_neon,.-gcm_init_neon |
| + |
| .global gcm_gmult_neon |
| .type gcm_gmult_neon,%function |
| .align 4 |
| gcm_gmult_neon: |
| - sub $Htbl,#16 @ point at H in GCM128_CTX |
| - vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi |
| - vmov.i32 $mod,#0xe1 @ our irreducible polynomial |
| - vld1.64 `&Dlo("$IN")`,[$Xi,:64]! |
| - vshr.u64 $mod,#32 |
| - vldmia $Htbl,{$Hhi-$Hlo} @ load H |
| - veor $zero,$zero |
| + vld1.64 $IN#hi,[$Xi,:64]! @ load Xi |
| + vld1.64 $IN#lo,[$Xi,:64]! |
| + vmov.i64 $k48,#0x0000ffffffffffff |
| + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H |
| + vmov.i64 $k32,#0x00000000ffffffff |
| #ifdef __ARMEL__ |
| vrev64.8 $IN,$IN |
| #endif |
| - veor $Qpost,$Qpost |
| - veor $R,$R |
| - mov $cnt,#16 |
| - veor $Z,$Z |
| + vmov.i64 $k16,#0x000000000000ffff |
| + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing |
| mov $len,#16 |
| - veor $Zo,$Zo |
| - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte |
| - b .Linner_neon |
| + b .Lgmult_neon |
| .size gcm_gmult_neon,.-gcm_gmult_neon |
| |
| .global gcm_ghash_neon |
| .type gcm_ghash_neon,%function |
| .align 4 |
| gcm_ghash_neon: |
| - vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi |
| - vmov.i32 $mod,#0xe1 @ our irreducible polynomial |
| - vld1.64 `&Dlo("$Z")`,[$Xi,:64]! |
| - vshr.u64 $mod,#32 |
| - vldmia $Xi,{$Hhi-$Hlo} @ load H |
| - veor $zero,$zero |
| - nop |
| + vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi |
| + vld1.64 $Xl#lo,[$Xi,:64]! |
| + vmov.i64 $k48,#0x0000ffffffffffff |
| + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H |
| + vmov.i64 $k32,#0x00000000ffffffff |
| #ifdef __ARMEL__ |
| - vrev64.8 $Z,$Z |
| + vrev64.8 $Xl,$Xl |
| #endif |
| -.Louter_neon: |
| - vld1.64 `&Dhi($IN)`,[$inp]! @ load inp |
| - veor $Qpost,$Qpost |
| - vld1.64 `&Dlo($IN)`,[$inp]! |
| - veor $R,$R |
| - mov $cnt,#16 |
| + vmov.i64 $k16,#0x000000000000ffff |
| + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing |
| + |
| +.Loop_neon: |
| + vld1.64 $IN#hi,[$inp]! @ load inp |
| + vld1.64 $IN#lo,[$inp]! |
| #ifdef __ARMEL__ |
| vrev64.8 $IN,$IN |
| #endif |
| - veor $Zo,$Zo |
| - veor $IN,$Z @ inp^=Xi |
| - veor $Z,$Z |
| - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte |
| -.Linner_neon: |
| - subs $cnt,$cnt,#1 |
| - vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i] |
| - vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i] |
| - vext.8 $IN,$zero,#1 @ IN>>=8 |
| - |
| - veor $Z,$Qpost @ modulo-scheduled part |
| - vshl.i64 `&Dlo("$R")`,#48 |
| - vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte |
| - veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` |
| - |
| - veor `&Dhi("$Z")`,`&Dlo("$R")` |
| - vuzp.8 $Qlo,$Qhi |
| - vsli.8 $Zo,$T,#1 @ compose the "carry" byte |
| - vext.8 $Z,$zero,#1 @ Z>>=8 |
| - |
| - vmull.p8 $R,$Zo,$mod @ "carry"·0xe1 |
| - vshr.u8 $Zo,$T,#7 @ save Z's bottom bit |
| - vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 |
| - veor $Z,$Qhi |
| - bne .Linner_neon |
| - |
| - veor $Z,$Qpost @ modulo-scheduled artefact |
| - vshl.i64 `&Dlo("$R")`,#48 |
| - veor `&Dhi("$Z")`,`&Dlo("$R")` |
| - |
| - @ finalization, normalize Z:Zo |
| - vand $Zo,$mod @ suffices to mask the bit |
| - vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 |
| - vshl.i64 $Z,#1 |
| + veor $IN,$Xl @ inp^=Xi |
| +.Lgmult_neon: |
| +___ |
| + &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo |
| +$code.=<<___; |
| + veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing |
| +___ |
| + &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) |
| + &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi |
| +$code.=<<___; |
| + veor $Xm,$Xm,$Xl @ Karatsuba post-processing |
| + veor $Xm,$Xm,$Xh |
| + veor $Xl#hi,$Xl#hi,$Xm#lo |
| + veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result |
| + |
| + @ equivalent of reduction_avx from ghash-x86_64.pl |
| + vshl.i64 $t1,$Xl,#57 @ 1st phase |
| + vshl.i64 $t2,$Xl,#62 |
| + veor $t2,$t2,$t1 @ |
| + vshl.i64 $t1,$Xl,#63 |
| + veor $t2, $t2, $t1 @ |
| + veor $Xl#hi,$Xl#hi,$t2#lo @ |
| + veor $Xh#lo,$Xh#lo,$t2#hi |
| + |
| + vshr.u64 $t2,$Xl,#1 @ 2nd phase |
| + veor $Xh,$Xh,$Xl |
| + veor $Xl,$Xl,$t2 @ |
| + vshr.u64 $t2,$t2,#6 |
| + vshr.u64 $Xl,$Xl,#1 @ |
| + veor $Xl,$Xl,$Xh @ |
| + veor $Xl,$Xl,$t2 @ |
| + |
| subs $len,#16 |
| - vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 |
| - bne .Louter_neon |
| + bne .Loop_neon |
| |
| #ifdef __ARMEL__ |
| - vrev64.8 $Z,$Z |
| + vrev64.8 $Xl,$Xl |
| #endif |
| sub $Xi,#16 |
| - vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi |
| - vst1.64 `&Dlo("$Z")`,[$Xi,:64] |
| + vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi |
| + vst1.64 $Xl#lo,[$Xi,:64] |
| |
| - bx lr |
| + ret @ bx lr |
| .size gcm_ghash_neon,.-gcm_ghash_neon |
| #endif |
| ___ |
| @@ -423,7 +480,13 @@ $code.=<<___; |
| .align 2 |
| ___ |
| |
| -$code =~ s/\`([^\`]*)\`/eval $1/gem; |
| -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
| -print $code; |
| +foreach (split("\n",$code)) { |
| + s/\`([^\`]*)\`/eval $1/geo; |
| + |
| + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or |
| + s/\bret\b/bx lr/go or |
| + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 |
| + |
| + print $_,"\n"; |
| +} |
| close STDOUT; # enforce flush |
| diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl |
| new file mode 100644 |
| index 0000000..b24f3d7 |
| --- /dev/null |
| +++ b/crypto/modes/asm/ghashv8-armx.pl |
| @@ -0,0 +1,240 @@ |
| +#!/usr/bin/env perl |
| +# |
| +# ==================================================================== |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# project. The module is, however, dual licensed under OpenSSL and |
| +# CRYPTOGAMS licenses depending on where you obtain it. For further |
| +# details see http://www.openssl.org/~appro/cryptogams/. |
| +# ==================================================================== |
| +# |
| +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. |
| +# |
| +# June 2014 |
| +# |
| +# Initial version was developed in tight cooperation with Ard |
| +# Biesheuvel <[email protected]> from bits-n-pieces from |
| +# other assembly modules. Just like aesv8-armx.pl this module |
| +# supports both AArch32 and AArch64 execution modes. |
| +# |
| +# Current performance in cycles per processed byte: |
| +# |
| +# PMULL[2] 32-bit NEON(*) |
| +# Apple A7 1.76 5.62 |
| +# Cortex-A5x n/a n/a |
| +# |
| +# (*) presented for reference/comparison purposes; |
| + |
| +$flavour = shift; |
| +open STDOUT,">".shift; |
| + |
| +$Xi="x0"; # argument block |
| +$Htbl="x1"; |
| +$inp="x2"; |
| +$len="x3"; |
| + |
| +$inc="x12"; |
| + |
| +{ |
| +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); |
| +my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14)); |
| + |
| +$code=<<___; |
| +#include "arm_arch.h" |
| + |
| +.text |
| +___ |
| +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); |
| +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); |
| + |
| +$code.=<<___; |
| +.global gcm_init_v8 |
| +.type gcm_init_v8,%function |
| +.align 4 |
| +gcm_init_v8: |
| + vld1.64 {$t1},[x1] @ load H |
| + vmov.i8 $t0,#0xe1 |
| + vext.8 $IN,$t1,$t1,#8 |
| + vshl.i64 $t0,$t0,#57 |
| + vshr.u64 $t2,$t0,#63 |
| + vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01 |
| + vdup.32 $t1,${t1}[1] |
| + vshr.u64 $t3,$IN,#63 |
| + vshr.s32 $t1,$t1,#31 @ broadcast carry bit |
| + vand $t3,$t3,$t0 |
| + vshl.i64 $IN,$IN,#1 |
| + vext.8 $t3,$t3,$t3,#8 |
| + vand $t0,$t0,$t1 |
| + vorr $IN,$IN,$t3 @ H<<<=1 |
| + veor $IN,$IN,$t0 @ twisted H |
| + vst1.64 {$IN},[x0] |
| + |
| + ret |
| +.size gcm_init_v8,.-gcm_init_v8 |
| + |
| +.global gcm_gmult_v8 |
| +.type gcm_gmult_v8,%function |
| +.align 4 |
| +gcm_gmult_v8: |
| + vld1.64 {$t1},[$Xi] @ load Xi |
| + vmov.i8 $t3,#0xe1 |
| + vld1.64 {$H},[$Htbl] @ load twisted H |
| + vshl.u64 $t3,$t3,#57 |
| +#ifndef __ARMEB__ |
| + vrev64.8 $t1,$t1 |
| +#endif |
| + vext.8 $Hhl,$H,$H,#8 |
| + mov $len,#0 |
| + vext.8 $IN,$t1,$t1,#8 |
| + mov $inc,#0 |
| + veor $Hhl,$Hhl,$H @ Karatsuba pre-processing |
| + mov $inp,$Xi |
| + b .Lgmult_v8 |
| +.size gcm_gmult_v8,.-gcm_gmult_v8 |
| + |
| +.global gcm_ghash_v8 |
| +.type gcm_ghash_v8,%function |
| +.align 4 |
| +gcm_ghash_v8: |
| + vld1.64 {$Xl},[$Xi] @ load [rotated] Xi |
| + subs $len,$len,#16 |
| + vmov.i8 $t3,#0xe1 |
| + mov $inc,#16 |
| + vld1.64 {$H},[$Htbl] @ load twisted H |
| + cclr $inc,eq |
| + vext.8 $Xl,$Xl,$Xl,#8 |
| + vshl.u64 $t3,$t3,#57 |
| + vld1.64 {$t1},[$inp],$inc @ load [rotated] inp |
| + vext.8 $Hhl,$H,$H,#8 |
| +#ifndef __ARMEB__ |
| + vrev64.8 $Xl,$Xl |
| + vrev64.8 $t1,$t1 |
| +#endif |
| + veor $Hhl,$Hhl,$H @ Karatsuba pre-processing |
| + vext.8 $IN,$t1,$t1,#8 |
| + b .Loop_v8 |
| + |
| +.align 4 |
| +.Loop_v8: |
| + vext.8 $t2,$Xl,$Xl,#8 |
| + veor $IN,$IN,$Xl @ inp^=Xi |
| + veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi |
| + |
| +.Lgmult_v8: |
| + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo |
| + veor $t1,$t1,$IN @ Karatsuba pre-processing |
| + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi |
| + subs $len,$len,#16 |
| + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) |
| + cclr $inc,eq |
| + |
| + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing |
| + veor $t2,$Xl,$Xh |
| + veor $Xm,$Xm,$t1 |
| + vld1.64 {$t1},[$inp],$inc @ load [rotated] inp |
| + veor $Xm,$Xm,$t2 |
| + vpmull.p64 $t2,$Xl,$t3 @ 1st phase |
| + |
| + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result |
| + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl |
| +#ifndef __ARMEB__ |
| + vrev64.8 $t1,$t1 |
| +#endif |
| + veor $Xl,$Xm,$t2 |
| + vext.8 $IN,$t1,$t1,#8 |
| + |
| + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase |
| + vpmull.p64 $Xl,$Xl,$t3 |
| + veor $t2,$t2,$Xh |
| + veor $Xl,$Xl,$t2 |
| + b.hs .Loop_v8 |
| + |
| +#ifndef __ARMEB__ |
| + vrev64.8 $Xl,$Xl |
| +#endif |
| + vext.8 $Xl,$Xl,$Xl,#8 |
| + vst1.64 {$Xl},[$Xi] @ write out Xi |
| + |
| + ret |
| +.size gcm_ghash_v8,.-gcm_ghash_v8 |
| +___ |
| +} |
| +$code.=<<___; |
| +.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
| +.align 2 |
| +___ |
| + |
| +if ($flavour =~ /64/) { ######## 64-bit code |
| + sub unvmov { |
| + my $arg=shift; |
| + |
| + $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && |
| + sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; |
| + } |
| + foreach(split("\n",$code)) { |
| + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or |
| + s/vmov\.i8/movi/o or # fix up legacy mnemonics |
| + s/vmov\s+(.*)/unvmov($1)/geo or |
| + s/vext\.8/ext/o or |
| + s/vshr\.s/sshr\.s/o or |
| + s/vshr/ushr/o or |
| + s/^(\s+)v/$1/o or # strip off v prefix |
| + s/\bbx\s+lr\b/ret/o; |
| + |
| + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers |
| + s/@\s/\/\//o; # old->new style commentary |
| + |
| + # fix up remainig legacy suffixes |
| + s/\.[ui]?8(\s)/$1/o; |
| + s/\.[uis]?32//o and s/\.16b/\.4s/go; |
| + m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument |
| + m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments |
| + s/\.[uisp]?64//o and s/\.16b/\.2d/go; |
| + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; |
| + |
| + print $_,"\n"; |
| + } |
| +} else { ######## 32-bit code |
| + sub unvdup32 { |
| + my $arg=shift; |
| + |
| + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && |
| + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; |
| + } |
| + sub unvpmullp64 { |
| + my ($mnemonic,$arg)=@_; |
| + |
| + if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { |
| + my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) |
| + |(($2&7)<<17)|(($2&8)<<4) |
| + |(($3&7)<<1) |(($3&8)<<2); |
| + $word |= 0x00010001 if ($mnemonic =~ "2"); |
| + # since ARMv7 instructions are always encoded little-endian. |
| + # correct solution is to use .inst directive, but older |
| + # assemblers don't implement it:-( |
| + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", |
| + $word&0xff,($word>>8)&0xff, |
| + ($word>>16)&0xff,($word>>24)&0xff, |
| + $mnemonic,$arg; |
| + } |
| + } |
| + |
| + foreach(split("\n",$code)) { |
| + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers |
| + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers |
| + s/\/\/\s?/@ /o; # new->old style commentary |
| + |
| + # fix up remainig new-style suffixes |
| + s/\],#[0-9]+/]!/o; |
| + |
| + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or |
| + s/vdup\.32\s+(.*)/unvdup32($1)/geo or |
| + s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or |
| + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or |
| + s/^(\s+)b\./$1b/o or |
| + s/^(\s+)ret/$1bx\tlr/o; |
| + |
| + print $_,"\n"; |
| + } |
| +} |
| + |
| +close STDOUT; # enforce flush |
| diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c |
| index e1dc2b0..79ebb66 100644 |
| --- a/crypto/modes/gcm128.c |
| +++ b/crypto/modes/gcm128.c |
| @@ -642,7 +642,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) |
| |
| #endif |
| |
| -#if TABLE_BITS==4 && defined(GHASH_ASM) |
| +#if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ)) |
| # if !defined(I386_ONLY) && \ |
| (defined(__i386) || defined(__i386__) || \ |
| defined(__x86_64) || defined(__x86_64__) || \ |
| @@ -663,13 +663,21 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len |
| void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); |
| void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); |
| # endif |
| -# elif defined(__arm__) || defined(__arm) |
| +# elif defined(__arm__) || defined(__arm) || defined(__aarch64__) |
| # include "arm_arch.h" |
| # if __ARM_ARCH__>=7 |
| # define GHASH_ASM_ARM |
| # define GCM_FUNCREF_4BIT |
| +# define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL) |
| +# if defined(__arm__) || defined(__arm) |
| +# define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON) |
| +# endif |
| +void gcm_init_neon(u128 Htable[16],const u64 Xi[2]); |
| void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); |
| void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); |
| +void gcm_init_v8(u128 Htable[16],const u64 Xi[2]); |
| +void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); |
| +void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); |
| # endif |
| # endif |
| #endif |
| @@ -739,10 +747,21 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) |
| ctx->ghash = gcm_ghash_4bit; |
| # endif |
| # elif defined(GHASH_ASM_ARM) |
| - if (OPENSSL_armcap_P & ARMV7_NEON) { |
| +# ifdef PMULL_CAPABLE |
| + if (PMULL_CAPABLE) { |
| + gcm_init_v8(ctx->Htable,ctx->H.u); |
| + ctx->gmult = gcm_gmult_v8; |
| + ctx->ghash = gcm_ghash_v8; |
| + } else |
| +# endif |
| +# ifdef NEON_CAPABLE |
| + if (NEON_CAPABLE) { |
| + gcm_init_neon(ctx->Htable,ctx->H.u); |
| ctx->gmult = gcm_gmult_neon; |
| ctx->ghash = gcm_ghash_neon; |
| - } else { |
| + } else |
| +# endif |
| + { |
| gcm_init_4bit(ctx->Htable,ctx->H.u); |
| ctx->gmult = gcm_gmult_4bit; |
| ctx->ghash = gcm_ghash_4bit; |
| diff --git a/crypto/sha/Makefile b/crypto/sha/Makefile |
| index 2eb2b7a..6ef027d 100644 |
| --- a/crypto/sha/Makefile |
| +++ b/crypto/sha/Makefile |
| @@ -92,6 +92,9 @@ sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ |
| sha1-armv4-large.o: sha1-armv4-large.S |
| sha256-armv4.o: sha256-armv4.S |
| sha512-armv4.o: sha512-armv4.S |
| +sha1-armv8.o: sha1-armv8.S |
| +sha256-armv8.o: sha256-armv8.S |
| +sha512-armv8.o: sha512-armv8.S |
| |
| files: |
| $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO |
| diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl |
| index 33da3e0..50bd07b 100644 |
| --- a/crypto/sha/asm/sha1-armv4-large.pl |
| +++ b/crypto/sha/asm/sha1-armv4-large.pl |
| @@ -1,7 +1,7 @@ |
| #!/usr/bin/env perl |
| |
| # ==================================================================== |
| -# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| @@ -52,6 +52,20 @@ |
| # Profiler-assisted and platform-specific optimization resulted in 10% |
| # improvement on Cortex A8 core and 12.2 cycles per byte. |
| |
| +# September 2013. |
| +# |
| +# Add NEON implementation (see sha1-586.pl for background info). On |
| +# Cortex A8 it was measured to process one byte in 6.7 cycles or >80% |
| +# faster than integer-only code. Because [fully unrolled] NEON code |
| +# is ~2.5x larger and there are some redundant instructions executed |
| +# when processing last block, improvement is not as big for smallest |
| +# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per |
| +# byte, which is also >80% faster than integer-only code. |
| + |
| +# May 2014. |
| +# |
| +# Add ARMv8 code path performing at 2.35 cpb on Apple A7. |
| + |
| while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| open STDOUT,">$output"; |
| |
| @@ -153,12 +167,22 @@ $code=<<___; |
| #include "arm_arch.h" |
| |
| .text |
| +.code 32 |
| |
| .global sha1_block_data_order |
| .type sha1_block_data_order,%function |
| |
| -.align 2 |
| +.align 5 |
| sha1_block_data_order: |
| +#if __ARM_ARCH__>=7 |
| + sub r3,pc,#8 @ sha1_block_data_order |
| + ldr r12,.LOPENSSL_armcap |
| + ldr r12,[r3,r12] @ OPENSSL_armcap_P |
| + tst r12,#ARMV8_SHA1 |
| + bne .LARMv8 |
| + tst r12,#ARMV7_NEON |
| + bne .LNEON |
| +#endif |
| stmdb sp!,{r4-r12,lr} |
| add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp |
| ldmia $ctx,{$a,$b,$c,$d,$e} |
| @@ -233,16 +257,422 @@ $code.=<<___; |
| moveq pc,lr @ be binary compatible with V4, yet |
| bx lr @ interoperable with Thumb ISA:-) |
| #endif |
| -.align 2 |
| +.size sha1_block_data_order,.-sha1_block_data_order |
| + |
| +.align 5 |
| .LK_00_19: .word 0x5a827999 |
| .LK_20_39: .word 0x6ed9eba1 |
| .LK_40_59: .word 0x8f1bbcdc |
| .LK_60_79: .word 0xca62c1d6 |
| -.size sha1_block_data_order,.-sha1_block_data_order |
| -.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
| -.align 2 |
| +.LOPENSSL_armcap: |
| +.word OPENSSL_armcap_P-sha1_block_data_order |
| +.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
| +.align 5 |
| +___ |
| +##################################################################### |
| +# NEON stuff |
| +# |
| +{{{ |
| +my @V=($a,$b,$c,$d,$e); |
| +my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14)); |
| +my $Xi=4; |
| +my @X=map("q$_",(8..11,0..3)); |
| +my @Tx=("q12","q13"); |
| +my ($K,$zero)=("q14","q15"); |
| +my $j=0; |
| + |
| +sub AUTOLOAD() # thunk [simplified] x86-style perlasm |
| +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; |
| + my $arg = pop; |
| + $arg = "#$arg" if ($arg*1 eq $arg); |
| + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; |
| +} |
| + |
| +sub body_00_19 () { |
| + ( |
| + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. |
| + '&bic ($t0,$d,$b)', |
| + '&add ($e,$e,$Ki)', # e+=X[i]+K |
| + '&and ($t1,$c,$b)', |
| + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', |
| + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) |
| + '&eor ($t1,$t1,$t0)', # F_00_19 |
| + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) |
| + '&add ($e,$e,$t1);'. # e+=F_00_19 |
| + '$j++; unshift(@V,pop(@V));' |
| + ) |
| +} |
| +sub body_20_39 () { |
| + ( |
| + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. |
| + '&eor ($t0,$b,$d)', |
| + '&add ($e,$e,$Ki)', # e+=X[i]+K |
| + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)', |
| + '&eor ($t1,$t0,$c)', # F_20_39 |
| + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) |
| + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) |
| + '&add ($e,$e,$t1);'. # e+=F_20_39 |
| + '$j++; unshift(@V,pop(@V));' |
| + ) |
| +} |
| +sub body_40_59 () { |
| + ( |
| + '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. |
| + '&add ($e,$e,$Ki)', # e+=X[i]+K |
| + '&and ($t0,$c,$d)', |
| + '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', |
| + '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) |
| + '&eor ($t1,$c,$d)', |
| + '&add ($e,$e,$t0)', |
| + '&and ($t1,$t1,$b)', |
| + '&mov ($b,$b,"ror#2")', # b=ROR(b,2) |
| + '&add ($e,$e,$t1);'. # e+=F_40_59 |
| + '$j++; unshift(@V,pop(@V));' |
| + ) |
| +} |
| + |
| +sub Xupdate_16_31 () |
| +{ use integer; |
| + my $body = shift; |
| + my @insns = (&$body,&$body,&$body,&$body); |
| + my ($a,$b,$c,$d,$e); |
| + |
| + &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]" |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 (@Tx[1],@X[-1&7],$K); |
| + eval(shift(@insns)); |
| + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0); |
| + eval(shift(@insns)); |
| + &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8] |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer |
| + &sub ($Xfer,$Xfer,64) if ($Xi%4==0); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 (@X[0],@Tx[0],@Tx[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1 |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 (@Tx[0],@Tx[1],30); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshl_u32 (@Tx[1],@Tx[1],2); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor (@X[0],@X[0],@Tx[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 |
| + |
| + foreach (@insns) { eval; } # remaining instructions [if any] |
| + |
| + $Xi++; push(@X,shift(@X)); # "rotate" X[] |
| +} |
| + |
| +sub Xupdate_32_79 () |
| +{ use integer; |
| + my $body = shift; |
| + my @insns = (&$body,&$body,&$body,&$body); |
| + my ($a,$b,$c,$d,$e); |
| + |
| + &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]" |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 (@Tx[1],@X[-1&7],$K); |
| + eval(shift(@insns)); |
| + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0); |
| + eval(shift(@insns)); |
| + &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]" |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 (@X[0],@Tx[0],30); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer |
| + &sub ($Xfer,$Xfer,64) if ($Xi%4==0); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2 |
| + |
| + foreach (@insns) { eval; } # remaining instructions [if any] |
| + |
| + $Xi++; push(@X,shift(@X)); # "rotate" X[] |
| +} |
| + |
| +sub Xuplast_80 () |
| +{ use integer; |
| + my $body = shift; |
| + my @insns = (&$body,&$body,&$body,&$body); |
| + my ($a,$b,$c,$d,$e); |
| + |
| + &vadd_i32 (@Tx[1],@X[-1&7],$K); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); |
| + &sub ($Xfer,$Xfer,64); |
| + |
| + &teq ($inp,$len); |
| + &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX |
| + &subeq ($inp,$inp,64); # reload last block to avoid SEGV |
| + &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!"); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19 |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vrev32_8 (@X[-4&7],@X[-4&7]); |
| + |
| + foreach (@insns) { eval; } # remaining instructions |
| + |
| + $Xi=0; |
| +} |
| + |
| +sub Xloop() |
| +{ use integer; |
| + my $body = shift; |
| + my @insns = (&$body,&$body,&$body,&$body); |
| + my ($a,$b,$c,$d,$e); |
| + |
| + &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU |
| + |
| + foreach (@insns) { eval; } |
| + |
| + $Xi++; |
| +} |
| + |
| +$code.=<<___; |
| +#if __ARM_ARCH__>=7 |
| +.fpu neon |
| + |
| +.type sha1_block_data_order_neon,%function |
| +.align 4 |
| +sha1_block_data_order_neon: |
| +.LNEON: |
| + stmdb sp!,{r4-r12,lr} |
| + add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp |
| + @ dmb @ errata #451034 on early Cortex A8 |
| + @ vstmdb sp!,{d8-d15} @ ABI specification says so |
| + mov $saved_sp,sp |
| + sub sp,sp,#64 @ alloca |
| + adr $K_XX_XX,.LK_00_19 |
| + bic sp,sp,#15 @ align for 128-bit stores |
| + |
| + ldmia $ctx,{$a,$b,$c,$d,$e} @ load context |
| + mov $Xfer,sp |
| + |
| + vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned |
| + veor $zero,$zero,$zero |
| + vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]! |
| + vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19 |
| + vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on |
| + vrev32.8 @X[-3&7],@X[-3&7] @ big-endian... |
| + vrev32.8 @X[-2&7],@X[-2&7] |
| + vadd.i32 @X[0],@X[-4&7],$K |
| + vrev32.8 @X[-1&7],@X[-1&7] |
| + vadd.i32 @X[1],@X[-3&7],$K |
| + vst1.32 {@X[0]},[$Xfer,:128]! |
| + vadd.i32 @X[2],@X[-2&7],$K |
| + vst1.32 {@X[1]},[$Xfer,:128]! |
| + vst1.32 {@X[2]},[$Xfer,:128]! |
| + ldr $Ki,[sp] @ big RAW stall |
| + |
| +.Loop_neon: |
| +___ |
| + &Xupdate_16_31(\&body_00_19); |
| + &Xupdate_16_31(\&body_00_19); |
| + &Xupdate_16_31(\&body_00_19); |
| + &Xupdate_16_31(\&body_00_19); |
| + &Xupdate_32_79(\&body_00_19); |
| + &Xupdate_32_79(\&body_20_39); |
| + &Xupdate_32_79(\&body_20_39); |
| + &Xupdate_32_79(\&body_20_39); |
| + &Xupdate_32_79(\&body_20_39); |
| + &Xupdate_32_79(\&body_20_39); |
| + &Xupdate_32_79(\&body_40_59); |
| + &Xupdate_32_79(\&body_40_59); |
| + &Xupdate_32_79(\&body_40_59); |
| + &Xupdate_32_79(\&body_40_59); |
| + &Xupdate_32_79(\&body_40_59); |
| + &Xupdate_32_79(\&body_20_39); |
| + &Xuplast_80(\&body_20_39); |
| + &Xloop(\&body_20_39); |
| + &Xloop(\&body_20_39); |
| + &Xloop(\&body_20_39); |
| +$code.=<<___; |
| + ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context |
| + add $a,$a,$Ki |
| + ldr $Ki,[$ctx,#16] |
| + add $b,$b,$t0 |
| + add $c,$c,$t1 |
| + add $d,$d,$Xfer |
| + moveq sp,$saved_sp |
| + add $e,$e,$Ki |
| + ldrne $Ki,[sp] |
| + stmia $ctx,{$a,$b,$c,$d,$e} |
| + addne $Xfer,sp,#3*16 |
| + bne .Loop_neon |
| + |
| + @ vldmia sp!,{d8-d15} |
| + ldmia sp!,{r4-r12,pc} |
| +.size sha1_block_data_order_neon,.-sha1_block_data_order_neon |
| +#endif |
| +___ |
| +}}} |
| +##################################################################### |
| +# ARMv8 stuff |
| +# |
| +{{{ |
| +my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3)); |
| +my @MSG=map("q$_",(4..7)); |
| +my @Kxx=map("q$_",(8..11)); |
| +my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); |
| + |
| +$code.=<<___; |
| +#if __ARM_ARCH__>=7 |
| +.type sha1_block_data_order_armv8,%function |
| +.align 5 |
| +sha1_block_data_order_armv8: |
| +.LARMv8: |
| + vstmdb sp!,{d8-d15} @ ABI specification says so |
| + |
| + veor $E,$E,$E |
| + adr r3,.LK_00_19 |
| + vld1.32 {$ABCD},[$ctx]! |
| + vld1.32 {$E\[0]},[$ctx] |
| + sub $ctx,$ctx,#16 |
| + vld1.32 {@Kxx[0]\[]},[r3,:32]! |
| + vld1.32 {@Kxx[1]\[]},[r3,:32]! |
| + vld1.32 {@Kxx[2]\[]},[r3,:32]! |
| + vld1.32 {@Kxx[3]\[]},[r3,:32] |
| + |
| +.Loop_v8: |
| + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! |
| + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! |
| + vrev32.8 @MSG[0],@MSG[0] |
| + vrev32.8 @MSG[1],@MSG[1] |
| + |
| + vadd.i32 $W0,@Kxx[0],@MSG[0] |
| + vrev32.8 @MSG[2],@MSG[2] |
| + vmov $ABCD_SAVE,$ABCD @ offload |
| + subs $len,$len,#1 |
| + |
| + vadd.i32 $W1,@Kxx[0],@MSG[1] |
| + vrev32.8 @MSG[3],@MSG[3] |
| + sha1h $E1,$ABCD @ 0 |
| + sha1c $ABCD,$E,$W0 |
| + vadd.i32 $W0,@Kxx[$j],@MSG[2] |
| + sha1su0 @MSG[0],@MSG[1],@MSG[2] |
| +___ |
| +for ($j=0,$i=1;$i<20-3;$i++) { |
| +my $f=("c","p","m","p")[$i/5]; |
| +$code.=<<___; |
| + sha1h $E0,$ABCD @ $i |
| + sha1$f $ABCD,$E1,$W1 |
| + vadd.i32 $W1,@Kxx[$j],@MSG[3] |
| + sha1su1 @MSG[0],@MSG[3] |
| +___ |
| +$code.=<<___ if ($i<20-4); |
| + sha1su0 @MSG[1],@MSG[2],@MSG[3] |
| ___ |
| + ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); |
| + push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); |
| +} |
| +$code.=<<___; |
| + sha1h $E0,$ABCD @ $i |
| + sha1p $ABCD,$E1,$W1 |
| + vadd.i32 $W1,@Kxx[$j],@MSG[3] |
| + |
| + sha1h $E1,$ABCD @ 18 |
| + sha1p $ABCD,$E0,$W0 |
| + |
| + sha1h $E0,$ABCD @ 19 |
| + sha1p $ABCD,$E1,$W1 |
| + |
| + vadd.i32 $E,$E,$E0 |
| + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE |
| + bne .Loop_v8 |
| + |
| + vst1.32 {$ABCD},[$ctx]! |
| + vst1.32 {$E\[0]},[$ctx] |
| + |
| + vldmia sp!,{d8-d15} |
| + ret @ bx lr |
| +.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 |
| +#endif |
| +___ |
| +}}} |
| +$code.=<<___; |
| +.comm OPENSSL_armcap_P,4,4 |
| +___ |
| + |
| +{ my %opcode = ( |
| + "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40, |
| + "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40, |
| + "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 ); |
| + |
| + sub unsha1 { |
| + my ($mnemonic,$arg)=@_; |
| + |
| + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { |
| + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) |
| + |(($2&7)<<17)|(($2&8)<<4) |
| + |(($3&7)<<1) |(($3&8)<<2); |
| + # since ARMv7 instructions are always encoded little-endian. |
| + # correct solution is to use .inst directive, but older |
| + # assemblers don't implement it:-( |
| + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", |
| + $word&0xff,($word>>8)&0xff, |
| + ($word>>16)&0xff,($word>>24)&0xff, |
| + $mnemonic,$arg; |
| + } |
| + } |
| +} |
| + |
| +foreach (split($/,$code)) { |
| + s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or |
| + s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo; |
| + |
| + s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo; |
| + |
| + s/\bret\b/bx lr/o or |
| + s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4 |
| + |
| + print $_,$/; |
| +} |
| |
| -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
| -print $code; |
| close STDOUT; # enforce flush |
| diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl |
| new file mode 100644 |
| index 0000000..c1f552b |
| --- /dev/null |
| +++ b/crypto/sha/asm/sha1-armv8.pl |
| @@ -0,0 +1,333 @@ |
| +#!/usr/bin/env perl |
| +# |
| +# ==================================================================== |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# project. The module is, however, dual licensed under OpenSSL and |
| +# CRYPTOGAMS licenses depending on where you obtain it. For further |
| +# details see http://www.openssl.org/~appro/cryptogams/. |
| +# ==================================================================== |
| +# |
| +# SHA1 for ARMv8. |
| +# |
| +# Performance in cycles per processed byte and improvement coefficient |
| +# over code generated with "default" compiler: |
| +# |
| +# hardware-assisted software(*) |
| +# Apple A7 2.31 4.13 (+14%) |
| +# Cortex-A5x n/a n/a |
| +# |
| +# (*) Software results are presented mostly for reference purposes. |
| + |
| +$flavour = shift; |
| +open STDOUT,">".shift; |
| + |
| +($ctx,$inp,$num)=("x0","x1","x2"); |
| +@Xw=map("w$_",(3..17,19)); |
| +@Xx=map("x$_",(3..17,19)); |
| +@V=($A,$B,$C,$D,$E)=map("w$_",(20..24)); |
| +($t0,$t1,$t2,$K)=map("w$_",(25..28)); |
| + |
| + |
| +sub BODY_00_19 { |
| +my ($i,$a,$b,$c,$d,$e)=@_; |
| +my $j=($i+2)&15; |
| + |
| +$code.=<<___ if ($i<15 && !($i&1)); |
| + lsr @Xx[$i+1],@Xx[$i],#32 |
| +___ |
| +$code.=<<___ if ($i<14 && !($i&1)); |
| + ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`] |
| +___ |
| +$code.=<<___ if ($i<14 && ($i&1)); |
| +#ifdef __ARMEB__ |
| + ror @Xx[$i+1],@Xx[$i+1],#32 |
| +#else |
| + rev32 @Xx[$i+1],@Xx[$i+1] |
| +#endif |
| +___ |
| +$code.=<<___ if ($i<14); |
| + bic $t0,$d,$b |
| + and $t1,$c,$b |
| + ror $t2,$a,#27 |
| + add $d,$d,$K // future e+=K |
| + orr $t0,$t0,$t1 |
| + add $e,$e,$t2 // e+=rot(a,5) |
| + ror $b,$b,#2 |
| + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] |
| + add $e,$e,$t0 // e+=F(b,c,d) |
| +___ |
| +$code.=<<___ if ($i==19); |
| + movz $K,#0xeba1 |
| + movk $K,#0x6ed9,lsl#16 |
| +___ |
| +$code.=<<___ if ($i>=14); |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] |
| + bic $t0,$d,$b |
| + and $t1,$c,$b |
| + ror $t2,$a,#27 |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] |
| + add $d,$d,$K // future e+=K |
| + orr $t0,$t0,$t1 |
| + add $e,$e,$t2 // e+=rot(a,5) |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] |
| + ror $b,$b,#2 |
| + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] |
| + add $e,$e,$t0 // e+=F(b,c,d) |
| + ror @Xw[$j],@Xw[$j],#31 |
| +___ |
| +} |
| + |
| +sub BODY_40_59 { |
| +my ($i,$a,$b,$c,$d,$e)=@_; |
| +my $j=($i+2)&15; |
| + |
| +$code.=<<___ if ($i==59); |
| + movz $K,#0xc1d6 |
| + movk $K,#0xca62,lsl#16 |
| +___ |
| +$code.=<<___; |
| + orr $t0,$b,$c |
| + and $t1,$b,$c |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] |
| + ror $t2,$a,#27 |
| + and $t0,$t0,$d |
| + add $d,$d,$K // future e+=K |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] |
| + add $e,$e,$t2 // e+=rot(a,5) |
| + orr $t0,$t0,$t1 |
| + ror $b,$b,#2 |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] |
| + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] |
| + add $e,$e,$t0 // e+=F(b,c,d) |
| + ror @Xw[$j],@Xw[$j],#31 |
| +___ |
| +} |
| + |
| +sub BODY_20_39 { |
| +my ($i,$a,$b,$c,$d,$e)=@_; |
| +my $j=($i+2)&15; |
| + |
| +$code.=<<___ if ($i==39); |
| + movz $K,#0xbcdc |
| + movk $K,#0x8f1b,lsl#16 |
| +___ |
| +$code.=<<___ if ($i<78); |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] |
| + eor $t0,$d,$b |
| + ror $t2,$a,#27 |
| + add $d,$d,$K // future e+=K |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] |
| + eor $t0,$t0,$c |
| + add $e,$e,$t2 // e+=rot(a,5) |
| + ror $b,$b,#2 |
| + eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] |
| + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] |
| + add $e,$e,$t0 // e+=F(b,c,d) |
| + ror @Xw[$j],@Xw[$j],#31 |
| +___ |
| +$code.=<<___ if ($i==78); |
| + ldp @Xw[1],@Xw[2],[$ctx] |
| + eor $t0,$d,$b |
| + ror $t2,$a,#27 |
| + add $d,$d,$K // future e+=K |
| + eor $t0,$t0,$c |
| + add $e,$e,$t2 // e+=rot(a,5) |
| + ror $b,$b,#2 |
| + add $d,$d,@Xw[($i+1)&15] // future e+=X[i] |
| + add $e,$e,$t0 // e+=F(b,c,d) |
| +___ |
| +$code.=<<___ if ($i==79); |
| + ldp @Xw[3],@Xw[4],[$ctx,#8] |
| + eor $t0,$d,$b |
| + ror $t2,$a,#27 |
| + eor $t0,$t0,$c |
| + add $e,$e,$t2 // e+=rot(a,5) |
| + ror $b,$b,#2 |
| + ldr @Xw[5],[$ctx,#16] |
| + add $e,$e,$t0 // e+=F(b,c,d) |
| +___ |
| +} |
| + |
| +$code.=<<___; |
| +#include "arm_arch.h" |
| + |
| +.text |
| + |
| +.globl sha1_block_data_order |
| +.type sha1_block_data_order,%function |
| +.align 6 |
| +sha1_block_data_order: |
| + ldr x16,.LOPENSSL_armcap_P |
| + adr x17,.LOPENSSL_armcap_P |
| + add x16,x16,x17 |
| + ldr w16,[x16] |
| + tst w16,#ARMV8_SHA1 |
| + b.ne .Lv8_entry |
| + |
| + stp x29,x30,[sp,#-96]! |
| + add x29,sp,#0 |
| + stp x19,x20,[sp,#16] |
| + stp x21,x22,[sp,#32] |
| + stp x23,x24,[sp,#48] |
| + stp x25,x26,[sp,#64] |
| + stp x27,x28,[sp,#80] |
| + |
| + ldp $A,$B,[$ctx] |
| + ldp $C,$D,[$ctx,#8] |
| + ldr $E,[$ctx,#16] |
| + |
| +.Loop: |
| + ldr @Xx[0],[$inp],#64 |
| + movz $K,#0x7999 |
| + sub $num,$num,#1 |
| + movk $K,#0x5a82,lsl#16 |
| +#ifdef __ARMEB__ |
| + ror $Xx[0],@Xx[0],#32 |
| +#else |
| + rev32 @Xx[0],@Xx[0] |
| +#endif |
| + add $E,$E,$K // warm it up |
| + add $E,$E,@Xw[0] |
| +___ |
| +for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } |
| +for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
| +for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } |
| +for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } |
| +$code.=<<___; |
| + add $B,$B,@Xw[2] |
| + add $C,$C,@Xw[3] |
| + add $A,$A,@Xw[1] |
| + add $D,$D,@Xw[4] |
| + add $E,$E,@Xw[5] |
| + stp $A,$B,[$ctx] |
| + stp $C,$D,[$ctx,#8] |
| + str $E,[$ctx,#16] |
| + cbnz $num,.Loop |
| + |
| + ldp x19,x20,[sp,#16] |
| + ldp x21,x22,[sp,#32] |
| + ldp x23,x24,[sp,#48] |
| + ldp x25,x26,[sp,#64] |
| + ldp x27,x28,[sp,#80] |
| + ldr x29,[sp],#96 |
| + ret |
| +.size sha1_block_data_order,.-sha1_block_data_order |
| +___ |
| +{{{ |
| +my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3)); |
| +my @MSG=map("v$_.16b",(4..7)); |
| +my @Kxx=map("v$_.4s",(16..19)); |
| +my ($W0,$W1)=("v20.4s","v21.4s"); |
| +my $ABCD_SAVE="v22.16b"; |
| + |
| +$code.=<<___; |
| +.type sha1_block_armv8,%function |
| +.align 6 |
| +sha1_block_armv8: |
| +.Lv8_entry: |
| + stp x29,x30,[sp,#-16]! |
| + add x29,sp,#0 |
| + |
| + adr x4,.Lconst |
| + eor $E,$E,$E |
| + ld1.32 {$ABCD},[$ctx],#16 |
| + ld1.32 {$E}[0],[$ctx] |
| + sub $ctx,$ctx,#16 |
| + ld1.32 {@Kxx[0]-@Kxx[3]},[x4] |
| + |
| +.Loop_hw: |
| + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 |
| + sub $num,$num,#1 |
| + rev32 @MSG[0],@MSG[0] |
| + rev32 @MSG[1],@MSG[1] |
| + |
| + add.i32 $W0,@Kxx[0],@MSG[0] |
| + rev32 @MSG[2],@MSG[2] |
| + orr $ABCD_SAVE,$ABCD,$ABCD // offload |
| + |
| + add.i32 $W1,@Kxx[0],@MSG[1] |
| + rev32 @MSG[3],@MSG[3] |
| + sha1h $E1,$ABCD |
| + sha1c $ABCD,$E,$W0 // 0 |
| + add.i32 $W0,@Kxx[$j],@MSG[2] |
| + sha1su0 @MSG[0],@MSG[1],@MSG[2] |
| +___ |
| +for ($j=0,$i=1;$i<20-3;$i++) { |
| +my $f=("c","p","m","p")[$i/5]; |
| +$code.=<<___; |
| + sha1h $E0,$ABCD // $i |
| + sha1$f $ABCD,$E1,$W1 |
| + add.i32 $W1,@Kxx[$j],@MSG[3] |
| + sha1su1 @MSG[0],@MSG[3] |
| +___ |
| +$code.=<<___ if ($i<20-4); |
| + sha1su0 @MSG[1],@MSG[2],@MSG[3] |
| +___ |
| + ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); |
| + push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); |
| +} |
| +$code.=<<___; |
| + sha1h $E0,$ABCD // $i |
| + sha1p $ABCD,$E1,$W1 |
| + add.i32 $W1,@Kxx[$j],@MSG[3] |
| + |
| + sha1h $E1,$ABCD // 18 |
| + sha1p $ABCD,$E0,$W0 |
| + |
| + sha1h $E0,$ABCD // 19 |
| + sha1p $ABCD,$E1,$W1 |
| + |
| + add.i32 $E,$E,$E0 |
| + add.i32 $ABCD,$ABCD,$ABCD_SAVE |
| + |
| + cbnz $num,.Loop_hw |
| + |
| + st1.32 {$ABCD},[$ctx],#16 |
| + st1.32 {$E}[0],[$ctx] |
| + |
| + ldr x29,[sp],#16 |
| + ret |
| +.size sha1_block_armv8,.-sha1_block_armv8 |
| +.align 6 |
| +.Lconst: |
| +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 |
| +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 |
| +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 |
| +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 |
| +.LOPENSSL_armcap_P: |
| +.quad OPENSSL_armcap_P-. |
| +.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
| +.align 2 |
| +.comm OPENSSL_armcap_P,4,4 |
| +___ |
| +}}} |
| + |
| +{ my %opcode = ( |
| + "sha1c" => 0x5e000000, "sha1p" => 0x5e001000, |
| + "sha1m" => 0x5e002000, "sha1su0" => 0x5e003000, |
| + "sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 ); |
| + |
| + sub unsha1 { |
| + my ($mnemonic,$arg)=@_; |
| + |
| + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o |
| + && |
| + sprintf ".inst\t0x%08x\t//%s %s", |
| + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), |
| + $mnemonic,$arg; |
| + } |
| +} |
| + |
| +foreach(split("\n",$code)) { |
| + |
| + s/\`([^\`]*)\`/eval($1)/geo; |
| + |
| + s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo; |
| + |
| + s/\.\w?32\b//o and s/\.16b/\.4s/go; |
| + m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; |
| + |
| + print $_,"\n"; |
| +} |
| + |
| +close STDOUT; |
| diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl |
| index 9c84e8d..505ca8f 100644 |
| --- a/crypto/sha/asm/sha256-armv4.pl |
| +++ b/crypto/sha/asm/sha256-armv4.pl |
| @@ -1,7 +1,7 @@ |
| #!/usr/bin/env perl |
| |
| # ==================================================================== |
| -# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| # project. The module is, however, dual licensed under OpenSSL and |
| # CRYPTOGAMS licenses depending on where you obtain it. For further |
| # details see http://www.openssl.org/~appro/cryptogams/. |
| @@ -21,15 +21,27 @@ |
| # February 2011. |
| # |
| # Profiler-assisted and platform-specific optimization resulted in 16% |
| -# improvement on Cortex A8 core and ~17 cycles per processed byte. |
| +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. |
| + |
| +# September 2013. |
| +# |
| +# Add NEON implementation. On Cortex A8 it was measured to process one |
| +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon |
| +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only |
| +# code (meaning that latter performs sub-optimally, nothing was done |
| +# about it). |
| + |
| +# May 2014. |
| +# |
| +# Add ARMv8 code path performing at 2.0 cpb on Apple A7. |
| |
| while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} |
| open STDOUT,">$output"; |
| |
| $ctx="r0"; $t0="r0"; |
| -$inp="r1"; $t3="r1"; |
| +$inp="r1"; $t4="r1"; |
| $len="r2"; $t1="r2"; |
| -$T1="r3"; |
| +$T1="r3"; $t3="r3"; |
| $A="r4"; |
| $B="r5"; |
| $C="r6"; |
| @@ -52,71 +64,88 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
| |
| $code.=<<___ if ($i<16); |
| #if __ARM_ARCH__>=7 |
| - ldr $T1,[$inp],#4 |
| + @ ldr $t1,[$inp],#4 @ $i |
| +# if $i==15 |
| + str $inp,[sp,#17*4] @ make room for $t4 |
| +# endif |
| + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` |
| + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past |
| + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) |
| + rev $t1,$t1 |
| #else |
| - ldrb $T1,[$inp,#3] @ $i |
| + @ ldrb $t1,[$inp,#3] @ $i |
| + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past |
| ldrb $t2,[$inp,#2] |
| - ldrb $t1,[$inp,#1] |
| - ldrb $t0,[$inp],#4 |
| - orr $T1,$T1,$t2,lsl#8 |
| - orr $T1,$T1,$t1,lsl#16 |
| - orr $T1,$T1,$t0,lsl#24 |
| + ldrb $t0,[$inp,#1] |
| + orr $t1,$t1,$t2,lsl#8 |
| + ldrb $t2,[$inp],#4 |
| + orr $t1,$t1,$t0,lsl#16 |
| +# if $i==15 |
| + str $inp,[sp,#17*4] @ make room for $t4 |
| +# endif |
| + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` |
| + orr $t1,$t1,$t2,lsl#24 |
| + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) |
| #endif |
| ___ |
| $code.=<<___; |
| - mov $t0,$e,ror#$Sigma1[0] |
| ldr $t2,[$Ktbl],#4 @ *K256++ |
| - eor $t0,$t0,$e,ror#$Sigma1[1] |
| + add $h,$h,$t1 @ h+=X[i] |
| + str $t1,[sp,#`$i%16`*4] |
| eor $t1,$f,$g |
| -#if $i>=16 |
| - add $T1,$T1,$t3 @ from BODY_16_xx |
| -#elif __ARM_ARCH__>=7 && defined(__ARMEL__) |
| - rev $T1,$T1 |
| -#endif |
| -#if $i==15 |
| - str $inp,[sp,#17*4] @ leave room for $t3 |
| -#endif |
| - eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) |
| + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) |
| and $t1,$t1,$e |
| - str $T1,[sp,#`$i%16`*4] |
| - add $T1,$T1,$t0 |
| + add $h,$h,$t2 @ h+=K256[i] |
| eor $t1,$t1,$g @ Ch(e,f,g) |
| - add $T1,$T1,$h |
| - mov $h,$a,ror#$Sigma0[0] |
| - add $T1,$T1,$t1 |
| - eor $h,$h,$a,ror#$Sigma0[1] |
| - add $T1,$T1,$t2 |
| - eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) |
| -#if $i>=15 |
| - ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx |
| + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` |
| + add $h,$h,$t1 @ h+=Ch(e,f,g) |
| +#if $i==31 |
| + and $t2,$t2,#0xff |
| + cmp $t2,#0xf2 @ done? |
| #endif |
| - orr $t0,$a,$b |
| - and $t1,$a,$b |
| - and $t0,$t0,$c |
| - add $h,$h,$T1 |
| - orr $t0,$t0,$t1 @ Maj(a,b,c) |
| - add $d,$d,$T1 |
| - add $h,$h,$t0 |
| +#if $i<15 |
| +# if __ARM_ARCH__>=7 |
| + ldr $t1,[$inp],#4 @ prefetch |
| +# else |
| + ldrb $t1,[$inp,#3] |
| +# endif |
| + eor $t2,$a,$b @ a^b, b^c in next round |
| +#else |
| + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx |
| + eor $t2,$a,$b @ a^b, b^c in next round |
| + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx |
| +#endif |
| + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) |
| + and $t3,$t3,$t2 @ (b^c)&=(a^b) |
| + add $d,$d,$h @ d+=h |
| + eor $t3,$t3,$b @ Maj(a,b,c) |
| + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) |
| + @ add $h,$h,$t3 @ h+=Maj(a,b,c) |
| ___ |
| + ($t2,$t3)=($t3,$t2); |
| } |
| |
| sub BODY_16_XX { |
| my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; |
| |
| $code.=<<___; |
| - @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i |
| - ldr $t2,[sp,#`($i+14)%16`*4] |
| - mov $t0,$t3,ror#$sigma0[0] |
| - ldr $T1,[sp,#`($i+0)%16`*4] |
| - eor $t0,$t0,$t3,ror#$sigma0[1] |
| - ldr $t1,[sp,#`($i+9)%16`*4] |
| - eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) |
| - mov $t3,$t2,ror#$sigma1[0] |
| - add $T1,$T1,$t0 |
| - eor $t3,$t3,$t2,ror#$sigma1[1] |
| - add $T1,$T1,$t1 |
| - eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) |
| - @ add $T1,$T1,$t3 |
| + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i |
| + @ ldr $t4,[sp,#`($i+14)%16`*4] |
| + mov $t0,$t1,ror#$sigma0[0] |
| + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past |
| + mov $t2,$t4,ror#$sigma1[0] |
| + eor $t0,$t0,$t1,ror#$sigma0[1] |
| + eor $t2,$t2,$t4,ror#$sigma1[1] |
| + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) |
| + ldr $t1,[sp,#`($i+0)%16`*4] |
| + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) |
| + ldr $t4,[sp,#`($i+9)%16`*4] |
| + |
| + add $t2,$t2,$t0 |
| + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 |
| + add $t1,$t1,$t2 |
| + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) |
| + add $t1,$t1,$t4 @ X[i] |
| ___ |
| &BODY_00_15(@_); |
| } |
| @@ -147,46 +176,64 @@ K256: |
| .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
| .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
| .size K256,.-K256 |
| +.word 0 @ terminator |
| +.LOPENSSL_armcap: |
| +.word OPENSSL_armcap_P-sha256_block_data_order |
| +.align 5 |
| |
| .global sha256_block_data_order |
| .type sha256_block_data_order,%function |
| sha256_block_data_order: |
| sub r3,pc,#8 @ sha256_block_data_order |
| add $len,$inp,$len,lsl#6 @ len to point at the end of inp |
| +#if __ARM_ARCH__>=7 |
| + ldr r12,.LOPENSSL_armcap |
| + ldr r12,[r3,r12] @ OPENSSL_armcap_P |
| + tst r12,#ARMV8_SHA256 |
| + bne .LARMv8 |
| + tst r12,#ARMV7_NEON |
| + bne .LNEON |
| +#endif |
| stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} |
| ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} |
| - sub $Ktbl,r3,#256 @ K256 |
| + sub $Ktbl,r3,#256+32 @ K256 |
| sub sp,sp,#16*4 @ alloca(X[16]) |
| .Loop: |
| +# if __ARM_ARCH__>=7 |
| + ldr $t1,[$inp],#4 |
| +# else |
| + ldrb $t1,[$inp,#3] |
| +# endif |
| + eor $t3,$B,$C @ magic |
| + eor $t2,$t2,$t2 |
| ___ |
| for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } |
| $code.=".Lrounds_16_xx:\n"; |
| for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } |
| $code.=<<___; |
| - and $t2,$t2,#0xff |
| - cmp $t2,#0xf2 |
| + ldreq $t3,[sp,#16*4] @ pull ctx |
| bne .Lrounds_16_xx |
| |
| - ldr $T1,[sp,#16*4] @ pull ctx |
| - ldr $t0,[$T1,#0] |
| - ldr $t1,[$T1,#4] |
| - ldr $t2,[$T1,#8] |
| + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past |
| + ldr $t0,[$t3,#0] |
| + ldr $t1,[$t3,#4] |
| + ldr $t2,[$t3,#8] |
| add $A,$A,$t0 |
| - ldr $t0,[$T1,#12] |
| + ldr $t0,[$t3,#12] |
| add $B,$B,$t1 |
| - ldr $t1,[$T1,#16] |
| + ldr $t1,[$t3,#16] |
| add $C,$C,$t2 |
| - ldr $t2,[$T1,#20] |
| + ldr $t2,[$t3,#20] |
| add $D,$D,$t0 |
| - ldr $t0,[$T1,#24] |
| + ldr $t0,[$t3,#24] |
| add $E,$E,$t1 |
| - ldr $t1,[$T1,#28] |
| + ldr $t1,[$t3,#28] |
| add $F,$F,$t2 |
| ldr $inp,[sp,#17*4] @ pull inp |
| ldr $t2,[sp,#18*4] @ pull inp+len |
| add $G,$G,$t0 |
| add $H,$H,$t1 |
| - stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H} |
| + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} |
| cmp $inp,$t2 |
| sub $Ktbl,$Ktbl,#256 @ rewind Ktbl |
| bne .Loop |
| @@ -200,12 +247,410 @@ $code.=<<___; |
| moveq pc,lr @ be binary compatible with V4, yet |
| bx lr @ interoperable with Thumb ISA:-) |
| #endif |
| -.size sha256_block_data_order,.-sha256_block_data_order |
| -.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" |
| +.size sha256_block_data_order,.-sha256_block_data_order |
| +___ |
| +###################################################################### |
| +# NEON stuff |
| +# |
| +{{{ |
| +my @X=map("q$_",(0..3)); |
| +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); |
| +my $Xfer=$t4; |
| +my $j=0; |
| + |
| +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } |
| +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } |
| + |
| +sub AUTOLOAD() # thunk [simplified] x86-style perlasm |
| +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; |
| + my $arg = pop; |
| + $arg = "#$arg" if ($arg*1 eq $arg); |
| + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; |
| +} |
| + |
| +sub Xupdate() |
| +{ use integer; |
| + my $body = shift; |
| + my @insns = (&$body,&$body,&$body,&$body); |
| + my ($a,$b,$c,$d,$e,$f,$g,$h); |
| + |
| + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T2,$T0,$sigma0[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T1,$T0,$sigma0[2]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vsli_32 ($T2,$T0,32-$sigma0[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T3,$T0,$sigma0[1]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor ($T1,$T1,$T2); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vsli_32 ($T3,$T0,32-$sigma0[1]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor ($T1,$T1,$T3); # sigma0(X[1..4]) |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor ($T5,$T5,$T4); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor ($T5,$T5,$T4); # sigma1(X[14..15]) |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor ($T5,$T5,$T4); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &veor ($T5,$T5,$T4); # sigma1(X[16..17]) |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 ($T0,$T0,@X[0]); |
| + while($#insns>=2) { eval(shift(@insns)); } |
| + &vst1_32 ("{$T0}","[$Xfer,:128]!"); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + |
| + push(@X,shift(@X)); # "rotate" X[] |
| +} |
| + |
| +sub Xpreload() |
| +{ use integer; |
| + my $body = shift; |
| + my @insns = (&$body,&$body,&$body,&$body); |
| + my ($a,$b,$c,$d,$e,$f,$g,$h); |
| + |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vrev32_8 (@X[0],@X[0]); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + eval(shift(@insns)); |
| + &vadd_i32 ($T0,$T0,@X[0]); |
| + foreach (@insns) { eval; } # remaining instructions |
| + &vst1_32 ("{$T0}","[$Xfer,:128]!"); |
| + |
| + push(@X,shift(@X)); # "rotate" X[] |
| +} |
| + |
| +sub body_00_15 () { |
| + ( |
| + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. |
| + '&add ($h,$h,$t1)', # h+=X[i]+K[i] |
| + '&eor ($t1,$f,$g)', |
| + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', |
| + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past |
| + '&and ($t1,$t1,$e)', |
| + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) |
| + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', |
| + '&eor ($t1,$t1,$g)', # Ch(e,f,g) |
| + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) |
| + '&eor ($t2,$a,$b)', # a^b, b^c in next round |
| + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) |
| + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) |
| + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. |
| + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. |
| + '&ldr ($t1,"[sp,#64]") if ($j==31)', |
| + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) |
| + '&add ($d,$d,$h)', # d+=h |
| + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) |
| + '&eor ($t3,$t3,$b)', # Maj(a,b,c) |
| + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' |
| + ) |
| +} |
| + |
| +$code.=<<___; |
| +#if __ARM_ARCH__>=7 |
| +.fpu neon |
| + |
| +.type sha256_block_data_order_neon,%function |
| +.align 4 |
| +sha256_block_data_order_neon: |
| +.LNEON: |
| + stmdb sp!,{r4-r12,lr} |
| + |
| + mov $t2,sp |
| + sub sp,sp,#16*4+16 @ alloca |
| + sub $Ktbl,r3,#256+32 @ K256 |
| + bic sp,sp,#15 @ align for 128-bit stores |
| + |
| + vld1.8 {@X[0]},[$inp]! |
| + vld1.8 {@X[1]},[$inp]! |
| + vld1.8 {@X[2]},[$inp]! |
| + vld1.8 {@X[3]},[$inp]! |
| + vld1.32 {$T0},[$Ktbl,:128]! |
| + vld1.32 {$T1},[$Ktbl,:128]! |
| + vld1.32 {$T2},[$Ktbl,:128]! |
| + vld1.32 {$T3},[$Ktbl,:128]! |
| + vrev32.8 @X[0],@X[0] @ yes, even on |
| + str $ctx,[sp,#64] |
| + vrev32.8 @X[1],@X[1] @ big-endian |
| + str $inp,[sp,#68] |
| + mov $Xfer,sp |
| + vrev32.8 @X[2],@X[2] |
| + str $len,[sp,#72] |
| + vrev32.8 @X[3],@X[3] |
| + str $t2,[sp,#76] @ save original sp |
| + vadd.i32 $T0,$T0,@X[0] |
| + vadd.i32 $T1,$T1,@X[1] |
| + vst1.32 {$T0},[$Xfer,:128]! |
| + vadd.i32 $T2,$T2,@X[2] |
| + vst1.32 {$T1},[$Xfer,:128]! |
| + vadd.i32 $T3,$T3,@X[3] |
| + vst1.32 {$T2},[$Xfer,:128]! |
| + vst1.32 {$T3},[$Xfer,:128]! |
| + |
| + ldmia $ctx,{$A-$H} |
| + sub $Xfer,$Xfer,#64 |
| + ldr $t1,[sp,#0] |
| + eor $t2,$t2,$t2 |
| + eor $t3,$B,$C |
| + b .L_00_48 |
| + |
| +.align 4 |
| +.L_00_48: |
| +___ |
| + &Xupdate(\&body_00_15); |
| + &Xupdate(\&body_00_15); |
| + &Xupdate(\&body_00_15); |
| + &Xupdate(\&body_00_15); |
| +$code.=<<___; |
| + teq $t1,#0 @ check for K256 terminator |
| + ldr $t1,[sp,#0] |
| + sub $Xfer,$Xfer,#64 |
| + bne .L_00_48 |
| + |
| + ldr $inp,[sp,#68] |
| + ldr $t0,[sp,#72] |
| + sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl |
| + teq $inp,$t0 |
| + subeq $inp,$inp,#64 @ avoid SEGV |
| + vld1.8 {@X[0]},[$inp]! @ load next input block |
| + vld1.8 {@X[1]},[$inp]! |
| + vld1.8 {@X[2]},[$inp]! |
| + vld1.8 {@X[3]},[$inp]! |
| + strne $inp,[sp,#68] |
| + mov $Xfer,sp |
| +___ |
| + &Xpreload(\&body_00_15); |
| + &Xpreload(\&body_00_15); |
| + &Xpreload(\&body_00_15); |
| + &Xpreload(\&body_00_15); |
| +$code.=<<___; |
| + ldr $t0,[$t1,#0] |
| + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past |
| + ldr $t2,[$t1,#4] |
| + ldr $t3,[$t1,#8] |
| + ldr $t4,[$t1,#12] |
| + add $A,$A,$t0 @ accumulate |
| + ldr $t0,[$t1,#16] |
| + add $B,$B,$t2 |
| + ldr $t2,[$t1,#20] |
| + add $C,$C,$t3 |
| + ldr $t3,[$t1,#24] |
| + add $D,$D,$t4 |
| + ldr $t4,[$t1,#28] |
| + add $E,$E,$t0 |
| + str $A,[$t1],#4 |
| + add $F,$F,$t2 |
| + str $B,[$t1],#4 |
| + add $G,$G,$t3 |
| + str $C,[$t1],#4 |
| + add $H,$H,$t4 |
| + str $D,[$t1],#4 |
| + stmia $t1,{$E-$H} |
| + |
| + movne $Xfer,sp |
| + ldrne $t1,[sp,#0] |
| + eorne $t2,$t2,$t2 |
| + ldreq sp,[sp,#76] @ restore original sp |
| + eorne $t3,$B,$C |
| + bne .L_00_48 |
| + |
| + ldmia sp!,{r4-r12,pc} |
| +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon |
| +#endif |
| +___ |
| +}}} |
| +###################################################################### |
| +# ARMv8 stuff |
| +# |
| +{{{ |
| +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); |
| +my @MSG=map("q$_",(8..11)); |
| +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); |
| +my $Ktbl="r3"; |
| + |
| +$code.=<<___; |
| +#if __ARM_ARCH__>=7 |
| +.type sha256_block_data_order_armv8,%function |
| +.align 5 |
| +sha256_block_data_order_armv8: |
| +.LARMv8: |
| + vld1.32 {$ABCD,$EFGH},[$ctx] |
| + sub $Ktbl,r3,#sha256_block_data_order-K256 |
| + |
| +.Loop_v8: |
| + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! |
| + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! |
| + vld1.32 {$W0},[$Ktbl]! |
| + vrev32.8 @MSG[0],@MSG[0] |
| + vrev32.8 @MSG[1],@MSG[1] |
| + vrev32.8 @MSG[2],@MSG[2] |
| + vrev32.8 @MSG[3],@MSG[3] |
| + vmov $ABCD_SAVE,$ABCD @ offload |
| + vmov $EFGH_SAVE,$EFGH |
| + teq $inp,$len |
| +___ |
| +for($i=0;$i<12;$i++) { |
| +$code.=<<___; |
| + vld1.32 {$W1},[$Ktbl]! |
| + vadd.i32 $W0,$W0,@MSG[0] |
| + sha256su0 @MSG[0],@MSG[1] |
| + vmov $abcd,$ABCD |
| + sha256h $ABCD,$EFGH,$W0 |
| + sha256h2 $EFGH,$abcd,$W0 |
| + sha256su1 @MSG[0],@MSG[2],@MSG[3] |
| +___ |
| + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); |
| +} |
| +$code.=<<___; |
| + vld1.32 {$W1},[$Ktbl]! |
| + vadd.i32 $W0,$W0,@MSG[0] |
| + vmov $abcd,$ABCD |
| + sha256h $ABCD,$EFGH,$W0 |
| + sha256h2 $EFGH,$abcd,$W0 |
| + |
| + vld1.32 {$W0},[$Ktbl]! |
| + vadd.i32 $W1,$W1,@MSG[1] |
| + vmov $abcd,$ABCD |
| + sha256h $ABCD,$EFGH,$W1 |
| + sha256h2 $EFGH,$abcd,$W1 |
| + |
| + vld1.32 {$W1},[$Ktbl] |
| + vadd.i32 $W0,$W0,@MSG[2] |
| + sub $Ktbl,$Ktbl,#256-16 @ rewind |
| + vmov $abcd,$ABCD |
| + sha256h $ABCD,$EFGH,$W0 |
| + sha256h2 $EFGH,$abcd,$W0 |
| + |
| + vadd.i32 $W1,$W1,@MSG[3] |
| + vmov $abcd,$ABCD |
| + sha256h $ABCD,$EFGH,$W1 |
| + sha256h2 $EFGH,$abcd,$W1 |
| + |
| + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE |
| + vadd.i32 $EFGH,$EFGH,$EFGH_SAVE |
| + bne .Loop_v8 |
| + |
| + vst1.32 {$ABCD,$EFGH},[$ctx] |
| + |
| + ret @ bx lr |
| +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 |
| +#endif |
| +___ |
| +}}} |
| +$code.=<<___; |
| +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
| .align 2 |
| +.comm OPENSSL_armcap_P,4,4 |
| ___ |
| |
| -$code =~ s/\`([^\`]*)\`/eval $1/gem; |
| -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
| -print $code; |
| +{ my %opcode = ( |
| + "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, |
| + "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); |
| + |
| + sub unsha256 { |
| + my ($mnemonic,$arg)=@_; |
| + |
| + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { |
| + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) |
| + |(($2&7)<<17)|(($2&8)<<4) |
| + |(($3&7)<<1) |(($3&8)<<2); |
| + # since ARMv7 instructions are always encoded little-endian. |
| + # correct solution is to use .inst directive, but older |
| + # assemblers don't implement it:-( |
| + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", |
| + $word&0xff,($word>>8)&0xff, |
| + ($word>>16)&0xff,($word>>24)&0xff, |
| + $mnemonic,$arg; |
| + } |
| + } |
| +} |
| + |
| +foreach (split($/,$code)) { |
| + |
| + s/\`([^\`]*)\`/eval $1/geo; |
| + |
| + s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; |
| + |
| + s/\bret\b/bx lr/go or |
| + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 |
| + |
| + print $_,"\n"; |
| +} |
| + |
| close STDOUT; # enforce flush |
| diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl |
| index 7faf37b..71aa935 100644 |
| --- a/crypto/sha/asm/sha512-armv4.pl |
| +++ b/crypto/sha/asm/sha512-armv4.pl |
| @@ -565,7 +565,7 @@ $code.=<<___; |
| bne .Loop_neon |
| |
| vldmia sp!,{d8-d15} @ epilogue |
| - bx lr |
| + ret @ bx lr |
| #endif |
| ___ |
| } |
| @@ -578,5 +578,6 @@ ___ |
| |
| $code =~ s/\`([^\`]*)\`/eval $1/gem; |
| $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 |
| +$code =~ s/\bret\b/bx lr/gm; |
| print $code; |
| close STDOUT; # enforce flush |
| diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl |
| new file mode 100644 |
| index 0000000..6935ed6 |
| --- /dev/null |
| +++ b/crypto/sha/asm/sha512-armv8.pl |
| @@ -0,0 +1,414 @@ |
| +#!/usr/bin/env perl |
| +# |
| +# ==================================================================== |
| +# Written by Andy Polyakov <[email protected]> for the OpenSSL |
| +# project. The module is, however, dual licensed under OpenSSL and |
| +# CRYPTOGAMS licenses depending on where you obtain it. For further |
| +# details see http://www.openssl.org/~appro/cryptogams/. |
| +# ==================================================================== |
| +# |
| +# SHA256/512 for ARMv8. |
| +# |
| +# Performance in cycles per processed byte and improvement coefficient |
| +# over code generated with "default" compiler: |
| +# |
| +# SHA256-hw SHA256(*) SHA512 |
| +# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) |
| +# Cortex-A5x n/a n/a n/a |
| +# |
| +# (*) Software SHA256 results are of lesser relevance, presented |
| +# mostly for informational purposes. |
| +# (**) The result is a trade-off: it's possible to improve it by |
| +# 10%, but at the cost of 20% loss on Cortex-A5x. |
| + |
| +$flavour=shift; |
| +$output=shift; |
| +open STDOUT,">$output"; |
| + |
| +if ($output =~ /512/) { |
| + $BITS=512; |
| + $SZ=8; |
| + @Sigma0=(28,34,39); |
| + @Sigma1=(14,18,41); |
| + @sigma0=(1, 8, 7); |
| + @sigma1=(19,61, 6); |
| + $rounds=80; |
| + $reg_t="x"; |
| +} else { |
| + $BITS=256; |
| + $SZ=4; |
| + @Sigma0=( 2,13,22); |
| + @Sigma1=( 6,11,25); |
| + @sigma0=( 7,18, 3); |
| + @sigma1=(17,19,10); |
| + $rounds=64; |
| + $reg_t="w"; |
| +} |
| + |
| +$func="sha${BITS}_block_data_order"; |
| + |
| +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); |
| + |
| +@X=map("$reg_t$_",(3..15,0..2)); |
| +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); |
| +($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); |
| + |
| +sub BODY_00_xx { |
| +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; |
| +my $j=($i+1)&15; |
| +my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); |
| + $T0=@X[$i+3] if ($i<11); |
| + |
| +$code.=<<___ if ($i<16); |
| +#ifndef __ARMEB__ |
| + rev @X[$i],@X[$i] // $i |
| +#endif |
| +___ |
| +$code.=<<___ if ($i<13 && ($i&1)); |
| + ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ |
| +___ |
| +$code.=<<___ if ($i==13); |
| + ldp @X[14],@X[15],[$inp] |
| +___ |
| +$code.=<<___ if ($i>=14); |
| + ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] |
| +___ |
| +$code.=<<___ if ($i>0 && $i<16); |
| + add $a,$a,$t1 // h+=Sigma0(a) |
| +___ |
| +$code.=<<___ if ($i>=11); |
| + str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] |
| +___ |
| +# While ARMv8 specifies merged rotate-n-logical operation such as |
| +# 'eor x,y,z,ror#n', it was found to negatively affect performance |
| +# on Apple A7. The reason seems to be that it requires even 'y' to |
| +# be available earlier. This means that such merged instruction is |
| +# not necessarily best choice on critical path... On the other hand |
| +# Cortex-A5x handles merged instructions much better than disjoint |
| +# rotate and logical... See (**) footnote above. |
| +$code.=<<___ if ($i<15); |
| + ror $t0,$e,#$Sigma1[0] |
| + add $h,$h,$t2 // h+=K[i] |
| + eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` |
| + and $t1,$f,$e |
| + bic $t2,$g,$e |
| + add $h,$h,@X[$i&15] // h+=X[i] |
| + orr $t1,$t1,$t2 // Ch(e,f,g) |
| + eor $t2,$a,$b // a^b, b^c in next round |
| + eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) |
| + ror $T0,$a,#$Sigma0[0] |
| + add $h,$h,$t1 // h+=Ch(e,f,g) |
| + eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` |
| + add $h,$h,$t0 // h+=Sigma1(e) |
| + and $t3,$t3,$t2 // (b^c)&=(a^b) |
| + add $d,$d,$h // d+=h |
| + eor $t3,$t3,$b // Maj(a,b,c) |
| + eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) |
| + add $h,$h,$t3 // h+=Maj(a,b,c) |
| + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round |
| + //add $h,$h,$t1 // h+=Sigma0(a) |
| +___ |
| +$code.=<<___ if ($i>=15); |
| + ror $t0,$e,#$Sigma1[0] |
| + add $h,$h,$t2 // h+=K[i] |
| + ror $T1,@X[($j+1)&15],#$sigma0[0] |
| + and $t1,$f,$e |
| + ror $T2,@X[($j+14)&15],#$sigma1[0] |
| + bic $t2,$g,$e |
| + ror $T0,$a,#$Sigma0[0] |
| + add $h,$h,@X[$i&15] // h+=X[i] |
| + eor $t0,$t0,$e,ror#$Sigma1[1] |
| + eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] |
| + orr $t1,$t1,$t2 // Ch(e,f,g) |
| + eor $t2,$a,$b // a^b, b^c in next round |
| + eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) |
| + eor $T0,$T0,$a,ror#$Sigma0[1] |
| + add $h,$h,$t1 // h+=Ch(e,f,g) |
| + and $t3,$t3,$t2 // (b^c)&=(a^b) |
| + eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] |
| + eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) |
| + add $h,$h,$t0 // h+=Sigma1(e) |
| + eor $t3,$t3,$b // Maj(a,b,c) |
| + eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) |
| + eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) |
| + add @X[$j],@X[$j],@X[($j+9)&15] |
| + add $d,$d,$h // d+=h |
| + add $h,$h,$t3 // h+=Maj(a,b,c) |
| + ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round |
| + add @X[$j],@X[$j],$T1 |
| + add $h,$h,$t1 // h+=Sigma0(a) |
| + add @X[$j],@X[$j],$T2 |
| +___ |
| + ($t2,$t3)=($t3,$t2); |
| +} |
| + |
| +$code.=<<___; |
| +#include "arm_arch.h" |
| + |
| +.text |
| + |
| +.globl $func |
| +.type $func,%function |
| +.align 6 |
| +$func: |
| +___ |
| +$code.=<<___ if ($SZ==4); |
| + ldr x16,.LOPENSSL_armcap_P |
| + adr x17,.LOPENSSL_armcap_P |
| + add x16,x16,x17 |
| + ldr w16,[x16] |
| + tst w16,#ARMV8_SHA256 |
| + b.ne .Lv8_entry |
| +___ |
| +$code.=<<___; |
| + stp x29,x30,[sp,#-128]! |
| + add x29,sp,#0 |
| + |
| + stp x19,x20,[sp,#16] |
| + stp x21,x22,[sp,#32] |
| + stp x23,x24,[sp,#48] |
| + stp x25,x26,[sp,#64] |
| + stp x27,x28,[sp,#80] |
| + sub sp,sp,#4*$SZ |
| + |
| + ldp $A,$B,[$ctx] // load context |
| + ldp $C,$D,[$ctx,#2*$SZ] |
| + ldp $E,$F,[$ctx,#4*$SZ] |
| + add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input |
| + ldp $G,$H,[$ctx,#6*$SZ] |
| + adr $Ktbl,K$BITS |
| + stp $ctx,$num,[x29,#96] |
| + |
| +.Loop: |
| + ldp @X[0],@X[1],[$inp],#2*$SZ |
| + ldr $t2,[$Ktbl],#$SZ // *K++ |
| + eor $t3,$B,$C // magic seed |
| + str $inp,[x29,#112] |
| +___ |
| +for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } |
| +$code.=".Loop_16_xx:\n"; |
| +for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } |
| +$code.=<<___; |
| + cbnz $t2,.Loop_16_xx |
| + |
| + ldp $ctx,$num,[x29,#96] |
| + ldr $inp,[x29,#112] |
| + sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind |
| + |
| + ldp @X[0],@X[1],[$ctx] |
| + ldp @X[2],@X[3],[$ctx,#2*$SZ] |
| + add $inp,$inp,#14*$SZ // advance input pointer |
| + ldp @X[4],@X[5],[$ctx,#4*$SZ] |
| + add $A,$A,@X[0] |
| + ldp @X[6],@X[7],[$ctx,#6*$SZ] |
| + add $B,$B,@X[1] |
| + add $C,$C,@X[2] |
| + add $D,$D,@X[3] |
| + stp $A,$B,[$ctx] |
| + add $E,$E,@X[4] |
| + add $F,$F,@X[5] |
| + stp $C,$D,[$ctx,#2*$SZ] |
| + add $G,$G,@X[6] |
| + add $H,$H,@X[7] |
| + cmp $inp,$num |
| + stp $E,$F,[$ctx,#4*$SZ] |
| + stp $G,$H,[$ctx,#6*$SZ] |
| + b.ne .Loop |
| + |
| + ldp x19,x20,[x29,#16] |
| + add sp,sp,#4*$SZ |
| + ldp x21,x22,[x29,#32] |
| + ldp x23,x24,[x29,#48] |
| + ldp x25,x26,[x29,#64] |
| + ldp x27,x28,[x29,#80] |
| + ldp x29,x30,[sp],#128 |
| + ret |
| +.size $func,.-$func |
| + |
| +.align 6 |
| +.type K$BITS,%object |
| +K$BITS: |
| +___ |
| +$code.=<<___ if ($SZ==8); |
| + .quad 0x428a2f98d728ae22,0x7137449123ef65cd |
| + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc |
| + .quad 0x3956c25bf348b538,0x59f111f1b605d019 |
| + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 |
| + .quad 0xd807aa98a3030242,0x12835b0145706fbe |
| + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 |
| + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 |
| + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 |
| + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 |
| + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 |
| + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 |
| + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 |
| + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 |
| + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 |
| + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 |
| + .quad 0x06ca6351e003826f,0x142929670a0e6e70 |
| + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 |
| + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df |
| + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 |
| + .quad 0x81c2c92e47edaee6,0x92722c851482353b |
| + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 |
| + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 |
| + .quad 0xd192e819d6ef5218,0xd69906245565a910 |
| + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 |
| + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 |
| + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 |
| + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb |
| + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 |
| + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 |
| + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec |
| + .quad 0x90befffa23631e28,0xa4506cebde82bde9 |
| + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b |
| + .quad 0xca273eceea26619c,0xd186b8c721c0c207 |
| + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 |
| + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 |
| + .quad 0x113f9804bef90dae,0x1b710b35131c471b |
| + .quad 0x28db77f523047d84,0x32caab7b40c72493 |
| + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c |
| + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a |
| + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 |
| + .quad 0 // terminator |
| +___ |
| +$code.=<<___ if ($SZ==4); |
| + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
| + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
| + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 |
| + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 |
| + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc |
| + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da |
| + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 |
| + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 |
| + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 |
| + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 |
| + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 |
| + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 |
| + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 |
| + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 |
| + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 |
| + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
| + .long 0 //terminator |
| +___ |
| +$code.=<<___; |
| +.size K$BITS,.-K$BITS |
| +.align 3 |
| +.LOPENSSL_armcap_P: |
| + .quad OPENSSL_armcap_P-. |
| +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" |
| +.align 2 |
| +___ |
| + |
| +if ($SZ==4) { |
| +my $Ktbl="x3"; |
| + |
| +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); |
| +my @MSG=map("v$_.16b",(4..7)); |
| +my ($W0,$W1)=("v16.4s","v17.4s"); |
| +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); |
| + |
| +$code.=<<___; |
| +.type sha256_block_armv8,%function |
| +.align 6 |
| +sha256_block_armv8: |
| +.Lv8_entry: |
| + stp x29,x30,[sp,#-16]! |
| + add x29,sp,#0 |
| + |
| + ld1.32 {$ABCD,$EFGH},[$ctx] |
| + adr $Ktbl,K256 |
| + |
| +.Loop_hw: |
| + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 |
| + sub $num,$num,#1 |
| + ld1.32 {$W0},[$Ktbl],#16 |
| + rev32 @MSG[0],@MSG[0] |
| + rev32 @MSG[1],@MSG[1] |
| + rev32 @MSG[2],@MSG[2] |
| + rev32 @MSG[3],@MSG[3] |
| + orr $ABCD_SAVE,$ABCD,$ABCD // offload |
| + orr $EFGH_SAVE,$EFGH,$EFGH |
| +___ |
| +for($i=0;$i<12;$i++) { |
| +$code.=<<___; |
| + ld1.32 {$W1},[$Ktbl],#16 |
| + add.i32 $W0,$W0,@MSG[0] |
| + sha256su0 @MSG[0],@MSG[1] |
| + orr $abcd,$ABCD,$ABCD |
| + sha256h $ABCD,$EFGH,$W0 |
| + sha256h2 $EFGH,$abcd,$W0 |
| + sha256su1 @MSG[0],@MSG[2],@MSG[3] |
| +___ |
| + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); |
| +} |
| +$code.=<<___; |
| + ld1.32 {$W1},[$Ktbl],#16 |
| + add.i32 $W0,$W0,@MSG[0] |
| + orr $abcd,$ABCD,$ABCD |
| + sha256h $ABCD,$EFGH,$W0 |
| + sha256h2 $EFGH,$abcd,$W0 |
| + |
| + ld1.32 {$W0},[$Ktbl],#16 |
| + add.i32 $W1,$W1,@MSG[1] |
| + orr $abcd,$ABCD,$ABCD |
| + sha256h $ABCD,$EFGH,$W1 |
| + sha256h2 $EFGH,$abcd,$W1 |
| + |
| + ld1.32 {$W1},[$Ktbl] |
| + add.i32 $W0,$W0,@MSG[2] |
| + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind |
| + orr $abcd,$ABCD,$ABCD |
| + sha256h $ABCD,$EFGH,$W0 |
| + sha256h2 $EFGH,$abcd,$W0 |
| + |
| + add.i32 $W1,$W1,@MSG[3] |
| + orr $abcd,$ABCD,$ABCD |
| + sha256h $ABCD,$EFGH,$W1 |
| + sha256h2 $EFGH,$abcd,$W1 |
| + |
| + add.i32 $ABCD,$ABCD,$ABCD_SAVE |
| + add.i32 $EFGH,$EFGH,$EFGH_SAVE |
| + |
| + cbnz $num,.Loop_hw |
| + |
| + st1.32 {$ABCD,$EFGH},[$ctx] |
| + |
| + ldr x29,[sp],#16 |
| + ret |
| +.size sha256_block_armv8,.-sha256_block_armv8 |
| +___ |
| +} |
| + |
| +$code.=<<___; |
| +.comm OPENSSL_armcap_P,4,4 |
| +___ |
| + |
| +{ my %opcode = ( |
| + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, |
| + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); |
| + |
| + sub unsha256 { |
| + my ($mnemonic,$arg)=@_; |
| + |
| + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o |
| + && |
| + sprintf ".inst\t0x%08x\t//%s %s", |
| + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), |
| + $mnemonic,$arg; |
| + } |
| +} |
| + |
| +foreach(split("\n",$code)) { |
| + |
| + s/\`([^\`]*)\`/eval($1)/geo; |
| + |
| + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; |
| + |
| + s/\.\w?32\b//o and s/\.16b/\.4s/go; |
| + m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; |
| + |
| + print $_,"\n"; |
| +} |
| + |
| +close STDOUT; |