diff options
Diffstat (limited to 'src/crypto/modes/asm')
-rw-r--r-- | src/crypto/modes/asm/ghash-armv4.pl | 10 | ||||
-rw-r--r-- | src/crypto/modes/asm/ghash-x86.pl | 2 | ||||
-rw-r--r-- | src/crypto/modes/asm/ghash-x86_64.pl | 8 | ||||
-rw-r--r-- | src/crypto/modes/asm/ghashv8-armx.pl | 24 |
4 files changed, 22 insertions, 22 deletions
diff --git a/src/crypto/modes/asm/ghash-armv4.pl b/src/crypto/modes/asm/ghash-armv4.pl index 25a4e27..dc5b99e 100644 --- a/src/crypto/modes/asm/ghash-armv4.pl +++ b/src/crypto/modes/asm/ghash-armv4.pl @@ -45,7 +45,7 @@ # processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63, # Snapdragon S4 - in 9.33. # -# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software # Polynomial Multiplication on ARM Processors using the NEON Engine. # # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf @@ -134,7 +134,7 @@ ___ $code=<<___; #if defined(__arm__) -#include "arm_arch.h" +#include <openssl/arm_arch.h> .syntax unified @@ -457,12 +457,12 @@ gcm_ghash_neon: veor $IN,$Xl @ inp^=Xi .Lgmult_neon: ___ - &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo + &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo $code.=<<___; veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing ___ - &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) - &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi + &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) + &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi $code.=<<___; veor $Xm,$Xm,$Xl @ Karatsuba post-processing veor $Xm,$Xm,$Xh diff --git a/src/crypto/modes/asm/ghash-x86.pl b/src/crypto/modes/asm/ghash-x86.pl index 23a5527..0269169 100644 --- a/src/crypto/modes/asm/ghash-x86.pl +++ b/src/crypto/modes/asm/ghash-x86.pl @@ -358,7 +358,7 @@ $S=12; # shift factor for rem_4bit # effective address calculation and finally merge of value to Z.hi. # Reference to rem_4bit is scheduled so late that I had to >>4 # rem_4bit elements. This resulted in 20-45% procent improvement -# on contemporary µ-archs. +# on contemporary µ-archs. { my $cnt; my $rem_4bit = "eax"; diff --git a/src/crypto/modes/asm/ghash-x86_64.pl b/src/crypto/modes/asm/ghash-x86_64.pl index 6e656ca..5a7ce39 100644 --- a/src/crypto/modes/asm/ghash-x86_64.pl +++ b/src/crypto/modes/asm/ghash-x86_64.pl @@ -576,15 +576,15 @@ $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); # experimental alternative. special thing about is that there # no dependency between the two multiplications... mov \$`0xE1<<1`,%eax - mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff + mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff mov \$0x07,%r11d movq %rax,$T1 movq %r10,$T2 movq %r11,$T3 # borrow $T3 pand $Xi,$T3 - pshufb $T3,$T2 # ($Xi&7)·0xE0 + pshufb $T3,$T2 # ($Xi&7)·0xE0 movq %rax,$T3 - pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) + pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) pxor $Xi,$T2 pslldq \$15,$T2 paddd $T2,$T2 # <<(64+56+1) @@ -657,7 +657,7 @@ $code.=<<___; je .Lskip4x sub \$0x30,$len - mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff + mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff movdqu 0x30($Htbl),$Hkey3 movdqu 0x40($Htbl),$Hkey4 diff --git a/src/crypto/modes/asm/ghashv8-armx.pl b/src/crypto/modes/asm/ghashv8-armx.pl index 686951f..3a7b8d8 100644 --- a/src/crypto/modes/asm/ghashv8-armx.pl +++ b/src/crypto/modes/asm/ghashv8-armx.pl @@ -54,7 +54,7 @@ my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); $code=<<___; -#include "arm_arch.h" +#include <openssl/arm_arch.h> .text ___ @@ -148,10 +148,10 @@ gcm_gmult_v8: #endif vext.8 $IN,$t1,$t1,#8 - vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo veor $t1,$t1,$IN @ Karatsuba pre-processing - vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi - vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh @@ -239,7 +239,7 @@ $code.=<<___; #endif vext.8 $In,$t1,$t1,#8 veor $IN,$IN,$Xl @ I[i]^=Xi - vpmull.p64 $Xln,$H,$In @ H·Ii+1 + vpmull.p64 $Xln,$H,$In @ H·Ii+1 veor $t1,$t1,$In @ Karatsuba pre-processing vpmull2.p64 $Xhn,$H,$In b .Loop_mod2x_v8 @@ -248,14 +248,14 @@ $code.=<<___; .Loop_mod2x_v8: vext.8 $t2,$IN,$IN,#8 subs $len,$len,#32 @ is there more data? - vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo + vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo cclr $inc,lo @ is it time to zero $inc? vpmull.p64 $Xmn,$Hhl,$t1 veor $t2,$t2,$IN @ Karatsuba pre-processing - vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi + vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi veor $Xl,$Xl,$Xln @ accumulate - vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] veor $Xh,$Xh,$Xhn @@ -280,7 +280,7 @@ $code.=<<___; vext.8 $In,$t1,$t1,#8 vext.8 $IN,$t0,$t0,#8 veor $Xl,$Xm,$t2 - vpmull.p64 $Xln,$H,$In @ H·Ii+1 + vpmull.p64 $Xln,$H,$In @ H·Ii+1 veor $IN,$IN,$Xh @ accumulate $IN early vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction @@ -304,10 +304,10 @@ $code.=<<___; veor $IN,$IN,$Xl @ inp^=Xi veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi - vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo veor $t1,$t1,$IN @ Karatsuba pre-processing - vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi - vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh |