diff options
Diffstat (limited to 'src/crypto/modes')
-rw-r--r-- | src/crypto/modes/CMakeLists.txt | 63 | ||||
-rw-r--r-- | src/crypto/modes/asm/aesni-gcm-x86_64.pl | 1057 | ||||
-rw-r--r-- | src/crypto/modes/asm/ghash-armv4.pl | 501 | ||||
-rw-r--r-- | src/crypto/modes/asm/ghash-x86.pl | 1393 | ||||
-rw-r--r-- | src/crypto/modes/asm/ghash-x86_64.pl | 1753 | ||||
-rw-r--r-- | src/crypto/modes/asm/ghashv8-armx.pl | 241 | ||||
-rw-r--r-- | src/crypto/modes/cbc.c | 203 | ||||
-rw-r--r-- | src/crypto/modes/cfb.c | 230 | ||||
-rw-r--r-- | src/crypto/modes/ctr.c | 219 | ||||
-rw-r--r-- | src/crypto/modes/gcm.c | 1245 | ||||
-rw-r--r-- | src/crypto/modes/gcm_test.c | 435 | ||||
-rw-r--r-- | src/crypto/modes/internal.h | 199 | ||||
-rw-r--r-- | src/crypto/modes/ofb.c | 107 |
13 files changed, 7646 insertions, 0 deletions
diff --git a/src/crypto/modes/CMakeLists.txt b/src/crypto/modes/CMakeLists.txt new file mode 100644 index 0000000..d50e97b --- /dev/null +++ b/src/crypto/modes/CMakeLists.txt @@ -0,0 +1,63 @@ +include_directories(. .. ../../include) + +if (${ARCH} STREQUAL "x86_64") + set( + MODES_ARCH_SOURCES + + aesni-gcm-x86_64.${ASM_EXT} + ghash-x86_64.${ASM_EXT} + ) +endif() + +if (${ARCH} STREQUAL "x86") + set( + MODES_ARCH_SOURCES + + ghash-x86.${ASM_EXT} + ) +endif() + +if (${ARCH} STREQUAL "arm") + set( + MODES_ARCH_SOURCES + + ghash-armv4.${ASM_EXT} + ghashv8-armx.${ASM_EXT} + ) +endif() + +if (${ARCH} STREQUAL "aarch64") + set( + MODES_ARCH_SOURCES + + ghashv8-armx.${ASM_EXT} + ) +endif() + +add_library( + modes + + OBJECT + + cbc.c + ctr.c + ofb.c + cfb.c + gcm.c + + ${MODES_ARCH_SOURCES} +) + +perlasm(aesni-gcm-x86_64.${ASM_EXT} asm/aesni-gcm-x86_64.pl) +perlasm(ghash-x86_64.${ASM_EXT} asm/ghash-x86_64.pl) +perlasm(ghash-x86.${ASM_EXT} asm/ghash-x86.pl) +perlasm(ghash-armv4.${ASM_EXT} asm/ghash-armv4.pl) +perlasm(ghashv8-armx.${ASM_EXT} asm/ghashv8-armx.pl) + +add_executable( + gcm_test + + gcm_test.c +) + +target_link_libraries(gcm_test crypto) diff --git a/src/crypto/modes/asm/aesni-gcm-x86_64.pl b/src/crypto/modes/asm/aesni-gcm-x86_64.pl new file mode 100644 index 0000000..7e4e04e --- /dev/null +++ b/src/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -0,0 +1,1057 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# +# AES-NI-CTR+GHASH stitch. +# +# February 2013 +# +# OpenSSL GCM implementation is organized in such way that its +# performance is rather close to the sum of its streamed components, +# in the context parallelized AES-NI CTR and modulo-scheduled +# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation +# was observed to perform significantly better than the sum of the +# components on contemporary CPUs, the effort was deemed impossible to +# justify. This module is based on combination of Intel submissions, +# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max +# Locktyukhin of Intel Corp. who verified that it reduces shuffles +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor, and 0.74 - +# on Broadwell. [Mentioned results are raw profiled measurements for +# favourable packet size, one divisible by 96. Applications using the +# EVP interface will observe a few percent worse performance.] +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +if ($avx>1) {{{ + +($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); + +($Ii,$T1,$T2,$Hkey, + $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); + +($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); + +($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15"); + +$code=<<___; +.text + +.type _aesni_ctr32_ghash_6x,\@abi-omnipotent +.align 32 +_aesni_ctr32_ghash_6x: + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + sub \$6,$len + vpxor $Z0,$Z0,$Z0 # $Z0 = 0 + vmovdqu 0x00-0x80($key),$rndkey + vpaddb $T2,$T1,$inout1 + vpaddb $T2,$inout1,$inout2 + vpaddb $T2,$inout2,$inout3 + vpaddb $T2,$inout3,$inout4 + vpaddb $T2,$inout4,$inout5 + vpxor $rndkey,$T1,$inout0 + vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 + jmp .Loop6x + +.align 32 +.Loop6x: + add \$`6<<24`,$counter + jc .Lhandle_ctr32 # discard $inout[1-5]? + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpaddb $T2,$inout5,$T1 # next counter value + vpxor $rndkey,$inout1,$inout1 + vpxor $rndkey,$inout2,$inout2 + +.Lresume_ctr32: + vmovdqu $T1,($ivp) # save next counter value + vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 + vpxor $rndkey,$inout3,$inout3 + vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey + vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 + xor %r12,%r12 + cmp $in0,$end0 + + vaesenc $T2,$inout0,$inout0 + vmovdqu 0x30+8(%rsp),$Ii # I[4] + vpxor $rndkey,$inout4,$inout4 + vpclmulqdq \$0x00,$Hkey,$Z3,$T1 + vaesenc $T2,$inout1,$inout1 + vpxor $rndkey,$inout5,$inout5 + setnc %r12b + vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 + vaesenc $T2,$inout2,$inout2 + vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2 + neg %r12 + vaesenc $T2,$inout3,$inout3 + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 + vpxor $Z0,$Xi,$Xi # modulo-scheduled + vaesenc $T2,$inout4,$inout4 + vpxor $Z1,$T1,$Z0 + and \$0x60,%r12 + vmovups 0x20-0x80($key),$rndkey + vpclmulqdq \$0x10,$Hkey,$Ii,$T1 + vaesenc $T2,$inout5,$inout5 + + vpclmulqdq \$0x01,$Hkey,$Ii,$T2 + lea ($in0,%r12),$in0 + vaesenc $rndkey,$inout0,$inout0 + vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] + vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey + vmovdqu 0x40+8(%rsp),$Ii # I[3] + vaesenc $rndkey,$inout1,$inout1 + movbe 0x58($in0),%r13 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x50($in0),%r12 + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x20+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x28+8(%rsp) + vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x30-0x80($key),$rndkey + vpxor $T1,$Z2,$Z2 + vpclmulqdq \$0x00,$Z1,$Ii,$T1 + vaesenc $rndkey,$inout0,$inout0 + vpxor $T2,$Z2,$Z2 + vpclmulqdq \$0x10,$Z1,$Ii,$T2 + vaesenc $rndkey,$inout1,$inout1 + vpxor $Hkey,$Z3,$Z3 + vpclmulqdq \$0x01,$Z1,$Ii,$Hkey + vaesenc $rndkey,$inout2,$inout2 + vpclmulqdq \$0x11,$Z1,$Ii,$Z1 + vmovdqu 0x50+8(%rsp),$Ii # I[2] + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vpxor $T1,$Z0,$Z0 + vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x40-0x80($key),$rndkey + vpxor $T2,$Z2,$Z2 + vpclmulqdq \$0x00,$T1,$Ii,$T2 + vaesenc $rndkey,$inout0,$inout0 + vpxor $Hkey,$Z2,$Z2 + vpclmulqdq \$0x10,$T1,$Ii,$Hkey + vaesenc $rndkey,$inout1,$inout1 + movbe 0x48($in0),%r13 + vpxor $Z1,$Z3,$Z3 + vpclmulqdq \$0x01,$T1,$Ii,$Z1 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x40($in0),%r12 + vpclmulqdq \$0x11,$T1,$Ii,$T1 + vmovdqu 0x60+8(%rsp),$Ii # I[1] + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x30+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x38+8(%rsp) + vpxor $T2,$Z0,$Z0 + vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x50-0x80($key),$rndkey + vpxor $Hkey,$Z2,$Z2 + vpclmulqdq \$0x00,$T2,$Ii,$Hkey + vaesenc $rndkey,$inout0,$inout0 + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x10,$T2,$Ii,$Z1 + vaesenc $rndkey,$inout1,$inout1 + movbe 0x38($in0),%r13 + vpxor $T1,$Z3,$Z3 + vpclmulqdq \$0x01,$T2,$Ii,$T1 + vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] + vaesenc $rndkey,$inout2,$inout2 + movbe 0x30($in0),%r12 + vpclmulqdq \$0x11,$T2,$Ii,$T2 + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x40+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x48+8(%rsp) + vpxor $Hkey,$Z0,$Z0 + vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6 + vaesenc $rndkey,$inout5,$inout5 + + vmovups 0x60-0x80($key),$rndkey + vpxor $Z1,$Z2,$Z2 + vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 + vaesenc $rndkey,$inout0,$inout0 + vpxor $T1,$Z2,$Z2 + vpclmulqdq \$0x01,$Hkey,$Xi,$T1 + vaesenc $rndkey,$inout1,$inout1 + movbe 0x28($in0),%r13 + vpxor $T2,$Z3,$Z3 + vpclmulqdq \$0x00,$Hkey,$Xi,$T2 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x20($in0),%r12 + vpclmulqdq \$0x11,$Hkey,$Xi,$Xi + vaesenc $rndkey,$inout3,$inout3 + mov %r13,0x50+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + mov %r12,0x58+8(%rsp) + vpxor $Z1,$Z2,$Z2 + vaesenc $rndkey,$inout5,$inout5 + vpxor $T1,$Z2,$Z2 + + vmovups 0x70-0x80($key),$rndkey + vpslldq \$8,$Z2,$Z1 + vpxor $T2,$Z0,$Z0 + vmovdqu 0x10($const),$Hkey # .Lpoly + + vaesenc $rndkey,$inout0,$inout0 + vpxor $Xi,$Z3,$Z3 + vaesenc $rndkey,$inout1,$inout1 + vpxor $Z1,$Z0,$Z0 + movbe 0x18($in0),%r13 + vaesenc $rndkey,$inout2,$inout2 + movbe 0x10($in0),%r12 + vpalignr \$8,$Z0,$Z0,$Ii # 1st phase + vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 + mov %r13,0x60+8(%rsp) + vaesenc $rndkey,$inout3,$inout3 + mov %r12,0x68+8(%rsp) + vaesenc $rndkey,$inout4,$inout4 + vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vmovups 0x90-0x80($key),$rndkey + vaesenc $T1,$inout1,$inout1 + vpsrldq \$8,$Z2,$Z2 + vaesenc $T1,$inout2,$inout2 + vpxor $Z2,$Z3,$Z3 + vaesenc $T1,$inout3,$inout3 + vpxor $Ii,$Z0,$Z0 + movbe 0x08($in0),%r13 + vaesenc $T1,$inout4,$inout4 + movbe 0x00($in0),%r12 + vaesenc $T1,$inout5,$inout5 + vmovups 0xa0-0x80($key),$T1 + cmp \$11,$rounds + jb .Lenc_tail # 128-bit key + + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vaesenc $T1,$inout1,$inout1 + vaesenc $T1,$inout2,$inout2 + vaesenc $T1,$inout3,$inout3 + vaesenc $T1,$inout4,$inout4 + vmovups 0xb0-0x80($key),$rndkey + vaesenc $T1,$inout5,$inout5 + vmovups 0xc0-0x80($key),$T1 + je .Lenc_tail # 192-bit key + + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + + vaesenc $T1,$inout0,$inout0 + vaesenc $T1,$inout1,$inout1 + vaesenc $T1,$inout2,$inout2 + vaesenc $T1,$inout3,$inout3 + vaesenc $T1,$inout4,$inout4 + vmovups 0xd0-0x80($key),$rndkey + vaesenc $T1,$inout5,$inout5 + vmovups 0xe0-0x80($key),$T1 + jmp .Lenc_tail # 256-bit key + +.align 32 +.Lhandle_ctr32: + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + vpshufb $Ii,$T1,$Z2 # byte-swap counter + vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb + vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb + vpaddd $Z1,$Z2,$inout2 + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpaddd $Z1,$inout1,$inout3 + vpshufb $Ii,$inout1,$inout1 + vpaddd $Z1,$inout2,$inout4 + vpshufb $Ii,$inout2,$inout2 + vpxor $rndkey,$inout1,$inout1 + vpaddd $Z1,$inout3,$inout5 + vpshufb $Ii,$inout3,$inout3 + vpxor $rndkey,$inout2,$inout2 + vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value + vpshufb $Ii,$inout4,$inout4 + vpshufb $Ii,$inout5,$inout5 + vpshufb $Ii,$T1,$T1 # next counter value + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc $rndkey,$inout0,$inout0 + vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi + vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase + vaesenc $rndkey,$inout1,$inout1 + vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 + vpxor 0x00($inp),$T1,$T2 + vaesenc $rndkey,$inout2,$inout2 + vpxor 0x10($inp),$T1,$Ii + vaesenc $rndkey,$inout3,$inout3 + vpxor 0x20($inp),$T1,$Z1 + vaesenc $rndkey,$inout4,$inout4 + vpxor 0x30($inp),$T1,$Z2 + vaesenc $rndkey,$inout5,$inout5 + vpxor 0x40($inp),$T1,$Z3 + vpxor 0x50($inp),$T1,$Hkey + vmovdqu ($ivp),$T1 # load next counter value + + vaesenclast $T2,$inout0,$inout0 + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + vaesenclast $Ii,$inout1,$inout1 + vpaddb $T2,$T1,$Ii + mov %r13,0x70+8(%rsp) + lea 0x60($inp),$inp + vaesenclast $Z1,$inout2,$inout2 + vpaddb $T2,$Ii,$Z1 + mov %r12,0x78+8(%rsp) + lea 0x60($out),$out + vmovdqu 0x00-0x80($key),$rndkey + vaesenclast $Z2,$inout3,$inout3 + vpaddb $T2,$Z1,$Z2 + vaesenclast $Z3, $inout4,$inout4 + vpaddb $T2,$Z2,$Z3 + vaesenclast $Hkey,$inout5,$inout5 + vpaddb $T2,$Z3,$Hkey + + add \$0x60,$ret + sub \$0x6,$len + jc .L6x_done + + vmovups $inout0,-0x60($out) # save output + vpxor $rndkey,$T1,$inout0 + vmovups $inout1,-0x50($out) + vmovdqa $Ii,$inout1 # 0 latency + vmovups $inout2,-0x40($out) + vmovdqa $Z1,$inout2 # 0 latency + vmovups $inout3,-0x30($out) + vmovdqa $Z2,$inout3 # 0 latency + vmovups $inout4,-0x20($out) + vmovdqa $Z3,$inout4 # 0 latency + vmovups $inout5,-0x10($out) + vmovdqa $Hkey,$inout5 # 0 latency + vmovdqu 0x20+8(%rsp),$Z3 # I[5] + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled + vpxor $Z0,$Xi,$Xi # modulo-scheduled + + ret +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +___ +###################################################################### +# +# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, +# const AES_KEY *key, unsigned char iv[16], +# struct { u128 Xi,H,Htbl[9]; } *Xip); +$code.=<<___; +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,\@function,6 +.align 32 +aesni_gcm_decrypt: + xor $ret,$ret + cmp \$0x60,$len # minimal accepted length + jb .Lgcm_dec_abort + + lea (%rsp),%rax # save stack pointer + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,-0xd8(%rax) + movaps %xmm7,-0xc8(%rax) + movaps %xmm8,-0xb8(%rax) + movaps %xmm9,-0xa8(%rax) + movaps %xmm10,-0x98(%rax) + movaps %xmm11,-0x88(%rax) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +.Lgcm_dec_body: +___ +$code.=<<___; + vzeroupper + + vmovdqu ($ivp),$T1 # input counter value + add \$-128,%rsp + mov 12($ivp),$counter + lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 + vmovdqu ($Xip),$Xi # load Xi + and \$-128,%rsp # ensure stack alignment + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + lea 0x80($key),$key # size optimization + lea 0x20+0x20($Xip),$Xip # size optimization + mov 0xf0-0x80($key),$rounds + vpshufb $Ii,$Xi,$Xi + + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Ldec_no_key_aliasing + cmp \$768,$end0 + jnc .Ldec_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Ldec_no_key_aliasing: + + vmovdqu 0x50($inp),$Z3 # I[5] + lea ($inp),$in0 + vmovdqu 0x40($inp),$Z0 + lea -0xc0($inp,$len),$end0 + vmovdqu 0x30($inp),$Z1 + shr \$4,$len + xor $ret,$ret + vmovdqu 0x20($inp),$Z2 + vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x + vmovdqu 0x10($inp),$T2 + vpshufb $Ii,$Z0,$Z0 + vmovdqu ($inp),$Hkey + vpshufb $Ii,$Z1,$Z1 + vmovdqu $Z0,0x30(%rsp) + vpshufb $Ii,$Z2,$Z2 + vmovdqu $Z1,0x40(%rsp) + vpshufb $Ii,$T2,$T2 + vmovdqu $Z2,0x50(%rsp) + vpshufb $Ii,$Hkey,$Hkey + vmovdqu $T2,0x60(%rsp) + vmovdqu $Hkey,0x70(%rsp) + + call _aesni_ctr32_ghash_6x + + vmovups $inout0,-0x60($out) # save output + vmovups $inout1,-0x50($out) + vmovups $inout2,-0x40($out) + vmovups $inout3,-0x30($out) + vmovups $inout4,-0x20($out) + vmovups $inout5,-0x10($out) + + vpshufb ($const),$Xi,$Xi # .Lbswap_mask + vmovdqu $Xi,-0x40($Xip) # output Xi + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xd8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp # restore %rsp +.Lgcm_dec_abort: + mov $ret,%rax # return value + ret +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ + +$code.=<<___; +.type _aesni_ctr32_6x,\@abi-omnipotent +.align 32 +_aesni_ctr32_6x: + vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey + vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb + lea -1($rounds),%r13 + vmovups 0x10-0x80($key),$rndkey + lea 0x20-0x80($key),%r12 + vpxor $Z0,$T1,$inout0 + add \$`6<<24`,$counter + jc .Lhandle_ctr32_2 + vpaddb $T2,$T1,$inout1 + vpaddb $T2,$inout1,$inout2 + vpxor $Z0,$inout1,$inout1 + vpaddb $T2,$inout2,$inout3 + vpxor $Z0,$inout2,$inout2 + vpaddb $T2,$inout3,$inout4 + vpxor $Z0,$inout3,$inout3 + vpaddb $T2,$inout4,$inout5 + vpxor $Z0,$inout4,$inout4 + vpaddb $T2,$inout5,$T1 + vpxor $Z0,$inout5,$inout5 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc $rndkey,$inout0,$inout0 + vaesenc $rndkey,$inout1,$inout1 + vaesenc $rndkey,$inout2,$inout2 + vaesenc $rndkey,$inout3,$inout3 + vaesenc $rndkey,$inout4,$inout4 + vaesenc $rndkey,$inout5,$inout5 + vmovups (%r12),$rndkey + lea 0x10(%r12),%r12 + dec %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),$Hkey # last round key + vaesenc $rndkey,$inout0,$inout0 + vpxor 0x00($inp),$Hkey,$Z0 + vaesenc $rndkey,$inout1,$inout1 + vpxor 0x10($inp),$Hkey,$Z1 + vaesenc $rndkey,$inout2,$inout2 + vpxor 0x20($inp),$Hkey,$Z2 + vaesenc $rndkey,$inout3,$inout3 + vpxor 0x30($inp),$Hkey,$Xi + vaesenc $rndkey,$inout4,$inout4 + vpxor 0x40($inp),$Hkey,$T2 + vaesenc $rndkey,$inout5,$inout5 + vpxor 0x50($inp),$Hkey,$Hkey + lea 0x60($inp),$inp + + vaesenclast $Z0,$inout0,$inout0 + vaesenclast $Z1,$inout1,$inout1 + vaesenclast $Z2,$inout2,$inout2 + vaesenclast $Xi,$inout3,$inout3 + vaesenclast $T2,$inout4,$inout4 + vaesenclast $Hkey,$inout5,$inout5 + vmovups $inout0,0x00($out) + vmovups $inout1,0x10($out) + vmovups $inout2,0x20($out) + vmovups $inout3,0x30($out) + vmovups $inout4,0x40($out) + vmovups $inout5,0x50($out) + lea 0x60($out),$out + + ret +.align 32 +.Lhandle_ctr32_2: + vpshufb $Ii,$T1,$Z2 # byte-swap counter + vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb + vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb + vpaddd $Z1,$Z2,$inout2 + vpaddd $Z1,$inout1,$inout3 + vpshufb $Ii,$inout1,$inout1 + vpaddd $Z1,$inout2,$inout4 + vpshufb $Ii,$inout2,$inout2 + vpxor $Z0,$inout1,$inout1 + vpaddd $Z1,$inout3,$inout5 + vpshufb $Ii,$inout3,$inout3 + vpxor $Z0,$inout2,$inout2 + vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value + vpshufb $Ii,$inout4,$inout4 + vpxor $Z0,$inout3,$inout3 + vpshufb $Ii,$inout5,$inout5 + vpxor $Z0,$inout4,$inout4 + vpshufb $Ii,$T1,$T1 # next counter value + vpxor $Z0,$inout5,$inout5 + jmp .Loop_ctr32 +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,\@function,6 +.align 32 +aesni_gcm_encrypt: + xor $ret,$ret + cmp \$0x60*3,$len # minimal accepted length + jb .Lgcm_enc_abort + + lea (%rsp),%rax # save stack pointer + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,-0xd8(%rax) + movaps %xmm7,-0xc8(%rax) + movaps %xmm8,-0xb8(%rax) + movaps %xmm9,-0xa8(%rax) + movaps %xmm10,-0x98(%rax) + movaps %xmm11,-0x88(%rax) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +.Lgcm_enc_body: +___ +$code.=<<___; + vzeroupper + + vmovdqu ($ivp),$T1 # input counter value + add \$-128,%rsp + mov 12($ivp),$counter + lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 + lea 0x80($key),$key # size optimization + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + and \$-128,%rsp # ensure stack alignment + mov 0xf0-0x80($key),$rounds + + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Lenc_no_key_aliasing + cmp \$768,$end0 + jnc .Lenc_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Lenc_no_key_aliasing: + + lea ($out),$in0 + lea -0xc0($out,$len),$end0 + shr \$4,$len + + call _aesni_ctr32_6x + vpshufb $Ii,$inout0,$Xi # save bswapped output on stack + vpshufb $Ii,$inout1,$T2 + vmovdqu $Xi,0x70(%rsp) + vpshufb $Ii,$inout2,$Z0 + vmovdqu $T2,0x60(%rsp) + vpshufb $Ii,$inout3,$Z1 + vmovdqu $Z0,0x50(%rsp) + vpshufb $Ii,$inout4,$Z2 + vmovdqu $Z1,0x40(%rsp) + vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x + vmovdqu $Z2,0x30(%rsp) + + call _aesni_ctr32_6x + + vmovdqu ($Xip),$Xi # load Xi + lea 0x20+0x20($Xip),$Xip # size optimization + sub \$12,$len + mov \$0x60*2,$ret + vpshufb $Ii,$Xi,$Xi + + call _aesni_ctr32_ghash_6x + vmovdqu 0x20(%rsp),$Z3 # I[5] + vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpunpckhqdq $Z3,$Z3,$T1 + vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK + vmovups $inout0,-0x60($out) # save output + vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy + vpxor $Z3,$T1,$T1 + vmovups $inout1,-0x50($out) + vpshufb $Ii,$inout1,$inout1 + vmovups $inout2,-0x40($out) + vpshufb $Ii,$inout2,$inout2 + vmovups $inout3,-0x30($out) + vpshufb $Ii,$inout3,$inout3 + vmovups $inout4,-0x20($out) + vpshufb $Ii,$inout4,$inout4 + vmovups $inout5,-0x10($out) + vpshufb $Ii,$inout5,$inout5 + vmovdqu $inout0,0x10(%rsp) # free $inout0 +___ +{ my ($HK,$T3)=($rndkey,$inout0); + +$code.=<<___; + vmovdqu 0x30(%rsp),$Z2 # I[4] + vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 + vpunpckhqdq $Z2,$Z2,$T2 + vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 + vpxor $Z2,$T2,$T2 + vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 + vpclmulqdq \$0x00,$HK,$T1,$T1 + + vmovdqu 0x40(%rsp),$T3 # I[3] + vpclmulqdq \$0x00,$Ii,$Z2,$Z0 + vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $T3,$T3,$Z1 + vpclmulqdq \$0x11,$Ii,$Z2,$Z2 + vpxor $T3,$Z1,$Z1 + vpxor $Z3,$Z2,$Z2 + vpclmulqdq \$0x10,$HK,$T2,$T2 + vmovdqu 0x50-0x20($Xip),$HK + vpxor $T1,$T2,$T2 + + vmovdqu 0x50(%rsp),$T1 # I[2] + vpclmulqdq \$0x00,$Hkey,$T3,$Z3 + vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 + vpxor $Z0,$Z3,$Z3 + vpunpckhqdq $T1,$T1,$Z0 + vpclmulqdq \$0x11,$Hkey,$T3,$T3 + vpxor $T1,$Z0,$Z0 + vpxor $Z2,$T3,$T3 + vpclmulqdq \$0x00,$HK,$Z1,$Z1 + vpxor $T2,$Z1,$Z1 + + vmovdqu 0x60(%rsp),$T2 # I[1] + vpclmulqdq \$0x00,$Ii,$T1,$Z2 + vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 + vpxor $Z3,$Z2,$Z2 + vpunpckhqdq $T2,$T2,$Z3 + vpclmulqdq \$0x11,$Ii,$T1,$T1 + vpxor $T2,$Z3,$Z3 + vpxor $T3,$T1,$T1 + vpclmulqdq \$0x10,$HK,$Z0,$Z0 + vmovdqu 0x80-0x20($Xip),$HK + vpxor $Z1,$Z0,$Z0 + + vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] + vpclmulqdq \$0x00,$Hkey,$T2,$Z1 + vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 + vpunpckhqdq $Xi,$Xi,$T3 + vpxor $Z2,$Z1,$Z1 + vpclmulqdq \$0x11,$Hkey,$T2,$T2 + vpxor $Xi,$T3,$T3 + vpxor $T1,$T2,$T2 + vpclmulqdq \$0x00,$HK,$Z3,$Z3 + vpxor $Z0,$Z3,$Z0 + + vpclmulqdq \$0x00,$Ii,$Xi,$Z2 + vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 + vpunpckhqdq $inout5,$inout5,$T1 + vpclmulqdq \$0x11,$Ii,$Xi,$Xi + vpxor $inout5,$T1,$T1 + vpxor $Z1,$Z2,$Z1 + vpclmulqdq \$0x10,$HK,$T3,$T3 + vmovdqu 0x20-0x20($Xip),$HK + vpxor $T2,$Xi,$Z3 + vpxor $Z0,$T3,$Z2 + + vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 + vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing + vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 + vpxor $T3,$Z2,$Z2 + vpunpckhqdq $inout4,$inout4,$T2 + vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 + vpxor $inout4,$T2,$T2 + vpslldq \$8,$Z2,$T3 + vpclmulqdq \$0x00,$HK,$T1,$T1 + vpxor $T3,$Z1,$Xi + vpsrldq \$8,$Z2,$Z2 + vpxor $Z2,$Z3,$Z3 + + vpclmulqdq \$0x00,$Ii,$inout4,$Z1 + vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 + vpxor $Z0,$Z1,$Z1 + vpunpckhqdq $inout3,$inout3,$T3 + vpclmulqdq \$0x11,$Ii,$inout4,$inout4 + vpxor $inout3,$T3,$T3 + vpxor $inout5,$inout4,$inout4 + vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase + vpclmulqdq \$0x10,$HK,$T2,$T2 + vmovdqu 0x50-0x20($Xip),$HK + vpxor $T1,$T2,$T2 + + vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 + vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $inout2,$inout2,$T1 + vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 + vpxor $inout2,$T1,$T1 + vpxor $inout4,$inout3,$inout3 + vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 + vpclmulqdq \$0x00,$HK,$T3,$T3 + vpxor $T2,$T3,$T3 + + vpclmulqdq \$0x10,0x10($const),$Xi,$Xi + vxorps $inout5,$Xi,$Xi + + vpclmulqdq \$0x00,$Ii,$inout2,$Z1 + vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 + vpxor $Z0,$Z1,$Z1 + vpunpckhqdq $inout1,$inout1,$T2 + vpclmulqdq \$0x11,$Ii,$inout2,$inout2 + vpxor $inout1,$T2,$T2 + vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase + vpxor $inout3,$inout2,$inout2 + vpclmulqdq \$0x10,$HK,$T1,$T1 + vmovdqu 0x80-0x20($Xip),$HK + vpxor $T3,$T1,$T1 + + vxorps $Z3,$inout5,$inout5 + vpclmulqdq \$0x10,0x10($const),$Xi,$Xi + vxorps $inout5,$Xi,$Xi + + vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 + vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 + vpxor $Z1,$Z0,$Z0 + vpunpckhqdq $Xi,$Xi,$T3 + vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 + vpxor $Xi,$T3,$T3 + vpxor $inout2,$inout1,$inout1 + vpclmulqdq \$0x00,$HK,$T2,$T2 + vpxor $T1,$T2,$T2 + + vpclmulqdq \$0x00,$Ii,$Xi,$Z1 + vpclmulqdq \$0x11,$Ii,$Xi,$Z3 + vpxor $Z0,$Z1,$Z1 + vpclmulqdq \$0x10,$HK,$T3,$Z2 + vpxor $inout1,$Z3,$Z3 + vpxor $T2,$Z2,$Z2 + + vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing + vpxor $Z0,$Z2,$Z2 + vpslldq \$8,$Z2,$T1 + vmovdqu 0x10($const),$Hkey # .Lpoly + vpsrldq \$8,$Z2,$Z2 + vpxor $T1,$Z1,$Xi + vpxor $Z2,$Z3,$Z3 + + vpalignr \$8,$Xi,$Xi,$T2 # 1st phase + vpclmulqdq \$0x10,$Hkey,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase + vpclmulqdq \$0x10,$Hkey,$Xi,$Xi + vpxor $Z3,$T2,$T2 + vpxor $T2,$Xi,$Xi +___ +} +$code.=<<___; + vpshufb ($const),$Xi,$Xi # .Lbswap_mask + vmovdqu $Xi,-0x40($Xip) # output Xi + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp # restore %rsp +.Lgcm_enc_abort: + mov $ret,%rax # return value + ret +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +___ + +$code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: + .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: + .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 +___ +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___ +.extern __imp_RtlVirtualUnwind +.type gcm_se_handler,\@abi-omnipotent +.align 16 +gcm_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 120($context),%rax # pull context->Rax + + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + mov %r15,240($context) + mov %r14,232($context) + mov %r13,224($context) + mov %r12,216($context) + mov %rbp,160($context) + mov %rbx,144($context) + + lea -0xd8(%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size gcm_se_handler,.-gcm_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_aesni_gcm_decrypt + .rva .LSEH_end_aesni_gcm_decrypt + .rva .LSEH_gcm_dec_info + + .rva .LSEH_begin_aesni_gcm_encrypt + .rva .LSEH_end_aesni_gcm_encrypt + .rva .LSEH_gcm_enc_info +.section .xdata +.align 8 +.LSEH_gcm_dec_info: + .byte 9,0,0,0 + .rva gcm_se_handler + .rva .Lgcm_dec_body,.Lgcm_dec_abort +.LSEH_gcm_enc_info: + .byte 9,0,0,0 + .rva gcm_se_handler + .rva .Lgcm_enc_body,.Lgcm_enc_abort +___ +} +}}} else {{{ +$code=<<___; # assembler is too old +.text + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,\@abi-omnipotent +aesni_gcm_encrypt: + xor %eax,%eax + ret +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,\@abi-omnipotent +aesni_gcm_decrypt: + xor %eax,%eax + ret +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +___ +}}} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/src/crypto/modes/asm/ghash-armv4.pl b/src/crypto/modes/asm/ghash-armv4.pl new file mode 100644 index 0000000..b324641 --- /dev/null +++ b/src/crypto/modes/asm/ghash-armv4.pl @@ -0,0 +1,501 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+32 bytes shared table]. There is no +# experimental performance data available yet. The only approximation +# that can be made at this point is based on code size. Inner loop is +# 32 instructions long and on single-issue core should execute in <40 +# cycles. Having verified that gcc 3.4 didn't unroll corresponding +# loop, this assembler loop body was found to be ~3x smaller than +# compiler-generated one... +# +# July 2010 +# +# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on +# Cortex A8 core and ~25 cycles per processed byte (which was observed +# to be ~3 times faster than gcc-generated code:-) +# +# February 2011 +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Cortex A8 core and ~23.5 cycles per byte. +# +# March 2011 +# +# Add NEON implementation featuring polynomial multiplication, i.e. no +# lookup tables involved. On Cortex A8 it was measured to process one +# byte in 15 cycles or 55% faster than integer-only code. +# +# April 2014 +# +# Switch to multiplication algorithm suggested in paper referred +# below and combine it with reduction algorithm from x86 module. +# Performance improvement over previous version varies from 65% on +# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 +# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 - +# in 9.33. +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf + +# ==================================================================== +# Note about "528B" variant. In ARM case it makes lesser sense to +# implement it for following reasons: +# +# - performance improvement won't be anywhere near 50%, because 128- +# bit shift operation is neatly fused with 128-bit xor here, and +# "538B" variant would eliminate only 4-5 instructions out of 32 +# in the inner loop (meaning that estimated improvement is ~15%); +# - ARM-based systems are often embedded ones and extra memory +# consumption might be unappreciated (for so little improvement); +# +# Byte order [in]dependence. ========================================= +# +# Caller is expected to maintain specific *dword* order in Htable, +# namely with *least* significant dword of 128-bit value at *lower* +# address. This differs completely from C code and has everything to +# do with ldm instruction and order in which dwords are "consumed" by +# algorithm. *Byte* order within these dwords in turn is whatever +# *native* byte order on current platform. See gcm128.c for working +# example... + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$Xi="r0"; # argument block +$Htbl="r1"; +$inp="r2"; +$len="r3"; + +$Zll="r4"; # variables +$Zlh="r5"; +$Zhl="r6"; +$Zhh="r7"; +$Tll="r8"; +$Tlh="r9"; +$Thl="r10"; +$Thh="r11"; +$nlo="r12"; +################# r13 is stack pointer +$nhi="r14"; +################# r15 is program counter + +$rem_4bit=$inp; # used in gcm_gmult_4bit +$cnt=$len; + +sub Zsmash() { + my $i=12; + my @args=@_; + for ($Zll,$Zlh,$Zhl,$Zhh) { + $code.=<<___; +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev $_,$_ + str $_,[$Xi,#$i] +#elif defined(__ARMEB__) + str $_,[$Xi,#$i] +#else + mov $Tlh,$_,lsr#8 + strb $_,[$Xi,#$i+3] + mov $Thl,$_,lsr#16 + strb $Tlh,[$Xi,#$i+2] + mov $Thh,$_,lsr#24 + strb $Thl,[$Xi,#$i+1] + strb $Thh,[$Xi,#$i] +#endif +___ + $code.="\t".shift(@args)."\n"; + $i-=4; + } +} + +$code=<<___; +#if defined(__arm__) +#include "arm_arch.h" + +.syntax unified + +.text +.code 32 + +.type rem_4bit,%object +.align 5 +rem_4bit: +.short 0x0000,0x1C20,0x3840,0x2460 +.short 0x7080,0x6CA0,0x48C0,0x54E0 +.short 0xE100,0xFD20,0xD940,0xC560 +.short 0x9180,0x8DA0,0xA9C0,0xB5E0 +.size rem_4bit,.-rem_4bit + +.type rem_4bit_get,%function +rem_4bit_get: + sub $rem_4bit,pc,#8 + sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit + b .Lrem_4bit_got + nop +.size rem_4bit_get,.-rem_4bit_get + +.global gcm_ghash_4bit +.hidden gcm_ghash_4bit +.type gcm_ghash_4bit,%function +gcm_ghash_4bit: + sub r12,pc,#8 + add $len,$inp,$len @ $len to point at the end + stmdb sp!,{r3-r11,lr} @ save $len/end too + sub r12,r12,#48 @ &rem_4bit + + ldmia r12,{r4-r11} @ copy rem_4bit ... + stmdb sp!,{r4-r11} @ ... to stack + + ldrb $nlo,[$inp,#15] + ldrb $nhi,[$Xi,#15] +.Louter: + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + add $Thh,$Htbl,$nhi + ldrb $nlo,[$inp,#14] + + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + ldrb $nhi,[$Xi,#14] + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 + +.Linner: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 + ldrbpl $nlo,[$inp,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + ldrbpl $Tll,[$Xi,$cnt] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tlh,[sp,$nhi] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eorpl $nlo,$nlo,$Tll + eor $Zhh,$Thh,$Zhh,lsr#4 + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] + bpl .Linner + + ldr $len,[sp,#32] @ re-load $len/end + add $inp,$inp,#16 + mov $nhi,$Zll +___ + &Zsmash("cmp\t$inp,$len","ldrbne\t$nlo,[$inp,#15]"); +$code.=<<___; + bne .Louter + + add sp,sp,#36 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_ghash_4bit,.-gcm_ghash_4bit + +.global gcm_gmult_4bit +.hidden gcm_gmult_4bit +.type gcm_gmult_4bit,%function +gcm_gmult_4bit: + stmdb sp!,{r4-r11,lr} + ldrb $nlo,[$Xi,#15] + b rem_4bit_get +.Lrem_4bit_got: + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + ldrb $nlo,[$Xi,#14] + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + and $nhi,$nlo,#0xf0 + eor $Zhh,$Zhh,$Tll,lsl#16 + and $nlo,$nlo,#0x0f + +.Loop: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 + ldrbpl $nlo,[$Xi,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + bpl .Loop +___ + &Zsmash(); +$code.=<<___; +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); +my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); + +sub clmul64x64 { +my ($r,$a,$b)=@_; +$code.=<<___; + vext.8 $t0#lo, $a, $a, #1 @ A1 + vmull.p8 $t0, $t0#lo, $b @ F = A1*B + vext.8 $r#lo, $b, $b, #1 @ B1 + vmull.p8 $r, $a, $r#lo @ E = A*B1 + vext.8 $t1#lo, $a, $a, #2 @ A2 + vmull.p8 $t1, $t1#lo, $b @ H = A2*B + vext.8 $t3#lo, $b, $b, #2 @ B2 + vmull.p8 $t3, $a, $t3#lo @ G = A*B2 + vext.8 $t2#lo, $a, $a, #3 @ A3 + veor $t0, $t0, $r @ L = E + F + vmull.p8 $t2, $t2#lo, $b @ J = A3*B + vext.8 $r#lo, $b, $b, #3 @ B3 + veor $t1, $t1, $t3 @ M = G + H + vmull.p8 $r, $a, $r#lo @ I = A*B3 + veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 + vand $t0#hi, $t0#hi, $k48 + vext.8 $t3#lo, $b, $b, #4 @ B4 + veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 + vand $t1#hi, $t1#hi, $k32 + vmull.p8 $t3, $a, $t3#lo @ K = A*B4 + veor $t2, $t2, $r @ N = I + J + veor $t0#lo, $t0#lo, $t0#hi + veor $t1#lo, $t1#lo, $t1#hi + veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 + vand $t2#hi, $t2#hi, $k16 + vext.8 $t0, $t0, $t0, #15 + veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 + vmov.i64 $t3#hi, #0 + vext.8 $t1, $t1, $t1, #14 + veor $t2#lo, $t2#lo, $t2#hi + vmull.p8 $r, $a, $b @ D = A*B + vext.8 $t3, $t3, $t3, #12 + vext.8 $t2, $t2, $t2, #13 + veor $t0, $t0, $t1 + veor $t2, $t2, $t3 + veor $r, $r, $t0 + veor $r, $r, $t2 +___ +} + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.global gcm_init_neon +.hidden gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + vld1.64 $IN#hi,[r1,:64]! @ load H + vmov.i8 $t0,#0xe1 + vld1.64 $IN#lo,[r1,:64] + vshl.i64 $t0#hi,#57 + vshr.u64 $t0#lo,#63 @ t0=0xc2....01 + vdup.8 $t1,$IN#hi[7] + vshr.u64 $Hlo,$IN#lo,#63 + vshr.s8 $t1,#7 @ broadcast carry bit + vshl.i64 $IN,$IN,#1 + vand $t0,$t0,$t1 + vorr $IN#hi,$Hlo @ H<<<=1 + veor $IN,$IN,$t0 @ twisted H + vstmia r0,{$IN} + + bx lr +.size gcm_init_neon,.-gcm_init_neon + +.global gcm_gmult_neon +.hidden gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + vld1.64 $IN#hi,[$Xi,:64]! @ load Xi + vld1.64 $IN#lo,[$Xi,:64]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing + mov $len,#16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.global gcm_ghash_neon +.hidden gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi + vld1.64 $Xl#lo,[$Xi,:64]! + vmov.i64 $k48,#0x0000ffffffffffff + vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H + vmov.i64 $k32,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 $Xl,$Xl +#endif + vmov.i64 $k16,#0x000000000000ffff + veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing + +.Loop_neon: + vld1.64 $IN#hi,[$inp]! @ load inp + vld1.64 $IN#lo,[$inp]! +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $IN,$Xl @ inp^=Xi +.Lgmult_neon: +___ + &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo +$code.=<<___; + veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing +___ + &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) + &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi +$code.=<<___; + veor $Xm,$Xm,$Xl @ Karatsuba post-processing + veor $Xm,$Xm,$Xh + veor $Xl#hi,$Xl#hi,$Xm#lo + veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 $t1,$Xl,#57 @ 1st phase + vshl.i64 $t2,$Xl,#62 + veor $t2,$t2,$t1 @ + vshl.i64 $t1,$Xl,#63 + veor $t2, $t2, $t1 @ + veor $Xl#hi,$Xl#hi,$t2#lo @ + veor $Xh#lo,$Xh#lo,$t2#hi + + vshr.u64 $t2,$Xl,#1 @ 2nd phase + veor $Xh,$Xh,$Xl + veor $Xl,$Xl,$t2 @ + vshr.u64 $t2,$t2,#6 + vshr.u64 $Xl,$Xl,#1 @ + veor $Xl,$Xl,$Xh @ + veor $Xl,$Xl,$t2 @ + + subs $len,#16 + bne .Loop_neon + +#ifdef __ARMEL__ + vrev64.8 $Xl,$Xl +#endif + sub $Xi,#16 + vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi + vst1.64 $Xl#lo,[$Xi,:64] + + bx lr +.size gcm_ghash_neon,.-gcm_ghash_neon +#endif +___ +} +$code.=<<___; +.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 + +#endif +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/src/crypto/modes/asm/ghash-x86.pl b/src/crypto/modes/asm/ghash-x86.pl new file mode 100644 index 0000000..eb6d55e --- /dev/null +++ b/src/crypto/modes/asm/ghash-x86.pl @@ -0,0 +1,1393 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, May, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two +# code paths: vanilla x86 and vanilla SSE. Former will be executed on +# 486 and Pentium, latter on all others. SSE GHASH features so called +# "528B" variant of "4-bit" method utilizing additional 256+16 bytes +# of per-key storage [+512 bytes shared table]. Performance results +# are for streamed GHASH subroutine and are expressed in cycles per +# processed byte, less is better: +# +# gcc 2.95.3(*) SSE assembler x86 assembler +# +# Pentium 105/111(**) - 50 +# PIII 68 /75 12.2 24 +# P4 125/125 17.8 84(***) +# Opteron 66 /70 10.1 30 +# Core2 54 /67 8.4 18 +# Atom 105/105 16.8 53 +# VIA Nano 69 /71 13.0 27 +# +# (*) gcc 3.4.x was observed to generate few percent slower code, +# which is one of reasons why 2.95.3 results were chosen, +# another reason is lack of 3.4.x results for older CPUs; +# comparison with SSE results is not completely fair, because C +# results are for vanilla "256B" implementation, while +# assembler results are for "528B";-) +# (**) second number is result for code compiled with -fPIC flag, +# which is actually more relevant, because assembler code is +# position-independent; +# (***) see comment in non-MMX routine for further details; +# +# To summarize, it's >2-5 times faster than gcc-generated code. To +# anchor it to something else SHA1 assembler processes one byte in +# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE +# in particular, see comment at the end of the file... + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.10 cycles per processed byte. +# The question is how close is it to theoretical limit? The pclmulqdq +# instruction latency appears to be 14 cycles and there can't be more +# than 2 of them executing at any given time. This means that single +# Karatsuba multiplication would take 28 cycles *plus* few cycles for +# pre- and post-processing. Then multiplication has to be followed by +# modulo-reduction. Given that aggregated reduction method [see +# "Carry-less Multiplication and Its Usage for Computing the GCM Mode" +# white paper by Intel] allows you to perform reduction only once in +# a while we can assume that asymptotic performance can be estimated +# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction +# and Naggr is the aggregation factor. +# +# Before we proceed to this implementation let's have closer look at +# the best-performing code suggested by Intel in their white paper. +# By tracing inter-register dependencies Tmod is estimated as ~19 +# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per +# processed byte. As implied, this is quite optimistic estimate, +# because it does not account for Karatsuba pre- and post-processing, +# which for a single multiplication is ~5 cycles. Unfortunately Intel +# does not provide performance data for GHASH alone. But benchmarking +# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt +# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that +# the result accounts even for pre-computing of degrees of the hash +# key H, but its portion is negligible at 16KB buffer size. +# +# Moving on to the implementation in question. Tmod is estimated as +# ~13 cycles and Naggr is 2, giving asymptotic performance of ... +# 2.16. How is it possible that measured performance is better than +# optimistic theoretical estimate? There is one thing Intel failed +# to recognize. By serializing GHASH with CTR in same subroutine +# former's performance is really limited to above (Tmul + Tmod/Naggr) +# equation. But if GHASH procedure is detached, the modulo-reduction +# can be interleaved with Naggr-1 multiplications at instruction level +# and under ideal conditions even disappear from the equation. So that +# optimistic theoretical estimate for this implementation is ... +# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, +# at least for such small Naggr. I'd argue that (28+Tproc/Naggr), +# where Tproc is time required for Karatsuba pre- and post-processing, +# is more realistic estimate. In this case it gives ... 1.91 cycles. +# Or in other words, depending on how well we can interleave reduction +# and one of the two multiplications the performance should be betwen +# 1.91 and 2.16. As already mentioned, this implementation processes +# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart +# - in 2.02. x86_64 performance is better, because larger register +# bank allows to interleave reduction and multiplication better. +# +# Does it make sense to increase Naggr? To start with it's virtually +# impossible in 32-bit mode, because of limited register bank +# capacity. Otherwise improvement has to be weighed agiainst slower +# setup, as well as code size and complexity increase. As even +# optimistic estimate doesn't promise 30% performance improvement, +# there are currently no plans to increase Naggr. +# +# Special thanks to David Woodhouse <dwmw2@infradead.org> for +# providing access to a Westmere-based system on behalf of Intel +# Open Source Technology Centre. + +# January 2010 +# +# Tweaked to optimize transitions between integer and FP operations +# on same XMM register, PCLMULQDQ subroutine was measured to process +# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. +# The minor regression on Westmere is outweighed by ~15% improvement +# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in +# similar manner resulted in almost 20% degradation on Sandy Bridge, +# where original 64-bit code processes one byte in 1.95 cycles. + +##################################################################### +# For reference, AMD Bulldozer processes one byte in 1.98 cycles in +# 32-bit mode and 1.89 in 64-bit. + +# February 2013 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9. Resulting performance is 1.96 cycles per byte on +# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); + +$sse2=1; +#for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); +$inp = "edi"; +$Htbl = "esi"; + +$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse + # than unrolled, which has to be weighted against + # 2.5x x86-specific code size reduction. + +sub x86_loop { + my $off = shift; + my $rem = "eax"; + + &mov ($Zhh,&DWP(4,$Htbl,$Zll)); + &mov ($Zhl,&DWP(0,$Htbl,$Zll)); + &mov ($Zlh,&DWP(12,$Htbl,$Zll)); + &mov ($Zll,&DWP(8,$Htbl,$Zll)); + &xor ($rem,$rem); # avoid partial register stalls on PIII + + # shrd practically kills P4, 2.5x deterioration, but P4 has + # MMX code-path to execute. shrd runs tad faster [than twice + # the shifts, move's and or's] on pre-MMX Pentium (as well as + # PIII and Core2), *but* minimizes code size, spares register + # and thus allows to fold the loop... + if (!$unroll) { + my $cnt = $inp; + &mov ($cnt,15); + &jmp (&label("x86_loop")); + &set_label("x86_loop",16); + for($i=1;$i<=2;$i++) { + &mov (&LB($rem),&LB($Zll)); + &shrd ($Zll,$Zlh,4); + &and (&LB($rem),0xf); + &shrd ($Zlh,$Zhl,4); + &shrd ($Zhl,$Zhh,4); + &shr ($Zhh,4); + &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); + + &mov (&LB($rem),&BP($off,"esp",$cnt)); + if ($i&1) { + &and (&LB($rem),0xf0); + } else { + &shl (&LB($rem),4); + } + + &xor ($Zll,&DWP(8,$Htbl,$rem)); + &xor ($Zlh,&DWP(12,$Htbl,$rem)); + &xor ($Zhl,&DWP(0,$Htbl,$rem)); + &xor ($Zhh,&DWP(4,$Htbl,$rem)); + + if ($i&1) { + &dec ($cnt); + &js (&label("x86_break")); + } else { + &jmp (&label("x86_loop")); + } + } + &set_label("x86_break",16); + } else { + for($i=1;$i<32;$i++) { + &comment($i); + &mov (&LB($rem),&LB($Zll)); + &shrd ($Zll,$Zlh,4); + &and (&LB($rem),0xf); + &shrd ($Zlh,$Zhl,4); + &shrd ($Zhl,$Zhh,4); + &shr ($Zhh,4); + &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); + + if ($i&1) { + &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); + &and (&LB($rem),0xf0); + } else { + &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); + &shl (&LB($rem),4); + } + + &xor ($Zll,&DWP(8,$Htbl,$rem)); + &xor ($Zlh,&DWP(12,$Htbl,$rem)); + &xor ($Zhl,&DWP(0,$Htbl,$rem)); + &xor ($Zhh,&DWP(4,$Htbl,$rem)); + } + } + &bswap ($Zll); + &bswap ($Zlh); + &bswap ($Zhl); + if (!$x86only) { + &bswap ($Zhh); + } else { + &mov ("eax",$Zhh); + &bswap ("eax"); + &mov ($Zhh,"eax"); + } +} + +if ($unroll) { + &function_begin_B("_x86_gmult_4bit_inner"); + &x86_loop(4); + &ret (); + &function_end_B("_x86_gmult_4bit_inner"); +} + +sub deposit_rem_4bit { + my $bias = shift; + + &mov (&DWP($bias+0, "esp"),0x0000<<16); + &mov (&DWP($bias+4, "esp"),0x1C20<<16); + &mov (&DWP($bias+8, "esp"),0x3840<<16); + &mov (&DWP($bias+12,"esp"),0x2460<<16); + &mov (&DWP($bias+16,"esp"),0x7080<<16); + &mov (&DWP($bias+20,"esp"),0x6CA0<<16); + &mov (&DWP($bias+24,"esp"),0x48C0<<16); + &mov (&DWP($bias+28,"esp"),0x54E0<<16); + &mov (&DWP($bias+32,"esp"),0xE100<<16); + &mov (&DWP($bias+36,"esp"),0xFD20<<16); + &mov (&DWP($bias+40,"esp"),0xD940<<16); + &mov (&DWP($bias+44,"esp"),0xC560<<16); + &mov (&DWP($bias+48,"esp"),0x9180<<16); + &mov (&DWP($bias+52,"esp"),0x8DA0<<16); + &mov (&DWP($bias+56,"esp"),0xA9C0<<16); + &mov (&DWP($bias+60,"esp"),0xB5E0<<16); +} + +$suffix = $x86only ? "" : "_x86"; + +&function_begin("gcm_gmult_4bit".$suffix); + &stack_push(16+4+1); # +1 for stack alignment + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] + &mov ($Zhl,&DWP(4,$inp)); + &mov ($Zlh,&DWP(8,$inp)); + &mov ($Zll,&DWP(12,$inp)); + + &deposit_rem_4bit(16); + + &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(12,"esp"),$Zll); + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(0)); + } + + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_gmult_4bit".$suffix); + +&function_begin("gcm_ghash_4bit".$suffix); + &stack_push(16+4+1); # +1 for 64-bit alignment + &mov ($Zll,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ("ecx",&wparam(3)); # load len + &add ("ecx",$inp); + &mov (&wparam(3),"ecx"); + + &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zll)); + &mov ($Zlh,&DWP(8,$Zll)); + &mov ($Zll,&DWP(12,$Zll)); + + &deposit_rem_4bit(16); + + &set_label("x86_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); # xor with input + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&DWP(12,"esp"),$Zll); # dump it on stack + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(0,"esp"),$Zhh); + + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(2)); + } + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &mov (&wparam(2),$inp) if (!$unroll); + &jb (&label("x86_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_ghash_4bit".$suffix); + +if (!$x86only) {{{ + +&static_label("rem_4bit"); + +if (!$sse2) {{ # pure-MMX "May" version... + +$S=12; # shift factor for rem_4bit + +&function_begin_B("_mmx_gmult_4bit_inner"); +# MMX version performs 3.5 times better on P4 (see comment in non-MMX +# routine for further details), 100% better on Opteron, ~70% better +# on Core2 and PIII... In other words effort is considered to be well +# spent... Since initial release the loop was unrolled in order to +# "liberate" register previously used as loop counter. Instead it's +# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. +# The path involves move of Z.lo from MMX to integer register, +# effective address calculation and finally merge of value to Z.hi. +# Reference to rem_4bit is scheduled so late that I had to >>4 +# rem_4bit elements. This resulted in 20-45% procent improvement +# on contemporary µ-archs. +{ + my $cnt; + my $rem_4bit = "eax"; + my @rem = ($Zhh,$Zll); + my $nhi = $Zhl; + my $nlo = $Zlh; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem[0],$Zlo); + + for ($cnt=28;$cnt>=-2;$cnt--) { + my $odd = $cnt&1; + my $nix = $odd ? $nlo : $nhi; + + &shl (&LB($nlo),4) if ($odd); + &psrlq ($Zlo,4); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nix)); + &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); + &psllq ($tmp,60); + &and ($nhi,0xf0) if ($odd); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); + &and ($rem[0],0xf); + &pxor ($Zhi,&QWP(0,$Htbl,$nix)); + &mov ($nhi,$nlo) if (!$odd && $cnt>=0); + &movd ($rem[1],$Zlo); + &pxor ($Zlo,$tmp); + + push (@rem,shift(@rem)); # "rotate" registers + } + + &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + &shl ($inp,4); # compensate for rem_4bit[i] being >>4 + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &xor ($Zhh,$inp); + &bswap ($Zhh); + + &ret (); +} +&function_end_B("_mmx_gmult_4bit_inner"); + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &movz ($Zll,&BP(15,$inp)); + + &call ("_mmx_gmult_4bit_inner"); + + &mov ($inp,&wparam(0)); # load Xi + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +# Streamed version performs 20% better on P4, 7% on Opteron, +# 10% on Core2 and PIII... +&function_begin("gcm_ghash_4bit_mmx"); + &mov ($Zhh,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ($Zlh,&wparam(3)); # load len + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &add ($Zlh,$inp); + &mov (&wparam(3),$Zlh); # len to point at the end of input + &stack_push(4+1); # +1 for stack alignment + + &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zhh)); + &mov ($Zlh,&DWP(8,$Zhh)); + &mov ($Zhh,&DWP(0,$Zhh)); + &jmp (&label("mmx_outer_loop")); + + &set_label("mmx_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&wparam(2),$inp); + &mov (&DWP(12,"esp"),$Zll); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(0,"esp"),$Zhh); + + &mov ($inp,"esp"); + &shr ($Zll,24); + + &call ("_mmx_gmult_4bit_inner"); + + &mov ($inp,&wparam(2)); + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &jb (&label("mmx_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); + + &stack_pop(4+1); +&function_end("gcm_ghash_4bit_mmx"); + +}} else {{ # "June" MMX version... + # ... has slower "April" gcm_gmult_4bit_mmx with folded + # loop. This is done to conserve code size... +$S=16; # shift factor for rem_4bit + +sub mmx_loop() { +# MMX version performs 2.8 times better on P4 (see comment in non-MMX +# routine for further details), 40% better on Opteron and Core2, 50% +# better on PIII... In other words effort is considered to be well +# spent... + my $inp = shift; + my $rem_4bit = shift; + my $cnt = $Zhh; + my $nhi = $Zhl; + my $nlo = $Zlh; + my $rem = $Zll; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &mov ($cnt,14); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem,$Zlo); + &jmp (&label("mmx_loop")); + + &set_label("mmx_loop",16); + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &mov (&LB($nlo),&BP(0,$inp,$cnt)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &dec ($cnt); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); + &pxor ($Zlo,$tmp); + &js (&label("mmx_break")); + + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + &jmp (&label("mmx_loop")); + + &set_label("mmx_break",16); + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &bswap ($Zhh); +} + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &movz ($Zll,&BP(15,$inp)); + + &mmx_loop($inp,"eax"); + + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +###################################################################### +# Below subroutine is "528B" variant of "4-bit" GCM GHASH function +# (see gcm128.c for details). It provides further 20-40% performance +# improvement over above mentioned "May" version. + +&static_label("rem_8bit"); + +&function_begin("gcm_ghash_4bit_mmx"); +{ my ($Zlo,$Zhi) = ("mm7","mm6"); + my $rem_8bit = "esi"; + my $Htbl = "ebx"; + + # parameter block + &mov ("eax",&wparam(0)); # Xi + &mov ("ebx",&wparam(1)); # Htable + &mov ("ecx",&wparam(2)); # inp + &mov ("edx",&wparam(3)); # len + &mov ("ebp","esp"); # original %esp + &call (&label("pic_point")); + &set_label ("pic_point"); + &blindpop ($rem_8bit); + &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit)); + + &sub ("esp",512+16+16); # allocate stack frame... + &and ("esp",-64); # ...and align it + &sub ("esp",16); # place for (u8)(H[]<<4) + + &add ("edx","ecx"); # pointer to the end of input + &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi + &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len + &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp + + { my @lo = ("mm0","mm1","mm2"); + my @hi = ("mm3","mm4","mm5"); + my @tmp = ("mm6","mm7"); + my ($off1,$off2,$i) = (0,0,); + + &add ($Htbl,128); # optimize for size + &lea ("edi",&DWP(16+128,"esp")); + &lea ("ebp",&DWP(16+256+128,"esp")); + + # decompose Htable (low and high parts are kept separately), + # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack... + for ($i=0;$i<18;$i++) { + + &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16); + &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16); + &psllq ($tmp[1],60) if ($i>1); + &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16); + &por ($lo[2],$tmp[1]) if ($i>1); + &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17); + &psrlq ($lo[1],4) if ($i>0 && $i<17); + &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17); + &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17); + &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1); + &psrlq ($hi[1],4) if ($i>0 && $i<17); + &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1); + &shl ("edx",4) if ($i<16); + &mov (&BP($i,"esp"),&LB("edx")) if ($i<16); + + unshift (@lo,pop(@lo)); # "rotate" registers + unshift (@hi,pop(@hi)); + unshift (@tmp,pop(@tmp)); + $off1 += 8 if ($i>0); + $off2 += 8 if ($i>1); + } + } + + &movq ($Zhi,&QWP(0,"eax")); + &mov ("ebx",&DWP(8,"eax")); + &mov ("edx",&DWP(12,"eax")); # load Xi + +&set_label("outer",16); + { my $nlo = "eax"; + my $dat = "edx"; + my @nhi = ("edi","ebp"); + my @rem = ("ebx","ecx"); + my @red = ("mm0","mm1","mm2"); + my $tmp = "mm3"; + + &xor ($dat,&DWP(12,"ecx")); # merge input data + &xor ("ebx",&DWP(8,"ecx")); + &pxor ($Zhi,&QWP(0,"ecx")); + &lea ("ecx",&DWP(16,"ecx")); # inp+=16 + #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi + &mov (&DWP(528+8,"esp"),"ebx"); + &movq (&QWP(528+0,"esp"),$Zhi); + &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp + + &xor ($nlo,$nlo); + &rol ($dat,8); + &mov (&LB($nlo),&LB($dat)); + &mov ($nhi[1],$nlo); + &and (&LB($nlo),0x0f); + &shr ($nhi[1],4); + &pxor ($red[0],$red[0]); + &rol ($dat,8); # next byte + &pxor ($red[1],$red[1]); + &pxor ($red[2],$red[2]); + + # Just like in "May" verson modulo-schedule for critical path in + # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' + # is scheduled so late that rem_8bit[] has to be shifted *right* + # by 16, which is why last argument to pinsrw is 2, which + # corresponds to <<32=<<48>>16... + for ($j=11,$i=0;$i<15;$i++) { + + if ($i>0) { + &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] + &rol ($dat,8); # next byte + &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); + &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) + } else { + &movq ($Zlo,&QWP(16,"esp",$nlo,8)); + &movq ($Zhi,&QWP(16+128,"esp",$nlo,8)); + } + + &mov (&LB($nlo),&LB($dat)); + &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0); + + &movd ($rem[0],$Zlo); + &movz ($rem[1],&LB($rem[1])) if ($i>0); + &psrlq ($Zlo,8); # Z>>=8 + + &movq ($tmp,$Zhi); + &mov ($nhi[0],$nlo); + &psrlq ($Zhi,8); + + &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4 + &and (&LB($nlo),0x0f); + &psllq ($tmp,56); + + &pxor ($Zhi,$red[1]) if ($i>1); + &shr ($nhi[0],4); + &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0); + + unshift (@red,pop(@red)); # "rotate" registers + unshift (@rem,pop(@rem)); + unshift (@nhi,pop(@nhi)); + } + + &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] + &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); + &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); + &movz ($rem[1],&LB($rem[1])); + + &pxor ($red[2],$red[2]); # clear 2nd word + &psllq ($red[1],4); + + &movd ($rem[0],$Zlo); + &psrlq ($Zlo,4); # Z>>=4 + + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &shl ($rem[0],4); # rem<<4 + + &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi] + &psllq ($tmp,60); + &movz ($rem[0],&LB($rem[0])); + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8)); + + &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2); + &pxor ($Zhi,$red[1]); + + &movd ($dat,$Zlo); + &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48 + + &psllq ($red[0],12); # correct by <<16>>4 + &pxor ($Zhi,$red[0]); + &psrlq ($Zlo,32); + &pxor ($Zhi,$red[2]); + + &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp + &movd ("ebx",$Zlo); + &movq ($tmp,$Zhi); # 01234567 + &psllw ($Zhi,8); # 1.3.5.7. + &psrlw ($tmp,8); # .0.2.4.6 + &por ($Zhi,$tmp); # 10325476 + &bswap ($dat); + &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 + &bswap ("ebx"); + + &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? + &jne (&label("outer")); + } + + &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi + &mov (&DWP(12,"eax"),"edx"); + &mov (&DWP(8,"eax"),"ebx"); + &movq (&QWP(0,"eax"),$Zhi); + + &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp + &emms (); +} +&function_end("gcm_ghash_4bit_mmx"); +}} + +if ($sse2) {{ +###################################################################### +# PCLMULQDQ version. + +$Xip="eax"; +$Htbl="edx"; +$const="ecx"; +$inp="esi"; +$len="ebx"; + +($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; +($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); +($Xn,$Xhn)=("xmm6","xmm7"); + +&static_label("bswap"); + +sub clmul64x64_T2 { # minimal "register" pressure +my ($Xhi,$Xi,$Hkey,$HK)=@_; + + &movdqa ($Xhi,$Xi); # + &pshufd ($T1,$Xi,0b01001110); + &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK)); + &pxor ($T1,$Xi); # + &pxor ($T2,$Hkey) if (!defined($HK)); + $HK=$T2 if (!defined($HK)); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T1,$HK,0x00); ####### + &xorps ($T1,$Xi); # + &xorps ($T1,$Xhi); # + + &movdqa ($T2,$T1); # + &psrldq ($T1,8); + &pslldq ($T2,8); # + &pxor ($Xhi,$T1); + &pxor ($Xi,$T2); # +} + +sub clmul64x64_T3 { +# Even though this subroutine offers visually better ILP, it +# was empirically found to be a tad slower than above version. +# At least in gcm_ghash_clmul context. But it's just as well, +# because loop modulo-scheduling is possible only thanks to +# minimized "register" pressure... +my ($Xhi,$Xi,$Hkey)=@_; + + &movdqa ($T1,$Xi); # + &movdqa ($Xhi,$Xi); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pshufd ($T2,$T1,0b01001110); # + &pshufd ($T3,$Hkey,0b01001110); + &pxor ($T2,$T1); # + &pxor ($T3,$Hkey); + &pclmulqdq ($T2,$T3,0x00); ####### + &pxor ($T2,$Xi); # + &pxor ($T2,$Xhi); # + + &movdqa ($T3,$T2); # + &psrldq ($T2,8); + &pslldq ($T3,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # +} + +if (1) { # Algorithm 9 with <<1 twist. + # Reduction is shorter and uses only two + # temporary registers, which makes it better + # candidate for interleaving with 64x64 + # multiplication. Pre-modulo-scheduled loop + # was found to be ~20% faster than Algorithm 5 + # below. Algorithm 9 was therefore chosen for + # further optimization... + +sub reduction_alg9 { # 17/11 times faster than Intel version +my ($Xhi,$Xi) = @_; + + # 1st phase + &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,57); # + &movdqa ($T1,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # + + # 2nd phase + &movdqa ($T2,$Xi); + &psrlq ($Xi,1); + &pxor ($Xhi,$T2); # + &pxor ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$Xhi) # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # <<1 twist + &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword + &movdqa ($T1,$Hkey); + &psllq ($Hkey,1); + &pxor ($T3,$T3); # + &psrlq ($T1,63); + &pcmpgtd ($T3,$T2); # broadcast carry bit + &pslldq ($T1,8); + &por ($Hkey,$T1); # H<<=1 + + # magic reduction + &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial + &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); + + &pshufd ($T1,$Hkey,0b01001110); + &pshufd ($T2,$Xi,0b01001110); + &pxor ($T1,$Hkey); # Karatsuba pre-processing + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &pxor ($T2,$Xi); # Karatsuba pre-processing + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + &palignr ($T2,$T1,8); # low part is H.lo^H.hi + &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt" + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movups ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + &movups ($T2,&QWP(32,$Htbl)); + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); + &reduction_alg9 ($Xhi,$Xi); + + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &movdqu ($T3,&QWP(32,$Htbl)); + &pxor ($Xi,$T1); # Ii+Xi + + &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 + &movdqa ($Xhn,$Xn); + &pxor ($T1,$Xn); # + &lea ($inp,&DWP(32,$inp)); # i+=2 + + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &pclmulqdq ($T1,$T3,0x00); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &nop (); + + &sub ($len,0x20); + &jbe (&label("even_tail")); + &jmp (&label("mod_loop")); + +&set_label("mod_loop",32); + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # + &nop (); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movups ($Hkey,&QWP(0,$Htbl)); # load H + + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &movdqa ($T3,&QWP(0,$const)); + &xorps ($Xhi,$Xhn); + &movdqu ($Xhn,&QWP(0,$inp)); # Ii + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pxor ($T1,$Xhi); # + + &pshufb ($Xhn,$T3); + &pxor ($T2,$T1); # + + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # + &pshufb ($Xn,$T3); + &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early + + &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 + &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &movups ($T3,&QWP(32,$Htbl)); + &psllq ($Xi,57); # + &movdqa ($T1,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # + &pshufd ($T1,$Xhn,0b01001110); + &movdqa ($T2,$Xi); # 2nd phase + &psrlq ($Xi,1); + &pxor ($T1,$Xhn); + &pxor ($Xhi,$T2); # + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &pxor ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$Xhi) # + &pclmulqdq ($T1,$T3,0x00); ####### + + &lea ($inp,&DWP(32,$inp)); + &sub ($len,0x20); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movdqa ($T3,&QWP(0,$const)); + + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &xorps ($Xhi,$Xhn); + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &pxor ($T1,$Xhi); # + + &pxor ($T2,$T1); # + + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # + + &reduction_alg9 ($Xhi,$Xi); + + &test ($len,$len); + &jnz (&label("done")); + + &movups ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); + +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} else { # Algorith 5. Kept for reference purposes. + +sub reduction_alg5 { # 19/16 times faster than Intel version +my ($Xhi,$Xi)=@_; + + # <<1 + &movdqa ($T1,$Xi); # + &movdqa ($T2,$Xhi); + &pslld ($Xi,1); + &pslld ($Xhi,1); # + &psrld ($T1,31); + &psrld ($T2,31); # + &movdqa ($T3,$T1); + &pslldq ($T1,4); + &psrldq ($T3,12); # + &pslldq ($T2,4); + &por ($Xhi,$T3); # + &por ($Xi,$T1); + &por ($Xhi,$T2); # + + # 1st phase + &movdqa ($T1,$Xi); + &movdqa ($T2,$Xi); + &movdqa ($T3,$Xi); # + &pslld ($T1,31); + &pslld ($T2,30); + &pslld ($Xi,25); # + &pxor ($T1,$T2); + &pxor ($T1,$Xi); # + &movdqa ($T2,$T1); # + &pslldq ($T1,12); + &psrldq ($T2,4); # + &pxor ($T3,$T1); + + # 2nd phase + &pxor ($Xhi,$T3); # + &movdqa ($Xi,$T3); + &movdqa ($T1,$T3); + &psrld ($Xi,1); # + &psrld ($T1,2); + &psrld ($T3,7); # + &pxor ($Xi,$T1); + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # + &pxor ($Xi,$Xhi); # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($Xn,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$Xn); + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &pshufb ($Xi,$Xn); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); # i+=2 + &jbe (&label("even_tail")); + +&set_label("mod_loop"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + ####### + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); + &test ($len,$len); + &jnz (&label("done")); + + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} + +&set_label("bswap",64); + &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial +&set_label("rem_8bit",64); + &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); + &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); + &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E); + &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E); + &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E); + &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E); + &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E); + &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E); + &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE); + &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE); + &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE); + &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE); + &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E); + &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E); + &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE); + &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE); + &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E); + &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E); + &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E); + &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E); + &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E); + &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E); + &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E); + &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E); + &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE); + &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE); + &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE); + &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE); + &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E); + &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); + &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); + &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); +}} # $sse2 + +&set_label("rem_4bit",64); + &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); + &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); + &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); + &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); +}}} # !$x86only + +&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); +&asm_finish(); + +# A question was risen about choice of vanilla MMX. Or rather why wasn't +# SSE2 chosen instead? In addition to the fact that MMX runs on legacy +# CPUs such as PIII, "4-bit" MMX version was observed to provide better +# performance than *corresponding* SSE2 one even on contemporary CPUs. +# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 +# implementation featuring full range of lookup-table sizes, but with +# per-invocation lookup table setup. Latter means that table size is +# chosen depending on how much data is to be hashed in every given call, +# more data - larger table. Best reported result for Core2 is ~4 cycles +# per processed byte out of 64KB block. This number accounts even for +# 64KB table setup overhead. As discussed in gcm128.c we choose to be +# more conservative in respect to lookup table sizes, but how do the +# results compare? Minimalistic "256B" MMX version delivers ~11 cycles +# on same platform. As also discussed in gcm128.c, next in line "8-bit +# Shoup's" or "4KB" method should deliver twice the performance of +# "256B" one, in other words not worse than ~6 cycles per byte. It +# should be also be noted that in SSE2 case improvement can be "super- +# linear," i.e. more than twice, mostly because >>8 maps to single +# instruction on SSE2 register. This is unlike "4-bit" case when >>4 +# maps to same amount of instructions in both MMX and SSE2 cases. +# Bottom line is that switch to SSE2 is considered to be justifiable +# only in case we choose to implement "8-bit" method... diff --git a/src/crypto/modes/asm/ghash-x86_64.pl b/src/crypto/modes/asm/ghash-x86_64.pl new file mode 100644 index 0000000..6e656ca --- /dev/null +++ b/src/crypto/modes/asm/ghash-x86_64.pl @@ -0,0 +1,1753 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that +# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH +# function features so called "528B" variant utilizing additional +# 256+16 bytes of per-key storage [+512 bytes shared table]. +# Performance results are for this streamed GHASH subroutine and are +# expressed in cycles per processed byte, less is better: +# +# gcc 3.4.x(*) assembler +# +# P4 28.6 14.0 +100% +# Opteron 19.3 7.7 +150% +# Core2 17.8 8.1(**) +120% +# Atom 31.6 16.8 +88% +# VIA Nano 21.8 10.1 +115% +# +# (*) comparison is not completely fair, because C results are +# for vanilla "256B" implementation, while assembler results +# are for "528B";-) +# (**) it's mystery [to me] why Core2 result is not same as for +# Opteron; + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. +# +# Special thanks to David Woodhouse <dwmw2@infradead.org> for +# providing access to a Westmere-based system on behalf of Intel +# Open Source Technology Centre. + +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere 1.78(+13%) +# Sandy Bridge 1.80(+8%) +# Ivy Bridge 1.80(+7%) +# Haswell 0.55(+93%) (if system doesn't support AVX) +# Broadwell 0.45(+110%)(if system doesn't support AVX) +# Bulldozer 1.49(+27%) +# Silvermont 2.88(+13%) + +# March 2013 +# +# ... 8x aggregate factor AVX code path is using reduction algorithm +# suggested by Shay Gueron[1]. Even though contemporary AVX-capable +# CPUs such as Sandy and Ivy Bridge can execute it, the code performs +# sub-optimally in comparison to above mentioned version. But thanks +# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that +# it performs in 0.41 cycles per byte on Haswell processor, and in +# 0.29 on Broadwell. +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$do4xaggr=1; + +# common register layout +$nlo="%rax"; +$nhi="%rbx"; +$Zlo="%r8"; +$Zhi="%r9"; +$tmp="%r10"; +$rem_4bit = "%r11"; + +$Xi="%rdi"; +$Htbl="%rsi"; + +# per-function register layout +$cnt="%rcx"; +$rem="%rdx"; + +sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or + $r =~ s/%[er]([sd]i)/%\1l/ or + $r =~ s/%[er](bp)/%\1l/ or + $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +{ my $N; + sub loop() { + my $inp = shift; + + $N++; +$code.=<<___; + xor $nlo,$nlo + xor $nhi,$nhi + mov `&LB("$Zlo")`,`&LB("$nlo")` + mov `&LB("$Zlo")`,`&LB("$nhi")` + shl \$4,`&LB("$nlo")` + mov \$14,$cnt + mov 8($Htbl,$nlo),$Zlo + mov ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + mov $Zlo,$rem + jmp .Loop$N + +.align 16 +.Loop$N: + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + mov ($inp,$cnt),`&LB("$nlo")` + shr \$4,$Zhi + xor 8($Htbl,$nhi),$Zlo + shl \$60,$tmp + xor ($Htbl,$nhi),$Zhi + mov `&LB("$nlo")`,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + shl \$4,`&LB("$nlo")` + xor $tmp,$Zlo + dec $cnt + js .Lbreak$N + + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nlo),$Zlo + shl \$60,$tmp + xor ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + xor $tmp,$Zlo + jmp .Loop$N + +.align 16 +.Lbreak$N: + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nlo),$Zlo + shl \$60,$tmp + xor ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + xor $tmp,$Zlo + + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nhi),$Zlo + shl \$60,$tmp + xor ($Htbl,$nhi),$Zhi + xor $tmp,$Zlo + xor ($rem_4bit,$rem,8),$Zhi + + bswap $Zlo + bswap $Zhi +___ +}} + +$code=<<___; +.text +.extern OPENSSL_ia32cap_P + +.globl gcm_gmult_4bit +.type gcm_gmult_4bit,\@function,2 +.align 16 +gcm_gmult_4bit: + push %rbx + push %rbp # %rbp and %r12 are pushed exclusively in + push %r12 # order to reuse Win64 exception handler... +.Lgmult_prologue: + + movzb 15($Xi),$Zlo + lea .Lrem_4bit(%rip),$rem_4bit +___ + &loop ($Xi); +$code.=<<___; + mov $Zlo,8($Xi) + mov $Zhi,($Xi) + + mov 16(%rsp),%rbx + lea 24(%rsp),%rsp +.Lgmult_epilogue: + ret +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ + +# per-function register layout +$inp="%rdx"; +$len="%rcx"; +$rem_8bit=$rem_4bit; + +$code.=<<___; +.globl gcm_ghash_4bit +.type gcm_ghash_4bit,\@function,4 +.align 16 +gcm_ghash_4bit: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub \$280,%rsp +.Lghash_prologue: + mov $inp,%r14 # reassign couple of args + mov $len,%r15 +___ +{ my $inp="%r14"; + my $dat="%edx"; + my $len="%r15"; + my @nhi=("%ebx","%ecx"); + my @rem=("%r12","%r13"); + my $Hshr4="%rbp"; + + &sub ($Htbl,-128); # size optimization + &lea ($Hshr4,"16+128(%rsp)"); + { my @lo =($nlo,$nhi); + my @hi =($Zlo,$Zhi); + + &xor ($dat,$dat); + for ($i=0,$j=-2;$i<18;$i++,$j++) { + &mov ("$j(%rsp)",&LB($dat)) if ($i>1); + &or ($lo[0],$tmp) if ($i>1); + &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); + &shr ($lo[1],4) if ($i>0 && $i<17); + &mov ($tmp,$hi[1]) if ($i>0 && $i<17); + &shr ($hi[1],4) if ($i>0 && $i<17); + &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); + &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); + &shl (&LB($dat),4) if ($i>0 && $i<17); + &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); + &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); + &shl ($tmp,60) if ($i>0 && $i<17); + + push (@lo,shift(@lo)); + push (@hi,shift(@hi)); + } + } + &add ($Htbl,-128); + &mov ($Zlo,"8($Xi)"); + &mov ($Zhi,"0($Xi)"); + &add ($len,$inp); # pointer to the end of data + &lea ($rem_8bit,".Lrem_8bit(%rip)"); + &jmp (".Louter_loop"); + +$code.=".align 16\n.Louter_loop:\n"; + &xor ($Zhi,"($inp)"); + &mov ("%rdx","8($inp)"); + &lea ($inp,"16($inp)"); + &xor ("%rdx",$Zlo); + &mov ("($Xi)",$Zhi); + &mov ("8($Xi)","%rdx"); + &shr ("%rdx",32); + + &xor ($nlo,$nlo); + &rol ($dat,8); + &mov (&LB($nlo),&LB($dat)); + &movz ($nhi[0],&LB($dat)); + &shl (&LB($nlo),4); + &shr ($nhi[0],4); + + for ($j=11,$i=0;$i<15;$i++) { + &rol ($dat,8); + &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); + &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); + &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); + &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); + + &mov (&LB($nlo),&LB($dat)); + &xor ($Zlo,$tmp) if ($i>0); + &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); + + &movz ($nhi[1],&LB($dat)); + &shl (&LB($nlo),4); + &movzb ($rem[0],"(%rsp,$nhi[0])"); + + &shr ($nhi[1],4) if ($i<14); + &and ($nhi[1],0xf0) if ($i==14); + &shl ($rem[1],48) if ($i>0); + &xor ($rem[0],$Zlo); + + &mov ($tmp,$Zhi); + &xor ($Zhi,$rem[1]) if ($i>0); + &shr ($Zlo,8); + + &movz ($rem[0],&LB($rem[0])); + &mov ($dat,"$j($Xi)") if (--$j%4==0); + &shr ($Zhi,8); + + &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); + &shl ($tmp,56); + &xor ($Zhi,"($Hshr4,$nhi[0],8)"); + + unshift (@nhi,pop(@nhi)); # "rotate" registers + unshift (@rem,pop(@rem)); + } + &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); + &xor ($Zlo,"8($Htbl,$nlo)"); + &xor ($Zhi,"($Htbl,$nlo)"); + + &shl ($rem[1],48); + &xor ($Zlo,$tmp); + + &xor ($Zhi,$rem[1]); + &movz ($rem[0],&LB($Zlo)); + &shr ($Zlo,4); + + &mov ($tmp,$Zhi); + &shl (&LB($rem[0]),4); + &shr ($Zhi,4); + + &xor ($Zlo,"8($Htbl,$nhi[0])"); + &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); + &shl ($tmp,60); + + &xor ($Zhi,"($Htbl,$nhi[0])"); + &xor ($Zlo,$tmp); + &shl ($rem[0],48); + + &bswap ($Zlo); + &xor ($Zhi,$rem[0]); + + &bswap ($Zhi); + &cmp ($inp,$len); + &jb (".Louter_loop"); +} +$code.=<<___; + mov $Zlo,8($Xi) + mov $Zhi,($Xi) + + lea 280(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lghash_epilogue: + ret +.size gcm_ghash_4bit,.-gcm_ghash_4bit +___ + +###################################################################### +# PCLMULQDQ version. + +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; +($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); + +sub clmul64x64_T2 { # minimal register pressure +my ($Xhi,$Xi,$Hkey,$HK)=@_; + +if (!defined($HK)) { $HK = $T2; +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey,$T2 + pxor $Xi,$T1 # + pxor $Hkey,$T2 +___ +} else { +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 # +___ +} +$code.=<<___; + pclmulqdq \$0x00,$Hkey,$Xi ####### + pclmulqdq \$0x11,$Hkey,$Xhi ####### + pclmulqdq \$0x00,$HK,$T1 ####### + pxor $Xi,$T1 # + pxor $Xhi,$T1 # + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ +} + +sub reduction_alg9 { # 17/11 times faster than Intel version +my ($Xhi,$Xi) = @_; + +$code.=<<___; + # 1st phase + movdqa $Xi,$T2 # + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T1 # + pslldq \$8,$Xi + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # + + # 2nd phase + movdqa $Xi,$T2 + psrlq \$1,$Xi + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $Xhi,$Xi # +___ +} + +{ my ($Htbl,$Xip)=@_4args; + my $HK="%xmm6"; + +$code.=<<___; +.globl gcm_init_clmul +.type gcm_init_clmul,\@abi-omnipotent +.align 16 +gcm_init_clmul: +.L_init_clmul: +___ +$code.=<<___ if ($win64); +.LSEH_begin_gcm_init_clmul: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) +___ +$code.=<<___; + movdqu ($Xip),$Hkey + pshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + movdqa $Hkey,$T1 + psllq \$1,$Hkey + pxor $T3,$T3 # + psrlq \$63,$T1 + pcmpgtd $T2,$T3 # broadcast carry bit + pslldq \$8,$T1 + por $T1,$Hkey # H<<=1 + + # magic reduction + pand .L0x1c2_polynomial(%rip),$T3 + pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + pshufd \$0b01001110,$Hkey,$HK + movdqa $Hkey,$Xi + pxor $Hkey,$HK +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufd \$0b01001110,$Hkey,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $Hkey,$T1 # Karatsuba pre-processing + movdqu $Hkey,0x00($Htbl) # save H + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x10($Htbl) # save H^2 + palignr \$8,$T1,$T2 # low part is H.lo^H.hi... + movdqu $T2,0x20($Htbl) # save Karatsuba "salt" +___ +if ($do4xaggr) { + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + movdqa $Xi,$T3 +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufd \$0b01001110,$T3,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $T3,$T1 # Karatsuba pre-processing + movdqu $T3,0x30($Htbl) # save H^3 + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x40($Htbl) # save H^4 + palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... + movdqu $T2,0x50($Htbl) # save Karatsuba "salt" +___ +} +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + lea 0x18(%rsp),%rsp +.LSEH_end_gcm_init_clmul: +___ +$code.=<<___; + ret +.size gcm_init_clmul,.-gcm_init_clmul +___ +} + +{ my ($Xip,$Htbl)=@_4args; + +$code.=<<___; +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,\@abi-omnipotent +.align 16 +gcm_gmult_clmul: +.L_gmult_clmul: + movdqu ($Xip),$Xi + movdqa .Lbswap_mask(%rip),$T3 + movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$T2 + pshufb $T3,$Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); +$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); + # experimental alternative. special thing about is that there + # no dependency between the two multiplications... + mov \$`0xE1<<1`,%eax + mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff + mov \$0x07,%r11d + movq %rax,$T1 + movq %r10,$T2 + movq %r11,$T3 # borrow $T3 + pand $Xi,$T3 + pshufb $T3,$T2 # ($Xi&7)·0xE0 + movq %rax,$T3 + pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) + pxor $Xi,$T2 + pslldq \$15,$T2 + paddd $T2,$T2 # <<(64+56+1) + pxor $T2,$Xi + pclmulqdq \$0x01,$T3,$Xi + movdqa .Lbswap_mask(%rip),$T3 # reload $T3 + psrldq \$1,$T1 + pxor $T1,$Xhi + pslldq \$7,$Xi + pxor $Xhi,$Xi +___ +$code.=<<___; + pshufb $T3,$Xi + movdqu $Xi,($Xip) + ret +.size gcm_gmult_clmul,.-gcm_gmult_clmul +___ +} + +{ my ($Xip,$Htbl,$inp,$len)=@_4args; + my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); + my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); + +$code.=<<___; +.globl gcm_ghash_clmul +.type gcm_ghash_clmul,\@abi-omnipotent +.align 32 +gcm_ghash_clmul: +.L_ghash_clmul: +___ +$code.=<<___ if ($win64); + lea -0x88(%rsp),%rax +.LSEH_begin_gcm_ghash_clmul: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) +___ +$code.=<<___; + movdqa .Lbswap_mask(%rip),$T3 + + movdqu ($Xip),$Xi + movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$HK + pshufb $T3,$Xi + + sub \$0x10,$len + jz .Lodd_tail + + movdqu 0x10($Htbl),$Hkey2 +___ +if ($do4xaggr) { +my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); + +$code.=<<___; + mov OPENSSL_ia32cap_P+4(%rip),%eax + cmp \$0x30,$len + jb .Lskip4x + + and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE + cmp \$`1<<22`,%eax # check for MOVBE without XSAVE + je .Lskip4x + + sub \$0x30,$len + mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff + movdqu 0x30($Htbl),$Hkey3 + movdqu 0x40($Htbl),$Hkey4 + + ####### + # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P + # + movdqu 0x30($inp),$Xln + movdqu 0x20($inp),$Xl + pshufb $T3,$Xln + pshufb $T3,$Xl + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey2,$Xl + pclmulqdq \$0x11,$Hkey2,$Xh + pclmulqdq \$0x10,$HK,$Xm + xorps $Xl,$Xln + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + xorps $Xm,$Xmn + + movdqu 0x10($inp),$Xl + movdqu 0($inp),$T1 + pshufb $T3,$Xl + pshufb $T3,$T1 + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $T1,$Xi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + movdqa $Xi,$Xhi + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + pclmulqdq \$0x11,$Hkey3,$Xh + pclmulqdq \$0x00,$HK,$Xm + xorps $Xl,$Xln + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: + pclmulqdq \$0x00,$Hkey4,$Xi + xorps $Xm,$Xmn + movdqu 0x30($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x11,$Hkey4,$Xhi + xorps $Xln,$Xi + movdqu 0x20($inp),$Xln + movdqa $Xl,$Xh + pclmulqdq \$0x10,$HK,$T1 + pshufd \$0b01001110,$Xl,$Xm + xorps $Xhn,$Xhi + pxor $Xl,$Xm + pshufb $T3,$Xln + movups 0x20($Htbl),$HK + xorps $Xmn,$T1 + pclmulqdq \$0x00,$Hkey,$Xl + pshufd \$0b01001110,$Xln,$Xmn + + pxor $Xi,$T1 # aggregated Karatsuba post-processing + movdqa $Xln,$Xhn + pxor $Xhi,$T1 # + pxor $Xln,$Xmn + movdqa $T1,$T2 # + pclmulqdq \$0x11,$Hkey,$Xh + pslldq \$8,$T1 + psrldq \$8,$T2 # + pxor $T1,$Xi + movdqa .L7_mask(%rip),$T1 + pxor $T2,$Xhi # + movq %rax,$T2 + + pand $Xi,$T1 # 1st phase + pshufb $T1,$T2 # + pxor $Xi,$T2 # + pclmulqdq \$0x00,$HK,$Xm + psllq \$57,$T2 # + movdqa $T2,$T1 # + pslldq \$8,$T2 + pclmulqdq \$0x00,$Hkey2,$Xln + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # + movdqu 0($inp),$T1 + + movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhn + xorps $Xl,$Xln + movdqu 0x10($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x10,$HK,$Xmn + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + pshufb $T3,$T1 + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + + movdqa $Xl,$Xh + pxor $Xm,$Xmn + pshufd \$0b01001110,$Xl,$Xm + pxor $T2,$Xi # + pxor $T1,$Xhi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + psrlq \$1,$Xi # + pxor $Xhi,$Xi # + movdqa $Xi,$Xhi + pclmulqdq \$0x11,$Hkey3,$Xh + xorps $Xl,$Xln + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + + pclmulqdq \$0x00,$HK,$Xm + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jnc .Lmod4_loop + +.Ltail4x: + pclmulqdq \$0x00,$Hkey4,$Xi + pclmulqdq \$0x11,$Hkey4,$Xhi + pclmulqdq \$0x10,$HK,$T1 + xorps $Xm,$Xmn + xorps $Xln,$Xi + xorps $Xhn,$Xhi + pxor $Xi,$Xhi # aggregated Karatsuba post-processing + pxor $Xmn,$T1 + + pxor $Xhi,$T1 # + pxor $Xi,$Xhi + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ + &reduction_alg9($Xhi,$Xi); +$code.=<<___; + add \$0x40,$len + jz .Ldone + movdqu 0x20($Htbl),$HK + sub \$0x10,$len + jz .Lodd_tail +.Lskip4x: +___ +} +$code.=<<___; + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + movdqu ($inp),$T1 # Ii + movdqu 16($inp),$Xln # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xln + pxor $T1,$Xi # Ii+Xi + + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + lea 32($inp),$inp # i+=2 + nop + sub \$0x20,$len + jbe .Leven_tail + nop + jmp .Lmod_loop + +.align 32 +.Lmod_loop: + movdqa $Xi,$Xhi + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # + + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$Xmn + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + movdqu ($inp),$T2 # Ii + pxor $Xi,$T1 # aggregated Karatsuba post-processing + pshufb $T3,$T2 + movdqu 16($inp),$Xln # Ii+1 + + pxor $Xhi,$T1 + pxor $T2,$Xhi # "Ii+Xi", consume early + pxor $T1,$Xmn + pshufb $T3,$Xln + movdqa $Xmn,$T1 # + psrldq \$8,$T1 + pslldq \$8,$Xmn # + pxor $T1,$Xhi + pxor $Xmn,$Xi # + + movdqa $Xln,$Xhn # + + movdqa $Xi,$T2 # 1st phase + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # + pclmulqdq \$0x00,$Hkey,$Xln ####### + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T1 # + pslldq \$8,$Xi + psrldq \$8,$T1 # + pxor $T2,$Xi + pshufd \$0b01001110,$Xhn,$Xmn + pxor $T1,$Xhi # + pxor $Xhn,$Xmn # + + movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey,$Xhn ####### + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + lea 32($inp),$inp + psrlq \$1,$Xi # + pclmulqdq \$0x00,$HK,$Xmn ####### + pxor $Xhi,$Xi # + + sub \$0x20,$len + ja .Lmod_loop + +.Leven_tail: + movdqa $Xi,$Xhi + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # + + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$Xmn + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + pxor $Xi,$T1 + pxor $Xhi,$T1 + pxor $T1,$Xmn + movdqa $Xmn,$T1 # + psrldq \$8,$T1 + pslldq \$8,$Xmn # + pxor $T1,$Xhi + pxor $Xmn,$Xi # +___ + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + test $len,$len + jnz .Ldone + +.Lodd_tail: + movdqu ($inp),$T1 # Ii + pshufb $T3,$T1 + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; +.Ldone: + pshufb $T3,$Xi + movdqu $Xi,($Xip) +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +.LSEH_end_gcm_ghash_clmul: +___ +$code.=<<___; + ret +.size gcm_ghash_clmul,.-gcm_ghash_clmul +___ +} + +$code.=<<___; +.globl gcm_init_avx +.type gcm_init_avx,\@abi-omnipotent +.align 32 +gcm_init_avx: +___ +if ($avx) { +my ($Htbl,$Xip)=@_4args; +my $HK="%xmm6"; + +$code.=<<___ if ($win64); +.LSEH_begin_gcm_init_avx: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) +___ +$code.=<<___; + vzeroupper + + vmovdqu ($Xip),$Hkey + vpshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + vpsrlq \$63,$Hkey,$T1 + vpsllq \$1,$Hkey,$Hkey + vpxor $T3,$T3,$T3 # + vpcmpgtd $T2,$T3,$T3 # broadcast carry bit + vpslldq \$8,$T1,$T1 + vpor $T1,$Hkey,$Hkey # H<<=1 + + # magic reduction + vpand .L0x1c2_polynomial(%rip),$T3,$T3 + vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial + + vpunpckhqdq $Hkey,$Hkey,$HK + vmovdqa $Hkey,$Xi + vpxor $Hkey,$HK,$HK + mov \$4,%r10 # up to H^8 + jmp .Linit_start_avx +___ + +sub clmul64x64_avx { +my ($Xhi,$Xi,$Hkey,$HK)=@_; + +if (!defined($HK)) { $HK = $T2; +$code.=<<___; + vpunpckhqdq $Xi,$Xi,$T1 + vpunpckhqdq $Hkey,$Hkey,$T2 + vpxor $Xi,$T1,$T1 # + vpxor $Hkey,$T2,$T2 +___ +} else { +$code.=<<___; + vpunpckhqdq $Xi,$Xi,$T1 + vpxor $Xi,$T1,$T1 # +___ +} +$code.=<<___; + vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### + vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### + vpclmulqdq \$0x00,$HK,$T1,$T1 ####### + vpxor $Xi,$Xhi,$T2 # + vpxor $T2,$T1,$T1 # + + vpslldq \$8,$T1,$T2 # + vpsrldq \$8,$T1,$T1 + vpxor $T2,$Xi,$Xi # + vpxor $T1,$Xhi,$Xhi +___ +} + +sub reduction_avx { +my ($Xhi,$Xi) = @_; + +$code.=<<___; + vpsllq \$57,$Xi,$T1 # 1st phase + vpsllq \$62,$Xi,$T2 + vpxor $T1,$T2,$T2 # + vpsllq \$63,$Xi,$T1 + vpxor $T1,$T2,$T2 # + vpslldq \$8,$T2,$T1 # + vpsrldq \$8,$T2,$T2 + vpxor $T1,$Xi,$Xi # + vpxor $T2,$Xhi,$Xhi + + vpsrlq \$1,$Xi,$T2 # 2nd phase + vpxor $Xi,$Xhi,$Xhi + vpxor $T2,$Xi,$Xi # + vpsrlq \$5,$T2,$T2 + vpxor $T2,$Xi,$Xi # + vpsrlq \$1,$Xi,$Xi # + vpxor $Xhi,$Xi,$Xi # +___ +} + +$code.=<<___; +.align 32 +.Linit_loop_avx: + vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... + vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" +___ + &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 + &reduction_avx ($Xhi,$Xi); +$code.=<<___; +.Linit_start_avx: + vmovdqa $Xi,$T3 +___ + &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 + &reduction_avx ($Xhi,$Xi); +$code.=<<___; + vpshufd \$0b01001110,$T3,$T1 + vpshufd \$0b01001110,$Xi,$T2 + vpxor $T3,$T1,$T1 # Karatsuba pre-processing + vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 + vpxor $Xi,$T2,$T2 # Karatsuba pre-processing + vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 + lea 0x30($Htbl),$Htbl + sub \$1,%r10 + jnz .Linit_loop_avx + + vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped + vmovdqu $T3,-0x10($Htbl) + + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + lea 0x18(%rsp),%rsp +.LSEH_end_gcm_init_avx: +___ +$code.=<<___; + ret +.size gcm_init_avx,.-gcm_init_avx +___ +} else { +$code.=<<___; + jmp .L_init_clmul +.size gcm_init_avx,.-gcm_init_avx +___ +} + +$code.=<<___; +.globl gcm_gmult_avx +.type gcm_gmult_avx,\@abi-omnipotent +.align 32 +gcm_gmult_avx: + jmp .L_gmult_clmul +.size gcm_gmult_avx,.-gcm_gmult_avx +___ + +$code.=<<___; +.globl gcm_ghash_avx +.type gcm_ghash_avx,\@abi-omnipotent +.align 32 +gcm_ghash_avx: +___ +if ($avx) { +my ($Xip,$Htbl,$inp,$len)=@_4args; +my ($Xlo,$Xhi,$Xmi, + $Zlo,$Zhi,$Zmi, + $Hkey,$HK,$T1,$T2, + $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); + +$code.=<<___ if ($win64); + lea -0x88(%rsp),%rax +.LSEH_begin_gcm_ghash_avx: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) +___ +$code.=<<___; + vzeroupper + + vmovdqu ($Xip),$Xi # load $Xi + lea .L0x1c2_polynomial(%rip),%r10 + lea 0x40($Htbl),$Htbl # size optimization + vmovdqu .Lbswap_mask(%rip),$bswap + vpshufb $bswap,$Xi,$Xi + cmp \$0x80,$len + jb .Lshort_avx + sub \$0x80,$len + + vmovdqu 0x70($inp),$Ii # I[7] + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vpshufb $bswap,$Ii,$Ii + vmovdqu 0x20-0x40($Htbl),$HK + + vpunpckhqdq $Ii,$Ii,$T2 + vmovdqu 0x60($inp),$Ij # I[6] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Ii,$T2,$T2 + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpunpckhqdq $Ij,$Ij,$T1 + vmovdqu 0x50($inp),$Ii # I[5] + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpxor $Ii,$T2,$T2 + vmovdqu 0x40($inp),$Ij # I[4] + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0x50-0x40($Htbl),$HK + + vpshufb $bswap,$Ij,$Ij + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vmovdqu 0x30($inp),$Ii # I[3] + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Zhi,$Xhi,$Xhi + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpxor $Zmi,$Xmi,$Xmi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0x80-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu 0x20($inp),$Ij # I[2] + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpxor $Xmi,$Zmi,$Zmi + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x00,$HK,$T2,$Xmi + vpxor $Ij,$T1,$T1 + + vmovdqu 0x10($inp),$Ii # I[1] + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Zhi,$Xhi,$Xhi + vpshufb $bswap,$Ii,$Ii + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpxor $Zmi,$Xmi,$Xmi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK,$T1,$Zmi + vmovdqu 0xb0-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu ($inp),$Ij # I[0] + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Xhi,$Zhi,$Zhi + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x10,$HK,$T2,$Xmi + + lea 0x80($inp),$inp + cmp \$0x80,$len + jb .Ltail_avx + + vpxor $Xi,$Ij,$Ij # accumulate $Xi + sub \$0x80,$len + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq $Ij,$Ij,$T1 + vmovdqu 0x70($inp),$Ii # I[7] + vpxor $Xlo,$Zlo,$Zlo + vpxor $Ij,$T1,$T1 + vpclmulqdq \$0x00,$Hkey,$Ij,$Xi + vpshufb $bswap,$Ii,$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xo + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Tred + vmovdqu 0x20-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + + vmovdqu 0x60($inp),$Ij # I[6] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpxor $Zlo,$Xi,$Xi # collect result + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vxorps $Zhi,$Xo,$Xo + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpunpckhqdq $Ij,$Ij,$T1 + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vpxor $Zmi,$Tred,$Tred + vxorps $Ij,$T1,$T1 + + vmovdqu 0x50($inp),$Ii # I[5] + vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpxor $Xo,$Tred,$Tred + vpslldq \$8,$Tred,$T2 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vpsrldq \$8,$Tred,$Tred + vpxor $T2, $Xi, $Xi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpshufb $bswap,$Ii,$Ii + vxorps $Tred,$Xo, $Xo + vpxor $Xhi,$Zhi,$Zhi + vpunpckhqdq $Ii,$Ii,$T2 + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0x50-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu 0x40($inp),$Ij # I[4] + vpalignr \$8,$Xi,$Xi,$Tred # 1st phase + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Zhi,$Xhi,$Xhi + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vxorps $Ij,$T1,$T1 + vpxor $Zmi,$Xmi,$Xmi + + vmovdqu 0x30($inp),$Ii # I[3] + vpclmulqdq \$0x10,(%r10),$Xi,$Xi + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpshufb $bswap,$Ii,$Ii + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0x80-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu 0x20($inp),$Ij # I[2] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpxor $Zlo,$Xlo,$Xlo + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Zhi,$Xhi,$Xhi + vpclmulqdq \$0x00,$HK, $T2,$Xmi + vpxor $Ij,$T1,$T1 + vpxor $Zmi,$Xmi,$Xmi + vxorps $Tred,$Xi,$Xi + + vmovdqu 0x10($inp),$Ii # I[1] + vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase + vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo + vpshufb $bswap,$Ii,$Ii + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpclmulqdq \$0x10,(%r10),$Xi,$Xi + vxorps $Xo,$Tred,$Tred + vpunpckhqdq $Ii,$Ii,$T2 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x10,$HK, $T1,$Zmi + vmovdqu 0xb0-0x40($Htbl),$HK + vpxor $Ii,$T2,$T2 + vpxor $Xmi,$Zmi,$Zmi + + vmovdqu ($inp),$Ij # I[0] + vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo + vpshufb $bswap,$Ij,$Ij + vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi + vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 + vpxor $Tred,$Ij,$Ij + vpclmulqdq \$0x10,$HK, $T2,$Xmi + vpxor $Xi,$Ij,$Ij # accumulate $Xi + + lea 0x80($inp),$inp + sub \$0x80,$len + jnc .Loop8x_avx + + add \$0x80,$len + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -0x10($inp,$len),$Ii # very last word + lea ($inp,$len),$inp + vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 + vmovdqu 0x20-0x40($Htbl),$HK + vpshufb $bswap,$Ii,$Ij + + vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, + vmovdqa $Xhi,$Zhi # $Zhi and + vmovdqa $Xmi,$Zmi # $Zmi + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x20($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x30($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovdqu 0x50-0x40($Htbl),$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x40($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x50($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovdqu 0x80-0x40($Htbl),$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x60($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vpsrldq \$8,$HK,$HK + sub \$0x10,$len + jz .Ltail_avx + + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vmovdqu -0x70($inp),$Ii + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 + vpshufb $bswap,$Ii,$Ij + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + vmovq 0xb8-0x40($Htbl),$HK + sub \$0x10,$len + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor $Xi,$Ij,$Ij # accumulate $Xi +.Ltail_no_xor_avx: + vpunpckhqdq $Ij,$Ij,$T1 + vpxor $Xlo,$Zlo,$Zlo + vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo + vpxor $Ij,$T1,$T1 + vpxor $Xhi,$Zhi,$Zhi + vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi + vpxor $Xmi,$Zmi,$Zmi + vpclmulqdq \$0x00,$HK,$T1,$Xmi + + vmovdqu (%r10),$Tred + + vpxor $Xlo,$Zlo,$Xi + vpxor $Xhi,$Zhi,$Xo + vpxor $Xmi,$Zmi,$Zmi + + vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing + vpxor $Xo, $Zmi,$Zmi + vpslldq \$8, $Zmi,$T2 + vpsrldq \$8, $Zmi,$Zmi + vpxor $T2, $Xi, $Xi + vpxor $Zmi,$Xo, $Xo + + vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase + vpalignr \$8,$Xi,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase + vpalignr \$8,$Xi,$Xi,$Xi + vpxor $Xo,$Xi,$Xi + vpxor $T2,$Xi,$Xi + + cmp \$0,$len + jne .Lshort_avx + + vpshufb $bswap,$Xi,$Xi + vmovdqu $Xi,($Xip) + vzeroupper +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp +.LSEH_end_gcm_ghash_avx: +___ +$code.=<<___; + ret +.size gcm_ghash_avx,.-gcm_ghash_avx +___ +} else { +$code.=<<___; + jmp .L_ghash_clmul +.size gcm_ghash_avx,.-gcm_ghash_avx +___ +} + +$code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: + .long 7,0,7,0 +.L7_mask_poly: + .long 7,0,`0xE1<<1`,0 +.align 64 +.type .Lrem_4bit,\@object +.Lrem_4bit: + .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` + .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` + .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` + .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` +.type .Lrem_8bit,\@object +.Lrem_8bit: + .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E + .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E + .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E + .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E + .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E + .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E + .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E + .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E + .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE + .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE + .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE + .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE + .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E + .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E + .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE + .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE + .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E + .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E + .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E + .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E + .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E + .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E + .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E + .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E + .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE + .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE + .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE + .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE + .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E + .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E + .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE + .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE + +.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" +.align 64 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 24(%rax),%rax # adjust "rsp" + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_gcm_gmult_4bit + .rva .LSEH_end_gcm_gmult_4bit + .rva .LSEH_info_gcm_gmult_4bit + + .rva .LSEH_begin_gcm_ghash_4bit + .rva .LSEH_end_gcm_ghash_4bit + .rva .LSEH_info_gcm_ghash_4bit + + .rva .LSEH_begin_gcm_init_clmul + .rva .LSEH_end_gcm_init_clmul + .rva .LSEH_info_gcm_init_clmul + + .rva .LSEH_begin_gcm_ghash_clmul + .rva .LSEH_end_gcm_ghash_clmul + .rva .LSEH_info_gcm_ghash_clmul +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_gcm_init_avx + .rva .LSEH_end_gcm_init_avx + .rva .LSEH_info_gcm_init_clmul + + .rva .LSEH_begin_gcm_ghash_avx + .rva .LSEH_end_gcm_ghash_avx + .rva .LSEH_info_gcm_ghash_clmul +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_gcm_gmult_4bit: + .byte 9,0,0,0 + .rva se_handler + .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData +.LSEH_info_gcm_ghash_4bit: + .byte 9,0,0,0 + .rva se_handler + .rva .Lghash_prologue,.Lghash_epilogue # HandlerData +.LSEH_info_gcm_init_clmul: + .byte 0x01,0x08,0x03,0x00 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 +.LSEH_info_gcm_ghash_clmul: + .byte 0x01,0x33,0x16,0x00 + .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 + .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 + .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 + .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 + .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 + .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/src/crypto/modes/asm/ghashv8-armx.pl b/src/crypto/modes/asm/ghashv8-armx.pl new file mode 100644 index 0000000..54a1ac4 --- /dev/null +++ b/src/crypto/modes/asm/ghashv8-armx.pl @@ -0,0 +1,241 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. +# +# June 2014 +# +# Initial version was developed in tight cooperation with Ard +# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from +# other assembly modules. Just like aesv8-armx.pl this module +# supports both AArch32 and AArch64 execution modes. +# +# Current performance in cycles per processed byte: +# +# PMULL[2] 32-bit NEON(*) +# Apple A7 1.76 5.62 +# Cortex-A53 1.45 8.39 +# Cortex-A57 2.22 7.61 +# +# (*) presented for reference/comparison purposes; + +$flavour = shift; +open STDOUT,">".shift; + +$Xi="x0"; # argument block +$Htbl="x1"; +$inp="x2"; +$len="x3"; + +$inc="x12"; + +{ +my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); +my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14)); + +$code=<<___; +#include "arm_arch.h" + +.text +___ +$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); + +$code.=<<___; +.global gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + vld1.64 {$t1},[x1] @ load H + vmov.i8 $t0,#0xe1 + vext.8 $IN,$t1,$t1,#8 + vshl.i64 $t0,$t0,#57 + vshr.u64 $t2,$t0,#63 + vext.8 $t0,$t2,$t0,#8 @ t0=0xc2....01 + vdup.32 $t1,${t1}[1] + vshr.u64 $t3,$IN,#63 + vshr.s32 $t1,$t1,#31 @ broadcast carry bit + vand $t3,$t3,$t0 + vshl.i64 $IN,$IN,#1 + vext.8 $t3,$t3,$t3,#8 + vand $t0,$t0,$t1 + vorr $IN,$IN,$t3 @ H<<<=1 + veor $IN,$IN,$t0 @ twisted H + vst1.64 {$IN},[x0] + + ret +.size gcm_init_v8,.-gcm_init_v8 + +.global gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + vld1.64 {$t1},[$Xi] @ load Xi + vmov.i8 $t3,#0xe1 + vld1.64 {$H},[$Htbl] @ load twisted H + vshl.u64 $t3,$t3,#57 +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + vext.8 $Hhl,$H,$H,#8 + mov $len,#0 + vext.8 $IN,$t1,$t1,#8 + mov $inc,#0 + veor $Hhl,$Hhl,$H @ Karatsuba pre-processing + mov $inp,$Xi + b .Lgmult_v8 +.size gcm_gmult_v8,.-gcm_gmult_v8 + +.global gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: + vld1.64 {$Xl},[$Xi] @ load [rotated] Xi + subs $len,$len,#16 + vmov.i8 $t3,#0xe1 + mov $inc,#16 + vld1.64 {$H},[$Htbl] @ load twisted H + cclr $inc,eq + vext.8 $Xl,$Xl,$Xl,#8 + vshl.u64 $t3,$t3,#57 + vld1.64 {$t1},[$inp],$inc @ load [rotated] inp + vext.8 $Hhl,$H,$H,#8 +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl + vrev64.8 $t1,$t1 +#endif + veor $Hhl,$Hhl,$H @ Karatsuba pre-processing + vext.8 $IN,$t1,$t1,#8 + b .Loop_v8 + +.align 4 +.Loop_v8: + vext.8 $t2,$Xl,$Xl,#8 + veor $IN,$IN,$Xl @ inp^=Xi + veor $t1,$t1,$t2 @ $t1 is rotated inp^Xi + +.Lgmult_v8: + vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo + veor $t1,$t1,$IN @ Karatsuba pre-processing + vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi + subs $len,$len,#16 + vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + cclr $inc,eq + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$t1},[$inp],$inc @ load [rotated] inp + veor $Xm,$Xm,$t2 + vpmull.p64 $t2,$Xl,$t3 @ 1st phase + + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl +#ifndef __ARMEB__ + vrev64.8 $t1,$t1 +#endif + veor $Xl,$Xm,$t2 + vext.8 $IN,$t1,$t1,#8 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase + vpmull.p64 $Xl,$Xl,$t3 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + b.hs .Loop_v8 + +#ifndef __ARMEB__ + vrev64.8 $Xl,$Xl +#endif + vext.8 $Xl,$Xl,$Xl,#8 + vst1.64 {$Xl},[$Xi] @ write out Xi + + ret +.size gcm_ghash_v8,.-gcm_ghash_v8 +___ +} +$code.=<<___; +.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +___ + +if ($flavour =~ /64/) { ######## 64-bit code + sub unvmov { + my $arg=shift; + + $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && + sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1; + } + foreach(split("\n",$code)) { + s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vmov\s+(.*)/unvmov($1)/geo or + s/vext\.8/ext/o or + s/vshr\.s/sshr\.s/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix + s/\bbx\s+lr\b/ret/o; + + s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers + s/@\s/\/\//o; # old->new style commentary + + # fix up remainig legacy suffixes + s/\.[ui]?8(\s)/$1/o; + s/\.[uis]?32//o and s/\.16b/\.4s/go; + m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument + m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments + s/\.[uisp]?64//o and s/\.16b/\.2d/go; + s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + + print $_,"\n"; + } +} else { ######## 32-bit code + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + sub unvpmullp64 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { + my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + $word |= 0x00010001 if ($mnemonic =~ "2"); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } + + foreach(split("\n",$code)) { + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remainig new-style suffixes + s/\],#[0-9]+/]!/o; + + s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + print $_,"\n"; + } +} + +close STDOUT; # enforce flush diff --git a/src/crypto/modes/cbc.c b/src/crypto/modes/cbc.c new file mode 100644 index 0000000..a2ad26c --- /dev/null +++ b/src/crypto/modes/cbc.c @@ -0,0 +1,203 @@ +/* ==================================================================== + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== */ +#include <openssl/modes.h> + +#include <assert.h> +#include <string.h> + +#include "internal.h" + + +#ifndef STRICT_ALIGNMENT +# define STRICT_ALIGNMENT 0 +#endif + +void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const void *key, uint8_t ivec[16], + block128_f block) { + size_t n; + const uint8_t *iv = ivec; + + assert(in && out && key && ivec); + + if (STRICT_ALIGNMENT && + ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + while (len >= 16) { + for (n = 0; n < 16; ++n) { + out[n] = in[n] ^ iv[n]; + } + (*block)(out, out, key); + iv = out; + len -= 16; + in += 16; + out += 16; + } + } else { + while (len >= 16) { + for (n = 0; n < 16; n += sizeof(size_t)) { + *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(iv + n); + } + (*block)(out, out, key); + iv = out; + len -= 16; + in += 16; + out += 16; + } + } + + while (len) { + for (n = 0; n < 16 && n < len; ++n) { + out[n] = in[n] ^ iv[n]; + } + for (; n < 16; ++n) { + out[n] = iv[n]; + } + (*block)(out, out, key); + iv = out; + if (len <= 16) { + break; + } + len -= 16; + in += 16; + out += 16; + } + + memcpy(ivec, iv, 16); +} + +void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len, + const void *key, uint8_t ivec[16], + block128_f block) { + size_t n; + union { + size_t t[16 / sizeof(size_t)]; + uint8_t c[16]; + } tmp; + + assert(in && out && key && ivec); + + if (in != out) { + const uint8_t *iv = ivec; + + if (STRICT_ALIGNMENT && + ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + while (len >= 16) { + (*block)(in, out, key); + for (n = 0; n < 16; ++n) + out[n] ^= iv[n]; + iv = in; + len -= 16; + in += 16; + out += 16; + } + } else if (16 % sizeof(size_t) == 0) { /* always true */ + while (len >= 16) { + size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv; + + (*block)(in, out, key); + for (n = 0; n < 16 / sizeof(size_t); n++) + out_t[n] ^= iv_t[n]; + iv = in; + len -= 16; + in += 16; + out += 16; + } + } + memcpy(ivec, iv, 16); + } else { + if (STRICT_ALIGNMENT && + ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + uint8_t c; + while (len >= 16) { + (*block)(in, tmp.c, key); + for (n = 0; n < 16; ++n) { + c = in[n]; + out[n] = tmp.c[n] ^ ivec[n]; + ivec[n] = c; + } + len -= 16; + in += 16; + out += 16; + } + } else if (16 % sizeof(size_t) == 0) { /* always true */ + while (len >= 16) { + size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec; + const size_t *in_t = (const size_t *)in; + + (*block)(in, tmp.c, key); + for (n = 0; n < 16 / sizeof(size_t); n++) { + c = in_t[n]; + out_t[n] = tmp.t[n] ^ ivec_t[n]; + ivec_t[n] = c; + } + len -= 16; + in += 16; + out += 16; + } + } + } + + while (len) { + uint8_t c; + (*block)(in, tmp.c, key); + for (n = 0; n < 16 && n < len; ++n) { + c = in[n]; + out[n] = tmp.c[n] ^ ivec[n]; + ivec[n] = c; + } + if (len <= 16) { + for (; n < 16; ++n) { + ivec[n] = in[n]; + } + break; + } + len -= 16; + in += 16; + out += 16; + } +} diff --git a/src/crypto/modes/cfb.c b/src/crypto/modes/cfb.c new file mode 100644 index 0000000..738a438 --- /dev/null +++ b/src/crypto/modes/cfb.c @@ -0,0 +1,230 @@ +/* ==================================================================== + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== */ + +#include <openssl/modes.h> + +#include <assert.h> +#include <string.h> + +#include "internal.h" + + +void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const void *key, uint8_t ivec[16], int *num, int enc, + block128_f block) { + unsigned int n; + size_t l = 0; + + assert(in && out && key && ivec && num); + assert((16 % sizeof(size_t)) == 0); + + n = *num; + + if (enc) { + while (n && len) { + *(out++) = ivec[n] ^= *(in++); + --len; + n = (n + 1) % 16; + } +#if STRICT_ALIGNMENT + if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + while (l < len) { + if (n == 0) { + (*block)(ivec, ivec, key); + } + out[l] = ivec[n] ^= in[l]; + ++l; + n = (n + 1) % 16; + } + *num = n; + return; + } +#endif + while (len >= 16) { + (*block)(ivec, ivec, key); + for (; n < 16; n += sizeof(size_t)) { + *(size_t *)(out + n) = *(size_t *)(ivec + n) ^= *(size_t *)(in + n); + } + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block)(ivec, ivec, key); + while (len--) { + out[n] = ivec[n] ^= in[n]; + ++n; + } + } + *num = n; + return; + } else { + while (n && len) { + uint8_t c; + *(out++) = ivec[n] ^ (c = *(in++)); + ivec[n] = c; + --len; + n = (n + 1) % 16; + } + if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + while (l < len) { + unsigned char c; + if (n == 0) { + (*block)(ivec, ivec, key); + } + out[l] = ivec[n] ^ (c = in[l]); + ivec[n] = c; + ++l; + n = (n + 1) % 16; + } + *num = n; + return; + } + while (len >= 16) { + (*block)(ivec, ivec, key); + for (; n < 16; n += sizeof(size_t)) { + size_t t = *(size_t *)(in + n); + *(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t; + *(size_t *)(ivec + n) = t; + } + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block)(ivec, ivec, key); + while (len--) { + uint8_t c; + out[n] = ivec[n] ^ (c = in[n]); + ivec[n] = c; + ++n; + } + } + *num = n; + return; + } +} + + +/* This expects a single block of size nbits for both in and out. Note that + it corrupts any extra bits in the last byte of out */ +static void cfbr_encrypt_block(const uint8_t *in, uint8_t *out, unsigned nbits, + const void *key, uint8_t ivec[16], int enc, + block128_f block) { + int n, rem, num; + uint8_t ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't use) one + byte off the end */ + + if (nbits <= 0 || nbits > 128) { + return; + } + + /* fill in the first half of the new IV with the current IV */ + memcpy(ovec, ivec, 16); + /* construct the new IV */ + (*block)(ivec, ivec, key); + num = (nbits + 7) / 8; + if (enc) { + /* encrypt the input */ + for (n = 0; n < num; ++n) { + out[n] = (ovec[16 + n] = in[n] ^ ivec[n]); + } + } else { + /* decrypt the input */ + for (n = 0; n < num; ++n) { + out[n] = (ovec[16 + n] = in[n]) ^ ivec[n]; + } + } + /* shift ovec left... */ + rem = nbits % 8; + num = nbits / 8; + if (rem == 0) { + memcpy(ivec, ovec + num, 16); + } else { + for (n = 0; n < 16; ++n) { + ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem); + } + } + + /* it is not necessary to cleanse ovec, since the IV is not secret */ +} + +/* N.B. This expects the input to be packed, MS bit first */ +void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits, + const void *key, uint8_t ivec[16], int *num, + int enc, block128_f block) { + size_t n; + uint8_t c[1], d[1]; + + assert(in && out && key && ivec && num); + assert(*num == 0); + + for (n = 0; n < bits; ++n) { + c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0; + cfbr_encrypt_block(c, d, 1, key, ivec, enc, block); + out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) | + ((d[0] & 0x80) >> (unsigned int)(n % 8)); + } +} + +void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const void *key, + unsigned char ivec[16], int *num, int enc, + block128_f block) { + size_t n; + + assert(in && out && key && ivec && num); + assert(*num == 0); + + for (n = 0; n < length; ++n) { + cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block); + } +} + diff --git a/src/crypto/modes/ctr.c b/src/crypto/modes/ctr.c new file mode 100644 index 0000000..61832ba --- /dev/null +++ b/src/crypto/modes/ctr.c @@ -0,0 +1,219 @@ +/* ==================================================================== + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== */ +#include <openssl/modes.h> + +#include <assert.h> +#include <string.h> + +#include "internal.h" + + +/* NOTE: the IV/counter CTR mode is big-endian. The code itself + * is endian-neutral. */ + +/* increment counter (128-bit int) by 1 */ +static void ctr128_inc(uint8_t *counter) { + uint32_t n = 16; + uint8_t c; + + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) { + return; + } + } while (n); +} + +/* The input encrypted as though 128bit counter mode is being used. The extra + * state information to record how much of the 128bit block we have used is + * contained in *num, and the encrypted counter is kept in ecount_buf. Both + * *num and ecount_buf must be initialised with zeros before the first call to + * CRYPTO_ctr128_encrypt(). + * + * This algorithm assumes that the counter is in the x lower bits of the IV + * (ivec), and that the application has full control over overflow and the rest + * of the IV. This implementation takes NO responsibility for checking that + * the counter doesn't overflow into the rest of the IV when incremented. */ +void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const void *key, uint8_t ivec[16], + uint8_t ecount_buf[16], unsigned int *num, + block128_f block) { + unsigned int n; + + assert(in && out && key && ecount_buf && num); + assert(*num < 16); + assert((16 % sizeof(size_t)) == 0); + + n = *num; + + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n + 1) % 16; + } + +#if STRICT_ALIGNMENT + if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + size_t l = 0; + while (l < len) { + if (n == 0) { + (*block)(ivec, ecount_buf, key); + ctr128_inc(ivec); + } + out[l] = in[l] ^ ecount_buf[n]; + ++l; + n = (n + 1) % 16; + } + + *num = n; + return; + } +#endif + + while (len >= 16) { + (*block)(ivec, ecount_buf, key); + ctr128_inc(ivec); + for (; n < 16; n += sizeof(size_t)) + *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n); + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block)(ivec, ecount_buf, key); + ctr128_inc(ivec); + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } + *num = n; +} + +/* increment upper 96 bits of 128-bit counter by 1 */ +static void ctr96_inc(uint8_t *counter) { + uint32_t n = 12; + uint8_t c; + + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) { + return; + } + } while (n); +} + +void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, + size_t len, const void *key, + uint8_t ivec[16], + uint8_t ecount_buf[16], + unsigned int *num, ctr128_f func) { + unsigned int n, ctr32; + + assert(in && out && key && ecount_buf && num); + assert(*num < 16); + + n = *num; + + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n + 1) % 16; + } + + ctr32 = GETU32(ivec + 12); + while (len >= 16) { + size_t blocks = len / 16; + /* 1<<28 is just a not-so-small yet not-so-large number... + * Below condition is practically never met, but it has to + * be checked for code correctness. */ + if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28)) + blocks = (1U << 28); + /* As (*func) operates on 32-bit counter, caller + * has to handle overflow. 'if' below detects the + * overflow, which is then handled by limiting the + * amount of blocks to the exact overflow point... */ + ctr32 += (uint32_t)blocks; + if (ctr32 < blocks) { + blocks -= ctr32; + ctr32 = 0; + } + (*func)(in, out, blocks, key, ivec); + /* (*func) does not update ivec, caller does: */ + PUTU32(ivec + 12, ctr32); + /* ... overflow was detected, propogate carry. */ + if (ctr32 == 0) + ctr96_inc(ivec); + blocks *= 16; + len -= blocks; + out += blocks; + in += blocks; + } + if (len) { + memset(ecount_buf, 0, 16); + (*func)(ecount_buf, ecount_buf, 1, key, ivec); + ++ctr32; + PUTU32(ivec + 12, ctr32); + if (ctr32 == 0) { + ctr96_inc(ivec); + } + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } + + *num = n; +} diff --git a/src/crypto/modes/gcm.c b/src/crypto/modes/gcm.c new file mode 100644 index 0000000..eeaeeff --- /dev/null +++ b/src/crypto/modes/gcm.c @@ -0,0 +1,1245 @@ +/* ==================================================================== + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== */ + +#include <openssl/modes.h> + +#include <assert.h> +#include <string.h> + +#include <openssl/mem.h> +#include <openssl/cpu.h> + +#include "internal.h" +#include "../internal.h" + + +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ + defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) +#define GHASH_ASM +#endif + +#if defined(BSWAP4) && STRICT_ALIGNMENT == 1 +/* redefine, because alignment is ensured */ +#undef GETU32 +#define GETU32(p) BSWAP4(*(const uint32_t *)(p)) +#undef PUTU32 +#define PUTU32(p, v) *(uint32_t *)(p) = BSWAP4(v) +#endif + +#define PACK(s) ((size_t)(s) << (sizeof(size_t) * 8 - 16)) +#define REDUCE1BIT(V) \ + do { \ + if (sizeof(size_t) == 8) { \ + uint64_t T = OPENSSL_U64(0xe100000000000000) & (0 - (V.lo & 1)); \ + V.lo = (V.hi << 63) | (V.lo >> 1); \ + V.hi = (V.hi >> 1) ^ T; \ + } else { \ + uint32_t T = 0xe1000000U & (0 - (uint32_t)(V.lo & 1)); \ + V.lo = (V.hi << 63) | (V.lo >> 1); \ + V.hi = (V.hi >> 1) ^ ((uint64_t)T << 32); \ + } \ + } while (0) + + +static void gcm_init_4bit(u128 Htable[16], uint64_t H[2]) { + u128 V; + + Htable[0].hi = 0; + Htable[0].lo = 0; + V.hi = H[0]; + V.lo = H[1]; + + Htable[8] = V; + REDUCE1BIT(V); + Htable[4] = V; + REDUCE1BIT(V); + Htable[2] = V; + REDUCE1BIT(V); + Htable[1] = V; + Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo; + V = Htable[4]; + Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo; + Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo; + Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo; + V = Htable[8]; + Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo; + Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo; + Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo; + Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo; + Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo; + Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo; + Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo; + +#if defined(GHASH_ASM) && defined(OPENSSL_ARM) + /* ARM assembler expects specific dword order in Htable. */ + { + int j; + const union { + long one; + char little; + } is_endian = {1}; + + if (is_endian.little) { + for (j = 0; j < 16; ++j) { + V = Htable[j]; + Htable[j].hi = V.lo; + Htable[j].lo = V.hi; + } + } else { + for (j = 0; j < 16; ++j) { + V = Htable[j]; + Htable[j].hi = V.lo << 32 | V.lo >> 32; + Htable[j].lo = V.hi << 32 | V.hi >> 32; + } + } + } +#endif +} + +#if !defined(GHASH_ASM) || defined(OPENSSL_AARCH64) +static const size_t rem_4bit[16] = { + PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), + PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), + PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), + PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)}; + +static void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]) { + u128 Z; + int cnt = 15; + size_t rem, nlo, nhi; + const union { + long one; + char little; + } is_endian = {1}; + + nlo = ((const uint8_t *)Xi)[15]; + nhi = nlo >> 4; + nlo &= 0xf; + + Z.hi = Htable[nlo].hi; + Z.lo = Htable[nlo].lo; + + while (1) { + rem = (size_t)Z.lo & 0xf; + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + if (sizeof(size_t) == 8) { + Z.hi ^= rem_4bit[rem]; + } else { + Z.hi ^= (uint64_t)rem_4bit[rem] << 32; + } + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + + if (--cnt < 0) { + break; + } + + nlo = ((const uint8_t *)Xi)[cnt]; + nhi = nlo >> 4; + nlo &= 0xf; + + rem = (size_t)Z.lo & 0xf; + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + if (sizeof(size_t) == 8) { + Z.hi ^= rem_4bit[rem]; + } else { + Z.hi ^= (uint64_t)rem_4bit[rem] << 32; + } + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + } + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + uint8_t *p = (uint8_t *)Xi; + uint32_t v; + v = (uint32_t)(Z.hi >> 32); + PUTU32(p, v); + v = (uint32_t)(Z.hi); + PUTU32(p + 4, v); + v = (uint32_t)(Z.lo >> 32); + PUTU32(p + 8, v); + v = (uint32_t)(Z.lo); + PUTU32(p + 12, v); +#endif + } else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} + +/* Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for + * details... Compiler-generated code doesn't seem to give any + * performance improvement, at least not on x86[_64]. It's here + * mostly as reference and a placeholder for possible future + * non-trivial optimization[s]... */ +static void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len) { + u128 Z; + int cnt; + size_t rem, nlo, nhi; + const union { + long one; + char little; + } is_endian = {1}; + + do { + cnt = 15; + nlo = ((const uint8_t *)Xi)[15]; + nlo ^= inp[15]; + nhi = nlo >> 4; + nlo &= 0xf; + + Z.hi = Htable[nlo].hi; + Z.lo = Htable[nlo].lo; + + while (1) { + rem = (size_t)Z.lo & 0xf; + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + if (sizeof(size_t) == 8) { + Z.hi ^= rem_4bit[rem]; + } else { + Z.hi ^= (uint64_t)rem_4bit[rem] << 32; + } + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + + if (--cnt < 0) { + break; + } + + nlo = ((const uint8_t *)Xi)[cnt]; + nlo ^= inp[cnt]; + nhi = nlo >> 4; + nlo &= 0xf; + + rem = (size_t)Z.lo & 0xf; + Z.lo = (Z.hi << 60) | (Z.lo >> 4); + Z.hi = (Z.hi >> 4); + if (sizeof(size_t) == 8) { + Z.hi ^= rem_4bit[rem]; + } else { + Z.hi ^= (uint64_t)rem_4bit[rem] << 32; + } + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + } + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + uint8_t *p = (uint8_t *)Xi; + uint32_t v; + v = (uint32_t)(Z.hi >> 32); + PUTU32(p, v); + v = (uint32_t)(Z.hi); + PUTU32(p + 4, v); + v = (uint32_t)(Z.lo >> 32); + PUTU32(p + 8, v); + v = (uint32_t)(Z.lo); + PUTU32(p + 12, v); +#endif + } else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } + } while (inp += 16, len -= 16); +} +#else /* GHASH_ASM */ +void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len); +#endif + +#define GCM_MUL(ctx, Xi) gcm_gmult_4bit(ctx->Xi.u, ctx->Htable) +#if defined(GHASH_ASM) +#define GHASH(ctx, in, len) gcm_ghash_4bit((ctx)->Xi.u, (ctx)->Htable, in, len) +/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache + * trashing effect. In other words idea is to hash data while it's + * still in L1 cache after encryption pass... */ +#define GHASH_CHUNK (3 * 1024) +#endif + + +#if defined(GHASH_ASM) +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +#define GHASH_ASM_X86_OR_64 +#define GCM_FUNCREF_4BIT +void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]); +void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len); + +#if defined(OPENSSL_X86) +#define gcm_init_avx gcm_init_clmul +#define gcm_gmult_avx gcm_gmult_clmul +#define gcm_ghash_avx gcm_ghash_clmul +#else +void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]); +void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, size_t len); +#endif + +#if defined(OPENSSL_X86) +#define GHASH_ASM_X86 +void gcm_gmult_4bit_mmx(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit_mmx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len); + +void gcm_gmult_4bit_x86(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_4bit_x86(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len); +#endif +#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) +#include "../arm_arch.h" +#if __ARM_ARCH__ >= 7 +#define GHASH_ASM_ARM +#define GCM_FUNCREF_4BIT + +static int pmull_capable() { + return (OPENSSL_armcap_P & ARMV8_PMULL) != 0; +} + +void gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]); +void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len); + +#if defined(OPENSSL_ARM) +/* 32-bit ARM also has support for doing GCM with NEON instructions. */ +static int neon_capable() { + return CRYPTO_is_NEON_capable(); +} + +void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]); +void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len); +#else +/* AArch64 only has the ARMv8 versions of functions. */ +static int neon_capable() { + return 0; +} +void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) { + abort(); +} +void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) { + abort(); +} +void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len) { + abort(); +} +#endif + +#endif +#endif +#endif + +#ifdef GCM_FUNCREF_4BIT +#undef GCM_MUL +#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)(ctx->Xi.u, ctx->Htable) +#ifdef GHASH +#undef GHASH +#define GHASH(ctx, in, len) (*gcm_ghash_p)(ctx->Xi.u, ctx->Htable, in, len) +#endif +#endif + +GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) { + GCM128_CONTEXT *ret; + + ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT)); + if (ret != NULL) { + CRYPTO_gcm128_init(ret, key, block); + } + + return ret; +} + +void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) { + const union { + long one; + char little; + } is_endian = {1}; + + memset(ctx, 0, sizeof(*ctx)); + ctx->block = block; + ctx->key = key; + + (*block)(ctx->H.c, ctx->H.c, key); + + if (is_endian.little) { +/* H is stored in host byte order */ +#ifdef BSWAP8 + ctx->H.u[0] = BSWAP8(ctx->H.u[0]); + ctx->H.u[1] = BSWAP8(ctx->H.u[1]); +#else + uint8_t *p = ctx->H.c; + uint64_t hi, lo; + hi = (uint64_t)GETU32(p) << 32 | GETU32(p + 4); + lo = (uint64_t)GETU32(p + 8) << 32 | GETU32(p + 12); + ctx->H.u[0] = hi; + ctx->H.u[1] = lo; +#endif + } + +#if defined(GHASH_ASM_X86_OR_64) + if (crypto_gcm_clmul_enabled()) { + if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */ + gcm_init_avx(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_avx; + ctx->ghash = gcm_ghash_avx; + } else { + gcm_init_clmul(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_clmul; + ctx->ghash = gcm_ghash_clmul; + } + return; + } + gcm_init_4bit(ctx->Htable, ctx->H.u); +#if defined(GHASH_ASM_X86) /* x86 only */ + if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */ + ctx->gmult = gcm_gmult_4bit_mmx; + ctx->ghash = gcm_ghash_4bit_mmx; + } else { + ctx->gmult = gcm_gmult_4bit_x86; + ctx->ghash = gcm_ghash_4bit_x86; + } +#else + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; +#endif +#elif defined(GHASH_ASM_ARM) + if (pmull_capable()) { + gcm_init_v8(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_v8; + ctx->ghash = gcm_ghash_v8; + } else if (neon_capable()) { + gcm_init_neon(ctx->Htable,ctx->H.u); + ctx->gmult = gcm_gmult_neon; + ctx->ghash = gcm_ghash_neon; + } else { + gcm_init_4bit(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; + } +#else + gcm_init_4bit(ctx->Htable, ctx->H.u); + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; +#endif +} + +void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const uint8_t *iv, size_t len) { + const union { + long one; + char little; + } is_endian = {1}; + unsigned int ctr; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult; +#endif + + ctx->Yi.u[0] = 0; + ctx->Yi.u[1] = 0; + ctx->Xi.u[0] = 0; + ctx->Xi.u[1] = 0; + ctx->len.u[0] = 0; /* AAD length */ + ctx->len.u[1] = 0; /* message length */ + ctx->ares = 0; + ctx->mres = 0; + + if (len == 12) { + memcpy(ctx->Yi.c, iv, 12); + ctx->Yi.c[15] = 1; + ctr = 1; + } else { + size_t i; + uint64_t len0 = len; + + while (len >= 16) { + for (i = 0; i < 16; ++i) { + ctx->Yi.c[i] ^= iv[i]; + } + GCM_MUL(ctx, Yi); + iv += 16; + len -= 16; + } + if (len) { + for (i = 0; i < len; ++i) { + ctx->Yi.c[i] ^= iv[i]; + } + GCM_MUL(ctx, Yi); + } + len0 <<= 3; + if (is_endian.little) { +#ifdef BSWAP8 + ctx->Yi.u[1] ^= BSWAP8(len0); +#else + ctx->Yi.c[8] ^= (uint8_t)(len0 >> 56); + ctx->Yi.c[9] ^= (uint8_t)(len0 >> 48); + ctx->Yi.c[10] ^= (uint8_t)(len0 >> 40); + ctx->Yi.c[11] ^= (uint8_t)(len0 >> 32); + ctx->Yi.c[12] ^= (uint8_t)(len0 >> 24); + ctx->Yi.c[13] ^= (uint8_t)(len0 >> 16); + ctx->Yi.c[14] ^= (uint8_t)(len0 >> 8); + ctx->Yi.c[15] ^= (uint8_t)(len0); +#endif + } else { + ctx->Yi.u[1] ^= len0; + } + + GCM_MUL(ctx, Yi); + + if (is_endian.little) { + ctr = GETU32(ctx->Yi.c + 12); + } else { + ctr = ctx->Yi.d[3]; + } + } + + (*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } +} + +int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) { + size_t i; + unsigned int n; + uint64_t alen = ctx->len.u[0]; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult; +#ifdef GHASH + void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len) = ctx->ghash; +#endif +#endif + + if (ctx->len.u[1]) { + return 0; + } + + alen += len; + if (alen > (OPENSSL_U64(1) << 61) || (sizeof(len) == 8 && alen < len)) { + return 0; + } + ctx->len.u[0] = alen; + + n = ctx->ares; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(aad++); + --len; + n = (n + 1) % 16; + } + if (n == 0) { + GCM_MUL(ctx, Xi); + } else { + ctx->ares = n; + return 1; + } + } + +#ifdef GHASH + if ((i = (len & (size_t) - 16))) { + GHASH(ctx, aad, i); + aad += i; + len -= i; + } +#else + while (len >= 16) { + for (i = 0; i < 16; ++i) { + ctx->Xi.c[i] ^= aad[i]; + } + GCM_MUL(ctx, Xi); + aad += 16; + len -= 16; + } +#endif + if (len) { + n = (unsigned int)len; + for (i = 0; i < len; ++i) + ctx->Xi.c[i] ^= aad[i]; + } + + ctx->ares = n; + return 1; +} + +int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const unsigned char *in, + unsigned char *out, size_t len) { + const union { + long one; + char little; + } is_endian = {1}; + unsigned int n, ctr; + size_t i; + uint64_t mlen = ctx->len.u[1]; + block128_f block = ctx->block; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult; +#ifdef GHASH + void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len) = ctx->ghash; +#endif +#endif + + mlen += len; + if (mlen > ((OPENSSL_U64(1) << 36) - 32) || + (sizeof(len) == 8 && mlen < len)) { + return 0; + } + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to encrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx, Xi); + ctx->ares = 0; + } + + if (is_endian.little) { + ctr = GETU32(ctx->Yi.c + 12); + } else { + ctr = ctx->Yi.d[3]; + } + + n = ctx->mres; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; + --len; + n = (n + 1) % 16; + } + if (n == 0) { + GCM_MUL(ctx, Xi); + } else { + ctx->mres = n; + return 1; + } + } + if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out) % sizeof(size_t) != 0) { + for (i = 0; i < len; ++i) { + if (n == 0) { + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + } + ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n]; + n = (n + 1) % 16; + if (n == 0) { + GCM_MUL(ctx, Xi); + } + } + + ctx->mres = n; + return 1; + } +#if defined(GHASH) && defined(GHASH_CHUNK) + while (len >= GHASH_CHUNK) { + size_t j = GHASH_CHUNK; + + while (j) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + for (i = 0; i < 16 / sizeof(size_t); ++i) { + out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + } + out += 16; + in += 16; + j -= 16; + } + GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK); + len -= GHASH_CHUNK; + } + if ((i = (len & (size_t) - 16))) { + size_t j = i; + + while (len >= 16) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + for (i = 0; i < 16 / sizeof(size_t); ++i) { + out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + } + out += 16; + in += 16; + len -= 16; + } + GHASH(ctx, out - j, j); + } +#else + while (len >= 16) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + for (i = 0; i < 16 / sizeof(size_t); ++i) { + ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + } + GCM_MUL(ctx, Xi); + out += 16; + in += 16; + len -= 16; + } +#endif + if (len) { + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + while (len--) { + ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 1; +} + +int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const unsigned char *in, + unsigned char *out, size_t len) { + const union { + long one; + char little; + } is_endian = {1}; + unsigned int n, ctr; + size_t i; + uint64_t mlen = ctx->len.u[1]; + block128_f block = ctx->block; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult; +#ifdef GHASH + void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len) = ctx->ghash; +#endif +#endif + + mlen += len; + if (mlen > ((OPENSSL_U64(1) << 36) - 32) || + (sizeof(len) == 8 && mlen < len)) { + return 0; + } + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to decrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx, Xi); + ctx->ares = 0; + } + + if (is_endian.little) { + ctr = GETU32(ctx->Yi.c + 12); + } else { + ctr = ctx->Yi.d[3]; + } + + n = ctx->mres; + if (n) { + while (n && len) { + uint8_t c = *(in++); + *(out++) = c ^ ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + --len; + n = (n + 1) % 16; + } + if (n == 0) { + GCM_MUL(ctx, Xi); + } else { + ctx->mres = n; + return 1; + } + } + if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out) % sizeof(size_t) != 0) { + for (i = 0; i < len; ++i) { + uint8_t c; + if (n == 0) { + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + } + c = in[i]; + out[i] = c ^ ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + n = (n + 1) % 16; + if (n == 0) { + GCM_MUL(ctx, Xi); + } + } + + ctx->mres = n; + return 1; + } +#if defined(GHASH) && defined(GHASH_CHUNK) + while (len >= GHASH_CHUNK) { + size_t j = GHASH_CHUNK; + + GHASH(ctx, in, GHASH_CHUNK); + while (j) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + for (i = 0; i < 16 / sizeof(size_t); ++i) { + out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + } + out += 16; + in += 16; + j -= 16; + } + len -= GHASH_CHUNK; + } + if ((i = (len & (size_t) - 16))) { + GHASH(ctx, in, i); + while (len >= 16) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + for (i = 0; i < 16 / sizeof(size_t); ++i) { + out_t[i] = in_t[i] ^ ctx->EKi.t[i]; + } + out += 16; + in += 16; + len -= 16; + } + } +#else + while (len >= 16) { + size_t *out_t = (size_t *)out; + const size_t *in_t = (const size_t *)in; + + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + for (i = 0; i < 16 / sizeof(size_t); ++i) { + size_t c = in_t[i]; + out_t[i] = c ^ ctx->EKi.t[i]; + ctx->Xi.t[i] ^= c; + } + GCM_MUL(ctx, Xi); + out += 16; + in += 16; + len -= 16; + } +#endif + if (len) { + (*block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + while (len--) { + uint8_t c = in[n]; + ctx->Xi.c[n] ^= c; + out[n] = c ^ ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 1; +} + +int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const uint8_t *in, + uint8_t *out, size_t len, ctr128_f stream) { + const union { + long one; + char little; + } is_endian = {1}; + unsigned int n, ctr; + size_t i; + uint64_t mlen = ctx->len.u[1]; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult; +#ifdef GHASH + void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len) = ctx->ghash; +#endif +#endif + + mlen += len; + if (mlen > ((OPENSSL_U64(1) << 36) - 32) || + (sizeof(len) == 8 && mlen < len)) { + return 0; + } + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to encrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx, Xi); + ctx->ares = 0; + } + + if (is_endian.little) { + ctr = GETU32(ctx->Yi.c + 12); + } else { + ctr = ctx->Yi.d[3]; + } + + n = ctx->mres; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; + --len; + n = (n + 1) % 16; + } + if (n == 0) { + GCM_MUL(ctx, Xi); + } else { + ctx->mres = n; + return 1; + } + } +#if defined(GHASH) + while (len >= GHASH_CHUNK) { + (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c); + ctr += GHASH_CHUNK / 16; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + GHASH(ctx, out, GHASH_CHUNK); + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } +#endif + if ((i = (len & (size_t) - 16))) { + size_t j = i / 16; + + (*stream)(in, out, j, key, ctx->Yi.c); + ctr += (unsigned int)j; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + in += i; + len -= i; +#if defined(GHASH) + GHASH(ctx, out, i); + out += i; +#else + while (j--) { + for (i = 0; i < 16; ++i) { + ctx->Xi.c[i] ^= out[i]; + } + GCM_MUL(ctx, Xi); + out += 16; + } +#endif + } + if (len) { + (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) { + PUTU32(ctx->Yi.c + 12, ctr); + } else { + ctx->Yi.d[3] = ctr; + } + while (len--) { + ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 1; +} + +int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const uint8_t *in, + uint8_t *out, size_t len, + ctr128_f stream) { + const union { + long one; + char little; + } is_endian = {1}; + unsigned int n, ctr; + size_t i; + uint64_t mlen = ctx->len.u[1]; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult; +#ifdef GHASH + void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len) = ctx->ghash; +#endif +#endif + + mlen += len; + if (mlen > ((OPENSSL_U64(1) << 36) - 32) || + (sizeof(len) == 8 && mlen < len)) { + return 0; + } + ctx->len.u[1] = mlen; + + if (ctx->ares) { + /* First call to decrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx, Xi); + ctx->ares = 0; + } + + if (is_endian.little) { + ctr = GETU32(ctx->Yi.c + 12); + } else { + ctr = ctx->Yi.d[3]; + } + + n = ctx->mres; + if (n) { + while (n && len) { + uint8_t c = *(in++); + *(out++) = c ^ ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + --len; + n = (n + 1) % 16; + } + if (n == 0) { + GCM_MUL(ctx, Xi); + } else { + ctx->mres = n; + return 1; + } + } +#if defined(GHASH) + while (len >= GHASH_CHUNK) { + GHASH(ctx, in, GHASH_CHUNK); + (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c); + ctr += GHASH_CHUNK / 16; + if (is_endian.little) + PUTU32(ctx->Yi.c + 12, ctr); + else + ctx->Yi.d[3] = ctr; + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } +#endif + if ((i = (len & (size_t) - 16))) { + size_t j = i / 16; + +#if defined(GHASH) + GHASH(ctx, in, i); +#else + while (j--) { + size_t k; + for (k = 0; k < 16; ++k) + ctx->Xi.c[k] ^= in[k]; + GCM_MUL(ctx, Xi); + in += 16; + } + j = i / 16; + in -= i; +#endif + (*stream)(in, out, j, key, ctx->Yi.c); + ctr += (unsigned int)j; + if (is_endian.little) + PUTU32(ctx->Yi.c + 12, ctr); + else + ctx->Yi.d[3] = ctr; + out += i; + in += i; + len -= i; + } + if (len) { + (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c + 12, ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + uint8_t c = in[n]; + ctx->Xi.c[n] ^= c; + out[n] = c ^ ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 1; +} + +int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) { + const union { + long one; + char little; + } is_endian = {1}; + uint64_t alen = ctx->len.u[0] << 3; + uint64_t clen = ctx->len.u[1] << 3; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult; +#endif + + if (ctx->mres || ctx->ares) { + GCM_MUL(ctx, Xi); + } + + if (is_endian.little) { +#ifdef BSWAP8 + alen = BSWAP8(alen); + clen = BSWAP8(clen); +#else + uint8_t *p = ctx->len.c; + + ctx->len.u[0] = alen; + ctx->len.u[1] = clen; + + alen = (uint64_t)GETU32(p) << 32 | GETU32(p + 4); + clen = (uint64_t)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + } + + ctx->Xi.u[0] ^= alen; + ctx->Xi.u[1] ^= clen; + GCM_MUL(ctx, Xi); + + ctx->Xi.u[0] ^= ctx->EK0.u[0]; + ctx->Xi.u[1] ^= ctx->EK0.u[1]; + + if (tag && len <= sizeof(ctx->Xi)) { + return CRYPTO_memcmp(ctx->Xi.c, tag, len) == 0; + } else { + return 0; + } +} + +void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) { + CRYPTO_gcm128_finish(ctx, NULL, 0); + memcpy(tag, ctx->Xi.c, len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c)); +} + +void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) { + if (ctx) { + OPENSSL_cleanse(ctx, sizeof(*ctx)); + OPENSSL_free(ctx); + } +} + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +int crypto_gcm_clmul_enabled(void) { +#ifdef GHASH_ASM + return OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */ + OPENSSL_ia32cap_P[1] & (1 << 1); /* check PCLMULQDQ bit */ +#else + return 0; +#endif +} +#endif diff --git a/src/crypto/modes/gcm_test.c b/src/crypto/modes/gcm_test.c new file mode 100644 index 0000000..29f9d95 --- /dev/null +++ b/src/crypto/modes/gcm_test.c @@ -0,0 +1,435 @@ +/* ==================================================================== + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== */ + +#include <stdio.h> +#include <string.h> + +#include <openssl/aes.h> +#include <openssl/crypto.h> +#include <openssl/mem.h> +#include <openssl/modes.h> + +#include "internal.h" + + +struct test_case { + const char *key; + const char *plaintext; + const char *additional_data; + const char *nonce; + const char *ciphertext; + const char *tag; +}; + +static const struct test_case test_cases[] = { + { + "00000000000000000000000000000000", + NULL, + NULL, + "000000000000000000000000", + NULL, + "58e2fccefa7e3061367f1d57a4e7455a", + }, + { + "00000000000000000000000000000000", + "00000000000000000000000000000000", + NULL, + "000000000000000000000000", + "0388dace60b6a392f328c2b971b2fe78", + "ab6e47d42cec13bdf53a67b21257bddf", + }, + { + "feffe9928665731c6d6a8f9467308308", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", + NULL, + "cafebabefacedbaddecaf888", + "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091473f5985", + "4d5c2af327cd64a62cf35abd2ba6fab4", + }, + { + "feffe9928665731c6d6a8f9467308308", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "cafebabefacedbaddecaf888", + "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091", + "5bc94fbc3221a5db94fae95ae7121a47", + }, + { + "feffe9928665731c6d6a8f9467308308", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "cafebabefacedbad", + "61353b4c2806934a777ff51fa22a4755699b2a714fcdc6f83766e5f97b6c742373806900e49f24b22b097544d4896b424989b5e1ebac0f07c23f4598", + "3612d2e79e3b0785561be14aaca2fccb", + }, + { + "feffe9928665731c6d6a8f9467308308", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", + "8ce24998625615b603a033aca13fb894be9112a5c3a211a8ba262a3cca7e2ca701e4a9a4fba43c90ccdcb281d48c7c6fd62875d2aca417034c34aee5", + "619cc5aefffe0bfa462af43c1699d050", + }, + { + "000000000000000000000000000000000000000000000000", + NULL, + NULL, + "000000000000000000000000", + NULL, + "cd33b28ac773f74ba00ed1f312572435", + }, + { + "000000000000000000000000000000000000000000000000", + "00000000000000000000000000000000", + NULL, + "000000000000000000000000", + "98e7247c07f0fe411c267e4384b0f600", + "2ff58d80033927ab8ef4d4587514f0fb", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", + NULL, + "cafebabefacedbaddecaf888", + "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710acade256", + "9924a7c8587336bfb118024db8674a14", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "cafebabefacedbaddecaf888", + "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710", + "2519498e80f1478f37ba55bd6d27618c", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "cafebabefacedbad", + "0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7", + "65dcc57fcf623a24094fcca40d3533f8", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "cafebabefacedbad", + "0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7", + "65dcc57fcf623a24094fcca40d3533f8", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", + "d27e88681ce3243c4830165a8fdcf9ff1de9a1d8e6b447ef6ef7b79828666e4581e79012af34ddd9e2f037589b292db3e67c036745fa22e7e9b7373b", + "dcf566ff291c25bbb8568fc3d376a6d9", + }, + { + "0000000000000000000000000000000000000000000000000000000000000000", + NULL, + NULL, + "000000000000000000000000", + NULL, + "530f8afbc74536b9a963b4f1c4cb738b", + }, + { + "0000000000000000000000000000000000000000000000000000000000000000", + "00000000000000000000000000000000", + NULL, + "000000000000000000000000", + "cea7403d4d606b6e074ec5d3baf39d18", + "d0d1c8a799996bf0265b98b5d48ab919", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255", + NULL, + "cafebabefacedbaddecaf888", + "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad", + "b094dac5d93471bdec1a502270e3cc6c", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "cafebabefacedbaddecaf888", + "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662", + "76fc6ece0f4e1768cddf8853bb2d551b", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "cafebabefacedbad", + "c3762df1ca787d32ae47c13bf19844cbaf1ae14d0b976afac52ff7d79bba9de0feb582d33934a4f0954cc2363bc73f7862ac430e64abe499f47c9b1f", + "3a337dbf46a792c45e454913fe2ea8f2", + }, + { + "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308", + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39", + "feedfacedeadbeeffeedfacedeadbeefabaddad2", + "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b", + "5a8def2f0c9e53f1f75d7853659e2a20eeb2b22aafde6419a058ab4f6f746bf40fc0c3b780f244452da3ebf1c5d82cdea2418997200ef82e44ae7e3f", + "a44a8266ee1c8eb0c8b5d4cf5ae9f19a", + }, + { + "00000000000000000000000000000000", + NULL, + "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad", + "000000000000000000000000", + NULL, + "5fea793a2d6f974d37e68e0cb8ff9492", + }, + { + "00000000000000000000000000000000", + "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", + NULL, + /* This nonce results in 0xfff in counter LSB. */ + "ffffffff000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", + "56b3373ca9ef6e4a2b64fe1e9a17b61425f10d47a75a5fce13efc6bc784af24f4141bdd48cf7c770887afd573cca5418a9aeffcd7c5ceddfc6a78397b9a85b499da558257267caab2ad0b23ca476a53cb17fb41c4b8b475cb4f3f7165094c229c9e8c4dc0a2a5ff1903e501511221376a1cdb8364c5061a20cae74bc4acd76ceb0abc9fd3217ef9f8c90be402ddf6d8697f4f880dff15bfb7a6b28241ec8fe183c2d59e3f9dfff653c7126f0acb9e64211f42bae12af462b1070bef1ab5e3606872ca10dee15b3249b1a1b958f23134c4bccb7d03200bce420a2f8eb66dcf3644d1423c1b5699003c13ecef4bf38a3b60eedc34033bac1902783dc6d89e2e774188a439c7ebcc0672dbda4ddcfb2794613b0be41315ef778708a70ee7d75165c", + "8b307f6b33286d0ab026a9ed3fe1e85f", + }, +}; + +static int from_hex(uint8_t *out, char in) { + if (in >= '0' && in <= '9') { + *out = in - '0'; + return 1; + } + if (in >= 'a' && in <= 'f') { + *out = in - 'a' + 10; + return 1; + } + if (in >= 'A' && in <= 'F') { + *out = in - 'A' + 10; + return 1; + } + + return 0; +} + +static int decode_hex(uint8_t **out, size_t *out_len, const char *in, + unsigned test_num, const char *description) { + uint8_t *buf = NULL; + size_t i; + + if (in == NULL) { + *out = NULL; + *out_len = 0; + return 1; + } + + size_t len = strlen(in); + if (len & 1) { + fprintf(stderr, "%u: Odd-length %s input.\n", test_num, description); + goto err; + } + + buf = OPENSSL_malloc(len / 2); + if (buf == NULL) { + fprintf(stderr, "%u: malloc failure.\n", test_num); + goto err; + } + + for (i = 0; i < len; i += 2) { + uint8_t v, v2; + if (!from_hex(&v, in[i]) || + !from_hex(&v2, in[i+1])) { + fprintf(stderr, "%u: invalid hex digit in %s around offset %u.\n", + test_num, description, (unsigned)i); + goto err; + } + buf[i/2] = (v << 4) | v2; + } + + *out = buf; + *out_len = len/2; + return 1; + +err: + if (buf) { + OPENSSL_free(buf); + } + return 0; +} + +void hexdump(const char *msg, const void *in, size_t len) { + const uint8_t *data = in; + size_t i; + + fprintf(stderr, "%s: ", msg); + for (i = 0; i < len; i++) { + fprintf(stderr, "%02x", data[i]); + } + fprintf(stderr, "\n"); +} + +static int run_test_case(unsigned test_num, const struct test_case *test) { + size_t key_len, plaintext_len, additional_data_len, nonce_len, ciphertext_len, + tag_len; + uint8_t *key = NULL, *plaintext = NULL, *additional_data = NULL, + *nonce = NULL, *ciphertext = NULL, *tag = NULL, *out = NULL; + int ret = 0; + AES_KEY aes_key; + GCM128_CONTEXT ctx; + + if (!decode_hex(&key, &key_len, test->key, test_num, "key") || + !decode_hex(&plaintext, &plaintext_len, test->plaintext, test_num, + "plaintext") || + !decode_hex(&additional_data, &additional_data_len, test->additional_data, + test_num, "additional_data") || + !decode_hex(&nonce, &nonce_len, test->nonce, test_num, "nonce") || + !decode_hex(&ciphertext, &ciphertext_len, test->ciphertext, test_num, + "ciphertext") || + !decode_hex(&tag, &tag_len, test->tag, test_num, "tag")) { + goto out; + } + + if (plaintext_len != ciphertext_len) { + fprintf(stderr, "%u: plaintext and ciphertext have differing lengths.\n", + test_num); + goto out; + } + + if (key_len != 16 && key_len != 24 && key_len != 32) { + fprintf(stderr, "%u: bad key length.\n", test_num); + goto out; + } + + if (tag_len != 16) { + fprintf(stderr, "%u: bad tag length.\n", test_num); + goto out; + } + + out = OPENSSL_malloc(plaintext_len); + if (AES_set_encrypt_key(key, key_len*8, &aes_key)) { + fprintf(stderr, "%u: AES_set_encrypt_key failed.\n", test_num); + goto out; + } + + CRYPTO_gcm128_init(&ctx, &aes_key, (block128_f) AES_encrypt); + CRYPTO_gcm128_setiv(&ctx, nonce, nonce_len); + memset(out, 0, plaintext_len); + if (additional_data) { + CRYPTO_gcm128_aad(&ctx, additional_data, additional_data_len); + } + if (plaintext) { + CRYPTO_gcm128_encrypt(&ctx, plaintext, out, plaintext_len); + } + if (!CRYPTO_gcm128_finish(&ctx, tag, tag_len) || + (ciphertext && memcmp(out, ciphertext, plaintext_len) != 0)) { + fprintf(stderr, "%u: encrypt failed.\n", test_num); + hexdump("got ", out, plaintext_len); + hexdump("want", ciphertext, plaintext_len); + goto out; + } + + CRYPTO_gcm128_setiv(&ctx, nonce, nonce_len); + memset(out, 0, plaintext_len); + if (additional_data) { + CRYPTO_gcm128_aad(&ctx, additional_data, additional_data_len); + } + if (ciphertext) { + CRYPTO_gcm128_decrypt(&ctx, ciphertext, out, plaintext_len); + } + if (!CRYPTO_gcm128_finish(&ctx, tag, tag_len)) { + fprintf(stderr, "%u: decrypt failed.\n", test_num); + goto out; + } + if (plaintext && memcmp(out, plaintext, plaintext_len)) { + fprintf(stderr, "%u: plaintext doesn't match.\n", test_num); + goto out; + } + + ret = 1; + +out: + if (key) { + OPENSSL_free(key); + } + if (plaintext) { + OPENSSL_free(plaintext); + } + if (additional_data) { + OPENSSL_free(additional_data); + } + if (nonce) { + OPENSSL_free(nonce); + } + if (ciphertext) { + OPENSSL_free(ciphertext); + } + if (tag) { + OPENSSL_free(tag); + } + if (out) { + OPENSSL_free(out); + } + return ret; +} + +int main(void) { + int ret = 0; + unsigned i; + + CRYPTO_library_init(); + + for (i = 0; i < sizeof(test_cases) / sizeof(struct test_case); i++) { + if (!run_test_case(i, &test_cases[i])) { + ret = 1; + } + } + + if (ret == 0) { + printf("PASS\n"); + } + + return ret; +} diff --git a/src/crypto/modes/internal.h b/src/crypto/modes/internal.h new file mode 100644 index 0000000..9662e0d --- /dev/null +++ b/src/crypto/modes/internal.h @@ -0,0 +1,199 @@ +/* ==================================================================== + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== */ + +#ifndef OPENSSL_HEADER_MODES_INTERNAL_H +#define OPENSSL_HEADER_MODES_INTERNAL_H + +#include <openssl/base.h> + +#if defined(__cplusplus) +extern "C" { +#endif + + +#define asm __asm__ + +#define STRICT_ALIGNMENT 1 +#if defined(OPENSSL_X86_64) || defined(OPENSSL_X86) || defined(OPENSSL_AARCH64) +#undef STRICT_ALIGNMENT +#define STRICT_ALIGNMENT 0 +#endif + +#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) +#if defined(__GNUC__) && __GNUC__ >= 2 +#if defined(OPENSSL_X86_64) +#define BSWAP8(x) \ + ({ \ + uint64_t ret = (x); \ + asm("bswapq %0" : "+r"(ret)); \ + ret; \ + }) +#define BSWAP4(x) \ + ({ \ + uint32_t ret = (x); \ + asm("bswapl %0" : "+r"(ret)); \ + ret; \ + }) +#elif defined(OPENSSL_X86) +#define BSWAP8(x) \ + ({ \ + uint32_t lo = (uint64_t)(x) >> 32, hi = (x); \ + asm("bswapl %0; bswapl %1" : "+r"(hi), "+r"(lo)); \ + (uint64_t) hi << 32 | lo; \ + }) +#define BSWAP4(x) \ + ({ \ + uint32_t ret = (x); \ + asm("bswapl %0" : "+r"(ret)); \ + ret; \ + }) +#elif defined(OPENSSL_AARCH64) +#define BSWAP8(x) \ + ({ \ + uint64_t ret; \ + asm("rev %0,%1" : "=r"(ret) : "r"(x)); \ + ret; \ + }) +#define BSWAP4(x) \ + ({ \ + uint32_t ret; \ + asm("rev %w0,%w1" : "=r"(ret) : "r"(x)); \ + ret; \ + }) +#elif defined(OPENSSL_ARM) && !defined(STRICT_ALIGNMENT) +#define BSWAP8(x) \ + ({ \ + uint32_t lo = (uint64_t)(x) >> 32, hi = (x); \ + asm("rev %0,%0; rev %1,%1" : "+r"(hi), "+r"(lo)); \ + (uint64_t) hi << 32 | lo; \ + }) +#define BSWAP4(x) \ + ({ \ + uint32_t ret; \ + asm("rev %0,%1" : "=r"(ret) : "r"((uint32_t)(x))); \ + ret; \ + }) +#endif +#elif defined(_MSC_VER) +#if _MSC_VER >= 1300 +#pragma intrinsic(_byteswap_uint64, _byteswap_ulong) +#define BSWAP8(x) _byteswap_uint64((uint64_t)(x)) +#define BSWAP4(x) _byteswap_ulong((uint32_t)(x)) +#elif defined(OPENSSL_X86) +__inline uint32_t _bswap4(uint32_t val) { + _asm mov eax, val + _asm bswap eax +} +#define BSWAP4(x) _bswap4(x) +#endif +#endif +#endif + +#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT) +#define GETU32(p) BSWAP4(*(const uint32_t *)(p)) +#define PUTU32(p, v) *(uint32_t *)(p) = BSWAP4(v) +#else +#define GETU32(p) \ + ((uint32_t)(p)[0] << 24 | (uint32_t)(p)[1] << 16 | (uint32_t)(p)[2] << 8 | (uint32_t)(p)[3]) +#define PUTU32(p, v) \ + ((p)[0] = (uint8_t)((v) >> 24), (p)[1] = (uint8_t)((v) >> 16), \ + (p)[2] = (uint8_t)((v) >> 8), (p)[3] = (uint8_t)(v)) +#endif + + +/* GCM definitions */ +typedef struct { uint64_t hi,lo; } u128; + +struct gcm128_context { + /* Following 6 names follow names in GCM specification */ + union { + uint64_t u[2]; + uint32_t d[4]; + uint8_t c[16]; + size_t t[16 / sizeof(size_t)]; + } Yi, EKi, EK0, len, Xi, H; + + /* Relative position of Xi, H and pre-computed Htable is used in some + * assembler modules, i.e. don't change the order! */ + u128 Htable[16]; + void (*gmult)(uint64_t Xi[2], const u128 Htable[16]); + void (*ghash)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, + size_t len); + + unsigned int mres, ares; + block128_f block; + void *key; +}; + +struct xts128_context { + void *key1, *key2; + block128_f block1, block2; +}; + +struct ccm128_context { + union { + uint64_t u[2]; + uint8_t c[16]; + } nonce, cmac; + uint64_t blocks; + block128_f block; + void *key; +}; + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +/* crypto_gcm_clmul_enabled returns one if the CLMUL implementation of GCM is + * used. */ +int crypto_gcm_clmul_enabled(void); +#endif + + +#if defined(__cplusplus) +} /* extern C */ +#endif + +#endif /* OPENSSL_HEADER_MODES_INTERNAL_H */ diff --git a/src/crypto/modes/ofb.c b/src/crypto/modes/ofb.c new file mode 100644 index 0000000..5836a9f --- /dev/null +++ b/src/crypto/modes/ofb.c @@ -0,0 +1,107 @@ +/* ==================================================================== + * Copyright (c) 2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== */ + +#include <openssl/modes.h> + +#include <assert.h> + +#include "internal.h" + + +void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const void *key, uint8_t ivec[16], int *num, + block128_f block) { + unsigned int n; + + assert(in && out && key && ivec && num); + assert((16 % sizeof(size_t)) == 0); + + n = *num; + + while (n && len) { + *(out++) = *(in++) ^ ivec[n]; + --len; + n = (n + 1) % 16; + } + +#if STRICT_ALIGNMENT + if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) { + size_t l = 0; + while (l < len) { + if (n == 0) { + (*block)(ivec, ivec, key); + } + out[l] = in[l] ^ ivec[n]; + ++l; + n = (n + 1) % 16; + } + + *num = n; + return; + } +#endif + + while (len >= 16) { + (*block)(ivec, ivec, key); + for (; n < 16; n += sizeof(size_t)) { + *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(ivec + n); + } + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block)(ivec, ivec, key); + while (len--) { + out[n] = in[n] ^ ivec[n]; + ++n; + } + } + *num = n; +} |