13 files changed, 7646 insertions, 0 deletions
diff --git a/src/crypto/modes/CMakeLists.txt b/src/crypto/modes/CMakeLists.txt
new file mode 100644
index 0000000..d50e97b
--- /dev/null
+++ b/src/crypto/modes/CMakeLists.txt
@@ -0,0 +1,63 @@
+include_directories(. .. ../../include)
+
+if (${ARCH} STREQUAL "x86_64")
+  set(
+    MODES_ARCH_SOURCES
+
+    aesni-gcm-x86_64.${ASM_EXT}
+    ghash-x86_64.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "x86")
+  set(
+    MODES_ARCH_SOURCES
+
+    ghash-x86.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "arm")
+  set(
+    MODES_ARCH_SOURCES
+
+    ghash-armv4.${ASM_EXT}
+    ghashv8-armx.${ASM_EXT}
+  )
+endif()
+
+if (${ARCH} STREQUAL "aarch64")
+  set(
+    MODES_ARCH_SOURCES
+
+    ghashv8-armx.${ASM_EXT}
+  )
+endif()
+
+add_library(
+  modes
+
+  OBJECT
+
+  cbc.c
+  ctr.c
+  ofb.c
+  cfb.c
+  gcm.c
+
+  ${MODES_ARCH_SOURCES}
+)
+
+perlasm(aesni-gcm-x86_64.${ASM_EXT} asm/aesni-gcm-x86_64.pl)
+perlasm(ghash-x86_64.${ASM_EXT} asm/ghash-x86_64.pl)
+perlasm(ghash-x86.${ASM_EXT} asm/ghash-x86.pl)
+perlasm(ghash-armv4.${ASM_EXT} asm/ghash-armv4.pl)
+perlasm(ghashv8-armx.${ASM_EXT} asm/ghashv8-armx.pl)
+
+add_executable(
+  gcm_test
+
+  gcm_test.c
+)
+
+target_link_libraries(gcm_test crypto)
diff --git a/src/crypto/modes/asm/aesni-gcm-x86_64.pl b/src/crypto/modes/asm/aesni-gcm-x86_64.pl
new file mode 100644
index 0000000..7e4e04e
--- /dev/null
+++ b/src/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -0,0 +1,1057 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, and 0.74 -
+# on Broadwell. [Mentioned results are raw profiled measurements for
+# favourable packet size, one divisible by 96. Applications using the
+# EVP interface will observe a few percent worse performance.]
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+	$avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($avx>1) {{{
+
+($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+($Ii,$T1,$T2,$Hkey,
+ $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
+
+($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
+
+($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
+
+$code=<<___;
+.text
+
+.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
+.align	32
+_aesni_ctr32_ghash_6x:
+	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
+	sub		\$6,$len
+	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
+	vmovdqu		0x00-0x80($key),$rndkey
+	vpaddb		$T2,$T1,$inout1
+	vpaddb		$T2,$inout1,$inout2
+	vpaddb		$T2,$inout2,$inout3
+	vpaddb		$T2,$inout3,$inout4
+	vpaddb		$T2,$inout4,$inout5
+	vpxor		$rndkey,$T1,$inout0
+	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
+	jmp		.Loop6x
+
+.align	32
+.Loop6x:
+	add		\$`6<<24`,$counter
+	jc		.Lhandle_ctr32		# discard $inout[1-5]?
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	  vpaddb	$T2,$inout5,$T1		# next counter value
+	  vpxor		$rndkey,$inout1,$inout1
+	  vpxor		$rndkey,$inout2,$inout2
+
+.Lresume_ctr32:
+	vmovdqu		$T1,($ivp)		# save next counter value
+	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
+	  vpxor		$rndkey,$inout3,$inout3
+	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
+	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
+	xor		%r12,%r12
+	cmp		$in0,$end0
+
+	  vaesenc	$T2,$inout0,$inout0
+	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
+	  vpxor		$rndkey,$inout4,$inout4
+	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
+	  vaesenc	$T2,$inout1,$inout1
+	  vpxor		$rndkey,$inout5,$inout5
+	setnc		%r12b
+	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
+	  vaesenc	$T2,$inout2,$inout2
+	vmovdqu		0x10-0x20($Xip),$Hkey	# $Hkey^2
+	neg		%r12
+	  vaesenc	$T2,$inout3,$inout3
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
+	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
+	  vaesenc	$T2,$inout4,$inout4
+	 vpxor		$Z1,$T1,$Z0
+	and		\$0x60,%r12
+	  vmovups	0x20-0x80($key),$rndkey
+	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
+	  vaesenc	$T2,$inout5,$inout5
+
+	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
+	lea		($in0,%r12),$in0
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
+	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x58($in0),%r13
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x50($in0),%r12
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x20+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x28+8(%rsp)
+	vmovdqu		0x30-0x20($Xip),$Z1	# borrow $Z1 for $Hkey^3
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x30-0x80($key),$rndkey
+	 vpxor		$T1,$Z2,$Z2
+	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$T2,$Z2,$Z2
+	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
+	  vaesenc	$rndkey,$inout1,$inout1
+	 vpxor		$Hkey,$Z3,$Z3
+	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout2,$inout2
+	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
+	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	 vpxor		$T1,$Z0,$Z0
+	vmovdqu		0x40-0x20($Xip),$T1	# borrow $T1 for $Hkey^4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x40-0x80($key),$rndkey
+	 vpxor		$T2,$Z2,$Z2
+	vpclmulqdq	\$0x00,$T1,$Ii,$T2
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$Hkey,$Z2,$Z2
+	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x48($in0),%r13
+	 vpxor		$Z1,$Z3,$Z3
+	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x40($in0),%r12
+	vpclmulqdq	\$0x11,$T1,$Ii,$T1
+	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x30+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x38+8(%rsp)
+	 vpxor		$T2,$Z0,$Z0
+	vmovdqu		0x60-0x20($Xip),$T2	# borrow $T2 for $Hkey^5
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x50-0x80($key),$rndkey
+	 vpxor		$Hkey,$Z2,$Z2
+	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x38($in0),%r13
+	 vpxor		$T1,$Z3,$Z3
+	vpclmulqdq	\$0x01,$T2,$Ii,$T1
+	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x30($in0),%r12
+	vpclmulqdq	\$0x11,$T2,$Ii,$T2
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x40+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x48+8(%rsp)
+	 vpxor		$Hkey,$Z0,$Z0
+	 vmovdqu	0x70-0x20($Xip),$Hkey	# $Hkey^6
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vmovups	0x60-0x80($key),$rndkey
+	 vpxor		$Z1,$Z2,$Z2
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
+	  vaesenc	$rndkey,$inout0,$inout0
+	 vpxor		$T1,$Z2,$Z2
+	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
+	  vaesenc	$rndkey,$inout1,$inout1
+	movbe		0x28($in0),%r13
+	 vpxor		$T2,$Z3,$Z3
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x20($in0),%r12
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r13,0x50+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	mov		%r12,0x58+8(%rsp)
+	vpxor		$Z1,$Z2,$Z2
+	  vaesenc	$rndkey,$inout5,$inout5
+	vpxor		$T1,$Z2,$Z2
+
+	  vmovups	0x70-0x80($key),$rndkey
+	vpslldq		\$8,$Z2,$Z1
+	vpxor		$T2,$Z0,$Z0
+	vmovdqu		0x10($const),$Hkey	# .Lpoly
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	vpxor		$Xi,$Z3,$Z3
+	  vaesenc	$rndkey,$inout1,$inout1
+	vpxor		$Z1,$Z0,$Z0
+	movbe		0x18($in0),%r13
+	  vaesenc	$rndkey,$inout2,$inout2
+	movbe		0x10($in0),%r12
+	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
+	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
+	mov		%r13,0x60+8(%rsp)
+	  vaesenc	$rndkey,$inout3,$inout3
+	mov		%r12,0x68+8(%rsp)
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vmovups	0x90-0x80($key),$rndkey
+	  vaesenc	$T1,$inout1,$inout1
+	vpsrldq		\$8,$Z2,$Z2
+	  vaesenc	$T1,$inout2,$inout2
+	vpxor		$Z2,$Z3,$Z3
+	  vaesenc	$T1,$inout3,$inout3
+	vpxor		$Ii,$Z0,$Z0
+	movbe		0x08($in0),%r13
+	  vaesenc	$T1,$inout4,$inout4
+	movbe		0x00($in0),%r12
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xa0-0x80($key),$T1
+	  cmp		\$11,$rounds
+	  jb		.Lenc_tail		# 128-bit key
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	  vaesenc	$rndkey,$inout1,$inout1
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vaesenc	$T1,$inout1,$inout1
+	  vaesenc	$T1,$inout2,$inout2
+	  vaesenc	$T1,$inout3,$inout3
+	  vaesenc	$T1,$inout4,$inout4
+	  vmovups	0xb0-0x80($key),$rndkey
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xc0-0x80($key),$T1
+	  je		.Lenc_tail		# 192-bit key
+
+	  vaesenc	$rndkey,$inout0,$inout0
+	  vaesenc	$rndkey,$inout1,$inout1
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vaesenc	$rndkey,$inout5,$inout5
+
+	  vaesenc	$T1,$inout0,$inout0
+	  vaesenc	$T1,$inout1,$inout1
+	  vaesenc	$T1,$inout2,$inout2
+	  vaesenc	$T1,$inout3,$inout3
+	  vaesenc	$T1,$inout4,$inout4
+	  vmovups	0xd0-0x80($key),$rndkey
+	  vaesenc	$T1,$inout5,$inout5
+	  vmovups	0xe0-0x80($key),$T1
+	  jmp		.Lenc_tail		# 256-bit key
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
+	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
+	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
+	  vpaddd	$Z1,$Z2,$inout2
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	  vpaddd	$Z1,$inout1,$inout3
+	  vpshufb	$Ii,$inout1,$inout1
+	  vpaddd	$Z1,$inout2,$inout4
+	  vpshufb	$Ii,$inout2,$inout2
+	  vpxor		$rndkey,$inout1,$inout1
+	  vpaddd	$Z1,$inout3,$inout5
+	  vpshufb	$Ii,$inout3,$inout3
+	  vpxor		$rndkey,$inout2,$inout2
+	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
+	  vpshufb	$Ii,$inout4,$inout4
+	  vpshufb	$Ii,$inout5,$inout5
+	  vpshufb	$Ii,$T1,$T1		# next counter value
+	jmp		.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	  vaesenc	$rndkey,$inout0,$inout0
+	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
+	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
+	  vaesenc	$rndkey,$inout1,$inout1
+	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
+	  vpxor		0x00($inp),$T1,$T2
+	  vaesenc	$rndkey,$inout2,$inout2
+	  vpxor		0x10($inp),$T1,$Ii
+	  vaesenc	$rndkey,$inout3,$inout3
+	  vpxor		0x20($inp),$T1,$Z1
+	  vaesenc	$rndkey,$inout4,$inout4
+	  vpxor		0x30($inp),$T1,$Z2
+	  vaesenc	$rndkey,$inout5,$inout5
+	  vpxor		0x40($inp),$T1,$Z3
+	  vpxor		0x50($inp),$T1,$Hkey
+	  vmovdqu	($ivp),$T1		# load next counter value
+
+	  vaesenclast	$T2,$inout0,$inout0
+	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
+	  vaesenclast	$Ii,$inout1,$inout1
+	 vpaddb		$T2,$T1,$Ii
+	mov		%r13,0x70+8(%rsp)
+	lea		0x60($inp),$inp
+	  vaesenclast	$Z1,$inout2,$inout2
+	 vpaddb		$T2,$Ii,$Z1
+	mov		%r12,0x78+8(%rsp)
+	lea		0x60($out),$out
+	  vmovdqu	0x00-0x80($key),$rndkey
+	  vaesenclast	$Z2,$inout3,$inout3
+	 vpaddb		$T2,$Z1,$Z2
+	  vaesenclast	$Z3, $inout4,$inout4
+	 vpaddb		$T2,$Z2,$Z3
+	  vaesenclast	$Hkey,$inout5,$inout5
+	 vpaddb		$T2,$Z3,$Hkey
+
+	add		\$0x60,$ret
+	sub		\$0x6,$len
+	jc		.L6x_done
+
+	  vmovups	$inout0,-0x60($out)	# save output
+	 vpxor		$rndkey,$T1,$inout0
+	  vmovups	$inout1,-0x50($out)
+	 vmovdqa	$Ii,$inout1		# 0 latency
+	  vmovups	$inout2,-0x40($out)
+	 vmovdqa	$Z1,$inout2		# 0 latency
+	  vmovups	$inout3,-0x30($out)
+	 vmovdqa	$Z2,$inout3		# 0 latency
+	  vmovups	$inout4,-0x20($out)
+	 vmovdqa	$Z3,$inout4		# 0 latency
+	  vmovups	$inout5,-0x10($out)
+	 vmovdqa	$Hkey,$inout5		# 0 latency
+	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
+	jmp		.Loop6x
+
+.L6x_done:
+	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
+	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
+
+	ret
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+___
+######################################################################
+#
+# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
+#		const AES_KEY *key, unsigned char iv[16],
+#		struct { u128 Xi,H,Htbl[9]; } *Xip);
+$code.=<<___;
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,\@function,6
+.align	32
+aesni_gcm_decrypt:
+	xor	$ret,$ret
+	cmp	\$0x60,$len			# minimal accepted length
+	jb	.Lgcm_dec_abort
+
+	lea	(%rsp),%rax			# save stack pointer
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	movaps	%xmm6,-0xd8(%rax)
+	movaps	%xmm7,-0xc8(%rax)
+	movaps	%xmm8,-0xb8(%rax)
+	movaps	%xmm9,-0xa8(%rax)
+	movaps	%xmm10,-0x98(%rax)
+	movaps	%xmm11,-0x88(%rax)
+	movaps	%xmm12,-0x78(%rax)
+	movaps	%xmm13,-0x68(%rax)
+	movaps	%xmm14,-0x58(%rax)
+	movaps	%xmm15,-0x48(%rax)
+.Lgcm_dec_body:
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($ivp),$T1		# input counter value
+	add		\$-128,%rsp
+	mov		12($ivp),$counter
+	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
+	vmovdqu		($Xip),$Xi		# load Xi
+	and		\$-128,%rsp		# ensure stack alignment
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	lea		0x80($key),$key		# size optimization
+	lea		0x20+0x20($Xip),$Xip	# size optimization
+	mov		0xf0-0x80($key),$rounds
+	vpshufb		$Ii,$Xi,$Xi
+
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Ldec_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Ldec_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Ldec_no_key_aliasing:
+
+	vmovdqu		0x50($inp),$Z3		# I[5]
+	lea		($inp),$in0
+	vmovdqu		0x40($inp),$Z0
+	lea		-0xc0($inp,$len),$end0
+	vmovdqu		0x30($inp),$Z1
+	shr		\$4,$len
+	xor		$ret,$ret
+	vmovdqu		0x20($inp),$Z2
+	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
+	vmovdqu		0x10($inp),$T2
+	 vpshufb	$Ii,$Z0,$Z0
+	vmovdqu		($inp),$Hkey
+	 vpshufb	$Ii,$Z1,$Z1
+	vmovdqu		$Z0,0x30(%rsp)
+	 vpshufb	$Ii,$Z2,$Z2
+	vmovdqu		$Z1,0x40(%rsp)
+	 vpshufb	$Ii,$T2,$T2
+	vmovdqu		$Z2,0x50(%rsp)
+	 vpshufb	$Ii,$Hkey,$Hkey
+	vmovdqu		$T2,0x60(%rsp)
+	vmovdqu		$Hkey,0x70(%rsp)
+
+	call		_aesni_ctr32_ghash_6x
+
+	vmovups		$inout0,-0x60($out)	# save output
+	vmovups		$inout1,-0x50($out)
+	vmovups		$inout2,-0x40($out)
+	vmovups		$inout3,-0x30($out)
+	vmovups		$inout4,-0x20($out)
+	vmovups		$inout5,-0x10($out)
+
+	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
+	vmovdqu		$Xi,-0x40($Xip)		# output Xi
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd8(%rax),%xmm6
+	movaps	-0xd8(%rax),%xmm7
+	movaps	-0xb8(%rax),%xmm8
+	movaps	-0xa8(%rax),%xmm9
+	movaps	-0x98(%rax),%xmm10
+	movaps	-0x88(%rax),%xmm11
+	movaps	-0x78(%rax),%xmm12
+	movaps	-0x68(%rax),%xmm13
+	movaps	-0x58(%rax),%xmm14
+	movaps	-0x48(%rax),%xmm15
+___
+$code.=<<___;
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp		# restore %rsp
+.Lgcm_dec_abort:
+	mov	$ret,%rax		# return value
+	ret
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+
+$code.=<<___;
+.type	_aesni_ctr32_6x,\@abi-omnipotent
+.align	32
+_aesni_ctr32_6x:
+	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
+	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
+	lea		-1($rounds),%r13
+	vmovups		0x10-0x80($key),$rndkey
+	lea		0x20-0x80($key),%r12
+	vpxor		$Z0,$T1,$inout0
+	add		\$`6<<24`,$counter
+	jc		.Lhandle_ctr32_2
+	vpaddb		$T2,$T1,$inout1
+	vpaddb		$T2,$inout1,$inout2
+	vpxor		$Z0,$inout1,$inout1
+	vpaddb		$T2,$inout2,$inout3
+	vpxor		$Z0,$inout2,$inout2
+	vpaddb		$T2,$inout3,$inout4
+	vpxor		$Z0,$inout3,$inout3
+	vpaddb		$T2,$inout4,$inout5
+	vpxor		$Z0,$inout4,$inout4
+	vpaddb		$T2,$inout5,$T1
+	vpxor		$Z0,$inout5,$inout5
+	jmp		.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc		$rndkey,$inout0,$inout0
+	vaesenc		$rndkey,$inout1,$inout1
+	vaesenc		$rndkey,$inout2,$inout2
+	vaesenc		$rndkey,$inout3,$inout3
+	vaesenc		$rndkey,$inout4,$inout4
+	vaesenc		$rndkey,$inout5,$inout5
+	vmovups		(%r12),$rndkey
+	lea		0x10(%r12),%r12
+	dec		%r13d
+	jnz		.Loop_ctr32
+
+	vmovdqu		(%r12),$Hkey		# last round key
+	vaesenc		$rndkey,$inout0,$inout0
+	vpxor		0x00($inp),$Hkey,$Z0
+	vaesenc		$rndkey,$inout1,$inout1
+	vpxor		0x10($inp),$Hkey,$Z1
+	vaesenc		$rndkey,$inout2,$inout2
+	vpxor		0x20($inp),$Hkey,$Z2
+	vaesenc		$rndkey,$inout3,$inout3
+	vpxor		0x30($inp),$Hkey,$Xi
+	vaesenc		$rndkey,$inout4,$inout4
+	vpxor		0x40($inp),$Hkey,$T2
+	vaesenc		$rndkey,$inout5,$inout5
+	vpxor		0x50($inp),$Hkey,$Hkey
+	lea		0x60($inp),$inp
+
+	vaesenclast	$Z0,$inout0,$inout0
+	vaesenclast	$Z1,$inout1,$inout1
+	vaesenclast	$Z2,$inout2,$inout2
+	vaesenclast	$Xi,$inout3,$inout3
+	vaesenclast	$T2,$inout4,$inout4
+	vaesenclast	$Hkey,$inout5,$inout5
+	vmovups		$inout0,0x00($out)
+	vmovups		$inout1,0x10($out)
+	vmovups		$inout2,0x20($out)
+	vmovups		$inout3,0x30($out)
+	vmovups		$inout4,0x40($out)
+	vmovups		$inout5,0x50($out)
+	lea		0x60($out),$out
+
+	ret
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
+	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
+	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
+	vpaddd		$Z1,$Z2,$inout2
+	vpaddd		$Z1,$inout1,$inout3
+	vpshufb		$Ii,$inout1,$inout1
+	vpaddd		$Z1,$inout2,$inout4
+	vpshufb		$Ii,$inout2,$inout2
+	vpxor		$Z0,$inout1,$inout1
+	vpaddd		$Z1,$inout3,$inout5
+	vpshufb		$Ii,$inout3,$inout3
+	vpxor		$Z0,$inout2,$inout2
+	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
+	vpshufb		$Ii,$inout4,$inout4
+	vpxor		$Z0,$inout3,$inout3
+	vpshufb		$Ii,$inout5,$inout5
+	vpxor		$Z0,$inout4,$inout4
+	vpshufb		$Ii,$T1,$T1		# next counter value
+	vpxor		$Z0,$inout5,$inout5
+	jmp	.Loop_ctr32
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,\@function,6
+.align	32
+aesni_gcm_encrypt:
+	xor	$ret,$ret
+	cmp	\$0x60*3,$len			# minimal accepted length
+	jb	.Lgcm_enc_abort
+
+	lea	(%rsp),%rax			# save stack pointer
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	movaps	%xmm6,-0xd8(%rax)
+	movaps	%xmm7,-0xc8(%rax)
+	movaps	%xmm8,-0xb8(%rax)
+	movaps	%xmm9,-0xa8(%rax)
+	movaps	%xmm10,-0x98(%rax)
+	movaps	%xmm11,-0x88(%rax)
+	movaps	%xmm12,-0x78(%rax)
+	movaps	%xmm13,-0x68(%rax)
+	movaps	%xmm14,-0x58(%rax)
+	movaps	%xmm15,-0x48(%rax)
+.Lgcm_enc_body:
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($ivp),$T1		# input counter value
+	add		\$-128,%rsp
+	mov		12($ivp),$counter
+	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
+	lea		0x80($key),$key		# size optimization
+	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
+	and		\$-128,%rsp		# ensure stack alignment
+	mov		0xf0-0x80($key),$rounds
+
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Lenc_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Lenc_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Lenc_no_key_aliasing:
+
+	lea		($out),$in0
+	lea		-0xc0($out,$len),$end0
+	shr		\$4,$len
+
+	call		_aesni_ctr32_6x
+	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
+	vpshufb		$Ii,$inout1,$T2
+	vmovdqu		$Xi,0x70(%rsp)
+	vpshufb		$Ii,$inout2,$Z0
+	vmovdqu		$T2,0x60(%rsp)
+	vpshufb		$Ii,$inout3,$Z1
+	vmovdqu		$Z0,0x50(%rsp)
+	vpshufb		$Ii,$inout4,$Z2
+	vmovdqu		$Z1,0x40(%rsp)
+	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
+	vmovdqu		$Z2,0x30(%rsp)
+
+	call		_aesni_ctr32_6x
+
+	vmovdqu		($Xip),$Xi		# load Xi
+	lea		0x20+0x20($Xip),$Xip	# size optimization
+	sub		\$12,$len
+	mov		\$0x60*2,$ret
+	vpshufb		$Ii,$Xi,$Xi
+
+	call		_aesni_ctr32_ghash_6x
+	vmovdqu		0x20(%rsp),$Z3		# I[5]
+	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
+	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
+	vpunpckhqdq	$Z3,$Z3,$T1
+	vmovdqu		0x20-0x20($Xip),$rndkey	# borrow $rndkey for $HK
+	 vmovups	$inout0,-0x60($out)	# save output
+	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
+	vpxor		$Z3,$T1,$T1
+	 vmovups	$inout1,-0x50($out)
+	 vpshufb	$Ii,$inout1,$inout1
+	 vmovups	$inout2,-0x40($out)
+	 vpshufb	$Ii,$inout2,$inout2
+	 vmovups	$inout3,-0x30($out)
+	 vpshufb	$Ii,$inout3,$inout3
+	 vmovups	$inout4,-0x20($out)
+	 vpshufb	$Ii,$inout4,$inout4
+	 vmovups	$inout5,-0x10($out)
+	 vpshufb	$Ii,$inout5,$inout5
+	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
+___
+{ my ($HK,$T3)=($rndkey,$inout0);
+
+$code.=<<___;
+	 vmovdqu	0x30(%rsp),$Z2		# I[4]
+	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
+	 vpunpckhqdq	$Z2,$Z2,$T2
+	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
+	 vpxor		$Z2,$T2,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
+	vpclmulqdq	\$0x00,$HK,$T1,$T1
+
+	 vmovdqu	0x40(%rsp),$T3		# I[3]
+	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
+	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$T3,$T3,$Z1
+	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
+	 vpxor		$T3,$Z1,$Z1
+	vpxor		$Z3,$Z2,$Z2
+	vpclmulqdq	\$0x10,$HK,$T2,$T2
+	 vmovdqu	0x50-0x20($Xip),$HK
+	vpxor		$T1,$T2,$T2
+
+	 vmovdqu	0x50(%rsp),$T1		# I[2]
+	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
+	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
+	vpxor		$Z0,$Z3,$Z3
+	 vpunpckhqdq	$T1,$T1,$Z0
+	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
+	 vpxor		$T1,$Z0,$Z0
+	vpxor		$Z2,$T3,$T3
+	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
+	vpxor		$T2,$Z1,$Z1
+
+	 vmovdqu	0x60(%rsp),$T2		# I[1]
+	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
+	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
+	vpxor		$Z3,$Z2,$Z2
+	 vpunpckhqdq	$T2,$T2,$Z3
+	vpclmulqdq	\$0x11,$Ii,$T1,$T1
+	 vpxor		$T2,$Z3,$Z3
+	vpxor		$T3,$T1,$T1
+	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
+	 vmovdqu	0x80-0x20($Xip),$HK
+	vpxor		$Z1,$Z0,$Z0
+
+	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
+	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
+	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
+	 vpunpckhqdq	$Xi,$Xi,$T3
+	vpxor		$Z2,$Z1,$Z1
+	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
+	 vpxor		$Xi,$T3,$T3
+	vpxor		$T1,$T2,$T2
+	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
+	vpxor		$Z0,$Z3,$Z0
+
+	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
+	 vmovdqu	0x00-0x20($Xip),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$inout5,$inout5,$T1
+	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
+	 vpxor		$inout5,$T1,$T1
+	vpxor		$Z1,$Z2,$Z1
+	vpclmulqdq	\$0x10,$HK,$T3,$T3
+	 vmovdqu	0x20-0x20($Xip),$HK
+	vpxor		$T2,$Xi,$Z3
+	vpxor		$Z0,$T3,$Z2
+
+	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
+	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
+	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
+	  vpxor		$T3,$Z2,$Z2
+	 vpunpckhqdq	$inout4,$inout4,$T2
+	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
+	 vpxor		$inout4,$T2,$T2
+	  vpslldq	\$8,$Z2,$T3
+	vpclmulqdq	\$0x00,$HK,$T1,$T1
+	  vpxor		$T3,$Z1,$Xi
+	  vpsrldq	\$8,$Z2,$Z2
+	  vpxor		$Z2,$Z3,$Z3
+
+	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
+	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
+	vpxor		$Z0,$Z1,$Z1
+	 vpunpckhqdq	$inout3,$inout3,$T3
+	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
+	 vpxor		$inout3,$T3,$T3
+	vpxor		$inout5,$inout4,$inout4
+	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
+	vpclmulqdq	\$0x10,$HK,$T2,$T2
+	 vmovdqu	0x50-0x20($Xip),$HK
+	vpxor		$T1,$T2,$T2
+
+	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
+	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$inout2,$inout2,$T1
+	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
+	 vpxor		$inout2,$T1,$T1
+	vpxor		$inout4,$inout3,$inout3
+	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
+	vpclmulqdq	\$0x00,$HK,$T3,$T3
+	vpxor		$T2,$T3,$T3
+
+	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
+	  vxorps	$inout5,$Xi,$Xi
+
+	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
+	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
+	vpxor		$Z0,$Z1,$Z1
+	 vpunpckhqdq	$inout1,$inout1,$T2
+	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
+	 vpxor		$inout1,$T2,$T2
+	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
+	vpxor		$inout3,$inout2,$inout2
+	vpclmulqdq	\$0x10,$HK,$T1,$T1
+	 vmovdqu	0x80-0x20($Xip),$HK
+	vpxor		$T3,$T1,$T1
+
+	  vxorps	$Z3,$inout5,$inout5
+	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
+	  vxorps	$inout5,$Xi,$Xi
+
+	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
+	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
+	vpxor		$Z1,$Z0,$Z0
+	 vpunpckhqdq	$Xi,$Xi,$T3
+	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
+	 vpxor		$Xi,$T3,$T3
+	vpxor		$inout2,$inout1,$inout1
+	vpclmulqdq	\$0x00,$HK,$T2,$T2
+	vpxor		$T1,$T2,$T2
+
+	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
+	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
+	vpxor		$Z0,$Z1,$Z1
+	vpclmulqdq	\$0x10,$HK,$T3,$Z2
+	vpxor		$inout1,$Z3,$Z3
+	vpxor		$T2,$Z2,$Z2
+
+	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
+	vpxor		$Z0,$Z2,$Z2
+	vpslldq		\$8,$Z2,$T1
+	vmovdqu		0x10($const),$Hkey	# .Lpoly
+	vpsrldq		\$8,$Z2,$Z2
+	vpxor		$T1,$Z1,$Xi
+	vpxor		$Z2,$Z3,$Z3
+
+	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
+	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
+	vpxor		$Z3,$T2,$T2
+	vpxor		$T2,$Xi,$Xi
+___
+}
+$code.=<<___;
+	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
+	vmovdqu		$Xi,-0x40($Xip)		# output Xi
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd8(%rax),%xmm6
+	movaps	-0xc8(%rax),%xmm7
+	movaps	-0xb8(%rax),%xmm8
+	movaps	-0xa8(%rax),%xmm9
+	movaps	-0x98(%rax),%xmm10
+	movaps	-0x88(%rax),%xmm11
+	movaps	-0x78(%rax),%xmm12
+	movaps	-0x68(%rax),%xmm13
+	movaps	-0x58(%rax),%xmm14
+	movaps	-0x48(%rax),%xmm15
+___
+$code.=<<___;
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp		# restore %rsp
+.Lgcm_enc_abort:
+	mov	$ret,%rax		# return value
+	ret
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+___
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern	__imp_RtlVirtualUnwind
+.type	gcm_se_handler,\@abi-omnipotent
+.align	16
+gcm_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	120($context),%rax	# pull context->Rax
+
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	mov	%r15,240($context)
+	mov	%r14,232($context)
+	mov	%r13,224($context)
+	mov	%r12,216($context)
+	mov	%rbp,160($context)
+	mov	%rbx,144($context)
+
+	lea	-0xd8(%rax),%rsi	# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	gcm_se_handler,.-gcm_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_aesni_gcm_decrypt
+	.rva	.LSEH_end_aesni_gcm_decrypt
+	.rva	.LSEH_gcm_dec_info
+
+	.rva	.LSEH_begin_aesni_gcm_encrypt
+	.rva	.LSEH_end_aesni_gcm_encrypt
+	.rva	.LSEH_gcm_enc_info
+.section	.xdata
+.align	8
+.LSEH_gcm_dec_info:
+	.byte	9,0,0,0
+	.rva	gcm_se_handler
+	.rva	.Lgcm_dec_body,.Lgcm_dec_abort
+.LSEH_gcm_enc_info:
+	.byte	9,0,0,0
+	.rva	gcm_se_handler
+	.rva	.Lgcm_enc_body,.Lgcm_enc_abort
+___
+}
+}}} else {{{
+$code=<<___;	# assembler is too old
+.text
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,\@abi-omnipotent
+aesni_gcm_encrypt:
+	xor	%eax,%eax
+	ret
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,\@abi-omnipotent
+aesni_gcm_decrypt:
+	xor	%eax,%eax
+	ret
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/src/crypto/modes/asm/ghash-armv4.pl b/src/crypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000..b324641
--- /dev/null
+++ b/src/crypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,501 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# C�mara, D.; Gouv�a, C. P. L.; L�pez, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+# 
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4";	# variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp;	# used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$_,$_
+	str	$_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+	str	$_,[$Xi,#$i]
+#else
+	mov	$Tlh,$_,lsr#8
+	strb	$_,[$Xi,#$i+3]
+	mov	$Thl,$_,lsr#16
+	strb	$Tlh,[$Xi,#$i+2]
+	mov	$Thh,$_,lsr#24
+	strb	$Thl,[$Xi,#$i+1]
+	strb	$Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+
+$code=<<___;
+#if defined(__arm__)
+#include "arm_arch.h"
+
+.syntax unified
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	$rem_4bit,pc,#8
+	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.hidden	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	$len,$inp,$len		@ $len to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	$nlo,[$inp,#15]
+	ldrb	$nhi,[$Xi,#15]
+.Louter:
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
+	ldrb	$nlo,[$inp,#14]
+
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	ldrb	$nhi,[$Xi,#14]
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrbpl	$nlo,[$inp,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrbpl	$Tll,[$Xi,$cnt]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tlh,[sp,$nhi]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eorpl	$nlo,$nlo,$Tll
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	$len,[sp,#32]		@ re-load $len/end
+	add	$inp,$inp,#16
+	mov	$nhi,$Zll
+___
+	&Zsmash("cmp\t$inp,$len","ldrbne\t$nlo,[$inp,#15]");
+$code.=<<___;
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.hidden	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	$nlo,[$Xi,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	ldrb	$nlo,[$Xi,#14]
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	and	$nhi,$nlo,#0xf0
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+	and	$nlo,$nlo,#0x0f
+
+.Loop:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrbpl	$nlo,[$Xi,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+___
+	&Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
+
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+	vext.8		$t0#lo, $a, $a, #1	@ A1
+	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
+	vext.8		$r#lo, $b, $b, #1	@ B1
+	vmull.p8	$r, $a, $r#lo		@ E = A*B1
+	vext.8		$t1#lo, $a, $a, #2	@ A2
+	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
+	vext.8		$t3#lo, $b, $b, #2	@ B2
+	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
+	vext.8		$t2#lo, $a, $a, #3	@ A3
+	veor		$t0, $t0, $r		@ L = E + F
+	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
+	vext.8		$r#lo, $b, $b, #3	@ B3
+	veor		$t1, $t1, $t3		@ M = G + H
+	vmull.p8	$r, $a, $r#lo		@ I = A*B3
+	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
+	vand		$t0#hi, $t0#hi, $k48
+	vext.8		$t3#lo, $b, $b, #4	@ B4
+	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
+	vand		$t1#hi, $t1#hi, $k32
+	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
+	veor		$t2, $t2, $r		@ N = I + J
+	veor		$t0#lo, $t0#lo, $t0#hi
+	veor		$t1#lo, $t1#lo, $t1#hi
+	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
+	vand		$t2#hi, $t2#hi, $k16
+	vext.8		$t0, $t0, $t0, #15
+	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	$t3#hi, #0
+	vext.8		$t1, $t1, $t1, #14
+	veor		$t2#lo, $t2#lo, $t2#hi
+	vmull.p8	$r, $a, $b		@ D = A*B
+	vext.8		$t3, $t3, $t3, #12
+	vext.8		$t2, $t2, $t2, #13
+	veor		$t0, $t0, $t1
+	veor		$t2, $t2, $t3
+	veor		$r, $r, $t0
+	veor		$r, $r, $t2
+___
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_init_neon
+.hidden	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64		$IN#hi,[r1,:64]!	@ load H
+	vmov.i8		$t0,#0xe1
+	vld1.64		$IN#lo,[r1,:64]
+	vshl.i64	$t0#hi,#57
+	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
+	vdup.8		$t1,$IN#hi[7]
+	vshr.u64	$Hlo,$IN#lo,#63
+	vshr.s8		$t1,#7			@ broadcast carry bit
+	vshl.i64	$IN,$IN,#1
+	vand		$t0,$t0,$t1
+	vorr		$IN#hi,$Hlo		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vstmia		r0,{$IN}
+
+	bx	lr
+.size	gcm_init_neon,.-gcm_init_neon
+
+.global	gcm_gmult_neon
+.hidden	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	vld1.64		$IN#hi,[$Xi,:64]!	@ load Xi
+	vld1.64		$IN#lo,[$Xi,:64]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+	mov		$len,#16
+	b		.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.hidden	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		$Xl#hi,[$Xi,:64]!	@ load Xi
+	vld1.64		$Xl#lo,[$Xi,:64]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	$Xl,$Xl
+#endif
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64		$IN#hi,[$inp]!		@ load inp
+	vld1.64		$IN#lo,[$inp]!
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$IN,$Xl			@ inp^=Xi
+.Lgmult_neon:
+___
+	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo�Xi.lo
+$code.=<<___;
+	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
+___
+	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)�(Xi.lo+Xi.hi)
+	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi�Xi.hi
+$code.=<<___;
+	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
+	veor		$Xm,$Xm,$Xh
+	veor		$Xl#hi,$Xl#hi,$Xm#lo
+	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	$t1,$Xl,#57		@ 1st phase
+	vshl.i64	$t2,$Xl,#62
+	veor		$t2,$t2,$t1		@
+	vshl.i64	$t1,$Xl,#63
+	veor		$t2, $t2, $t1		@
+ 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
+	veor		$Xh#lo,$Xh#lo,$t2#hi
+
+	vshr.u64	$t2,$Xl,#1		@ 2nd phase
+	veor		$Xh,$Xh,$Xl
+	veor		$Xl,$Xl,$t2		@
+	vshr.u64	$t2,$t2,#6
+	vshr.u64	$Xl,$Xl,#1		@
+	veor		$Xl,$Xl,$Xh		@
+	veor		$Xl,$Xl,$t2		@
+
+	subs		$len,#16
+	bne		.Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Xl,$Xl
+#endif
+	sub		$Xi,#16	
+	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
+	vst1.64		$Xl#lo,[$Xi,:64]
+
+	bx	lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+
+#endif
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/src/crypto/modes/asm/ghash-x86.pl b/src/crypto/modes/asm/ghash-x86.pl
new file mode 100644
index 0000000..eb6d55e
--- /dev/null
+++ b/src/crypto/modes/asm/ghash-x86.pl
@@ -0,0 +1,1393 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#		gcc 2.95.3(*)	SSE assembler	x86 assembler
+#
+# Pentium	105/111(**)	-		50
+# PIII		68 /75		12.2		24
+# P4		125/125		17.8		84(***)
+# Opteron	66 /70		10.1		30
+# Core2		54 /67		8.4		18
+# Atom		105/105		16.8		53
+# VIA Nano	69 /71		13.0		27
+#
+# (*)	gcc 3.4.x was observed to generate few percent slower code,
+#	which is one of reasons why 2.95.3 results were chosen,
+#	another reason is lack of 3.4.x results for older CPUs;
+#	comparison with SSE results is not completely fair, because C
+#	results are for vanilla "256B" implementation, while
+#	assembler results are for "528B";-)
+# (**)	second number is result for code compiled with -fPIC flag,
+#	which is actually more relevant, because assembler code is
+#	position-independent;
+# (***)	see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=1;
+#for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp  = "edi";
+$Htbl = "esi";
+
+$unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
+		# than unrolled, which has to be weighted against
+		# 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+    my $off = shift;
+    my $rem = "eax";
+
+	&mov	($Zhh,&DWP(4,$Htbl,$Zll));
+	&mov	($Zhl,&DWP(0,$Htbl,$Zll));
+	&mov	($Zlh,&DWP(12,$Htbl,$Zll));
+	&mov	($Zll,&DWP(8,$Htbl,$Zll));
+	&xor	($rem,$rem);	# avoid partial register stalls on PIII
+
+	# shrd practically kills P4, 2.5x deterioration, but P4 has
+	# MMX code-path to execute. shrd runs tad faster [than twice
+	# the shifts, move's and or's] on pre-MMX Pentium (as well as
+	# PIII and Core2), *but* minimizes code size, spares register
+	# and thus allows to fold the loop...
+	if (!$unroll) {
+	my $cnt = $inp;
+	&mov	($cnt,15);
+	&jmp	(&label("x86_loop"));
+	&set_label("x86_loop",16);
+	    for($i=1;$i<=2;$i++) {
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		&mov	(&LB($rem),&BP($off,"esp",$cnt));
+		if ($i&1) {
+			&and	(&LB($rem),0xf0);
+		} else {
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+
+		if ($i&1) {
+			&dec	($cnt);
+			&js	(&label("x86_break"));
+		} else {
+			&jmp	(&label("x86_loop"));
+		}
+	    }
+	&set_label("x86_break",16);
+	} else {
+	    for($i=1;$i<32;$i++) {
+		&comment($i);
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		if ($i&1) {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&and	(&LB($rem),0xf0);
+		} else {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+	    }
+	}
+	&bswap	($Zll);
+	&bswap	($Zlh);
+	&bswap	($Zhl);
+	if (!$x86only) {
+		&bswap	($Zhh);
+	} else {
+		&mov	("eax",$Zhh);
+		&bswap	("eax");
+		&mov	($Zhh,"eax");
+	}
+}
+
+if ($unroll) {
+    &function_begin_B("_x86_gmult_4bit_inner");
+	&x86_loop(4);
+	&ret	();
+    &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+    my $bias = shift;
+
+	&mov	(&DWP($bias+0, "esp"),0x0000<<16);
+	&mov	(&DWP($bias+4, "esp"),0x1C20<<16);
+	&mov	(&DWP($bias+8, "esp"),0x3840<<16);
+	&mov	(&DWP($bias+12,"esp"),0x2460<<16);
+	&mov	(&DWP($bias+16,"esp"),0x7080<<16);
+	&mov	(&DWP($bias+20,"esp"),0x6CA0<<16);
+	&mov	(&DWP($bias+24,"esp"),0x48C0<<16);
+	&mov	(&DWP($bias+28,"esp"),0x54E0<<16);
+	&mov	(&DWP($bias+32,"esp"),0xE100<<16);
+	&mov	(&DWP($bias+36,"esp"),0xFD20<<16);
+	&mov	(&DWP($bias+40,"esp"),0xD940<<16);
+	&mov	(&DWP($bias+44,"esp"),0xC560<<16);
+	&mov	(&DWP($bias+48,"esp"),0x9180<<16);
+	&mov	(&DWP($bias+52,"esp"),0x8DA0<<16);
+	&mov	(&DWP($bias+56,"esp"),0xA9C0<<16);
+	&mov	(&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for stack alignment
+	&mov	($inp,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+
+	&mov	($Zhh,&DWP(0,$inp));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$inp));
+	&mov	($Zlh,&DWP(8,$inp));
+	&mov	($Zll,&DWP(12,$inp));
+
+	&deposit_rem_4bit(16);
+
+	&mov	(&DWP(0,"esp"),$Zhh);		# copy Xi[16] on stack
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(0));
+	}
+
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for 64-bit alignment
+	&mov	($Zll,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+	&mov	($inp,&wparam(2));		# load in
+	&mov	("ecx",&wparam(3));		# load len
+	&add	("ecx",$inp);
+	&mov	(&wparam(3),"ecx");
+
+	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zll));
+	&mov	($Zlh,&DWP(8,$Zll));
+	&mov	($Zll,&DWP(12,$Zll));
+
+	&deposit_rem_4bit(16);
+
+    &set_label("x86_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));		# xor with input
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&DWP(12,"esp"),$Zll);		# dump it on stack
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(2));
+	}
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&mov	(&wparam(2),$inp)	if (!$unroll);
+	&jb	(&label("x86_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (!$sse2) {{	# pure-MMX "May" version...
+
+$S=12;		# shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary �-archs.
+{
+    my $cnt;
+    my $rem_4bit = "eax";
+    my @rem = ($Zhh,$Zll);
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem[0],$Zlo);
+
+	for ($cnt=28;$cnt>=-2;$cnt--) {
+	    my $odd = $cnt&1;
+	    my $nix = $odd ? $nlo : $nhi;
+
+		&shl	(&LB($nlo),4)			if ($odd);
+		&psrlq	($Zlo,4);
+		&movq	($tmp,$Zhi);
+		&psrlq	($Zhi,4);
+		&pxor	($Zlo,&QWP(8,$Htbl,$nix));
+		&mov	(&LB($nlo),&BP($cnt/2,$inp))	if (!$odd && $cnt>=0);
+		&psllq	($tmp,60);
+		&and	($nhi,0xf0)			if ($odd);
+		&pxor	($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+		&and	($rem[0],0xf);
+		&pxor	($Zhi,&QWP(0,$Htbl,$nix));
+		&mov	($nhi,$nlo)			if (!$odd && $cnt>=0);
+		&movd	($rem[1],$Zlo);
+		&pxor	($Zlo,$tmp);
+
+		push	(@rem,shift(@rem));		# "rotate" registers
+	}
+
+	&mov	($inp,&DWP(4,$rem_4bit,$rem[1],8));	# last rem_4bit[rem]
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+	&shl	($inp,4);	# compensate for rem_4bit[i] being >>4
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&xor	($Zhh,$inp);
+	&bswap	($Zhh);
+
+	&ret	();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+	&mov	($Zhh,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+	&mov	($inp,&wparam(2));	# load in
+	&mov	($Zlh,&wparam(3));	# load len
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&add	($Zlh,$inp);
+	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
+	&stack_push(4+1);		# +1 for stack alignment
+
+	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zhh));
+	&mov	($Zlh,&DWP(8,$Zhh));
+	&mov	($Zhh,&DWP(0,$Zhh));
+	&jmp	(&label("mmx_outer_loop"));
+
+    &set_label("mmx_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&wparam(2),$inp);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&mov	($inp,"esp");
+	&shr	($Zll,24);
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(2));
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&jb	(&label("mmx_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+
+	&stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+
+}} else {{	# "June" MMX version...
+		# ... has slower "April" gcm_gmult_4bit_mmx with folded
+		# loop. This is done to conserve code size...
+$S=16;		# shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+    my $inp = shift;
+    my $rem_4bit = shift;
+    my $cnt = $Zhh;
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my $rem = $Zll;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&mov	($cnt,14);
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem,$Zlo);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_loop",16);
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&mov	(&LB($nlo),&BP(0,$inp,$cnt));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&dec	($cnt);
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&mov	($nhi,$nlo);
+	&pxor	($Zlo,$tmp);
+	&js	(&label("mmx_break"));
+
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_break",16);
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&bswap	($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&mmx_loop($inp,"eax");
+
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+  my $rem_8bit = "esi";
+  my $Htbl = "ebx";
+
+    # parameter block
+    &mov	("eax",&wparam(0));		# Xi
+    &mov	("ebx",&wparam(1));		# Htable
+    &mov	("ecx",&wparam(2));		# inp
+    &mov	("edx",&wparam(3));		# len
+    &mov	("ebp","esp");			# original %esp
+    &call	(&label("pic_point"));
+    &set_label	("pic_point");
+    &blindpop	($rem_8bit);
+    &lea	($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+    &sub	("esp",512+16+16);		# allocate stack frame...
+    &and	("esp",-64);			# ...and align it
+    &sub	("esp",16);			# place for (u8)(H[]<<4)
+
+    &add	("edx","ecx");			# pointer to the end of input
+    &mov	(&DWP(528+16+0,"esp"),"eax");	# save Xi
+    &mov	(&DWP(528+16+8,"esp"),"edx");	# save inp+len
+    &mov	(&DWP(528+16+12,"esp"),"ebp");	# save original %esp
+
+    { my @lo  = ("mm0","mm1","mm2");
+      my @hi  = ("mm3","mm4","mm5");
+      my @tmp = ("mm6","mm7");
+      my ($off1,$off2,$i) = (0,0,);
+
+      &add	($Htbl,128);			# optimize for size
+      &lea	("edi",&DWP(16+128,"esp"));
+      &lea	("ebp",&DWP(16+256+128,"esp"));
+
+      # decompose Htable (low and high parts are kept separately),
+      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+      for ($i=0;$i<18;$i++) {
+
+	&mov	("edx",&DWP(16*$i+8-128,$Htbl))		if ($i<16);
+	&movq	($lo[0],&QWP(16*$i+8-128,$Htbl))	if ($i<16);
+	&psllq	($tmp[1],60)				if ($i>1);
+	&movq	($hi[0],&QWP(16*$i+0-128,$Htbl))	if ($i<16);
+	&por	($lo[2],$tmp[1])			if ($i>1);
+	&movq	(&QWP($off1-128,"edi"),$lo[1])		if ($i>0 && $i<17);
+	&psrlq	($lo[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off1,"edi"),$hi[1])		if ($i>0 && $i<17);
+	&movq	($tmp[0],$hi[1])			if ($i>0 && $i<17);
+	&movq	(&QWP($off2-128,"ebp"),$lo[2])		if ($i>1);
+	&psrlq	($hi[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off2,"ebp"),$hi[2])		if ($i>1);
+	&shl	("edx",4)				if ($i<16);
+	&mov	(&BP($i,"esp"),&LB("edx"))		if ($i<16);
+
+	unshift	(@lo,pop(@lo));			# "rotate" registers
+	unshift	(@hi,pop(@hi));
+	unshift	(@tmp,pop(@tmp));
+	$off1 += 8	if ($i>0);
+	$off2 += 8	if ($i>1);
+      }
+    }
+
+    &movq	($Zhi,&QWP(0,"eax"));
+    &mov	("ebx",&DWP(8,"eax"));
+    &mov	("edx",&DWP(12,"eax"));		# load Xi
+
+&set_label("outer",16);
+  { my $nlo = "eax";
+    my $dat = "edx";
+    my @nhi = ("edi","ebp");
+    my @rem = ("ebx","ecx");
+    my @red = ("mm0","mm1","mm2");
+    my $tmp = "mm3";
+
+    &xor	($dat,&DWP(12,"ecx"));		# merge input data
+    &xor	("ebx",&DWP(8,"ecx"));
+    &pxor	($Zhi,&QWP(0,"ecx"));
+    &lea	("ecx",&DWP(16,"ecx"));		# inp+=16
+    #&mov	(&DWP(528+12,"esp"),$dat);	# save inp^Xi
+    &mov	(&DWP(528+8,"esp"),"ebx");
+    &movq	(&QWP(528+0,"esp"),$Zhi);
+    &mov	(&DWP(528+16+4,"esp"),"ecx");	# save inp
+
+    &xor	($nlo,$nlo);
+    &rol	($dat,8);
+    &mov	(&LB($nlo),&LB($dat));
+    &mov	($nhi[1],$nlo);
+    &and	(&LB($nlo),0x0f);
+    &shr	($nhi[1],4);
+    &pxor	($red[0],$red[0]);
+    &rol	($dat,8);			# next byte
+    &pxor	($red[1],$red[1]);
+    &pxor	($red[2],$red[2]);
+
+    # Just like in "May" verson modulo-schedule for critical path in
+    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+    # is scheduled so late that rem_8bit[] has to be shifted *right*
+    # by 16, which is why last argument to pinsrw is 2, which
+    # corresponds to <<32=<<48>>16...
+    for ($j=11,$i=0;$i<15;$i++) {
+
+      if ($i>0) {
+	&pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+	&rol	($dat,8);				# next byte
+	&pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+	&pxor	($Zlo,$tmp);
+	&pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+	&xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+      } else {
+	&movq	($Zlo,&QWP(16,"esp",$nlo,8));
+	&movq	($Zhi,&QWP(16+128,"esp",$nlo,8));
+      }
+
+	&mov	(&LB($nlo),&LB($dat));
+	&mov	($dat,&DWP(528+$j,"esp"))		if (--$j%4==0);
+
+	&movd	($rem[0],$Zlo);
+	&movz	($rem[1],&LB($rem[1]))			if ($i>0);
+	&psrlq	($Zlo,8);				# Z>>=8
+
+	&movq	($tmp,$Zhi);
+	&mov	($nhi[0],$nlo);
+	&psrlq	($Zhi,8);
+
+	&pxor	($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));	# Z^=H[nhi]>>4
+	&and	(&LB($nlo),0x0f);
+	&psllq	($tmp,56);
+
+	&pxor	($Zhi,$red[1])				if ($i>1);
+	&shr	($nhi[0],4);
+	&pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2)	if ($i>0);
+
+	unshift	(@red,pop(@red));			# "rotate" registers
+	unshift	(@rem,pop(@rem));
+	unshift	(@nhi,pop(@nhi));
+    }
+
+    &pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+    &pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+    &xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+    &movz	($rem[1],&LB($rem[1]));
+
+    &pxor	($red[2],$red[2]);			# clear 2nd word
+    &psllq	($red[1],4);
+
+    &movd	($rem[0],$Zlo);
+    &psrlq	($Zlo,4);				# Z>>=4
+
+    &movq	($tmp,$Zhi);
+    &psrlq	($Zhi,4);
+    &shl	($rem[0],4);				# rem<<4
+
+    &pxor	($Zlo,&QWP(16,"esp",$nhi[1],8));	# Z^=H[nhi]
+    &psllq	($tmp,60);
+    &movz	($rem[0],&LB($rem[0]));
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+    &pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+    &pxor	($Zhi,$red[1]);
+
+    &movd	($dat,$Zlo);
+    &pinsrw	($red[2],&WP(0,$rem_8bit,$rem[0],2),3);	# last is <<48
+
+    &psllq	($red[0],12);				# correct by <<16>>4
+    &pxor	($Zhi,$red[0]);
+    &psrlq	($Zlo,32);
+    &pxor	($Zhi,$red[2]);
+
+    &mov	("ecx",&DWP(528+16+4,"esp"));	# restore inp
+    &movd	("ebx",$Zlo);
+    &movq	($tmp,$Zhi);			# 01234567
+    &psllw	($Zhi,8);			# 1.3.5.7.
+    &psrlw	($tmp,8);			# .0.2.4.6
+    &por	($Zhi,$tmp);			# 10325476
+    &bswap	($dat);
+    &pshufw	($Zhi,$Zhi,0b00011011);		# 76543210
+    &bswap	("ebx");
+    
+    &cmp	("ecx",&DWP(528+16+8,"esp"));	# are we done?
+    &jne	(&label("outer"));
+  }
+
+    &mov	("eax",&DWP(528+16+0,"esp"));	# restore Xi
+    &mov	(&DWP(12,"eax"),"edx");
+    &mov	(&DWP(8,"eax"),"ebx");
+    &movq	(&QWP(0,"eax"),$Zhi);
+
+    &mov	("esp",&DWP(528+16+12,"esp"));	# restore original %esp
+    &emms	();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");	$Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {	# minimal "register" pressure
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+	&movdqa		($Xhi,$Xi);		#
+	&pshufd		($T1,$Xi,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110)	if (!defined($HK));
+	&pxor		($T1,$Xi);		#
+	&pxor		($T2,$Hkey)		if (!defined($HK));
+			$HK=$T2			if (!defined($HK));
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$HK,0x00);		#######
+	&xorps		($T1,$Xi);		#
+	&xorps		($T1,$Xhi);		#
+
+	&movdqa		($T2,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T2,8);		#
+	&pxor		($Xhi,$T1);
+	&pxor		($Xi,$T2);		#
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($Xhi,$Xi);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pshufd		($T2,$T1,0b01001110);	#
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T2,$T1);		#
+	&pxor		($T3,$Hkey);
+	&pclmulqdq	($T2,$T3,0x00);		#######
+	&pxor		($T2,$Xi);		#
+	&pxor		($T2,$Xhi);		#
+
+	&movdqa		($T3,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+}
+
+if (1) {		# Algorithm 9 with <<1 twist.
+			# Reduction is shorter and uses only two
+			# temporary registers, which makes it better
+			# candidate for interleaving with 64x64
+			# multiplication. Pre-modulo-scheduled loop
+			# was found to be ~20% faster than Algorithm 5
+			# below. Algorithm 9 was therefore chosen for
+			# further optimization...
+
+sub reduction_alg9 {	# 17/11 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+	# 1st phase
+	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);
+	&psllq		($Xi,5);
+	&pxor		($T1,$Xi);		#
+	&psllq		($Xi,1);
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,57);		#
+	&movdqa		($T1,$Xi);		#
+	&pslldq		($Xi,8);
+	&psrldq		($T1,8);		#	
+	&pxor		($Xi,$T2);
+	&pxor		($Xhi,$T1);		#
+
+	# 2nd phase
+	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,1);
+	&pxor		($Xhi,$T2);		#
+	&pxor		($T2,$Xi);
+	&psrlq		($Xi,5);
+	&pxor		($Xi,$T2);		#
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$Xhi)		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# <<1 twist
+	&pshufd		($T2,$Hkey,0b11111111);	# broadcast uppermost dword
+	&movdqa		($T1,$Hkey);
+	&psllq		($Hkey,1);
+	&pxor		($T3,$T3);		#
+	&psrlq		($T1,63);
+	&pcmpgtd	($T3,$T2);		# broadcast carry bit
+	&pslldq		($T1,8);
+	&por		($Hkey,$T1);		# H<<=1
+
+	# magic reduction
+	&pand		($T3,&QWP(16,$const));	# 0x1c2_polynomial
+	&pxor		($Hkey,$T3);		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufd		($T1,$Hkey,0b01001110);
+	&pshufd		($T2,$Xi,0b01001110);
+	&pxor		($T1,$Hkey);		# Karatsuba pre-processing
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&pxor		($T2,$Xi);		# Karatsuba pre-processing
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+	&palignr	($T2,$T1,8);		# low part is H.lo^H.hi
+	&movdqu		(&QWP(32,$Htbl),$T2);	# save Karatsuba "salt"
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movups		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+	&movups		($T2,&QWP(32,$Htbl));
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&movdqu		($T3,&QWP(32,$Htbl));
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	&pxor		($T1,$Xn);		#
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T3,0x00);		#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&nop		();
+
+	&sub		($len,0x20);
+	&jbe		(&label("even_tail"));
+	&jmp		(&label("mod_loop"));
+
+&set_label("mod_loop",32);
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
+	&nop		();
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&movdqa		($T3,&QWP(0,$const));
+	&xorps		($Xhi,$Xhn);
+	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1
+	&pxor		($T1,$Xhi);		#
+
+	 &pshufb	($Xhn,$T3);
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+	 &pshufb	($Xn,$T3);
+	 &pxor		($Xhi,$Xhn);		# "Ii+Xi", consume early
+
+	&movdqa		($Xhn,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	  &movdqa	($T2,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &movdqa	($T1,$Xi);
+	  &psllq	($Xi,5);
+	  &pxor		($T1,$Xi);		#
+	  &psllq	($Xi,1);
+	  &pxor		($Xi,$T1);		#
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&movups		($T3,&QWP(32,$Htbl));
+	  &psllq	($Xi,57);		#
+	  &movdqa	($T1,$Xi);		#
+	  &pslldq	($Xi,8);
+	  &psrldq	($T1,8);		#	
+	  &pxor		($Xi,$T2);
+	  &pxor		($Xhi,$T1);		#
+	&pshufd		($T1,$Xhn,0b01001110);
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,1);
+	&pxor		($T1,$Xhn);
+	  &pxor		($Xhi,$T2);		#
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	  &pxor		($T2,$Xi);
+	  &psrlq	($Xi,5);
+	  &pxor		($Xi,$T2);		#
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$Xhi)		#
+	&pclmulqdq	($T1,$T3,0x00);		#######
+
+	&lea		($inp,&DWP(32,$inp));
+	&sub		($len,0x20);
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movdqa		($T3,&QWP(0,$const));
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&xorps		($Xhi,$Xhn);
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	&pxor		($T1,$Xhi);		#
+
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+
+	&reduction_alg9	($Xhi,$Xi);
+
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else {		# Algorith 5. Kept for reference purposes.
+
+sub reduction_alg5 {	# 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+	# <<1
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($T2,$Xhi);
+	&pslld		($Xi,1);
+	&pslld		($Xhi,1);		#
+	&psrld		($T1,31);
+	&psrld		($T2,31);		#
+	&movdqa		($T3,$T1);
+	&pslldq		($T1,4);
+	&psrldq		($T3,12);		#
+	&pslldq		($T2,4);
+	&por		($Xhi,$T3);		#
+	&por		($Xi,$T1);
+	&por		($Xhi,$T2);		#
+
+	# 1st phase
+	&movdqa		($T1,$Xi);
+	&movdqa		($T2,$Xi);
+	&movdqa		($T3,$Xi);		#
+	&pslld		($T1,31);
+	&pslld		($T2,30);
+	&pslld		($Xi,25);		#
+	&pxor		($T1,$T2);
+	&pxor		($T1,$Xi);		#
+	&movdqa		($T2,$T1);		#
+	&pslldq		($T1,12);
+	&psrldq		($T2,4);		#
+	&pxor		($T3,$T1);
+
+	# 2nd phase
+	&pxor		($Xhi,$T3);		#
+	&movdqa		($Xi,$T3);
+	&movdqa		($T1,$T3);
+	&psrld		($Xi,1);		#
+	&psrld		($T1,2);
+	&psrld		($T3,7);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+	&pxor		($Xi,$Xhi);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($Xn,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$Xn);
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&pshufb		($Xi,$Xn);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	#######
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
+&set_label("rem_8bit",64);
+	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+	&data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+	&data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+	&data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+	&data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+	&data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+	&data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+	&data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+	&data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+	&data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+	&data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+	&data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+	&data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+	&data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+	&data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+	&data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+	&data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+	&data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+	&data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+	&data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+	&data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+	&data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+	&data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+	&data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+	&data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+	&data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+	&data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+	&data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}	# $sse2
+
+&set_label("rem_4bit",64);
+	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+}}}	# !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/src/crypto/modes/asm/ghash-x86_64.pl b/src/crypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 0000000..6e656ca
--- /dev/null
+++ b/src/crypto/modes/asm/ghash-x86_64.pl
@@ -0,0 +1,1753 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.4.x(*)	assembler
+#
+# P4		28.6		14.0		+100%
+# Opteron	19.3		7.7		+150%
+# Core2		17.8		8.1(**)		+120%
+# Atom		31.6		16.8		+88%
+# VIA Nano	21.8		10.1		+115%
+#
+# (*)	comparison is not completely fair, because C results are
+#	for vanilla "256B" implementation, while assembler results
+#	are for "528B";-)
+# (**)	it's mystery [to me] why Core2 result is not same as for
+#	Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere	1.78(+13%)
+# Sandy Bridge	1.80(+8%)
+# Ivy Bridge	1.80(+7%)
+# Haswell	0.55(+93%) (if system doesn't support AVX)
+# Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Bulldozer	1.49(+27%)
+# Silvermont	2.88(+13%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, and in
+# 0.29 on Broadwell.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+	$avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$do4xaggr=1;
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/	or
+			$r =~ s/%[er]([sd]i)/%\1l/	or
+			$r =~ s/%[er](bp)/%\1l/		or
+			$r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+{ my $N;
+  sub loop() {
+  my $inp = shift;
+
+	$N++;
+$code.=<<___;
+	xor	$nlo,$nlo
+	xor	$nhi,$nhi
+	mov	`&LB("$Zlo")`,`&LB("$nlo")`
+	mov	`&LB("$Zlo")`,`&LB("$nhi")`
+	shl	\$4,`&LB("$nlo")`
+	mov	\$14,$cnt
+	mov	8($Htbl,$nlo),$Zlo
+	mov	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	mov	$Zlo,$rem
+	jmp	.Loop$N
+
+.align	16
+.Loop$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	mov	($inp,$cnt),`&LB("$nlo")`
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	mov	`&LB("$nlo")`,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	shl	\$4,`&LB("$nlo")`
+	xor	$tmp,$Zlo
+	dec	$cnt
+	js	.Lbreak$N
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+	jmp	.Loop$N
+
+.align	16
+.Lbreak$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	xor	$tmp,$Zlo
+	xor	($rem_4bit,$rem,8),$Zhi
+
+	bswap	$Zlo
+	bswap	$Zhi
+___
+}}
+
+$code=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
+
+.globl	gcm_gmult_4bit
+.type	gcm_gmult_4bit,\@function,2
+.align	16
+gcm_gmult_4bit:
+	push	%rbx
+	push	%rbp		# %rbp and %r12 are pushed exclusively in
+	push	%r12		# order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+	movzb	15($Xi),$Zlo
+	lea	.Lrem_4bit(%rip),$rem_4bit
+___
+	&loop	($Xi);
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	mov	16(%rsp),%rbx
+	lea	24(%rsp),%rsp
+.Lgmult_epilogue:
+	ret
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.type	gcm_ghash_4bit,\@function,4
+.align	16
+gcm_ghash_4bit:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$280,%rsp
+.Lghash_prologue:
+	mov	$inp,%r14		# reassign couple of args
+	mov	$len,%r15
+___
+{ my $inp="%r14";
+  my $dat="%edx";
+  my $len="%r15";
+  my @nhi=("%ebx","%ecx");
+  my @rem=("%r12","%r13");
+  my $Hshr4="%rbp";
+
+	&sub	($Htbl,-128);		# size optimization
+	&lea	($Hshr4,"16+128(%rsp)");
+	{ my @lo =($nlo,$nhi);
+          my @hi =($Zlo,$Zhi);
+
+	  &xor	($dat,$dat);
+	  for ($i=0,$j=-2;$i<18;$i++,$j++) {
+	    &mov	("$j(%rsp)",&LB($dat))		if ($i>1);
+	    &or		($lo[0],$tmp)			if ($i>1);
+	    &mov	(&LB($dat),&LB($lo[1]))		if ($i>0 && $i<17);
+	    &shr	($lo[1],4)			if ($i>0 && $i<17);
+	    &mov	($tmp,$hi[1])			if ($i>0 && $i<17);
+	    &shr	($hi[1],4)			if ($i>0 && $i<17);
+	    &mov	("8*$j($Hshr4)",$hi[0])		if ($i>1);
+	    &mov	($hi[0],"16*$i+0-128($Htbl)")	if ($i<16);
+	    &shl	(&LB($dat),4)			if ($i>0 && $i<17);
+	    &mov	("8*$j-128($Hshr4)",$lo[0])	if ($i>1);
+	    &mov	($lo[0],"16*$i+8-128($Htbl)")	if ($i<16);
+	    &shl	($tmp,60)			if ($i>0 && $i<17);
+
+	    push	(@lo,shift(@lo));
+	    push	(@hi,shift(@hi));
+	  }
+	}
+	&add	($Htbl,-128);
+	&mov	($Zlo,"8($Xi)");
+	&mov	($Zhi,"0($Xi)");
+	&add	($len,$inp);		# pointer to the end of data
+	&lea	($rem_8bit,".Lrem_8bit(%rip)");
+	&jmp	(".Louter_loop");
+
+$code.=".align	16\n.Louter_loop:\n";
+	&xor	($Zhi,"($inp)");
+	&mov	("%rdx","8($inp)");
+	&lea	($inp,"16($inp)");
+	&xor	("%rdx",$Zlo);
+	&mov	("($Xi)",$Zhi);
+	&mov	("8($Xi)","%rdx");
+	&shr	("%rdx",32);
+
+	&xor	($nlo,$nlo);
+	&rol	($dat,8);
+	&mov	(&LB($nlo),&LB($dat));
+	&movz	($nhi[0],&LB($dat));
+	&shl	(&LB($nlo),4);
+	&shr	($nhi[0],4);
+
+	for ($j=11,$i=0;$i<15;$i++) {
+	    &rol	($dat,8);
+	    &xor	($Zlo,"8($Htbl,$nlo)")			if ($i>0);
+	    &xor	($Zhi,"($Htbl,$nlo)")			if ($i>0);
+	    &mov	($Zlo,"8($Htbl,$nlo)")			if ($i==0);
+	    &mov	($Zhi,"($Htbl,$nlo)")			if ($i==0);
+
+	    &mov	(&LB($nlo),&LB($dat));
+	    &xor	($Zlo,$tmp)				if ($i>0);
+	    &movzw	($rem[1],"($rem_8bit,$rem[1],2)")	if ($i>0);
+
+	    &movz	($nhi[1],&LB($dat));
+	    &shl	(&LB($nlo),4);
+	    &movzb	($rem[0],"(%rsp,$nhi[0])");
+
+	    &shr	($nhi[1],4)				if ($i<14);
+	    &and	($nhi[1],0xf0)				if ($i==14);
+	    &shl	($rem[1],48)				if ($i>0);
+	    &xor	($rem[0],$Zlo);
+
+	    &mov	($tmp,$Zhi);
+	    &xor	($Zhi,$rem[1])				if ($i>0);
+	    &shr	($Zlo,8);
+
+	    &movz	($rem[0],&LB($rem[0]));
+	    &mov	($dat,"$j($Xi)")			if (--$j%4==0);
+	    &shr	($Zhi,8);
+
+	    &xor	($Zlo,"-128($Hshr4,$nhi[0],8)");
+	    &shl	($tmp,56);
+	    &xor	($Zhi,"($Hshr4,$nhi[0],8)");
+
+	    unshift	(@nhi,pop(@nhi));		# "rotate" registers
+	    unshift	(@rem,pop(@rem));
+	}
+	&movzw	($rem[1],"($rem_8bit,$rem[1],2)");
+	&xor	($Zlo,"8($Htbl,$nlo)");
+	&xor	($Zhi,"($Htbl,$nlo)");
+
+	&shl	($rem[1],48);
+	&xor	($Zlo,$tmp);
+
+	&xor	($Zhi,$rem[1]);
+	&movz	($rem[0],&LB($Zlo));
+	&shr	($Zlo,4);
+
+	&mov	($tmp,$Zhi);
+	&shl	(&LB($rem[0]),4);
+	&shr	($Zhi,4);
+
+	&xor	($Zlo,"8($Htbl,$nhi[0])");
+	&movzw	($rem[0],"($rem_8bit,$rem[0],2)");
+	&shl	($tmp,60);
+
+	&xor	($Zhi,"($Htbl,$nhi[0])");
+	&xor	($Zlo,$tmp);
+	&shl	($rem[0],48);
+
+	&bswap	($Zlo);
+	&xor	($Zhi,$rem[0]);
+
+	&bswap	($Zhi);
+	&cmp	($inp,$len);
+	&jb	(".Louter_loop");
+}
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	lea	280(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lghash_epilogue:
+	ret
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 {	# minimal register pressure
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey,$T2
+___
+} else {
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1			#
+___
+}
+$code.=<<___;
+	pclmulqdq	\$0x00,$Hkey,$Xi	#######
+	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
+	pclmulqdq	\$0x00,$HK,$T1		#######
+	pxor		$Xi,$T1			#
+	pxor		$Xhi,$T1		#
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+}
+
+sub reduction_alg9 {	# 17/11 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	# 1st phase
+	movdqa		$Xi,$T2			#
+	movdqa		$Xi,$T1
+	psllq		\$5,$Xi
+	pxor		$Xi,$T1			#
+	psllq		\$1,$Xi
+	pxor		$T1,$Xi			#
+	psllq		\$57,$Xi		#
+	movdqa		$Xi,$T1			#
+	pslldq		\$8,$Xi
+	psrldq		\$8,$T1			#	
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
+
+	# 2nd phase
+	movdqa		$Xi,$T2
+	psrlq		\$1,$Xi
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
+	psrlq		\$5,$Xi
+	pxor		$T2,$Xi			#
+	psrlq		\$1,$Xi			#
+	pxor		$Xhi,$Xi		#
+___
+}
+
+{ my ($Htbl,$Xip)=@_4args;
+  my $HK="%xmm6";
+
+$code.=<<___;
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,\@abi-omnipotent
+.align	16
+gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+___
+$code.=<<___;
+	movdqu		($Xip),$Hkey
+	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	movdqa		$Hkey,$T1
+	psllq		\$1,$Hkey
+	pxor		$T3,$T3			#
+	psrlq		\$63,$T1
+	pcmpgtd		$T2,$T3			# broadcast carry bit
+	pslldq		\$8,$T1
+	por		$T1,$Hkey		# H<<=1
+
+	# magic reduction
+	pand		.L0x1c2_polynomial(%rip),$T3
+	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	pshufd		\$0b01001110,$Hkey,$HK
+	movdqa		$Hkey,$Xi
+	pxor		$Hkey,$HK
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufd		\$0b01001110,$Hkey,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$Hkey,$T1		# Karatsuba pre-processing
+	movdqu		$Hkey,0x00($Htbl)	# save H
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x10($Htbl)		# save H^2
+	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
+	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
+___
+if ($do4xaggr) {
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqa		$Xi,$T3
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufd		\$0b01001110,$T3,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$T3,$T1			# Karatsuba pre-processing
+	movdqu		$T3,0x30($Htbl)		# save H^3
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x40($Htbl)		# save H^4
+	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
+	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
+	ret
+.size	gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)=@_4args;
+
+$code.=<<___;
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,\@abi-omnipotent
+.align	16
+gcm_gmult_clmul:
+.L_gmult_clmul:
+	movdqu		($Xip),$Xi
+	movdqa		.Lbswap_mask(%rip),$T3
+	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$T2
+	pshufb		$T3,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+	# experimental alternative. special thing about is that there
+	# no dependency between the two multiplications... 
+	mov		\$`0xE1<<1`,%eax
+	mov		\$0xA040608020C0E000,%r10	# ((7..0)�0xE0)&0xff
+	mov		\$0x07,%r11d
+	movq		%rax,$T1
+	movq		%r10,$T2
+	movq		%r11,$T3		# borrow $T3
+	pand		$Xi,$T3
+	pshufb		$T3,$T2			# ($Xi&7)�0xE0
+	movq		%rax,$T3
+	pclmulqdq	\$0x00,$Xi,$T1		# �(0xE1<<1)
+	pxor		$Xi,$T2
+	pslldq		\$15,$T2
+	paddd		$T2,$T2			# <<(64+56+1)
+	pxor		$T2,$Xi
+	pclmulqdq	\$0x01,$T3,$Xi
+	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3
+	psrldq		\$1,$T1
+	pxor		$T1,$Xhi
+	pslldq		\$7,$Xi
+	pxor		$Xhi,$Xi
+___
+$code.=<<___;
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+	ret
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
+
+$code.=<<___;
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,\@abi-omnipotent
+.align	32
+gcm_ghash_clmul:
+.L_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
+___
+$code.=<<___;
+	movdqa		.Lbswap_mask(%rip),$T3
+
+	movdqu		($Xip),$Xi
+	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$HK
+	pshufb		$T3,$Xi
+
+	sub		\$0x10,$len
+	jz		.Lodd_tail
+
+	movdqu		0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+	mov		OPENSSL_ia32cap_P+4(%rip),%eax
+	cmp		\$0x30,$len
+	jb		.Lskip4x
+
+	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE
+	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE
+	je		.Lskip4x
+
+	sub		\$0x30,$len
+	mov		\$0xA040608020C0E000,%rax	# ((7..0)�0xE0)&0xff
+	movdqu		0x30($Htbl),$Hkey3
+	movdqu		0x40($Htbl),$Hkey4
+
+	#######
+	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+	#
+	movdqu		0x30($inp),$Xln
+	 movdqu		0x20($inp),$Xl
+	pshufb		$T3,$Xln
+	 pshufb		$T3,$Xl
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
+
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey2,$Xl
+	pclmulqdq	\$0x11,$Hkey2,$Xh
+	pclmulqdq	\$0x10,$HK,$Xm
+	xorps		$Xl,$Xln
+	xorps		$Xh,$Xhn
+	movups		0x50($Htbl),$HK
+	xorps		$Xm,$Xmn
+
+	movdqu		0x10($inp),$Xl
+	 movdqu		0($inp),$T1
+	pshufb		$T3,$Xl
+	 pshufb		$T3,$T1
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	 pxor		$T1,$Xi
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey3,$Xl
+	 movdqa		$Xi,$Xhi
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pxor		$Xi,$T1
+	pclmulqdq	\$0x11,$Hkey3,$Xh
+	pclmulqdq	\$0x00,$HK,$Xm
+	xorps		$Xl,$Xln
+	xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jc	.Ltail4x
+
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	xorps		$Xm,$Xmn
+	 movdqu		0x30($inp),$Xl
+	 pshufb		$T3,$Xl
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	xorps		$Xln,$Xi
+	 movdqu		0x20($inp),$Xln
+	 movdqa		$Xl,$Xh
+	pclmulqdq	\$0x10,$HK,$T1
+	 pshufd		\$0b01001110,$Xl,$Xm
+	xorps		$Xhn,$Xhi
+	 pxor		$Xl,$Xm
+	 pshufb		$T3,$Xln
+	movups		0x20($Htbl),$HK
+	xorps		$Xmn,$T1
+	 pclmulqdq	\$0x00,$Hkey,$Xl
+	 pshufd		\$0b01001110,$Xln,$Xmn
+
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	 movdqa		$Xln,$Xhn
+	pxor		$Xhi,$T1		#
+	 pxor		$Xln,$Xmn
+	movdqa		$T1,$T2			#
+	 pclmulqdq	\$0x11,$Hkey,$Xh
+	pslldq		\$8,$T1
+	psrldq		\$8,$T2			#
+	pxor		$T1,$Xi
+	movdqa		.L7_mask(%rip),$T1
+	pxor		$T2,$Xhi		#
+	movq		%rax,$T2
+
+	pand		$Xi,$T1			# 1st phase
+	pshufb		$T1,$T2			#
+	pxor		$Xi,$T2			#
+	 pclmulqdq	\$0x00,$HK,$Xm
+	psllq		\$57,$T2		#
+	movdqa		$T2,$T1			#
+	pslldq		\$8,$T2
+	 pclmulqdq	\$0x00,$Hkey2,$Xln
+	psrldq		\$8,$T1			#	
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
+	movdqu		0($inp),$T1
+
+	movdqa		$Xi,$T2			# 2nd phase
+	psrlq		\$1,$Xi
+	 pclmulqdq	\$0x11,$Hkey2,$Xhn
+	 xorps		$Xl,$Xln
+	 movdqu		0x10($inp),$Xl
+	 pshufb		$T3,$Xl
+	 pclmulqdq	\$0x10,$HK,$Xmn
+	 xorps		$Xh,$Xhn
+	 movups		0x50($Htbl),$HK
+	pshufb		$T3,$T1
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
+	psrlq		\$5,$Xi
+
+	 movdqa		$Xl,$Xh
+	 pxor		$Xm,$Xmn
+	 pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$T2,$Xi			#
+	pxor		$T1,$Xhi
+	 pxor		$Xl,$Xm
+	 pclmulqdq	\$0x00,$Hkey3,$Xl
+	psrlq		\$1,$Xi			#
+	pxor		$Xhi,$Xi		#
+	movdqa		$Xi,$Xhi
+	 pclmulqdq	\$0x11,$Hkey3,$Xh
+	 xorps		$Xl,$Xln
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1
+
+	 pclmulqdq	\$0x00,$HK,$Xm
+	 xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	pclmulqdq	\$0x10,$HK,$T1
+	xorps		$Xm,$Xmn
+	xorps		$Xln,$Xi
+	xorps		$Xhn,$Xhi
+	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
+	pxor		$Xmn,$T1
+
+	pxor		$Xhi,$T1		#
+	pxor		$Xi,$Xhi
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+	&reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+	add	\$0x40,$len
+	jz	.Ldone
+	movdqu	0x20($Htbl),$HK
+	sub	\$0x10,$len
+	jz	.Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	movdqu		($inp),$T1		# Ii
+	movdqu		16($inp),$Xln		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xln
+	pxor		$T1,$Xi			# Ii+Xi
+
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
+
+	lea		32($inp),$inp		# i+=2
+	nop
+	sub		\$0x20,$len
+	jbe		.Leven_tail
+	nop
+	jmp		.Lmod_loop
+
+.align	32
+.Lmod_loop:
+	movdqa		$Xi,$Xhi
+	movdqa		$Xmn,$T1
+	pshufd		\$0b01001110,$Xi,$Xmn	#
+	pxor		$Xi,$Xmn		#
+
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$Xmn
+
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+	  movdqu	($inp),$T2		# Ii
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	  pshufb	$T3,$T2
+	  movdqu	16($inp),$Xln		# Ii+1
+
+	pxor		$Xhi,$T1
+	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
+	pxor		$T1,$Xmn
+	 pshufb		$T3,$Xln
+	movdqa		$Xmn,$T1		#
+	psrldq		\$8,$T1
+	pslldq		\$8,$Xmn		#
+	pxor		$T1,$Xhi
+	pxor		$Xmn,$Xi		#
+
+	movdqa		$Xln,$Xhn		#
+
+	  movdqa	$Xi,$T2			# 1st phase
+	  movdqa	$Xi,$T1
+	  psllq		\$5,$Xi
+	  pxor		$Xi,$T1			#
+	pclmulqdq	\$0x00,$Hkey,$Xln	#######
+	  psllq		\$1,$Xi
+	  pxor		$T1,$Xi			#
+	  psllq		\$57,$Xi		#
+	  movdqa	$Xi,$T1			#
+	  pslldq	\$8,$Xi
+	  psrldq	\$8,$T1			#	
+	  pxor		$T2,$Xi
+	pshufd		\$0b01001110,$Xhn,$Xmn
+	  pxor		$T1,$Xhi		#
+	pxor		$Xhn,$Xmn		#
+
+	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$1,$Xi
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  pxor		$T2,$Xhi		#
+	  pxor		$Xi,$T2
+	  psrlq		\$5,$Xi
+	  pxor		$T2,$Xi			#
+	lea		32($inp),$inp
+	  psrlq		\$1,$Xi			#
+	pclmulqdq	\$0x00,$HK,$Xmn		#######
+	  pxor		$Xhi,$Xi		#
+
+	sub		\$0x20,$len
+	ja		.Lmod_loop
+
+.Leven_tail:
+	 movdqa		$Xi,$Xhi
+	 movdqa		$Xmn,$T1
+	 pshufd		\$0b01001110,$Xi,$Xmn	#
+	 pxor		$Xi,$Xmn		#
+
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$Xmn
+
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+	pxor		$Xi,$T1
+	pxor		$Xhi,$T1
+	pxor		$T1,$Xmn
+	movdqa		$Xmn,$T1		#
+	psrldq		\$8,$T1
+	pslldq		\$8,$Xmn		#
+	pxor		$T1,$Xhi
+	pxor		$Xmn,$Xi		#
+___
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	test		$len,$len
+	jnz		.Ldone
+
+.Lodd_tail:
+	movdqu		($inp),$T1		# Ii
+	pshufb		$T3,$T1
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.globl	gcm_init_avx
+.type	gcm_init_avx,\@abi-omnipotent
+.align	32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Hkey
+	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	vpsrlq		\$63,$Hkey,$T1
+	vpsllq		\$1,$Hkey,$Hkey
+	vpxor		$T3,$T3,$T3		#
+	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
+	vpslldq		\$8,$T1,$T1
+	vpor		$T1,$Hkey,$Hkey		# H<<=1
+
+	# magic reduction
+	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
+	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	vpunpckhqdq	$Hkey,$Hkey,$HK
+	vmovdqa		$Hkey,$Xi
+	vpxor		$Hkey,$HK,$HK
+	mov		\$4,%r10		# up to H^8
+	jmp		.Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpunpckhqdq	$Hkey,$Hkey,$T2
+	vpxor		$Xi,$T1,$T1		#
+	vpxor		$Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpxor		$Xi,$T1,$T1		#
+___
+}
+$code.=<<___;
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
+	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
+	vpxor		$Xi,$Xhi,$T2		#
+	vpxor		$T2,$T1,$T1		#
+
+	vpslldq		\$8,$T1,$T2		#
+	vpsrldq		\$8,$T1,$T1
+	vpxor		$T2,$Xi,$Xi		#
+	vpxor		$T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	vpsllq		\$57,$Xi,$T1		# 1st phase
+	vpsllq		\$62,$Xi,$T2
+	vpxor		$T1,$T2,$T2		#
+	vpsllq		\$63,$Xi,$T1
+	vpxor		$T1,$T2,$T2		#
+	vpslldq		\$8,$T2,$T1		#
+	vpsrldq		\$8,$T2,$T2
+	vpxor		$T1,$Xi,$Xi		#
+	vpxor		$T2,$Xhi,$Xhi
+
+	vpsrlq		\$1,$Xi,$T2		# 2nd phase
+	vpxor		$Xi,$Xhi,$Xhi
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$5,$T2,$T2
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$1,$Xi,$Xi		#
+	vpxor		$Xhi,$Xi,$Xi		#
+___
+}
+
+$code.=<<___;
+.align	32
+.Linit_loop_avx:
+	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
+	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+	vmovdqa		$Xi,$T3
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+	vpshufd		\$0b01001110,$T3,$T1
+	vpshufd		\$0b01001110,$Xi,$T2
+	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
+	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
+	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
+	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
+	lea		0x30($Htbl),$Htbl
+	sub		\$1,%r10
+	jnz		.Linit_loop_avx
+
+	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
+	vmovdqu		$T3,-0x10($Htbl)
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+	ret
+.size	gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_init_clmul
+.size	gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl	gcm_gmult_avx
+.type	gcm_gmult_avx,\@abi-omnipotent
+.align	32
+gcm_gmult_avx:
+	jmp	.L_gmult_clmul
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl	gcm_ghash_avx
+.type	gcm_ghash_avx,\@abi-omnipotent
+.align	32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+    $Zlo,$Zhi,$Zmi,
+    $Hkey,$HK,$T1,$T2,
+    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Xi		# load $Xi
+	lea		.L0x1c2_polynomial(%rip),%r10
+	lea		0x40($Htbl),$Htbl	# size optimization
+	vmovdqu		.Lbswap_mask(%rip),$bswap
+	vpshufb		$bswap,$Xi,$Xi
+	cmp		\$0x80,$len
+	jb		.Lshort_avx
+	sub		\$0x80,$len
+
+	vmovdqu		0x70($inp),$Ii		# I[7]
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vpshufb		$bswap,$Ii,$Ii
+	vmovdqu		0x20-0x40($Htbl),$HK
+
+	vpunpckhqdq	$Ii,$Ii,$T2
+	 vmovdqu	0x60($inp),$Ij		# I[6]
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Ii,$T2,$T2
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x50($inp),$Ii		# I[5]
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpxor		$Ii,$T2,$T2
+	 vmovdqu	0x40($inp),$Ij		# I[4]
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x50-0x40($Htbl),$HK
+
+	 vpshufb	$bswap,$Ij,$Ij
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x30($inp),$Ii		# I[3]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	0x20($inp),$Ij		# I[2]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	vpxor		$Xmi,$Zmi,$Zmi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x10($inp),$Ii		# I[1]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	($inp),$Ij		# I[0]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
+
+	lea		0x80($inp),$inp
+	cmp		\$0x80,$len
+	jb		.Ltail_avx
+
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+	sub		\$0x80,$len
+	jmp		.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x70($inp),$Ii		# I[7]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpxor		$Ij,$T1,$T1
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
+	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Tred
+	 vmovdqu	0x20-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	  vmovdqu	0x60($inp),$Ij		# I[6]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Zlo,$Xi,$Xi		# collect result
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	vxorps		$Zhi,$Xo,$Xo
+	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	vpxor		$Zmi,$Tred,$Tred
+	 vxorps		$Ij,$T1,$T1
+
+	  vmovdqu	0x50($inp),$Ii		# I[5]
+	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Xo,$Tred,$Tred
+	vpslldq		\$8,$Tred,$T2
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	vpsrldq		\$8,$Tred,$Tred
+	vpxor		$T2, $Xi, $Xi
+	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	  vpshufb	$bswap,$Ii,$Ii
+	vxorps		$Tred,$Xo, $Xo
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x50-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x40($inp),$Ij		# I[4]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vxorps		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+
+	  vmovdqu	0x30($inp),$Ii		# I[3]
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x20($inp),$Ij		# I[2]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+	vxorps		$Tred,$Xi,$Xi
+
+	  vmovdqu	0x10($inp),$Ii		# I[1]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	vxorps		$Xo,$Tred,$Tred
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	($inp),$Ij		# I[0]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Tred,$Ij,$Ij
+	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+
+	lea		0x80($inp),$inp
+	sub		\$0x80,$len
+	jnc		.Loop8x_avx
+
+	add		\$0x80,$len
+	jmp		.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu		-0x10($inp,$len),$Ii	# very last word
+	lea		($inp,$len),$inp
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vmovdqu		0x20-0x40($Htbl),$HK
+	vpshufb		$bswap,$Ii,$Ij
+
+	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
+	vmovdqa		$Xhi,$Zhi		# $Zhi and
+	vmovdqa		$Xmi,$Zmi		# $Zmi
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x20($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x30($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x50-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x40($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x50($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x80-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x60($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x70($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovq		0xb8-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jmp		.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+.Ltail_no_xor_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+
+	vmovdqu		(%r10),$Tred
+
+	vpxor		$Xlo,$Zlo,$Xi
+	vpxor		$Xhi,$Zhi,$Xo
+	vpxor		$Xmi,$Zmi,$Zmi
+
+	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
+	vpxor		$Xo, $Zmi,$Zmi
+	vpslldq		\$8, $Zmi,$T2
+	vpsrldq		\$8, $Zmi,$Zmi
+	vpxor		$T2, $Xi, $Xi
+	vpxor		$Zmi,$Xo, $Xo
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$Xo,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	cmp		\$0,$len
+	jne		.Lshort_avx
+
+	vpshufb		$bswap,$Xi,$Xi
+	vmovdqu		$Xi,($Xip)
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_ghash_clmul
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+	.long	7,0,7,0
+.L7_mask_poly:
+	.long	7,0,`0xE1<<1`,0
+.align	64
+.type	.Lrem_4bit,\@object
+.Lrem_4bit:
+	.long	0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+	.long	0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+	.long	0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+	.long	0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type	.Lrem_8bit,\@object
+.Lrem_8bit:
+	.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+	.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+	.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+	.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+	.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+	.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+	.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+	.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+	.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+	.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+	.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+	.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+	.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+	.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+	.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+	.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+	.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+	.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+	.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+	.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+	.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+	.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+	.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+	.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+	.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+	.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+	.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+	.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+	.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+	.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+	.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+	.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	24(%rax),%rax		# adjust "rsp"
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_gcm_gmult_4bit
+	.rva	.LSEH_end_gcm_gmult_4bit
+	.rva	.LSEH_info_gcm_gmult_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_4bit
+	.rva	.LSEH_end_gcm_ghash_4bit
+	.rva	.LSEH_info_gcm_ghash_4bit
+
+	.rva	.LSEH_begin_gcm_init_clmul
+	.rva	.LSEH_end_gcm_init_clmul
+	.rva	.LSEH_info_gcm_init_clmul
+
+	.rva	.LSEH_begin_gcm_ghash_clmul
+	.rva	.LSEH_end_gcm_ghash_clmul
+	.rva	.LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___	if ($avx);
+	.rva	.LSEH_begin_gcm_init_avx
+	.rva	.LSEH_end_gcm_init_avx
+	.rva	.LSEH_info_gcm_init_clmul
+
+	.rva	.LSEH_begin_gcm_ghash_avx
+	.rva	.LSEH_end_gcm_ghash_avx
+	.rva	.LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_gcm_gmult_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lgmult_prologue,.Lgmult_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_init_clmul:
+	.byte	0x01,0x08,0x03,0x00
+	.byte	0x08,0x68,0x00,0x00	#movaps	0x00(rsp),xmm6
+	.byte	0x04,0x22,0x00,0x00	#sub	rsp,0x18
+.LSEH_info_gcm_ghash_clmul:
+	.byte	0x01,0x33,0x16,0x00
+	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
+	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
+	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
+	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
+	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
+	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/src/crypto/modes/asm/ghashv8-armx.pl b/src/crypto/modes/asm/ghashv8-armx.pl
new file mode 100644
index 0000000..54a1ac4
--- /dev/null
+++ b/src/crypto/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,241 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# Current performance in cycles per processed byte:
+#
+#		PMULL[2]	32-bit NEON(*)
+# Apple A7	1.76		5.62
+# Cortex-A53	1.45		8.39
+# Cortex-A57	2.22		7.61
+#
+# (*)	presented for reference/comparison purposes;
+
+$flavour = shift;
+open STDOUT,">".shift;
+
+$Xi="x0";	# argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3,$H,$Hhl)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
+
+$code.=<<___;
+.global	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	vld1.64		{$t1},[x1]		@ load H
+	vmov.i8		$t0,#0xe1
+	vext.8		$IN,$t1,$t1,#8
+	vshl.i64	$t0,$t0,#57
+	vshr.u64	$t2,$t0,#63
+	vext.8		$t0,$t2,$t0,#8		@ t0=0xc2....01
+	vdup.32		$t1,${t1}[1]
+	vshr.u64	$t3,$IN,#63
+	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
+	vand		$t3,$t3,$t0
+	vshl.i64	$IN,$IN,#1
+	vext.8		$t3,$t3,$t3,#8
+	vand		$t0,$t0,$t1
+	vorr		$IN,$IN,$t3		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vst1.64		{$IN},[x0]
+
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+
+.global	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	vld1.64		{$t1},[$Xi]		@ load Xi
+	vmov.i8		$t3,#0xe1
+	vld1.64		{$H},[$Htbl]		@ load twisted H
+	vshl.u64	$t3,$t3,#57
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$Hhl,$H,$H,#8
+	mov		$len,#0
+	vext.8		$IN,$t1,$t1,#8
+	mov		$inc,#0
+	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing
+	mov		$inp,$Xi
+	b		.Lgmult_v8
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+
+.global	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+	subs		$len,$len,#16
+	vmov.i8		$t3,#0xe1
+	mov		$inc,#16
+	vld1.64		{$H},[$Htbl]		@ load twisted H
+	cclr		$inc,eq
+	vext.8		$Xl,$Xl,$Xl,#8
+	vshl.u64	$t3,$t3,#57
+	vld1.64		{$t1},[$inp],$inc	@ load [rotated] inp
+	vext.8		$Hhl,$H,$H,#8
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+	vrev64.8	$t1,$t1
+#endif
+	veor		$Hhl,$Hhl,$H		@ Karatsuba pre-processing
+	vext.8		$IN,$t1,$t1,#8
+	b		.Loop_v8
+
+.align	4
+.Loop_v8:
+	vext.8		$t2,$Xl,$Xl,#8
+	veor		$IN,$IN,$Xl		@ inp^=Xi
+	veor		$t1,$t1,$t2		@ $t1 is rotated inp^Xi
+
+.Lgmult_v8:
+	vpmull.p64	$Xl,$H,$IN		@ H.lo�Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi�Xi.hi
+	subs		$len,$len,#16
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)�(Xi.lo+Xi.hi)
+	cclr		$inc,eq
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] inp
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$t3		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+#ifndef __ARMEB__
+	 vrev64.8	$t1,$t1
+#endif
+	veor		$Xl,$Xm,$t2
+	 vext.8		$IN,$t1,$t1,#8
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$t3
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	b.hs		.Loop_v8
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+if ($flavour =~ /64/) {			######## 64-bit code
+    sub unvmov {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+    }
+    foreach(split("\n",$code)) {
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
+	s/vmov\s+(.*)/unvmov($1)/geo	or
+	s/vext\.8/ext/o			or
+	s/vshr\.s/sshr\.s/o		or
+	s/vshr/ushr/o			or
+	s/^(\s+)v/$1/o			or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;				# old->new style commentary
+
+	# fix up remainig legacy suffixes
+	s/\.[ui]?8(\s)/$1/o;
+	s/\.[uis]?32//o and s/\.16b/\.4s/go;
+	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
+	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
+	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+    sub unvpmullp64 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+				 |(($2&7)<<17)|(($2&8)<<4)
+				 |(($3&7)<<1) |(($3&8)<<2);
+	    $word |= 0x00010001	 if ($mnemonic =~ "2");
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+
+    foreach(split("\n",$code)) {
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+        s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remainig new-style suffixes
+	s/\],#[0-9]+/]!/o;
+
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
+	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/^(\s+)b\./$1b/o						or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+        print $_,"\n";
+    }
+}
+
+close STDOUT; # enforce flush
diff --git a/src/crypto/modes/cbc.c b/src/crypto/modes/cbc.c
new file mode 100644
index 0000000..a2ad26c
--- /dev/null
+++ b/src/crypto/modes/cbc.c
@@ -0,0 +1,203 @@
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ==================================================================== */
+#include <openssl/modes.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "internal.h"
+
+
+#ifndef STRICT_ALIGNMENT
+#  define STRICT_ALIGNMENT 0
+#endif
+
+void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const void *key, uint8_t ivec[16],
+                           block128_f block) {
+  size_t n;
+  const uint8_t *iv = ivec;
+
+  assert(in && out && key && ivec);
+
+  if (STRICT_ALIGNMENT &&
+      ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+    while (len >= 16) {
+      for (n = 0; n < 16; ++n) {
+        out[n] = in[n] ^ iv[n];
+      }
+      (*block)(out, out, key);
+      iv = out;
+      len -= 16;
+      in += 16;
+      out += 16;
+    }
+  } else {
+    while (len >= 16) {
+      for (n = 0; n < 16; n += sizeof(size_t)) {
+        *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(iv + n);
+      }
+      (*block)(out, out, key);
+      iv = out;
+      len -= 16;
+      in += 16;
+      out += 16;
+    }
+  }
+
+  while (len) {
+    for (n = 0; n < 16 && n < len; ++n) {
+      out[n] = in[n] ^ iv[n];
+    }
+    for (; n < 16; ++n) {
+      out[n] = iv[n];
+    }
+    (*block)(out, out, key);
+    iv = out;
+    if (len <= 16) {
+      break;
+    }
+    len -= 16;
+    in += 16;
+    out += 16;
+  }
+
+  memcpy(ivec, iv, 16);
+}
+
+void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const void *key, uint8_t ivec[16],
+                           block128_f block) {
+  size_t n;
+  union {
+    size_t t[16 / sizeof(size_t)];
+    uint8_t c[16];
+  } tmp;
+
+  assert(in && out && key && ivec);
+
+  if (in != out) {
+    const uint8_t *iv = ivec;
+
+    if (STRICT_ALIGNMENT &&
+        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+      while (len >= 16) {
+        (*block)(in, out, key);
+        for (n = 0; n < 16; ++n)
+          out[n] ^= iv[n];
+        iv = in;
+        len -= 16;
+        in += 16;
+        out += 16;
+      }
+    } else if (16 % sizeof(size_t) == 0) { /* always true */
+      while (len >= 16) {
+        size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv;
+
+        (*block)(in, out, key);
+        for (n = 0; n < 16 / sizeof(size_t); n++)
+          out_t[n] ^= iv_t[n];
+        iv = in;
+        len -= 16;
+        in += 16;
+        out += 16;
+      }
+    }
+    memcpy(ivec, iv, 16);
+  } else {
+    if (STRICT_ALIGNMENT &&
+        ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+      uint8_t c;
+      while (len >= 16) {
+        (*block)(in, tmp.c, key);
+        for (n = 0; n < 16; ++n) {
+          c = in[n];
+          out[n] = tmp.c[n] ^ ivec[n];
+          ivec[n] = c;
+        }
+        len -= 16;
+        in += 16;
+        out += 16;
+      }
+    } else if (16 % sizeof(size_t) == 0) { /* always true */
+      while (len >= 16) {
+        size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec;
+        const size_t *in_t = (const size_t *)in;
+
+        (*block)(in, tmp.c, key);
+        for (n = 0; n < 16 / sizeof(size_t); n++) {
+          c = in_t[n];
+          out_t[n] = tmp.t[n] ^ ivec_t[n];
+          ivec_t[n] = c;
+        }
+        len -= 16;
+        in += 16;
+        out += 16;
+      }
+    }
+  }
+
+  while (len) {
+    uint8_t c;
+    (*block)(in, tmp.c, key);
+    for (n = 0; n < 16 && n < len; ++n) {
+      c = in[n];
+      out[n] = tmp.c[n] ^ ivec[n];
+      ivec[n] = c;
+    }
+    if (len <= 16) {
+      for (; n < 16; ++n) {
+        ivec[n] = in[n];
+      }
+      break;
+    }
+    len -= 16;
+    in += 16;
+    out += 16;
+  }
+}
diff --git a/src/crypto/modes/cfb.c b/src/crypto/modes/cfb.c
new file mode 100644
index 0000000..738a438
--- /dev/null
+++ b/src/crypto/modes/cfb.c
@@ -0,0 +1,230 @@
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ==================================================================== */
+
+#include <openssl/modes.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "internal.h"
+
+
+void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const void *key, uint8_t ivec[16], int *num, int enc,
+                           block128_f block) {
+  unsigned int n;
+  size_t l = 0;
+
+  assert(in && out && key && ivec && num);
+  assert((16 % sizeof(size_t)) == 0);
+
+  n = *num;
+
+  if (enc) {
+    while (n && len) {
+      *(out++) = ivec[n] ^= *(in++);
+      --len;
+      n = (n + 1) % 16;
+    }
+#if STRICT_ALIGNMENT
+    if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+      while (l < len) {
+        if (n == 0) {
+          (*block)(ivec, ivec, key);
+        }
+        out[l] = ivec[n] ^= in[l];
+        ++l;
+        n = (n + 1) % 16;
+      }
+      *num = n;
+      return;
+    }
+#endif
+    while (len >= 16) {
+      (*block)(ivec, ivec, key);
+      for (; n < 16; n += sizeof(size_t)) {
+        *(size_t *)(out + n) = *(size_t *)(ivec + n) ^= *(size_t *)(in + n);
+      }
+      len -= 16;
+      out += 16;
+      in += 16;
+      n = 0;
+    }
+    if (len) {
+      (*block)(ivec, ivec, key);
+      while (len--) {
+        out[n] = ivec[n] ^= in[n];
+        ++n;
+      }
+    }
+    *num = n;
+    return;
+  } else {
+    while (n && len) {
+      uint8_t c;
+      *(out++) = ivec[n] ^ (c = *(in++));
+      ivec[n] = c;
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+      while (l < len) {
+        unsigned char c;
+        if (n == 0) {
+          (*block)(ivec, ivec, key);
+        }
+        out[l] = ivec[n] ^ (c = in[l]);
+        ivec[n] = c;
+        ++l;
+        n = (n + 1) % 16;
+      }
+      *num = n;
+      return;
+    }
+    while (len >= 16) {
+      (*block)(ivec, ivec, key);
+      for (; n < 16; n += sizeof(size_t)) {
+        size_t t = *(size_t *)(in + n);
+        *(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t;
+        *(size_t *)(ivec + n) = t;
+      }
+      len -= 16;
+      out += 16;
+      in += 16;
+      n = 0;
+    }
+    if (len) {
+      (*block)(ivec, ivec, key);
+      while (len--) {
+        uint8_t c;
+        out[n] = ivec[n] ^ (c = in[n]);
+        ivec[n] = c;
+        ++n;
+      }
+    }
+    *num = n;
+    return;
+  }
+}
+
+
+/* This expects a single block of size nbits for both in and out. Note that
+   it corrupts any extra bits in the last byte of out */
+static void cfbr_encrypt_block(const uint8_t *in, uint8_t *out, unsigned nbits,
+                               const void *key, uint8_t ivec[16], int enc,
+                               block128_f block) {
+  int n, rem, num;
+  uint8_t ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't use) one
+                               byte off the end */
+
+  if (nbits <= 0 || nbits > 128) {
+    return;
+  }
+
+  /* fill in the first half of the new IV with the current IV */
+  memcpy(ovec, ivec, 16);
+  /* construct the new IV */
+  (*block)(ivec, ivec, key);
+  num = (nbits + 7) / 8;
+  if (enc) {
+    /* encrypt the input */
+    for (n = 0; n < num; ++n) {
+      out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
+    }
+  } else {
+    /* decrypt the input */
+    for (n = 0; n < num; ++n) {
+      out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
+    }
+  }
+  /* shift ovec left... */
+  rem = nbits % 8;
+  num = nbits / 8;
+  if (rem == 0) {
+    memcpy(ivec, ovec + num, 16);
+  } else {
+    for (n = 0; n < 16; ++n) {
+      ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
+    }
+  }
+
+  /* it is not necessary to cleanse ovec, since the IV is not secret */
+}
+
+/* N.B. This expects the input to be packed, MS bit first */
+void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits,
+                             const void *key, uint8_t ivec[16], int *num,
+                             int enc, block128_f block) {
+  size_t n;
+  uint8_t c[1], d[1];
+
+  assert(in && out && key && ivec && num);
+  assert(*num == 0);
+
+  for (n = 0; n < bits; ++n) {
+    c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
+    cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
+    out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
+                 ((d[0] & 0x80) >> (unsigned int)(n % 8));
+  }
+}
+
+void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const void *key,
+                             unsigned char ivec[16], int *num, int enc,
+                             block128_f block) {
+  size_t n;
+
+  assert(in && out && key && ivec && num);
+  assert(*num == 0);
+
+  for (n = 0; n < length; ++n) {
+    cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
+  }
+}
+
diff --git a/src/crypto/modes/ctr.c b/src/crypto/modes/ctr.c
new file mode 100644
index 0000000..61832ba
--- /dev/null
+++ b/src/crypto/modes/ctr.c
@@ -0,0 +1,219 @@
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ==================================================================== */
+#include <openssl/modes.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "internal.h"
+
+
+/* NOTE: the IV/counter CTR mode is big-endian.  The code itself
+ * is endian-neutral. */
+
+/* increment counter (128-bit int) by 1 */
+static void ctr128_inc(uint8_t *counter) {
+  uint32_t n = 16;
+  uint8_t c;
+
+  do {
+    --n;
+    c = counter[n];
+    ++c;
+    counter[n] = c;
+    if (c) {
+      return;
+    }
+  } while (n);
+}
+
+/* The input encrypted as though 128bit counter mode is being used.  The extra
+ * state information to record how much of the 128bit block we have used is
+ * contained in *num, and the encrypted counter is kept in ecount_buf.  Both
+ * *num and ecount_buf must be initialised with zeros before the first call to
+ * CRYPTO_ctr128_encrypt().
+ *
+ * This algorithm assumes that the counter is in the x lower bits of the IV
+ * (ivec), and that the application has full control over overflow and the rest
+ * of the IV.  This implementation takes NO responsibility for checking that
+ * the counter doesn't overflow into the rest of the IV when incremented. */
+void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const void *key, uint8_t ivec[16],
+                           uint8_t ecount_buf[16], unsigned int *num,
+                           block128_f block) {
+  unsigned int n;
+
+  assert(in && out && key && ecount_buf && num);
+  assert(*num < 16);
+  assert((16 % sizeof(size_t)) == 0);
+
+  n = *num;
+
+  while (n && len) {
+    *(out++) = *(in++) ^ ecount_buf[n];
+    --len;
+    n = (n + 1) % 16;
+  }
+
+#if STRICT_ALIGNMENT
+  if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+    size_t l = 0;
+    while (l < len) {
+      if (n == 0) {
+        (*block)(ivec, ecount_buf, key);
+        ctr128_inc(ivec);
+      }
+      out[l] = in[l] ^ ecount_buf[n];
+      ++l;
+      n = (n + 1) % 16;
+    }
+
+    *num = n;
+    return;
+  }
+#endif
+
+  while (len >= 16) {
+    (*block)(ivec, ecount_buf, key);
+    ctr128_inc(ivec);
+    for (; n < 16; n += sizeof(size_t))
+      *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
+    len -= 16;
+    out += 16;
+    in += 16;
+    n = 0;
+  }
+  if (len) {
+    (*block)(ivec, ecount_buf, key);
+    ctr128_inc(ivec);
+    while (len--) {
+      out[n] = in[n] ^ ecount_buf[n];
+      ++n;
+    }
+  }
+  *num = n;
+}
+
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(uint8_t *counter) {
+  uint32_t n = 12;
+  uint8_t c;
+
+  do {
+    --n;
+    c = counter[n];
+    ++c;
+    counter[n] = c;
+    if (c) {
+      return;
+    }
+  } while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out,
+                                 size_t len, const void *key,
+                                 uint8_t ivec[16],
+                                 uint8_t ecount_buf[16],
+                                 unsigned int *num, ctr128_f func) {
+  unsigned int n, ctr32;
+
+  assert(in && out && key && ecount_buf && num);
+  assert(*num < 16);
+
+  n = *num;
+
+  while (n && len) {
+    *(out++) = *(in++) ^ ecount_buf[n];
+    --len;
+    n = (n + 1) % 16;
+  }
+
+  ctr32 = GETU32(ivec + 12);
+  while (len >= 16) {
+    size_t blocks = len / 16;
+    /* 1<<28 is just a not-so-small yet not-so-large number...
+     * Below condition is practically never met, but it has to
+     * be checked for code correctness. */
+    if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28))
+      blocks = (1U << 28);
+    /* As (*func) operates on 32-bit counter, caller
+     * has to handle overflow. 'if' below detects the
+     * overflow, which is then handled by limiting the
+     * amount of blocks to the exact overflow point... */
+    ctr32 += (uint32_t)blocks;
+    if (ctr32 < blocks) {
+      blocks -= ctr32;
+      ctr32 = 0;
+    }
+    (*func)(in, out, blocks, key, ivec);
+    /* (*func) does not update ivec, caller does: */
+    PUTU32(ivec + 12, ctr32);
+    /* ... overflow was detected, propogate carry. */
+    if (ctr32 == 0)
+      ctr96_inc(ivec);
+    blocks *= 16;
+    len -= blocks;
+    out += blocks;
+    in += blocks;
+  }
+  if (len) {
+    memset(ecount_buf, 0, 16);
+    (*func)(ecount_buf, ecount_buf, 1, key, ivec);
+    ++ctr32;
+    PUTU32(ivec + 12, ctr32);
+    if (ctr32 == 0) {
+      ctr96_inc(ivec);
+    }
+    while (len--) {
+      out[n] = in[n] ^ ecount_buf[n];
+      ++n;
+    }
+  }
+
+  *num = n;
+}
diff --git a/src/crypto/modes/gcm.c b/src/crypto/modes/gcm.c
new file mode 100644
index 0000000..eeaeeff
--- /dev/null
+++ b/src/crypto/modes/gcm.c
@@ -0,0 +1,1245 @@
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ==================================================================== */
+
+#include <openssl/modes.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/mem.h>
+#include <openssl/cpu.h>
+
+#include "internal.h"
+#include "../internal.h"
+
+
+#if !defined(OPENSSL_NO_ASM) &&                         \
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
+     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
+#define GHASH_ASM
+#endif
+
+#if defined(BSWAP4) && STRICT_ALIGNMENT == 1
+/* redefine, because alignment is ensured */
+#undef GETU32
+#define GETU32(p) BSWAP4(*(const uint32_t *)(p))
+#undef PUTU32
+#define PUTU32(p, v) *(uint32_t *)(p) = BSWAP4(v)
+#endif
+
+#define PACK(s) ((size_t)(s) << (sizeof(size_t) * 8 - 16))
+#define REDUCE1BIT(V)                                                  \
+  do {                                                                 \
+    if (sizeof(size_t) == 8) {                                         \
+      uint64_t T = OPENSSL_U64(0xe100000000000000) & (0 - (V.lo & 1)); \
+      V.lo = (V.hi << 63) | (V.lo >> 1);                               \
+      V.hi = (V.hi >> 1) ^ T;                                          \
+    } else {                                                           \
+      uint32_t T = 0xe1000000U & (0 - (uint32_t)(V.lo & 1));           \
+      V.lo = (V.hi << 63) | (V.lo >> 1);                               \
+      V.hi = (V.hi >> 1) ^ ((uint64_t)T << 32);                        \
+    }                                                                  \
+  } while (0)
+
+
+static void gcm_init_4bit(u128 Htable[16], uint64_t H[2]) {
+  u128 V;
+
+  Htable[0].hi = 0;
+  Htable[0].lo = 0;
+  V.hi = H[0];
+  V.lo = H[1];
+
+  Htable[8] = V;
+  REDUCE1BIT(V);
+  Htable[4] = V;
+  REDUCE1BIT(V);
+  Htable[2] = V;
+  REDUCE1BIT(V);
+  Htable[1] = V;
+  Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
+  V = Htable[4];
+  Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
+  Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
+  Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
+  V = Htable[8];
+  Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
+  Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
+  Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
+  Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
+  Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
+  Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
+  Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
+
+#if defined(GHASH_ASM) && defined(OPENSSL_ARM)
+  /* ARM assembler expects specific dword order in Htable. */
+  {
+    int j;
+    const union {
+      long one;
+      char little;
+    } is_endian = {1};
+
+    if (is_endian.little) {
+      for (j = 0; j < 16; ++j) {
+        V = Htable[j];
+        Htable[j].hi = V.lo;
+        Htable[j].lo = V.hi;
+      }
+    } else {
+      for (j = 0; j < 16; ++j) {
+        V = Htable[j];
+        Htable[j].hi = V.lo << 32 | V.lo >> 32;
+        Htable[j].lo = V.hi << 32 | V.hi >> 32;
+      }
+    }
+  }
+#endif
+}
+
+#if !defined(GHASH_ASM) || defined(OPENSSL_AARCH64)
+static const size_t rem_4bit[16] = {
+    PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+    PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+    PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+    PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)};
+
+static void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]) {
+  u128 Z;
+  int cnt = 15;
+  size_t rem, nlo, nhi;
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+
+  nlo = ((const uint8_t *)Xi)[15];
+  nhi = nlo >> 4;
+  nlo &= 0xf;
+
+  Z.hi = Htable[nlo].hi;
+  Z.lo = Htable[nlo].lo;
+
+  while (1) {
+    rem = (size_t)Z.lo & 0xf;
+    Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+    Z.hi = (Z.hi >> 4);
+    if (sizeof(size_t) == 8) {
+      Z.hi ^= rem_4bit[rem];
+    } else {
+      Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
+    }
+
+    Z.hi ^= Htable[nhi].hi;
+    Z.lo ^= Htable[nhi].lo;
+
+    if (--cnt < 0) {
+      break;
+    }
+
+    nlo = ((const uint8_t *)Xi)[cnt];
+    nhi = nlo >> 4;
+    nlo &= 0xf;
+
+    rem = (size_t)Z.lo & 0xf;
+    Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+    Z.hi = (Z.hi >> 4);
+    if (sizeof(size_t) == 8) {
+      Z.hi ^= rem_4bit[rem];
+    } else {
+      Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
+    }
+
+    Z.hi ^= Htable[nlo].hi;
+    Z.lo ^= Htable[nlo].lo;
+  }
+
+  if (is_endian.little) {
+#ifdef BSWAP8
+    Xi[0] = BSWAP8(Z.hi);
+    Xi[1] = BSWAP8(Z.lo);
+#else
+    uint8_t *p = (uint8_t *)Xi;
+    uint32_t v;
+    v = (uint32_t)(Z.hi >> 32);
+    PUTU32(p, v);
+    v = (uint32_t)(Z.hi);
+    PUTU32(p + 4, v);
+    v = (uint32_t)(Z.lo >> 32);
+    PUTU32(p + 8, v);
+    v = (uint32_t)(Z.lo);
+    PUTU32(p + 12, v);
+#endif
+  } else {
+    Xi[0] = Z.hi;
+    Xi[1] = Z.lo;
+  }
+}
+
+/* Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
+ * details... Compiler-generated code doesn't seem to give any
+ * performance improvement, at least not on x86[_64]. It's here
+ * mostly as reference and a placeholder for possible future
+ * non-trivial optimization[s]... */
+static void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                           size_t len) {
+  u128 Z;
+  int cnt;
+  size_t rem, nlo, nhi;
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+
+  do {
+    cnt = 15;
+    nlo = ((const uint8_t *)Xi)[15];
+    nlo ^= inp[15];
+    nhi = nlo >> 4;
+    nlo &= 0xf;
+
+    Z.hi = Htable[nlo].hi;
+    Z.lo = Htable[nlo].lo;
+
+    while (1) {
+      rem = (size_t)Z.lo & 0xf;
+      Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+      Z.hi = (Z.hi >> 4);
+      if (sizeof(size_t) == 8) {
+        Z.hi ^= rem_4bit[rem];
+      } else {
+        Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
+      }
+
+      Z.hi ^= Htable[nhi].hi;
+      Z.lo ^= Htable[nhi].lo;
+
+      if (--cnt < 0) {
+        break;
+      }
+
+      nlo = ((const uint8_t *)Xi)[cnt];
+      nlo ^= inp[cnt];
+      nhi = nlo >> 4;
+      nlo &= 0xf;
+
+      rem = (size_t)Z.lo & 0xf;
+      Z.lo = (Z.hi << 60) | (Z.lo >> 4);
+      Z.hi = (Z.hi >> 4);
+      if (sizeof(size_t) == 8) {
+        Z.hi ^= rem_4bit[rem];
+      } else {
+        Z.hi ^= (uint64_t)rem_4bit[rem] << 32;
+      }
+
+      Z.hi ^= Htable[nlo].hi;
+      Z.lo ^= Htable[nlo].lo;
+    }
+
+    if (is_endian.little) {
+#ifdef BSWAP8
+      Xi[0] = BSWAP8(Z.hi);
+      Xi[1] = BSWAP8(Z.lo);
+#else
+      uint8_t *p = (uint8_t *)Xi;
+      uint32_t v;
+      v = (uint32_t)(Z.hi >> 32);
+      PUTU32(p, v);
+      v = (uint32_t)(Z.hi);
+      PUTU32(p + 4, v);
+      v = (uint32_t)(Z.lo >> 32);
+      PUTU32(p + 8, v);
+      v = (uint32_t)(Z.lo);
+      PUTU32(p + 12, v);
+#endif
+    } else {
+      Xi[0] = Z.hi;
+      Xi[1] = Z.lo;
+    }
+  } while (inp += 16, len -= 16);
+}
+#else /* GHASH_ASM */
+void gcm_gmult_4bit(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                    size_t len);
+#endif
+
+#define GCM_MUL(ctx, Xi) gcm_gmult_4bit(ctx->Xi.u, ctx->Htable)
+#if defined(GHASH_ASM)
+#define GHASH(ctx, in, len) gcm_ghash_4bit((ctx)->Xi.u, (ctx)->Htable, in, len)
+/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
+ * trashing effect. In other words idea is to hash data while it's
+ * still in L1 cache after encryption pass... */
+#define GHASH_CHUNK (3 * 1024)
+#endif
+
+
+#if defined(GHASH_ASM)
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+#define GHASH_ASM_X86_OR_64
+#define GCM_FUNCREF_4BIT
+void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                     size_t len);
+
+#if defined(OPENSSL_X86)
+#define gcm_init_avx gcm_init_clmul
+#define gcm_gmult_avx gcm_gmult_clmul
+#define gcm_ghash_avx gcm_ghash_clmul
+#else
+void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, size_t len);
+#endif
+
+#if defined(OPENSSL_X86)
+#define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                        size_t len);
+
+void gcm_gmult_4bit_x86(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_4bit_x86(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                        size_t len);
+#endif
+#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
+#include "../arm_arch.h"
+#if __ARM_ARCH__ >= 7
+#define GHASH_ASM_ARM
+#define GCM_FUNCREF_4BIT
+
+static int pmull_capable() {
+  return (OPENSSL_armcap_P & ARMV8_PMULL) != 0;
+}
+
+void gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                  size_t len);
+
+#if defined(OPENSSL_ARM)
+/* 32-bit ARM also has support for doing GCM with NEON instructions. */
+static int neon_capable() {
+  return CRYPTO_is_NEON_capable();
+}
+
+void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                    size_t len);
+#else
+/* AArch64 only has the ARMv8 versions of functions. */
+static int neon_capable() {
+  return 0;
+}
+void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) {
+  abort();
+}
+void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) {
+  abort();
+}
+void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                    size_t len) {
+  abort();
+}
+#endif
+
+#endif
+#endif
+#endif
+
+#ifdef GCM_FUNCREF_4BIT
+#undef GCM_MUL
+#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)(ctx->Xi.u, ctx->Htable)
+#ifdef GHASH
+#undef GHASH
+#define GHASH(ctx, in, len) (*gcm_ghash_p)(ctx->Xi.u, ctx->Htable, in, len)
+#endif
+#endif
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) {
+  GCM128_CONTEXT *ret;
+
+  ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT));
+  if (ret != NULL) {
+    CRYPTO_gcm128_init(ret, key, block);
+  }
+
+  return ret;
+}
+
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block) {
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+
+  memset(ctx, 0, sizeof(*ctx));
+  ctx->block = block;
+  ctx->key = key;
+
+  (*block)(ctx->H.c, ctx->H.c, key);
+
+  if (is_endian.little) {
+/* H is stored in host byte order */
+#ifdef BSWAP8
+    ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+    ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+#else
+    uint8_t *p = ctx->H.c;
+    uint64_t hi, lo;
+    hi = (uint64_t)GETU32(p) << 32 | GETU32(p + 4);
+    lo = (uint64_t)GETU32(p + 8) << 32 | GETU32(p + 12);
+    ctx->H.u[0] = hi;
+    ctx->H.u[1] = lo;
+#endif
+  }
+
+#if defined(GHASH_ASM_X86_OR_64)
+  if (crypto_gcm_clmul_enabled()) {
+    if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
+      gcm_init_avx(ctx->Htable, ctx->H.u);
+      ctx->gmult = gcm_gmult_avx;
+      ctx->ghash = gcm_ghash_avx;
+    } else {
+      gcm_init_clmul(ctx->Htable, ctx->H.u);
+      ctx->gmult = gcm_gmult_clmul;
+      ctx->ghash = gcm_ghash_clmul;
+    }
+    return;
+  }
+  gcm_init_4bit(ctx->Htable, ctx->H.u);
+#if defined(GHASH_ASM_X86) /* x86 only */
+  if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
+    ctx->gmult = gcm_gmult_4bit_mmx;
+    ctx->ghash = gcm_ghash_4bit_mmx;
+  } else {
+    ctx->gmult = gcm_gmult_4bit_x86;
+    ctx->ghash = gcm_ghash_4bit_x86;
+  }
+#else
+  ctx->gmult = gcm_gmult_4bit;
+  ctx->ghash = gcm_ghash_4bit;
+#endif
+#elif defined(GHASH_ASM_ARM)
+  if (pmull_capable()) {
+    gcm_init_v8(ctx->Htable, ctx->H.u);
+    ctx->gmult = gcm_gmult_v8;
+    ctx->ghash = gcm_ghash_v8;
+  } else if (neon_capable()) {
+    gcm_init_neon(ctx->Htable,ctx->H.u);
+    ctx->gmult = gcm_gmult_neon;
+    ctx->ghash = gcm_ghash_neon;
+  } else {
+    gcm_init_4bit(ctx->Htable, ctx->H.u);
+    ctx->gmult = gcm_gmult_4bit;
+    ctx->ghash = gcm_ghash_4bit;
+  }
+#else
+  gcm_init_4bit(ctx->Htable, ctx->H.u);
+  ctx->gmult = gcm_gmult_4bit;
+  ctx->ghash = gcm_ghash_4bit;
+#endif
+}
+
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const uint8_t *iv, size_t len) {
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+  unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
+#endif
+
+  ctx->Yi.u[0] = 0;
+  ctx->Yi.u[1] = 0;
+  ctx->Xi.u[0] = 0;
+  ctx->Xi.u[1] = 0;
+  ctx->len.u[0] = 0; /* AAD length */
+  ctx->len.u[1] = 0; /* message length */
+  ctx->ares = 0;
+  ctx->mres = 0;
+
+  if (len == 12) {
+    memcpy(ctx->Yi.c, iv, 12);
+    ctx->Yi.c[15] = 1;
+    ctr = 1;
+  } else {
+    size_t i;
+    uint64_t len0 = len;
+
+    while (len >= 16) {
+      for (i = 0; i < 16; ++i) {
+        ctx->Yi.c[i] ^= iv[i];
+      }
+      GCM_MUL(ctx, Yi);
+      iv += 16;
+      len -= 16;
+    }
+    if (len) {
+      for (i = 0; i < len; ++i) {
+        ctx->Yi.c[i] ^= iv[i];
+      }
+      GCM_MUL(ctx, Yi);
+    }
+    len0 <<= 3;
+    if (is_endian.little) {
+#ifdef BSWAP8
+      ctx->Yi.u[1] ^= BSWAP8(len0);
+#else
+      ctx->Yi.c[8] ^= (uint8_t)(len0 >> 56);
+      ctx->Yi.c[9] ^= (uint8_t)(len0 >> 48);
+      ctx->Yi.c[10] ^= (uint8_t)(len0 >> 40);
+      ctx->Yi.c[11] ^= (uint8_t)(len0 >> 32);
+      ctx->Yi.c[12] ^= (uint8_t)(len0 >> 24);
+      ctx->Yi.c[13] ^= (uint8_t)(len0 >> 16);
+      ctx->Yi.c[14] ^= (uint8_t)(len0 >> 8);
+      ctx->Yi.c[15] ^= (uint8_t)(len0);
+#endif
+    } else {
+      ctx->Yi.u[1] ^= len0;
+    }
+
+    GCM_MUL(ctx, Yi);
+
+    if (is_endian.little) {
+      ctr = GETU32(ctx->Yi.c + 12);
+    } else {
+      ctr = ctx->Yi.d[3];
+    }
+  }
+
+  (*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key);
+  ++ctr;
+  if (is_endian.little) {
+    PUTU32(ctx->Yi.c + 12, ctr);
+  } else {
+    ctx->Yi.d[3] = ctr;
+  }
+}
+
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
+  size_t i;
+  unsigned int n;
+  uint64_t alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
+#ifdef GHASH
+  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->ghash;
+#endif
+#endif
+
+  if (ctx->len.u[1]) {
+    return 0;
+  }
+
+  alen += len;
+  if (alen > (OPENSSL_U64(1) << 61) || (sizeof(len) == 8 && alen < len)) {
+    return 0;
+  }
+  ctx->len.u[0] = alen;
+
+  n = ctx->ares;
+  if (n) {
+    while (n && len) {
+      ctx->Xi.c[n] ^= *(aad++);
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->ares = n;
+      return 1;
+    }
+  }
+
+#ifdef GHASH
+  if ((i = (len & (size_t) - 16))) {
+    GHASH(ctx, aad, i);
+    aad += i;
+    len -= i;
+  }
+#else
+  while (len >= 16) {
+    for (i = 0; i < 16; ++i) {
+      ctx->Xi.c[i] ^= aad[i];
+    }
+    GCM_MUL(ctx, Xi);
+    aad += 16;
+    len -= 16;
+  }
+#endif
+  if (len) {
+    n = (unsigned int)len;
+    for (i = 0; i < len; ++i)
+      ctx->Xi.c[i] ^= aad[i];
+  }
+
+  ctx->ares = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const unsigned char *in,
+                          unsigned char *out, size_t len) {
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+  unsigned int n, ctr;
+  size_t i;
+  uint64_t mlen = ctx->len.u[1];
+  block128_f block = ctx->block;
+  void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
+#ifdef GHASH
+  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->ghash;
+#endif
+#endif
+
+  mlen += len;
+  if (mlen > ((OPENSSL_U64(1) << 36) - 32) ||
+      (sizeof(len) == 8 && mlen < len)) {
+    return 0;
+  }
+  ctx->len.u[1] = mlen;
+
+  if (ctx->ares) {
+    /* First call to encrypt finalizes GHASH(AAD) */
+    GCM_MUL(ctx, Xi);
+    ctx->ares = 0;
+  }
+
+  if (is_endian.little) {
+    ctr = GETU32(ctx->Yi.c + 12);
+  } else {
+    ctr = ctx->Yi.d[3];
+  }
+
+  n = ctx->mres;
+  if (n) {
+    while (n && len) {
+      ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->mres = n;
+      return 1;
+    }
+  }
+  if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out) % sizeof(size_t) != 0) {
+    for (i = 0; i < len; ++i) {
+      if (n == 0) {
+        (*block)(ctx->Yi.c, ctx->EKi.c, key);
+        ++ctr;
+        if (is_endian.little) {
+          PUTU32(ctx->Yi.c + 12, ctr);
+        } else {
+          ctx->Yi.d[3] = ctr;
+        }
+      }
+      ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
+      n = (n + 1) % 16;
+      if (n == 0) {
+        GCM_MUL(ctx, Xi);
+      }
+    }
+
+    ctx->mres = n;
+    return 1;
+  }
+#if defined(GHASH) && defined(GHASH_CHUNK)
+  while (len >= GHASH_CHUNK) {
+    size_t j = GHASH_CHUNK;
+
+    while (j) {
+      size_t *out_t = (size_t *)out;
+      const size_t *in_t = (const size_t *)in;
+
+      (*block)(ctx->Yi.c, ctx->EKi.c, key);
+      ++ctr;
+      if (is_endian.little) {
+        PUTU32(ctx->Yi.c + 12, ctr);
+      } else {
+        ctx->Yi.d[3] = ctr;
+      }
+      for (i = 0; i < 16 / sizeof(size_t); ++i) {
+        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+      }
+      out += 16;
+      in += 16;
+      j -= 16;
+    }
+    GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
+    len -= GHASH_CHUNK;
+  }
+  if ((i = (len & (size_t) - 16))) {
+    size_t j = i;
+
+    while (len >= 16) {
+      size_t *out_t = (size_t *)out;
+      const size_t *in_t = (const size_t *)in;
+
+      (*block)(ctx->Yi.c, ctx->EKi.c, key);
+      ++ctr;
+      if (is_endian.little) {
+        PUTU32(ctx->Yi.c + 12, ctr);
+      } else {
+        ctx->Yi.d[3] = ctr;
+      }
+      for (i = 0; i < 16 / sizeof(size_t); ++i) {
+        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+      }
+      out += 16;
+      in += 16;
+      len -= 16;
+    }
+    GHASH(ctx, out - j, j);
+  }
+#else
+  while (len >= 16) {
+    size_t *out_t = (size_t *)out;
+    const size_t *in_t = (const size_t *)in;
+
+    (*block)(ctx->Yi.c, ctx->EKi.c, key);
+    ++ctr;
+    if (is_endian.little) {
+      PUTU32(ctx->Yi.c + 12, ctr);
+    } else {
+      ctx->Yi.d[3] = ctr;
+    }
+    for (i = 0; i < 16 / sizeof(size_t); ++i) {
+      ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+    }
+    GCM_MUL(ctx, Xi);
+    out += 16;
+    in += 16;
+    len -= 16;
+  }
+#endif
+  if (len) {
+    (*block)(ctx->Yi.c, ctx->EKi.c, key);
+    ++ctr;
+    if (is_endian.little) {
+      PUTU32(ctx->Yi.c + 12, ctr);
+    } else {
+      ctx->Yi.d[3] = ctr;
+    }
+    while (len--) {
+      ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+      ++n;
+    }
+  }
+
+  ctx->mres = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const unsigned char *in,
+                          unsigned char *out, size_t len) {
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+  unsigned int n, ctr;
+  size_t i;
+  uint64_t mlen = ctx->len.u[1];
+  block128_f block = ctx->block;
+  void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
+#ifdef GHASH
+  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->ghash;
+#endif
+#endif
+
+  mlen += len;
+  if (mlen > ((OPENSSL_U64(1) << 36) - 32) ||
+      (sizeof(len) == 8 && mlen < len)) {
+    return 0;
+  }
+  ctx->len.u[1] = mlen;
+
+  if (ctx->ares) {
+    /* First call to decrypt finalizes GHASH(AAD) */
+    GCM_MUL(ctx, Xi);
+    ctx->ares = 0;
+  }
+
+  if (is_endian.little) {
+    ctr = GETU32(ctx->Yi.c + 12);
+  } else {
+    ctr = ctx->Yi.d[3];
+  }
+
+  n = ctx->mres;
+  if (n) {
+    while (n && len) {
+      uint8_t c = *(in++);
+      *(out++) = c ^ ctx->EKi.c[n];
+      ctx->Xi.c[n] ^= c;
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->mres = n;
+      return 1;
+    }
+  }
+  if (STRICT_ALIGNMENT && ((size_t)in | (size_t)out) % sizeof(size_t) != 0) {
+    for (i = 0; i < len; ++i) {
+      uint8_t c;
+      if (n == 0) {
+        (*block)(ctx->Yi.c, ctx->EKi.c, key);
+        ++ctr;
+        if (is_endian.little) {
+          PUTU32(ctx->Yi.c + 12, ctr);
+        } else {
+          ctx->Yi.d[3] = ctr;
+        }
+      }
+      c = in[i];
+      out[i] = c ^ ctx->EKi.c[n];
+      ctx->Xi.c[n] ^= c;
+      n = (n + 1) % 16;
+      if (n == 0) {
+        GCM_MUL(ctx, Xi);
+      }
+    }
+
+    ctx->mres = n;
+    return 1;
+  }
+#if defined(GHASH) && defined(GHASH_CHUNK)
+  while (len >= GHASH_CHUNK) {
+    size_t j = GHASH_CHUNK;
+
+    GHASH(ctx, in, GHASH_CHUNK);
+    while (j) {
+      size_t *out_t = (size_t *)out;
+      const size_t *in_t = (const size_t *)in;
+
+      (*block)(ctx->Yi.c, ctx->EKi.c, key);
+      ++ctr;
+      if (is_endian.little) {
+        PUTU32(ctx->Yi.c + 12, ctr);
+      } else {
+        ctx->Yi.d[3] = ctr;
+      }
+      for (i = 0; i < 16 / sizeof(size_t); ++i) {
+        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+      }
+      out += 16;
+      in += 16;
+      j -= 16;
+    }
+    len -= GHASH_CHUNK;
+  }
+  if ((i = (len & (size_t) - 16))) {
+    GHASH(ctx, in, i);
+    while (len >= 16) {
+      size_t *out_t = (size_t *)out;
+      const size_t *in_t = (const size_t *)in;
+
+      (*block)(ctx->Yi.c, ctx->EKi.c, key);
+      ++ctr;
+      if (is_endian.little) {
+        PUTU32(ctx->Yi.c + 12, ctr);
+      } else {
+        ctx->Yi.d[3] = ctr;
+      }
+      for (i = 0; i < 16 / sizeof(size_t); ++i) {
+        out_t[i] = in_t[i] ^ ctx->EKi.t[i];
+      }
+      out += 16;
+      in += 16;
+      len -= 16;
+    }
+  }
+#else
+  while (len >= 16) {
+    size_t *out_t = (size_t *)out;
+    const size_t *in_t = (const size_t *)in;
+
+    (*block)(ctx->Yi.c, ctx->EKi.c, key);
+    ++ctr;
+    if (is_endian.little) {
+      PUTU32(ctx->Yi.c + 12, ctr);
+    } else {
+      ctx->Yi.d[3] = ctr;
+    }
+    for (i = 0; i < 16 / sizeof(size_t); ++i) {
+      size_t c = in_t[i];
+      out_t[i] = c ^ ctx->EKi.t[i];
+      ctx->Xi.t[i] ^= c;
+    }
+    GCM_MUL(ctx, Xi);
+    out += 16;
+    in += 16;
+    len -= 16;
+  }
+#endif
+  if (len) {
+    (*block)(ctx->Yi.c, ctx->EKi.c, key);
+    ++ctr;
+    if (is_endian.little) {
+      PUTU32(ctx->Yi.c + 12, ctr);
+    } else {
+      ctx->Yi.d[3] = ctr;
+    }
+    while (len--) {
+      uint8_t c = in[n];
+      ctx->Xi.c[n] ^= c;
+      out[n] = c ^ ctx->EKi.c[n];
+      ++n;
+    }
+  }
+
+  ctx->mres = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const uint8_t *in,
+                                uint8_t *out, size_t len, ctr128_f stream) {
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+  unsigned int n, ctr;
+  size_t i;
+  uint64_t mlen = ctx->len.u[1];
+  void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
+#ifdef GHASH
+  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->ghash;
+#endif
+#endif
+
+  mlen += len;
+  if (mlen > ((OPENSSL_U64(1) << 36) - 32) ||
+      (sizeof(len) == 8 && mlen < len)) {
+    return 0;
+  }
+  ctx->len.u[1] = mlen;
+
+  if (ctx->ares) {
+    /* First call to encrypt finalizes GHASH(AAD) */
+    GCM_MUL(ctx, Xi);
+    ctx->ares = 0;
+  }
+
+  if (is_endian.little) {
+    ctr = GETU32(ctx->Yi.c + 12);
+  } else {
+    ctr = ctx->Yi.d[3];
+  }
+
+  n = ctx->mres;
+  if (n) {
+    while (n && len) {
+      ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->mres = n;
+      return 1;
+    }
+  }
+#if defined(GHASH)
+  while (len >= GHASH_CHUNK) {
+    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+    ctr += GHASH_CHUNK / 16;
+    if (is_endian.little) {
+      PUTU32(ctx->Yi.c + 12, ctr);
+    } else {
+      ctx->Yi.d[3] = ctr;
+    }
+    GHASH(ctx, out, GHASH_CHUNK);
+    out += GHASH_CHUNK;
+    in += GHASH_CHUNK;
+    len -= GHASH_CHUNK;
+  }
+#endif
+  if ((i = (len & (size_t) - 16))) {
+    size_t j = i / 16;
+
+    (*stream)(in, out, j, key, ctx->Yi.c);
+    ctr += (unsigned int)j;
+    if (is_endian.little) {
+      PUTU32(ctx->Yi.c + 12, ctr);
+    } else {
+      ctx->Yi.d[3] = ctr;
+    }
+    in += i;
+    len -= i;
+#if defined(GHASH)
+    GHASH(ctx, out, i);
+    out += i;
+#else
+    while (j--) {
+      for (i = 0; i < 16; ++i) {
+        ctx->Xi.c[i] ^= out[i];
+      }
+      GCM_MUL(ctx, Xi);
+      out += 16;
+    }
+#endif
+  }
+  if (len) {
+    (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
+    ++ctr;
+    if (is_endian.little) {
+      PUTU32(ctx->Yi.c + 12, ctr);
+    } else {
+      ctx->Yi.d[3] = ctr;
+    }
+    while (len--) {
+      ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
+      ++n;
+    }
+  }
+
+  ctx->mres = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const uint8_t *in,
+                                uint8_t *out, size_t len,
+                                ctr128_f stream) {
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+  unsigned int n, ctr;
+  size_t i;
+  uint64_t mlen = ctx->len.u[1];
+  void *key = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
+#ifdef GHASH
+  void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->ghash;
+#endif
+#endif
+
+  mlen += len;
+  if (mlen > ((OPENSSL_U64(1) << 36) - 32) ||
+      (sizeof(len) == 8 && mlen < len)) {
+    return 0;
+  }
+  ctx->len.u[1] = mlen;
+
+  if (ctx->ares) {
+    /* First call to decrypt finalizes GHASH(AAD) */
+    GCM_MUL(ctx, Xi);
+    ctx->ares = 0;
+  }
+
+  if (is_endian.little) {
+    ctr = GETU32(ctx->Yi.c + 12);
+  } else {
+    ctr = ctx->Yi.d[3];
+  }
+
+  n = ctx->mres;
+  if (n) {
+    while (n && len) {
+      uint8_t c = *(in++);
+      *(out++) = c ^ ctx->EKi.c[n];
+      ctx->Xi.c[n] ^= c;
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->mres = n;
+      return 1;
+    }
+  }
+#if defined(GHASH)
+  while (len >= GHASH_CHUNK) {
+    GHASH(ctx, in, GHASH_CHUNK);
+    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
+    ctr += GHASH_CHUNK / 16;
+    if (is_endian.little)
+      PUTU32(ctx->Yi.c + 12, ctr);
+    else
+      ctx->Yi.d[3] = ctr;
+    out += GHASH_CHUNK;
+    in += GHASH_CHUNK;
+    len -= GHASH_CHUNK;
+  }
+#endif
+  if ((i = (len & (size_t) - 16))) {
+    size_t j = i / 16;
+
+#if defined(GHASH)
+    GHASH(ctx, in, i);
+#else
+    while (j--) {
+      size_t k;
+      for (k = 0; k < 16; ++k)
+        ctx->Xi.c[k] ^= in[k];
+      GCM_MUL(ctx, Xi);
+      in += 16;
+    }
+    j = i / 16;
+    in -= i;
+#endif
+    (*stream)(in, out, j, key, ctx->Yi.c);
+    ctr += (unsigned int)j;
+    if (is_endian.little)
+      PUTU32(ctx->Yi.c + 12, ctr);
+    else
+      ctx->Yi.d[3] = ctr;
+    out += i;
+    in += i;
+    len -= i;
+  }
+  if (len) {
+    (*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
+    ++ctr;
+    if (is_endian.little)
+      PUTU32(ctx->Yi.c + 12, ctr);
+    else
+      ctx->Yi.d[3] = ctr;
+    while (len--) {
+      uint8_t c = in[n];
+      ctx->Xi.c[n] ^= c;
+      out[n] = c ^ ctx->EKi.c[n];
+      ++n;
+    }
+  }
+
+  ctx->mres = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) {
+  const union {
+    long one;
+    char little;
+  } is_endian = {1};
+  uint64_t alen = ctx->len.u[0] << 3;
+  uint64_t clen = ctx->len.u[1] << 3;
+#ifdef GCM_FUNCREF_4BIT
+  void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = ctx->gmult;
+#endif
+
+  if (ctx->mres || ctx->ares) {
+    GCM_MUL(ctx, Xi);
+  }
+
+  if (is_endian.little) {
+#ifdef BSWAP8
+    alen = BSWAP8(alen);
+    clen = BSWAP8(clen);
+#else
+    uint8_t *p = ctx->len.c;
+
+    ctx->len.u[0] = alen;
+    ctx->len.u[1] = clen;
+
+    alen = (uint64_t)GETU32(p) << 32 | GETU32(p + 4);
+    clen = (uint64_t)GETU32(p + 8) << 32 | GETU32(p + 12);
+#endif
+  }
+
+  ctx->Xi.u[0] ^= alen;
+  ctx->Xi.u[1] ^= clen;
+  GCM_MUL(ctx, Xi);
+
+  ctx->Xi.u[0] ^= ctx->EK0.u[0];
+  ctx->Xi.u[1] ^= ctx->EK0.u[1];
+
+  if (tag && len <= sizeof(ctx->Xi)) {
+    return CRYPTO_memcmp(ctx->Xi.c, tag, len) == 0;
+  } else {
+    return 0;
+  }
+}
+
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) {
+  CRYPTO_gcm128_finish(ctx, NULL, 0);
+  memcpy(tag, ctx->Xi.c, len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
+}
+
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) {
+  if (ctx) {
+    OPENSSL_cleanse(ctx, sizeof(*ctx));
+    OPENSSL_free(ctx);
+  }
+}
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+int crypto_gcm_clmul_enabled(void) {
+#ifdef GHASH_ASM
+  return OPENSSL_ia32cap_P[0] & (1 << 24) &&  /* check FXSR bit */
+    OPENSSL_ia32cap_P[1] & (1 << 1);  /* check PCLMULQDQ bit */
+#else
+  return 0;
+#endif
+}
+#endif
diff --git a/src/crypto/modes/gcm_test.c b/src/crypto/modes/gcm_test.c
new file mode 100644
index 0000000..29f9d95
--- /dev/null
+++ b/src/crypto/modes/gcm_test.c
@@ -0,0 +1,435 @@
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ==================================================================== */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <openssl/aes.h>
+#include <openssl/crypto.h>
+#include <openssl/mem.h>
+#include <openssl/modes.h>
+
+#include "internal.h"
+
+
+struct test_case {
+  const char *key;
+  const char *plaintext;
+  const char *additional_data;
+  const char *nonce;
+  const char *ciphertext;
+  const char *tag;
+};
+
+static const struct test_case test_cases[] = {
+  {
+    "00000000000000000000000000000000",
+    NULL,
+    NULL,
+    "000000000000000000000000",
+    NULL,
+    "58e2fccefa7e3061367f1d57a4e7455a",
+  },
+  {
+    "00000000000000000000000000000000",
+    "00000000000000000000000000000000",
+    NULL,
+    "000000000000000000000000",
+    "0388dace60b6a392f328c2b971b2fe78",
+    "ab6e47d42cec13bdf53a67b21257bddf",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
+    NULL,
+    "cafebabefacedbaddecaf888",
+    "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091473f5985",
+    "4d5c2af327cd64a62cf35abd2ba6fab4",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "cafebabefacedbaddecaf888",
+    "42831ec2217774244b7221b784d0d49ce3aa212f2c02a4e035c17e2329aca12e21d514b25466931c7d8f6a5aac84aa051ba30b396a0aac973d58e091",
+    "5bc94fbc3221a5db94fae95ae7121a47",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "cafebabefacedbad",
+    "61353b4c2806934a777ff51fa22a4755699b2a714fcdc6f83766e5f97b6c742373806900e49f24b22b097544d4896b424989b5e1ebac0f07c23f4598",
+    "3612d2e79e3b0785561be14aaca2fccb",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
+    "8ce24998625615b603a033aca13fb894be9112a5c3a211a8ba262a3cca7e2ca701e4a9a4fba43c90ccdcb281d48c7c6fd62875d2aca417034c34aee5",
+    "619cc5aefffe0bfa462af43c1699d050",
+  },
+  {
+    "000000000000000000000000000000000000000000000000",
+    NULL,
+    NULL,
+    "000000000000000000000000",
+    NULL,
+    "cd33b28ac773f74ba00ed1f312572435",
+  },
+  {
+    "000000000000000000000000000000000000000000000000",
+    "00000000000000000000000000000000",
+    NULL,
+    "000000000000000000000000",
+    "98e7247c07f0fe411c267e4384b0f600",
+    "2ff58d80033927ab8ef4d4587514f0fb",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
+    NULL,
+    "cafebabefacedbaddecaf888",
+    "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710acade256",
+    "9924a7c8587336bfb118024db8674a14",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "cafebabefacedbaddecaf888",
+    "3980ca0b3c00e841eb06fac4872a2757859e1ceaa6efd984628593b40ca1e19c7d773d00c144c525ac619d18c84a3f4718e2448b2fe324d9ccda2710",
+    "2519498e80f1478f37ba55bd6d27618c",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "cafebabefacedbad",
+    "0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7",
+    "65dcc57fcf623a24094fcca40d3533f8",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "cafebabefacedbad",
+    "0f10f599ae14a154ed24b36e25324db8c566632ef2bbb34f8347280fc4507057fddc29df9a471f75c66541d4d4dad1c9e93a19a58e8b473fa0f062f7",
+    "65dcc57fcf623a24094fcca40d3533f8",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
+    "d27e88681ce3243c4830165a8fdcf9ff1de9a1d8e6b447ef6ef7b79828666e4581e79012af34ddd9e2f037589b292db3e67c036745fa22e7e9b7373b",
+    "dcf566ff291c25bbb8568fc3d376a6d9",
+  },
+  {
+    "0000000000000000000000000000000000000000000000000000000000000000",
+    NULL,
+    NULL,
+    "000000000000000000000000",
+    NULL,
+    "530f8afbc74536b9a963b4f1c4cb738b",
+  },
+  {
+    "0000000000000000000000000000000000000000000000000000000000000000",
+    "00000000000000000000000000000000",
+    NULL,
+    "000000000000000000000000",
+    "cea7403d4d606b6e074ec5d3baf39d18",
+    "d0d1c8a799996bf0265b98b5d48ab919",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255",
+    NULL,
+    "cafebabefacedbaddecaf888",
+    "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad",
+    "b094dac5d93471bdec1a502270e3cc6c",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "cafebabefacedbaddecaf888",
+    "522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662",
+    "76fc6ece0f4e1768cddf8853bb2d551b",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "cafebabefacedbad",
+    "c3762df1ca787d32ae47c13bf19844cbaf1ae14d0b976afac52ff7d79bba9de0feb582d33934a4f0954cc2363bc73f7862ac430e64abe499f47c9b1f",
+    "3a337dbf46a792c45e454913fe2ea8f2",
+  },
+  {
+    "feffe9928665731c6d6a8f9467308308feffe9928665731c6d6a8f9467308308",
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b39",
+    "feedfacedeadbeeffeedfacedeadbeefabaddad2",
+    "9313225df88406e555909c5aff5269aa6a7a9538534f7da1e4c303d2a318a728c3c0c95156809539fcf0e2429a6b525416aedbf5a0de6a57a637b39b",
+    "5a8def2f0c9e53f1f75d7853659e2a20eeb2b22aafde6419a058ab4f6f746bf40fc0c3b780f244452da3ebf1c5d82cdea2418997200ef82e44ae7e3f",
+    "a44a8266ee1c8eb0c8b5d4cf5ae9f19a",
+  },
+  {
+    "00000000000000000000000000000000",
+    NULL,
+    "d9313225f88406e5a55909c5aff5269a86a7a9531534f7da2e4c303d8a318a721c3c0c95956809532fcf0e2449a6b525b16aedf5aa0de657ba637b391aafd255522dc1f099567d07f47f37a32a84427d643a8cdcbfe5c0c97598a2bd2555d1aa8cb08e48590dbb3da7b08b1056828838c5f61e6393ba7a0abcc9f662898015ad",
+    "000000000000000000000000",
+    NULL,
+    "5fea793a2d6f974d37e68e0cb8ff9492",
+  },
+  {
+    "00000000000000000000000000000000",
+    "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+    NULL,
+    /* This nonce results in 0xfff in counter LSB. */
+    "ffffffff000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+    "56b3373ca9ef6e4a2b64fe1e9a17b61425f10d47a75a5fce13efc6bc784af24f4141bdd48cf7c770887afd573cca5418a9aeffcd7c5ceddfc6a78397b9a85b499da558257267caab2ad0b23ca476a53cb17fb41c4b8b475cb4f3f7165094c229c9e8c4dc0a2a5ff1903e501511221376a1cdb8364c5061a20cae74bc4acd76ceb0abc9fd3217ef9f8c90be402ddf6d8697f4f880dff15bfb7a6b28241ec8fe183c2d59e3f9dfff653c7126f0acb9e64211f42bae12af462b1070bef1ab5e3606872ca10dee15b3249b1a1b958f23134c4bccb7d03200bce420a2f8eb66dcf3644d1423c1b5699003c13ecef4bf38a3b60eedc34033bac1902783dc6d89e2e774188a439c7ebcc0672dbda4ddcfb2794613b0be41315ef778708a70ee7d75165c",
+    "8b307f6b33286d0ab026a9ed3fe1e85f",
+  },
+};
+
+static int from_hex(uint8_t *out, char in) {
+  if (in >= '0' && in <= '9') {
+    *out = in - '0';
+    return 1;
+  }
+  if (in >= 'a' && in <= 'f') {
+    *out = in - 'a' + 10;
+    return 1;
+  }
+  if (in >= 'A' && in <= 'F') {
+    *out = in - 'A' + 10;
+    return 1;
+  }
+
+  return 0;
+}
+
+static int decode_hex(uint8_t **out, size_t *out_len, const char *in,
+                      unsigned test_num, const char *description) {
+  uint8_t *buf = NULL;
+  size_t i;
+
+  if (in == NULL) {
+    *out = NULL;
+    *out_len = 0;
+    return 1;
+  }
+
+  size_t len = strlen(in);
+  if (len & 1) {
+    fprintf(stderr, "%u: Odd-length %s input.\n", test_num, description);
+    goto err;
+  }
+
+  buf = OPENSSL_malloc(len / 2);
+  if (buf == NULL) {
+    fprintf(stderr, "%u: malloc failure.\n", test_num);
+    goto err;
+  }
+
+  for (i = 0; i < len; i += 2) {
+    uint8_t v, v2;
+    if (!from_hex(&v, in[i]) ||
+        !from_hex(&v2, in[i+1])) {
+      fprintf(stderr, "%u: invalid hex digit in %s around offset %u.\n",
+              test_num, description, (unsigned)i);
+      goto err;
+    }
+    buf[i/2] = (v << 4) | v2;
+  }
+
+  *out = buf;
+  *out_len = len/2;
+  return 1;
+
+err:
+  if (buf) {
+    OPENSSL_free(buf);
+  }
+  return 0;
+}
+
+void hexdump(const char *msg, const void *in, size_t len) {
+  const uint8_t *data = in;
+  size_t i;
+
+  fprintf(stderr, "%s: ", msg);
+  for (i = 0; i < len; i++) {
+    fprintf(stderr, "%02x", data[i]);
+  }
+  fprintf(stderr, "\n");
+}
+
+static int run_test_case(unsigned test_num, const struct test_case *test) {
+  size_t key_len, plaintext_len, additional_data_len, nonce_len, ciphertext_len,
+      tag_len;
+  uint8_t *key = NULL, *plaintext = NULL, *additional_data = NULL,
+          *nonce = NULL, *ciphertext = NULL, *tag = NULL, *out = NULL;
+  int ret = 0;
+  AES_KEY aes_key;
+  GCM128_CONTEXT ctx;
+
+  if (!decode_hex(&key, &key_len, test->key, test_num, "key") ||
+      !decode_hex(&plaintext, &plaintext_len, test->plaintext, test_num,
+                  "plaintext") ||
+      !decode_hex(&additional_data, &additional_data_len, test->additional_data,
+                  test_num, "additional_data") ||
+      !decode_hex(&nonce, &nonce_len, test->nonce, test_num, "nonce") ||
+      !decode_hex(&ciphertext, &ciphertext_len, test->ciphertext, test_num,
+                  "ciphertext") ||
+      !decode_hex(&tag, &tag_len, test->tag, test_num, "tag")) {
+    goto out;
+  }
+
+  if (plaintext_len != ciphertext_len) {
+    fprintf(stderr, "%u: plaintext and ciphertext have differing lengths.\n",
+            test_num);
+    goto out;
+  }
+
+  if (key_len != 16 && key_len != 24 && key_len != 32) {
+    fprintf(stderr, "%u: bad key length.\n", test_num);
+    goto out;
+  }
+
+  if (tag_len != 16) {
+    fprintf(stderr, "%u: bad tag length.\n", test_num);
+    goto out;
+  }
+
+  out = OPENSSL_malloc(plaintext_len);
+  if (AES_set_encrypt_key(key, key_len*8, &aes_key)) {
+    fprintf(stderr, "%u: AES_set_encrypt_key failed.\n", test_num);
+    goto out;
+  }
+
+  CRYPTO_gcm128_init(&ctx, &aes_key, (block128_f) AES_encrypt);
+  CRYPTO_gcm128_setiv(&ctx, nonce, nonce_len);
+  memset(out, 0, plaintext_len);
+  if (additional_data) {
+    CRYPTO_gcm128_aad(&ctx, additional_data, additional_data_len);
+  }
+  if (plaintext) {
+    CRYPTO_gcm128_encrypt(&ctx, plaintext, out, plaintext_len);
+  }
+  if (!CRYPTO_gcm128_finish(&ctx, tag, tag_len) ||
+      (ciphertext && memcmp(out, ciphertext, plaintext_len) != 0)) {
+    fprintf(stderr, "%u: encrypt failed.\n", test_num);
+    hexdump("got ", out, plaintext_len);
+    hexdump("want", ciphertext, plaintext_len);
+    goto out;
+  }
+
+  CRYPTO_gcm128_setiv(&ctx, nonce, nonce_len);
+  memset(out, 0, plaintext_len);
+  if (additional_data) {
+    CRYPTO_gcm128_aad(&ctx, additional_data, additional_data_len);
+  }
+  if (ciphertext) {
+    CRYPTO_gcm128_decrypt(&ctx, ciphertext, out, plaintext_len);
+  }
+  if (!CRYPTO_gcm128_finish(&ctx, tag, tag_len)) {
+    fprintf(stderr, "%u: decrypt failed.\n", test_num);
+    goto out;
+  }
+  if (plaintext && memcmp(out, plaintext, plaintext_len)) {
+    fprintf(stderr, "%u: plaintext doesn't match.\n", test_num);
+    goto out;
+  }
+
+  ret = 1;
+
+out:
+  if (key) {
+    OPENSSL_free(key);
+  }
+  if (plaintext) {
+    OPENSSL_free(plaintext);
+  }
+  if (additional_data) {
+    OPENSSL_free(additional_data);
+  }
+  if (nonce) {
+    OPENSSL_free(nonce);
+  }
+  if (ciphertext) {
+    OPENSSL_free(ciphertext);
+  }
+  if (tag) {
+    OPENSSL_free(tag);
+  }
+  if (out) {
+    OPENSSL_free(out);
+  }
+  return ret;
+}
+
+int main(void) {
+  int ret = 0;
+  unsigned i;
+
+  CRYPTO_library_init();
+
+  for (i = 0; i < sizeof(test_cases) / sizeof(struct test_case); i++) {
+    if (!run_test_case(i, &test_cases[i])) {
+      ret = 1;
+    }
+  }
+
+  if (ret == 0) {
+    printf("PASS\n");
+  }
+
+  return ret;
+}
diff --git a/src/crypto/modes/internal.h b/src/crypto/modes/internal.h
new file mode 100644
index 0000000..9662e0d
--- /dev/null
+++ b/src/crypto/modes/internal.h
@@ -0,0 +1,199 @@
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ==================================================================== */
+
+#ifndef OPENSSL_HEADER_MODES_INTERNAL_H
+#define OPENSSL_HEADER_MODES_INTERNAL_H
+
+#include <openssl/base.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+#define asm __asm__
+
+#define STRICT_ALIGNMENT 1
+#if defined(OPENSSL_X86_64) || defined(OPENSSL_X86) || defined(OPENSSL_AARCH64)
+#undef STRICT_ALIGNMENT
+#define STRICT_ALIGNMENT 0
+#endif
+
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM)
+#if defined(__GNUC__) && __GNUC__ >= 2
+#if defined(OPENSSL_X86_64)
+#define BSWAP8(x)                 \
+  ({                              \
+    uint64_t ret = (x);           \
+    asm("bswapq %0" : "+r"(ret)); \
+    ret;                          \
+  })
+#define BSWAP4(x)                 \
+  ({                              \
+    uint32_t ret = (x);           \
+    asm("bswapl %0" : "+r"(ret)); \
+    ret;                          \
+  })
+#elif defined(OPENSSL_X86)
+#define BSWAP8(x)                                     \
+  ({                                                  \
+    uint32_t lo = (uint64_t)(x) >> 32, hi = (x);      \
+    asm("bswapl %0; bswapl %1" : "+r"(hi), "+r"(lo)); \
+    (uint64_t) hi << 32 | lo;                         \
+  })
+#define BSWAP4(x)                 \
+  ({                              \
+    uint32_t ret = (x);           \
+    asm("bswapl %0" : "+r"(ret)); \
+    ret;                          \
+  })
+#elif defined(OPENSSL_AARCH64)
+#define BSWAP8(x)                          \
+  ({                                       \
+    uint64_t ret;                          \
+    asm("rev %0,%1" : "=r"(ret) : "r"(x)); \
+    ret;                                   \
+  })
+#define BSWAP4(x)                            \
+  ({                                         \
+    uint32_t ret;                            \
+    asm("rev %w0,%w1" : "=r"(ret) : "r"(x)); \
+    ret;                                     \
+  })
+#elif defined(OPENSSL_ARM) && !defined(STRICT_ALIGNMENT)
+#define BSWAP8(x)                                     \
+  ({                                                  \
+    uint32_t lo = (uint64_t)(x) >> 32, hi = (x);      \
+    asm("rev %0,%0; rev %1,%1" : "+r"(hi), "+r"(lo)); \
+    (uint64_t) hi << 32 | lo;                         \
+  })
+#define BSWAP4(x)                                      \
+  ({                                                   \
+    uint32_t ret;                                      \
+    asm("rev %0,%1" : "=r"(ret) : "r"((uint32_t)(x))); \
+    ret;                                               \
+  })
+#endif
+#elif defined(_MSC_VER)
+#if _MSC_VER >= 1300
+#pragma intrinsic(_byteswap_uint64, _byteswap_ulong)
+#define BSWAP8(x) _byteswap_uint64((uint64_t)(x))
+#define BSWAP4(x) _byteswap_ulong((uint32_t)(x))
+#elif defined(OPENSSL_X86)
+__inline uint32_t _bswap4(uint32_t val) {
+  _asm mov eax, val
+  _asm bswap eax
+}
+#define BSWAP4(x) _bswap4(x)
+#endif
+#endif
+#endif
+
+#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
+#define GETU32(p) BSWAP4(*(const uint32_t *)(p))
+#define PUTU32(p, v) *(uint32_t *)(p) = BSWAP4(v)
+#else
+#define GETU32(p) \
+  ((uint32_t)(p)[0] << 24 | (uint32_t)(p)[1] << 16 | (uint32_t)(p)[2] << 8 | (uint32_t)(p)[3])
+#define PUTU32(p, v)                                   \
+  ((p)[0] = (uint8_t)((v) >> 24), (p)[1] = (uint8_t)((v) >> 16), \
+   (p)[2] = (uint8_t)((v) >> 8), (p)[3] = (uint8_t)(v))
+#endif
+
+
+/* GCM definitions */
+typedef struct { uint64_t hi,lo; } u128;
+
+struct gcm128_context {
+  /* Following 6 names follow names in GCM specification */
+  union {
+    uint64_t u[2];
+    uint32_t d[4];
+    uint8_t c[16];
+    size_t t[16 / sizeof(size_t)];
+  } Yi, EKi, EK0, len, Xi, H;
+
+  /* Relative position of Xi, H and pre-computed Htable is used in some
+   * assembler modules, i.e. don't change the order! */
+  u128 Htable[16];
+  void (*gmult)(uint64_t Xi[2], const u128 Htable[16]);
+  void (*ghash)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
+                size_t len);
+
+  unsigned int mres, ares;
+  block128_f block;
+  void *key;
+};
+
+struct xts128_context {
+  void *key1, *key2;
+  block128_f block1, block2;
+};
+
+struct ccm128_context {
+  union {
+    uint64_t u[2];
+    uint8_t c[16];
+  } nonce, cmac;
+  uint64_t blocks;
+  block128_f block;
+  void *key;
+};
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+/* crypto_gcm_clmul_enabled returns one if the CLMUL implementation of GCM is
+ * used. */
+int crypto_gcm_clmul_enabled(void);
+#endif
+
+
+#if defined(__cplusplus)
+} /* extern C */
+#endif
+
+#endif /* OPENSSL_HEADER_MODES_INTERNAL_H */
diff --git a/src/crypto/modes/ofb.c b/src/crypto/modes/ofb.c
new file mode 100644
index 0000000..5836a9f
--- /dev/null
+++ b/src/crypto/modes/ofb.c
@@ -0,0 +1,107 @@
+/* ====================================================================
+ * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ==================================================================== */
+
+#include <openssl/modes.h>
+
+#include <assert.h>
+
+#include "internal.h"
+
+
+void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const void *key, uint8_t ivec[16], int *num,
+                           block128_f block) {
+  unsigned int n;
+
+  assert(in && out && key && ivec && num);
+  assert((16 % sizeof(size_t)) == 0);
+
+  n = *num;
+
+  while (n && len) {
+    *(out++) = *(in++) ^ ivec[n];
+    --len;
+    n = (n + 1) % 16;
+  }
+
+#if STRICT_ALIGNMENT
+  if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
+    size_t l = 0;
+    while (l < len) {
+      if (n == 0) {
+        (*block)(ivec, ivec, key);
+      }
+      out[l] = in[l] ^ ivec[n];
+      ++l;
+      n = (n + 1) % 16;
+    }
+
+    *num = n;
+    return;
+  }
+#endif
+
+  while (len >= 16) {
+    (*block)(ivec, ivec, key);
+    for (; n < 16; n += sizeof(size_t)) {
+      *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(ivec + n);
+    }
+    len -= 16;
+    out += 16;
+    in += 16;
+    n = 0;
+  }
+  if (len) {
+    (*block)(ivec, ivec, key);
+    while (len--) {
+      out[n] = in[n] ^ ivec[n];
+      ++n;
+    }
+  }
+  *num = n;
+}