updated perlasm files

author: Nikos Mavrogiannopoulos <nmav@gnutls.org> 2013-11-26 23:19:01 +0100
committer: Nikos Mavrogiannopoulos <nmav@gnutls.org> 2013-11-26 23:20:56 +0100
commit: c61d244ee65b40cbf8e8a8dbf83f92ada6eb430a (patch)
tree: fcd06f998a52f354924a3fb4840d0b08a8e5c71f /devel
parent: 11c6e66f570a6d6ed3baabe3c7924ef29fe47ad0 (diff)
download: gnutls-c61d244ee65b40cbf8e8a8dbf83f92ada6eb430a.tar.gz
13 files changed, 2285 insertions, 940 deletions
diff --git a/devel/perlasm/aesni-x86.pl b/devel/perlasm/aesni-x86.pl
index 3dc345b585..14ff2602ed 100644
--- a/devel/perlasm/aesni-x86.pl
+++ b/devel/perlasm/aesni-x86.pl
@@ -54,8 +54,8 @@ require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
 
-if ($PREFIX eq "aesni")	{ $movekey=*movups; }
-else			{ $movekey=*movups; }
+if ($PREFIX eq "aesni")	{ $movekey=\&movups; }
+else			{ $movekey=\&movups; }
 
 $len="eax";
 $rounds="ecx";
@@ -1816,7 +1816,7 @@ if ($PREFIX eq "aesni") {
 	&movups	(&QWP(0x10,$out),$inout1);
 	&lea	($inp,&DWP(0x60,$inp));
 	&movups	(&QWP(0x20,$out),$inout2);
-	&mov	($rounds,$rounds_)		# restore $rounds
+	&mov	($rounds,$rounds_);		# restore $rounds
 	&movups	(&QWP(0x30,$out),$inout3);
 	&mov	($key,$key_);			# restore $key
 	&movups	(&QWP(0x40,$out),$inout4);
@@ -2015,7 +2015,7 @@ if ($PREFIX eq "aesni") {
 &set_label("12rounds",16);
 	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
 	&mov		($rounds,11);
-	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
+	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
 	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
 	&call		(&label("key_192a_cold"));
 	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
@@ -2152,7 +2152,7 @@ if ($PREFIX eq "aesni") {
 	&mov	($key,&wparam(2));
 	&call	("_aesni_set_encrypt_key");
 	&mov	($key,&wparam(2));
-	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
+	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
 	&test	("eax","eax");
 	&jnz	(&label("dec_key_ret"));
 	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
diff --git a/devel/perlasm/aesni-x86_64.pl b/devel/perlasm/aesni-x86_64.pl
index 499f3b3f42..4a10fe6bd2 100644
--- a/devel/perlasm/aesni-x86_64.pl
+++ b/devel/perlasm/aesni-x86_64.pl
@@ -129,8 +129,8 @@
 #
 # Further data for other parallelizable modes:
 #
-# CBC decrypt				1.16	0.93	0.93
-# CTR					1.14	0.91	n/a
+# CBC decrypt				1.16	0.93	0.74
+# CTR					1.14	0.91	0.74
 #
 # Well, given 3x column it's probably inappropriate to call the limit
 # asymptotic, if it can be surpassed, isn't it? What happens there?
@@ -153,10 +153,17 @@
 
 # April 2011
 #
-# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
-# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
 # in CTR mode AES instruction interleave factor was chosen to be 6x.
 
+######################################################################
+# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
+# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
+# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
+# instruction latency is 9 cycles and that they can be issued every
+# cycle.
+
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
 			# crypto/aes/asm/aes-x86_64.pl:-)
@@ -172,7 +179,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
 @_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
@@ -1003,220 +1011,378 @@ ___
 #                         const char *ivec);
 #
 # Handles only complete blocks, operates on 32-bit counter and
-# does not update *ivec! (see engine/eng_aesni.c for details)
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
 #
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
 {
-my $reserved = $win64?0:-0x28;
-my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
-my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
-my $bswap_mask="%xmm15";
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("${key_}d","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
 
 $code.=<<___;
 .globl	aesni_ctr32_encrypt_blocks
 .type	aesni_ctr32_encrypt_blocks,\@function,5
 .align	16
 aesni_ctr32_encrypt_blocks:
+	lea	(%rsp),%rax
+	push	%rbp
+	sub	\$$frame_size,%rsp
+	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	lea	-0xc8(%rsp),%rsp
-	movaps	%xmm6,0x20(%rsp)
-	movaps	%xmm7,0x30(%rsp)
-	movaps	%xmm8,0x40(%rsp)
-	movaps	%xmm9,0x50(%rsp)
-	movaps	%xmm10,0x60(%rsp)
-	movaps	%xmm11,0x70(%rsp)
-	movaps	%xmm12,0x80(%rsp)
-	movaps	%xmm13,0x90(%rsp)
-	movaps	%xmm14,0xa0(%rsp)
-	movaps	%xmm15,0xb0(%rsp)
+	movaps	%xmm6,-0xa8(%rax)
+	movaps	%xmm7,-0x98(%rax)
+	movaps	%xmm8,-0x88(%rax)
+	movaps	%xmm9,-0x78(%rax)
+	movaps	%xmm10,-0x68(%rax)
+	movaps	%xmm11,-0x58(%rax)
+	movaps	%xmm12,-0x48(%rax)
+	movaps	%xmm13,-0x38(%rax)
+	movaps	%xmm14,-0x28(%rax)
+	movaps	%xmm15,-0x18(%rax)
 .Lctr32_body:
 ___
 $code.=<<___;
+	lea	-8(%rax),%rbp
+
 	cmp	\$1,$len
 	je	.Lctr32_one_shortcut
 
-	movdqu	($ivp),$ivec
-	movdqa	.Lbswap_mask(%rip),$bswap_mask
-	xor	$rounds,$rounds
-	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
-	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
+	movdqu	($ivp),$inout0
+	movdqu	($key),$rndkey0
+	mov	12($ivp),$ctr			# counter LSB
+	pxor	$rndkey0,$inout0
+	mov	12($key),$key0			# 0-round key LSB
+	movdqa	$inout0,0x00(%rsp)		# populate counter block
+	bswap	$ctr
+	movdqa	$inout0,$inout1
+	movdqa	$inout0,$inout2
+	movdqa	$inout0,$inout3
+	movdqa	$inout0,0x40(%rsp)
+	movdqa	$inout0,0x50(%rsp)
+	movdqa	$inout0,0x60(%rsp)
+	movdqa	$inout0,0x70(%rsp)
 
 	mov	240($key),$rounds		# key->rounds
-	bswap	$rnds_
-	pxor	$iv0,$iv0			# vector of 3 32-bit counters
-	pxor	$iv1,$iv1			# vector of 3 32-bit counters
-	pinsrd	\$0,$rnds_,$iv0
-	lea	3($rnds_),$key_
-	pinsrd	\$0,$key_,$iv1
-	inc	$rnds_
-	pinsrd	\$1,$rnds_,$iv0
-	inc	$key_
-	pinsrd	\$1,$key_,$iv1
-	inc	$rnds_
-	pinsrd	\$2,$rnds_,$iv0
-	inc	$key_
-	pinsrd	\$2,$key_,$iv1
-	movdqa	$iv0,$reserved(%rsp)
-	pshufb	$bswap_mask,$iv0
-	movdqa	$iv1,`$reserved+0x10`(%rsp)
-	pshufb	$bswap_mask,$iv1
-
-	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
-	pshufd	\$`2<<6`,$iv0,$inout1
-	pshufd	\$`1<<6`,$iv0,$inout2
-	cmp	\$6,$len
+
+	lea	1($ctr),%r9
+	 lea	2($ctr),%r10
+	bswap	%r9d
+	 bswap	%r10d
+	xor	$key0,%r9d
+	 xor	$key0,%r10d
+	pinsrd	\$3,%r9d,$inout1
+	lea	3($ctr),%r9
+	movdqa	$inout1,0x10(%rsp)
+	 pinsrd	\$3,%r10d,$inout2
+	bswap	%r9d
+	 lea	4($ctr),%r10
+	 movdqa	$inout2,0x20(%rsp)
+	xor	$key0,%r9d
+	 bswap	%r10d
+	pinsrd	\$3,%r9d,$inout3
+	 xor	$key0,%r10d
+	movdqa	$inout3,0x30(%rsp)
+	lea	5($ctr),%r9
+	 mov	%r10d,0x40+12(%rsp)
+	bswap	%r9d
+	 lea	6($ctr),%r10
+	xor	$key0,%r9d
+	 bswap	%r10d
+	mov	%r9d,0x50+12(%rsp)
+	 xor	$key0,%r10d
+	lea	7($ctr),%r9
+	 mov	%r10d,0x60+12(%rsp)
+	bswap	%r9d
+	xor	$key0,%r9d
+	mov	%r9d,0x70+12(%rsp)
+
+	$movkey	0x10($key),$rndkey1
+
+	movdqa	0x40(%rsp),$inout4
+	movdqa	0x50(%rsp),$inout5
+
+	cmp	\$8,$len
 	jb	.Lctr32_tail
-	shr	\$1,$rounds
-	mov	$key,$key_			# backup $key
-	mov	$rounds,$rnds_			# backup $rounds
-	sub	\$6,$len
-	jmp	.Lctr32_loop6
-
-.align	16
-.Lctr32_loop6:
-	pshufd	\$`3<<6`,$iv1,$inout3
-	por	$ivec,$inout0			# merge counter-less ivec
-	 $movkey	($key_),$rndkey0
-	pshufd	\$`2<<6`,$iv1,$inout4
-	por	$ivec,$inout1
-	 $movkey	16($key_),$rndkey1
-	pshufd	\$`1<<6`,$iv1,$inout5
-	por	$ivec,$inout2
-	por	$ivec,$inout3
-	 xorps		$rndkey0,$inout0
-	por	$ivec,$inout4
-	por	$ivec,$inout5
-
-	# inline _aesni_encrypt6 and interleave last rounds
-	# with own code...
 
-	pxor		$rndkey0,$inout1
+	lea	0x80($key),$key		# size optimization
+	sub	\$8,$len
+	jmp	.Lctr32_loop8
+
+.align	32
+.Lctr32_loop8:
+	 add		\$8,$ctr
+	movdqa		0x60(%rsp),$inout6
 	aesenc		$rndkey1,$inout0
-	lea		32($key_),$key
-	pxor		$rndkey0,$inout2
+	 mov		$ctr,%r9d
+	movdqa		0x70(%rsp),$inout7
 	aesenc		$rndkey1,$inout1
-	 movdqa		.Lincrement32(%rip),$iv1
-	pxor		$rndkey0,$inout3
+	 bswap		%r9d
+	$movkey		0x20-0x80($key),$rndkey0
 	aesenc		$rndkey1,$inout2
-	 movdqa		$reserved(%rsp),$iv0
-	pxor		$rndkey0,$inout4
+	 xor		$key0,%r9d
 	aesenc		$rndkey1,$inout3
-	pxor		$rndkey0,$inout5
-	$movkey		($key),$rndkey0
-	dec		$rounds
+	 mov		%r9d,0x00+12(%rsp)
+	 lea		1($ctr),%r9
 	aesenc		$rndkey1,$inout4
 	aesenc		$rndkey1,$inout5
-	jmp		.Lctr32_enc_loop6_enter
-.align	16
-.Lctr32_enc_loop6:
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	$movkey		0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+	aesenc		$rndkeyx,$inout0
+	aesenc		$rndkeyx,$inout1
+	 bswap		%r9d
+	aesenc		$rndkeyx,$inout2
+	 xor		$key0,%r9d
+	aesenc		$rndkeyx,$inout3
+	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
+	 lea		$i($ctr),%r9
+	aesenc		$rndkeyx,$inout4
+	aesenc		$rndkeyx,$inout5
+	aesenc		$rndkeyx,$inout6
+	aesenc		$rndkeyx,$inout7
+	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	 bswap		%r9d
+	aesenc		$rndkey0,$inout2
+	 xor		$key0,%r9d
+	aesenc		$rndkey0,$inout3
+	 mov		%r9d,0x70+12(%rsp)
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	aesenc		$rndkey0,$inout6
+	 movdqu		0x00($inp),$in0
+	aesenc		$rndkey0,$inout7
+	$movkey		0xa0-0x80($key),$rndkey0
+
+	cmp		\$11,$rounds
+	jb		.Lctr32_enc_done
+
 	aesenc		$rndkey1,$inout0
 	aesenc		$rndkey1,$inout1
-	dec		$rounds
 	aesenc		$rndkey1,$inout2
 	aesenc		$rndkey1,$inout3
 	aesenc		$rndkey1,$inout4
 	aesenc		$rndkey1,$inout5
-.Lctr32_enc_loop6_enter:
-	$movkey		16($key),$rndkey1
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	$movkey		0xb0-0x80($key),$rndkey1
+
 	aesenc		$rndkey0,$inout0
 	aesenc		$rndkey0,$inout1
-	lea		32($key),$key
 	aesenc		$rndkey0,$inout2
 	aesenc		$rndkey0,$inout3
 	aesenc		$rndkey0,$inout4
 	aesenc		$rndkey0,$inout5
-	$movkey		($key),$rndkey0
-	jnz		.Lctr32_enc_loop6
+	aesenc		$rndkey0,$inout6
+	aesenc		$rndkey0,$inout7
+	$movkey		0xc0-0x80($key),$rndkey0
+	je		.Lctr32_enc_done
 
 	aesenc		$rndkey1,$inout0
-	 paddd		$iv1,$iv0		# increment counter vector
 	aesenc		$rndkey1,$inout1
-	 paddd		`$reserved+0x10`(%rsp),$iv1
 	aesenc		$rndkey1,$inout2
-	 movdqa		$iv0,$reserved(%rsp)	# save counter vector
 	aesenc		$rndkey1,$inout3
-	 movdqa		$iv1,`$reserved+0x10`(%rsp)
 	aesenc		$rndkey1,$inout4
-	 pshufb		$bswap_mask,$iv0	# byte swap
 	aesenc		$rndkey1,$inout5
-	 pshufb		$bswap_mask,$iv1
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	$movkey		0xd0-0x80($key),$rndkey1
 
-	aesenclast	$rndkey0,$inout0
-	 movups		($inp),$in0		# load input
-	aesenclast	$rndkey0,$inout1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	aesenc		$rndkey0,$inout6
+	aesenc		$rndkey0,$inout7
+	$movkey		0xe0-0x80($key),$rndkey0
+
+.Lctr32_enc_done:
+	movdqu		0x10($inp),$in1
+	pxor		$rndkey0,$in0
+	movdqu		0x20($inp),$in2
+	pxor		$rndkey0,$in1
+	movdqu		0x30($inp),$in3
+	pxor		$rndkey0,$in2
+	movdqu		0x40($inp),$in4
+	pxor		$rndkey0,$in3
+	movdqu		0x50($inp),$in5
+	pxor		$rndkey0,$in4
+	aesenc		$rndkey1,$inout0
+	pxor		$rndkey0,$in5
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	aesenc		$rndkey1,$inout6
+	aesenc		$rndkey1,$inout7
+	movdqu		0x60($inp),$rndkey1
+
+	aesenclast	$in0,$inout0
+	pxor		$rndkey0,$rndkey1
+	movdqu		0x70($inp),$in0
+	lea		0x80($inp),$inp
+	aesenclast	$in1,$inout1
+	pxor		$rndkey0,$in0
+	movdqa		0x00(%rsp),$in1		# load next counter block
+	aesenclast	$in2,$inout2
+	movdqa		0x10(%rsp),$in2
+	aesenclast	$in3,$inout3
+	movdqa		0x20(%rsp),$in3
+	aesenclast	$in4,$inout4
+	movdqa		0x30(%rsp),$in4
+	aesenclast	$in5,$inout5
+	movdqa		0x40(%rsp),$in5
+	aesenclast	$rndkey1,$inout6
+	movdqa		0x50(%rsp),$rndkey0
+	aesenclast	$in0,$inout7
+	$movkey		0x10-0x80($key),$rndkey1
+
+	movups		$inout0,($out)		# store output
+	movdqa		$in1,$inout0
+	movups		$inout1,0x10($out)
+	movdqa		$in2,$inout1
+	movups		$inout2,0x20($out)
+	movdqa		$in3,$inout2
+	movups		$inout3,0x30($out)
+	movdqa		$in4,$inout3
+	movups		$inout4,0x40($out)
+	movdqa		$in5,$inout4
+	movups		$inout5,0x50($out)
+	movdqa		$rndkey0,$inout5
+	movups		$inout6,0x60($out)
+	movups		$inout7,0x70($out)
+	lea		0x80($out),$out
+	
+	sub	\$8,$len
+	jnc	.Lctr32_loop8
+
+	add	\$8,$len
+	jz	.Lctr32_done
+	lea	-0x80($key),$key
+
+.Lctr32_tail:
+	lea	16($key),$key
+	cmp	\$4,$len
+	jb	.Lctr32_loop3
+	je	.Lctr32_loop4
+
+	movdqa		0x60(%rsp),$inout6
+	pxor		$inout7,$inout7
+
+	$movkey		16($key),$rndkey0
+	aesenc		$rndkey1,$inout0
+	lea		16($key),$key
+	aesenc		$rndkey1,$inout1
+	shr		\$1,$rounds
+	aesenc		$rndkey1,$inout2
+	dec		$rounds
+	aesenc		$rndkey1,$inout3
+	 movups		($inp),$in0
+	aesenc		$rndkey1,$inout4
 	 movups		0x10($inp),$in1
-	aesenclast	$rndkey0,$inout2
+	aesenc		$rndkey1,$inout5
 	 movups		0x20($inp),$in2
-	aesenclast	$rndkey0,$inout3
+	aesenc		$rndkey1,$inout6
+	$movkey		16($key),$rndkey1
+
+	call            .Lenc_loop8_enter
+
+	movdqu	0x30($inp),$in3
+	pxor	$in0,$inout0
+	movdqu	0x40($inp),$in0
+	pxor	$in1,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in2,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in3,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in0,$inout4
+	movdqu	$inout3,0x30($out)
+	movdqu	$inout4,0x40($out)
+	cmp	\$6,$len
+	jb	.Lctr32_done
+
+	movups	0x50($inp),$in1
+	xorps	$in1,$inout5
+	movups	$inout5,0x50($out)
+	je	.Lctr32_done
+
+	movups	0x60($inp),$in2
+	xorps	$in2,$inout6
+	movups	$inout6,0x60($out)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop4:
+	aesenc		$rndkey1,$inout0
+	lea		16($key),$key
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		($key),$rndkey1
+	dec		$rounds
+	jnz		.Lctr32_loop4
+	aesenclast	$rndkey1,$inout0
+	 movups		($inp),$in0
+	aesenclast	$rndkey1,$inout1
+	 movups		0x10($inp),$in1
+	aesenclast	$rndkey1,$inout2
+	 movups		0x20($inp),$in2
+	aesenclast	$rndkey1,$inout3
 	 movups		0x30($inp),$in3
-	aesenclast	$rndkey0,$inout4
-	 movups		0x40($inp),$rndkey1
-	aesenclast	$rndkey0,$inout5
-	 movups		0x50($inp),$rndkey0
-	 lea	0x60($inp),$inp
-
-	xorps	$inout0,$in0			# xor
-	 pshufd	\$`3<<6`,$iv0,$inout0
-	xorps	$inout1,$in1
-	 pshufd	\$`2<<6`,$iv0,$inout1
-	movups	$in0,($out)			# store output
-	xorps	$inout2,$in2
-	 pshufd	\$`1<<6`,$iv0,$inout2
-	movups	$in1,0x10($out)
-	xorps	$inout3,$in3
-	movups	$in2,0x20($out)
-	xorps	$inout4,$rndkey1
-	movups	$in3,0x30($out)
-	xorps	$inout5,$rndkey0
-	movups	$rndkey1,0x40($out)
-	movups	$rndkey0,0x50($out)
-	lea	0x60($out),$out
-	mov	$rnds_,$rounds
-	sub	\$6,$len
-	jnc	.Lctr32_loop6
 
-	add	\$6,$len
-	jz	.Lctr32_done
-	mov	$key_,$key			# restore $key
-	lea	1($rounds,$rounds),$rounds	# restore original value
+	xorps	$in0,$inout0
+	movups	$inout0,($out)
+	xorps	$in1,$inout1
+	movups	$inout1,0x10($out)
+	pxor	$in2,$inout2
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout3
+	movdqu	$inout3,0x30($out)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop3:
+	aesenc		$rndkey1,$inout0
+	lea		16($key),$key
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	$movkey		($key),$rndkey1
+	dec		$rounds
+	jnz		.Lctr32_loop3
+	aesenclast	$rndkey1,$inout0
+	aesenclast	$rndkey1,$inout1
+	aesenclast	$rndkey1,$inout2
 
-.Lctr32_tail:
-	por	$ivec,$inout0
 	movups	($inp),$in0
+	xorps	$in0,$inout0
+	movups	$inout0,($out)
 	cmp	\$2,$len
-	jb	.Lctr32_one
+	jb	.Lctr32_done
 
-	por	$ivec,$inout1
 	movups	0x10($inp),$in1
-	je	.Lctr32_two
+	xorps	$in1,$inout1
+	movups	$inout1,0x10($out)
+	je	.Lctr32_done
 
-	pshufd	\$`3<<6`,$iv1,$inout3
-	por	$ivec,$inout2
 	movups	0x20($inp),$in2
-	cmp	\$4,$len
-	jb	.Lctr32_three
-
-	pshufd	\$`2<<6`,$iv1,$inout4
-	por	$ivec,$inout3
-	movups	0x30($inp),$in3
-	je	.Lctr32_four
-
-	por	$ivec,$inout4
-	xorps	$inout5,$inout5
-
-	call	_aesni_encrypt6
-
-	movups	0x40($inp),$rndkey1
-	xorps	$inout0,$in0
-	xorps	$inout1,$in1
-	movups	$in0,($out)
-	xorps	$inout2,$in2
-	movups	$in1,0x10($out)
-	xorps	$inout3,$in3
-	movups	$in2,0x20($out)
-	xorps	$inout4,$rndkey1
-	movups	$in3,0x30($out)
-	movups	$rndkey1,0x40($out)
+	xorps	$in2,$inout2
+	movups	$inout2,0x20($out)
 	jmp	.Lctr32_done
 
 .align	16
@@ -1224,64 +1390,32 @@ $code.=<<___;
 	movups	($ivp),$inout0
 	movups	($inp),$in0
 	mov	240($key),$rounds		# key->rounds
-.Lctr32_one:
 ___
 	&aesni_generate1("enc",$key,$rounds);
 $code.=<<___;
-	xorps	$inout0,$in0
-	movups	$in0,($out)
-	jmp	.Lctr32_done
-
-.align	16
-.Lctr32_two:
-	xorps	$inout2,$inout2
-	call	_aesni_encrypt3
-	xorps	$inout0,$in0
-	xorps	$inout1,$in1
-	movups	$in0,($out)
-	movups	$in1,0x10($out)
-	jmp	.Lctr32_done
-
-.align	16
-.Lctr32_three:
-	call	_aesni_encrypt3
-	xorps	$inout0,$in0
-	xorps	$inout1,$in1
-	movups	$in0,($out)
-	xorps	$inout2,$in2
-	movups	$in1,0x10($out)
-	movups	$in2,0x20($out)
+	xorps	$in0,$inout0
+	movups	$inout0,($out)
 	jmp	.Lctr32_done
 
 .align	16
-.Lctr32_four:
-	call	_aesni_encrypt4
-	xorps	$inout0,$in0
-	xorps	$inout1,$in1
-	movups	$in0,($out)
-	xorps	$inout2,$in2
-	movups	$in1,0x10($out)
-	xorps	$inout3,$in3
-	movups	$in2,0x20($out)
-	movups	$in3,0x30($out)
-
 .Lctr32_done:
 ___
 $code.=<<___ if ($win64);
-	movaps	0x20(%rsp),%xmm6
-	movaps	0x30(%rsp),%xmm7
-	movaps	0x40(%rsp),%xmm8
-	movaps	0x50(%rsp),%xmm9
-	movaps	0x60(%rsp),%xmm10
-	movaps	0x70(%rsp),%xmm11
-	movaps	0x80(%rsp),%xmm12
-	movaps	0x90(%rsp),%xmm13
-	movaps	0xa0(%rsp),%xmm14
-	movaps	0xb0(%rsp),%xmm15
-	lea	0xc8(%rsp),%rsp
-.Lctr32_ret:
+	movaps	-0xa0(%rbp),%xmm6
+	movaps	-0x90(%rbp),%xmm7
+	movaps	-0x80(%rbp),%xmm8
+	movaps	-0x70(%rbp),%xmm9
+	movaps	-0x60(%rbp),%xmm10
+	movaps	-0x50(%rbp),%xmm11
+	movaps	-0x40(%rbp),%xmm12
+	movaps	-0x30(%rbp),%xmm13
+	movaps	-0x20(%rbp),%xmm14
+	movaps	-0x10(%rbp),%xmm15
 ___
 $code.=<<___;
+	lea	(%rbp),%rsp
+	pop	%rbp
+.Lctr32_epilogue:
 	ret
 .size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
 ___
@@ -1296,29 +1430,33 @@ ___
 my @tweak=map("%xmm$_",(10..15));
 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
-my $frame_size = 0x68 + ($win64?160:0);
+my $frame_size = 0x70 + ($win64?160:0);
 
 $code.=<<___;
 .globl	aesni_xts_encrypt
 .type	aesni_xts_encrypt,\@function,6
 .align	16
 aesni_xts_encrypt:
-	lea	-$frame_size(%rsp),%rsp
+	lea	(%rsp),%rax
+	push	%rbp
+	sub	\$$frame_size,%rsp
+	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	movaps	%xmm6,0x60(%rsp)
-	movaps	%xmm7,0x70(%rsp)
-	movaps	%xmm8,0x80(%rsp)
-	movaps	%xmm9,0x90(%rsp)
-	movaps	%xmm10,0xa0(%rsp)
-	movaps	%xmm11,0xb0(%rsp)
-	movaps	%xmm12,0xc0(%rsp)
-	movaps	%xmm13,0xd0(%rsp)
-	movaps	%xmm14,0xe0(%rsp)
-	movaps	%xmm15,0xf0(%rsp)
+	movaps	%xmm6,-0xa8(%rax)
+	movaps	%xmm7,-0x98(%rax)
+	movaps	%xmm8,-0x88(%rax)
+	movaps	%xmm9,-0x78(%rax)
+	movaps	%xmm10,-0x68(%rax)
+	movaps	%xmm11,-0x58(%rax)
+	movaps	%xmm12,-0x48(%rax)
+	movaps	%xmm13,-0x38(%rax)
+	movaps	%xmm14,-0x28(%rax)
+	movaps	%xmm15,-0x18(%rax)
 .Lxts_enc_body:
 ___
 $code.=<<___;
+	lea	-8(%rax),%rbp
 	movups	($ivp),@tweak[5]		# load clear-text tweak
 	mov	240(%r8),$rounds		# key2->rounds
 	mov	240($key),$rnds_		# key1->rounds
@@ -1326,213 +1464,251 @@ ___
 	# generate the tweak
 	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
 $code.=<<___;
+	$movkey	($key),$rndkey0			# zero round key
 	mov	$key,$key_			# backup $key
 	mov	$rnds_,$rounds			# backup $rounds
+	shl	\$4,$rnds_
 	mov	$len,$len_			# backup $len
 	and	\$-16,$len
 
+	$movkey	16($key,$rnds_),$rndkey1	# last round key
+	mov	$rounds,$rnds_
+
 	movdqa	.Lxts_magic(%rip),$twmask
-	pxor	$twtmp,$twtmp
-	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	pshufd	\$0x5f,@tweak[5],$twres
+	pxor	$rndkey0,$rndkey1
 ___
+    # alternative tweak calculation algorithm is based on suggestions
+    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+    # and should help in the future...
     for ($i=0;$i<4;$i++) {
     $code.=<<___;
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	movdqa	@tweak[5],@tweak[$i]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	pand	$twmask,$twres			# isolate carry and residue
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	pxor	$twres,@tweak[5]
+	psrad	\$31,$twtmp			# broadcast upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
+	pxor	$rndkey0,@tweak[$i]
+	pxor	$twtmp,@tweak[5]
 ___
     }
 $code.=<<___;
+	movdqa	@tweak[5],@tweak[4]
+	psrad	\$31,$twres
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twres
+	pxor	$rndkey0,@tweak[4]
+	pxor	$twres,@tweak[5]
+	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
+
 	sub	\$16*6,$len
 	jc	.Lxts_enc_short
 
 	shr	\$1,$rounds
-	sub	\$1,$rounds
+	sub	\$3,$rounds
+	$movkey	16($key_),$rndkey1
 	mov	$rounds,$rnds_
+	lea	.Lxts_magic(%rip),%r8
 	jmp	.Lxts_enc_grandloop
 
-.align	16
+.align	32
 .Lxts_enc_grandloop:
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
 	movdqu	`16*0`($inp),$inout0		# load input
-	pand	$twmask,$twres			# isolate carry and residue
+	movdqa	$rndkey0,$twmask
 	movdqu	`16*1`($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	pxor	@tweak[0],$inout0
 	movdqu	`16*2`($inp),$inout2
-	pxor	@tweak[0],$inout0		# input^=tweak
-	movdqu	`16*3`($inp),$inout3
 	pxor	@tweak[1],$inout1
-	movdqu	`16*4`($inp),$inout4
+	 aesenc		$rndkey1,$inout0
+	movdqu	`16*3`($inp),$inout3
 	pxor	@tweak[2],$inout2
-	movdqu	`16*5`($inp),$inout5
-	lea	`16*6`($inp),$inp
+	 aesenc		$rndkey1,$inout1
+	movdqu	`16*4`($inp),$inout4
 	pxor	@tweak[3],$inout3
-	$movkey		($key_),$rndkey0
+	 aesenc		$rndkey1,$inout2
+	movdqu	`16*5`($inp),$inout5
+	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
+	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
 	pxor	@tweak[4],$inout4
-	pxor	@tweak[5],$inout5
+	 aesenc		$rndkey1,$inout3
+	$movkey	32($key_),$rndkey0
+	lea	`16*6`($inp),$inp
+	pxor	$twmask,$inout5
 
-	# inline _aesni_encrypt6 and interleave first and last rounds
-	# with own code...
-	$movkey		16($key_),$rndkey1
-	pxor		$rndkey0,$inout0
-	pxor		$rndkey0,$inout1
-	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
-	aesenc		$rndkey1,$inout0
-	lea		32($key_),$key
-	pxor		$rndkey0,$inout2
-	 movdqa	@tweak[1],`16*1`(%rsp)
-	aesenc		$rndkey1,$inout1
-	pxor		$rndkey0,$inout3
-	 movdqa	@tweak[2],`16*2`(%rsp)
-	aesenc		$rndkey1,$inout2
-	pxor		$rndkey0,$inout4
-	 movdqa	@tweak[3],`16*3`(%rsp)
-	aesenc		$rndkey1,$inout3
-	pxor		$rndkey0,$inout5
-	$movkey		($key),$rndkey0
-	dec		$rounds
-	 movdqa	@tweak[4],`16*4`(%rsp)
+	 pxor	$twres,@tweak[0]
 	aesenc		$rndkey1,$inout4
-	 movdqa	@tweak[5],`16*5`(%rsp)
+	 pxor	$twres,@tweak[1]
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
 	aesenc		$rndkey1,$inout5
-	pxor	$twtmp,$twtmp
-	pcmpgtd	@tweak[5],$twtmp
-	jmp		.Lxts_enc_loop6_enter
+	$movkey		48($key_),$rndkey1
 
-.align	16
+	aesenc		$rndkey0,$inout0
+	 pxor	$twres,@tweak[2]
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesenc		$rndkey0,$inout1
+	 pxor	$twres,@tweak[3]
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesenc		$rndkey0,$inout2
+	 pxor	$twres,@tweak[4]
+	aesenc		$rndkey0,$inout3
+	 pxor	$twres,$twmask
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesenc		$rndkey0,$inout4
+	 movdqa	$twmask,`16*5`(%rsp)
+	aesenc		$rndkey0,$inout5
+	$movkey		64($key_),$rndkey0
+	lea		64($key_),$key
+	pshufd	\$0x5f,@tweak[5],$twres
+	jmp	.Lxts_enc_loop6
+.align	32
 .Lxts_enc_loop6:
 	aesenc		$rndkey1,$inout0
 	aesenc		$rndkey1,$inout1
-	dec		$rounds
 	aesenc		$rndkey1,$inout2
 	aesenc		$rndkey1,$inout3
 	aesenc		$rndkey1,$inout4
 	aesenc		$rndkey1,$inout5
-.Lxts_enc_loop6_enter:
 	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+
 	aesenc		$rndkey0,$inout0
 	aesenc		$rndkey0,$inout1
-	lea		32($key),$key
 	aesenc		$rndkey0,$inout2
 	aesenc		$rndkey0,$inout3
 	aesenc		$rndkey0,$inout4
 	aesenc		$rndkey0,$inout5
 	$movkey		($key),$rndkey0
+	dec		$rounds
 	jnz		.Lxts_enc_loop6
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	(%r8),$twmask
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	 aesenc		$rndkey1,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	paddq	@tweak[5],@tweak[5]
+	psrad	\$31,$twtmp
 	 aesenc		$rndkey1,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	pand	$twmask,$twtmp
+	$movkey	($key_),@tweak[0]		# load round[0]
 	 aesenc		$rndkey1,$inout2
-	pxor	$twres,@tweak[5]
 	 aesenc		$rndkey1,$inout3
+	pxor	$twtmp,@tweak[5]
 	 aesenc		$rndkey1,$inout4
+	movaps	@tweak[0],@tweak[1]		# copy round[0]
 	 aesenc		$rndkey1,$inout5
 	 $movkey	16($key),$rndkey1
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[0]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	 aesenc		$rndkey0,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	pxor	@tweak[5],@tweak[0]
+	psrad	\$31,$twtmp
 	 aesenc		$rndkey0,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
 	 aesenc		$rndkey0,$inout2
-	pxor	$twres,@tweak[5]
 	 aesenc		$rndkey0,$inout3
+	pxor	$twtmp,@tweak[5]
 	 aesenc		$rndkey0,$inout4
+	movaps	@tweak[1],@tweak[2]
 	 aesenc		$rndkey0,$inout5
 	 $movkey	32($key),$rndkey0
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[1]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	 aesenc		$rndkey1,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	pxor	@tweak[5],@tweak[1]
+	psrad	\$31,$twtmp
 	 aesenc		$rndkey1,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
 	 aesenc		$rndkey1,$inout2
-	pxor	$twres,@tweak[5]
+	 movdqa	@tweak[3],`16*3`(%rsp)
 	 aesenc		$rndkey1,$inout3
+	pxor	$twtmp,@tweak[5]
 	 aesenc		$rndkey1,$inout4
+	movaps	@tweak[2],@tweak[3]
 	 aesenc		$rndkey1,$inout5
+	 $movkey	48($key),$rndkey1
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[2]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	 aesenclast	$rndkey0,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 aesenclast	$rndkey0,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	 aesenclast	$rndkey0,$inout2
-	pxor	$twres,@tweak[5]
-	 aesenclast	$rndkey0,$inout3
-	 aesenclast	$rndkey0,$inout4
-	 aesenclast	$rndkey0,$inout5
-
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[3]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
-	pand	$twmask,$twres			# isolate carry and residue
-	 xorps	`16*1`(%rsp),$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
+	 aesenc		$rndkey0,$inout0
+	pxor	@tweak[5],@tweak[2]
+	psrad	\$31,$twtmp
+	 aesenc		$rndkey0,$inout1
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
+	 aesenc		$rndkey0,$inout2
+	 aesenc		$rndkey0,$inout3
+	pxor	$twtmp,@tweak[5]
+	 aesenc		$rndkey0,$inout4
+	movaps	@tweak[3],@tweak[4]
+	 aesenc		$rndkey0,$inout5
+
+	movdqa	$twres,$rndkey0
+	paddd	$twres,$twres
+	 aesenc		$rndkey1,$inout0
+	pxor	@tweak[5],@tweak[3]
+	psrad	\$31,$rndkey0
+	 aesenc		$rndkey1,$inout1
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$rndkey0
+	 aesenc		$rndkey1,$inout2
+	 aesenc		$rndkey1,$inout3
+	pxor	$rndkey0,@tweak[5]
+	$movkey		($key_),$rndkey0
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+	$movkey		16($key_),$rndkey1
+
+	pxor	@tweak[5],@tweak[4]
+	psrad	\$31,$twres
+	 aesenclast	`16*0`(%rsp),$inout0
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twres
+	 aesenclast	`16*1`(%rsp),$inout1
+	 aesenclast	`16*2`(%rsp),$inout2
 	pxor	$twres,@tweak[5]
+	 aesenclast	`16*3`(%rsp),$inout3
+	 aesenclast	`16*4`(%rsp),$inout4
+	 aesenclast	`16*5`(%rsp),$inout5
+	mov		$rnds_,$rounds		# restore $rounds
 
-	xorps	`16*2`(%rsp),$inout2
-	movups	$inout0,`16*0`($out)		# write output
-	xorps	`16*3`(%rsp),$inout3
-	movups	$inout1,`16*1`($out)
-	xorps	`16*4`(%rsp),$inout4
-	movups	$inout2,`16*2`($out)
-	xorps	`16*5`(%rsp),$inout5
-	movups	$inout3,`16*3`($out)
-	mov	$rnds_,$rounds			# restore $rounds
-	movups	$inout4,`16*4`($out)
-	movups	$inout5,`16*5`($out)
 	lea	`16*6`($out),$out
+	movups	$inout0,`-16*6`($out)		# write output
+	movups	$inout1,`-16*5`($out)
+	movups	$inout2,`-16*4`($out)
+	movups	$inout3,`-16*3`($out)
+	movups	$inout4,`-16*2`($out)
+	movups	$inout5,`-16*1`($out)
 	sub	\$16*6,$len
 	jnc	.Lxts_enc_grandloop
 
-	lea	3($rounds,$rounds),$rounds	# restore original value
+	lea	7($rounds,$rounds),$rounds	# restore original value
 	mov	$key_,$key			# restore $key
 	mov	$rounds,$rnds_			# backup $rounds
 
 .Lxts_enc_short:
+	pxor	$rndkey0,@tweak[0]
 	add	\$16*6,$len
 	jz	.Lxts_enc_done
 
+	pxor	$rndkey0,@tweak[1]
 	cmp	\$0x20,$len
 	jb	.Lxts_enc_one
+	pxor	$rndkey0,@tweak[2]
 	je	.Lxts_enc_two
 
+	pxor	$rndkey0,@tweak[3]
 	cmp	\$0x40,$len
 	jb	.Lxts_enc_three
+	pxor	$rndkey0,@tweak[4]
 	je	.Lxts_enc_four
 
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
-	 movdqu	($inp),$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 movdqu	16*1($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	movdqu	($inp),$inout0
+	movdqu	16*1($inp),$inout1
 	movdqu	16*2($inp),$inout2
 	pxor	@tweak[0],$inout0
 	movdqu	16*3($inp),$inout3
@@ -1627,15 +1803,15 @@ $code.=<<___;
 
 	call	_aesni_encrypt4
 
-	xorps	@tweak[0],$inout0
-	movdqa	@tweak[5],@tweak[0]
-	xorps	@tweak[1],$inout1
-	xorps	@tweak[2],$inout2
-	movups	$inout0,($out)
-	xorps	@tweak[3],$inout3
-	movups	$inout1,16*1($out)
-	movups	$inout2,16*2($out)
-	movups	$inout3,16*3($out)
+	pxor	@tweak[0],$inout0
+	movdqa	@tweak[4],@tweak[0]
+	pxor	@tweak[1],$inout1
+	pxor	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	pxor	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	movdqu	$inout2,16*2($out)
+	movdqu	$inout3,16*3($out)
 	lea	16*4($out),$out
 	jmp	.Lxts_enc_done
 
@@ -1670,19 +1846,20 @@ $code.=<<___;
 .Lxts_enc_ret:
 ___
 $code.=<<___ if ($win64);
-	movaps	0x60(%rsp),%xmm6
-	movaps	0x70(%rsp),%xmm7
-	movaps	0x80(%rsp),%xmm8
-	movaps	0x90(%rsp),%xmm9
-	movaps	0xa0(%rsp),%xmm10
-	movaps	0xb0(%rsp),%xmm11
-	movaps	0xc0(%rsp),%xmm12
-	movaps	0xd0(%rsp),%xmm13
-	movaps	0xe0(%rsp),%xmm14
-	movaps	0xf0(%rsp),%xmm15
+	movaps	-0xa0(%rbp),%xmm6
+	movaps	-0x90(%rbp),%xmm7
+	movaps	-0x80(%rbp),%xmm8
+	movaps	-0x70(%rbp),%xmm9
+	movaps	-0x60(%rbp),%xmm10
+	movaps	-0x50(%rbp),%xmm11
+	movaps	-0x40(%rbp),%xmm12
+	movaps	-0x30(%rbp),%xmm13
+	movaps	-0x20(%rbp),%xmm14
+	movaps	-0x10(%rbp),%xmm15
 ___
 $code.=<<___;
-	lea	$frame_size(%rsp),%rsp
+	lea	(%rbp),%rsp
+	pop	%rbp
 .Lxts_enc_epilogue:
 	ret
 .size	aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -1693,22 +1870,26 @@ $code.=<<___;
 .type	aesni_xts_decrypt,\@function,6
 .align	16
 aesni_xts_decrypt:
-	lea	-$frame_size(%rsp),%rsp
+	lea	(%rsp),%rax
+	push	%rbp
+	sub	\$$frame_size,%rsp
+	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	movaps	%xmm6,0x60(%rsp)
-	movaps	%xmm7,0x70(%rsp)
-	movaps	%xmm8,0x80(%rsp)
-	movaps	%xmm9,0x90(%rsp)
-	movaps	%xmm10,0xa0(%rsp)
-	movaps	%xmm11,0xb0(%rsp)
-	movaps	%xmm12,0xc0(%rsp)
-	movaps	%xmm13,0xd0(%rsp)
-	movaps	%xmm14,0xe0(%rsp)
-	movaps	%xmm15,0xf0(%rsp)
+	movaps	%xmm6,-0xa8(%rax)
+	movaps	%xmm7,-0x98(%rax)
+	movaps	%xmm8,-0x88(%rax)
+	movaps	%xmm9,-0x78(%rax)
+	movaps	%xmm10,-0x68(%rax)
+	movaps	%xmm11,-0x58(%rax)
+	movaps	%xmm12,-0x48(%rax)
+	movaps	%xmm13,-0x38(%rax)
+	movaps	%xmm14,-0x28(%rax)
+	movaps	%xmm15,-0x18(%rax)
 .Lxts_dec_body:
 ___
 $code.=<<___;
+	lea	-8(%rax),%rbp
 	movups	($ivp),@tweak[5]		# load clear-text tweak
 	mov	240($key2),$rounds		# key2->rounds
 	mov	240($key),$rnds_		# key1->rounds
@@ -1722,213 +1903,248 @@ $code.=<<___;
 	shl	\$4,%rax
 	sub	%rax,$len
 
+	$movkey	($key),$rndkey0			# zero round key
 	mov	$key,$key_			# backup $key
 	mov	$rnds_,$rounds			# backup $rounds
+	shl	\$4,$rnds_
 	mov	$len,$len_			# backup $len
 	and	\$-16,$len
 
+	$movkey	16($key,$rnds_),$rndkey1	# last round key
+	mov	$rounds,$rnds_
+
 	movdqa	.Lxts_magic(%rip),$twmask
-	pxor	$twtmp,$twtmp
-	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	pshufd	\$0x5f,@tweak[5],$twres
+	pxor	$rndkey0,$rndkey1
 ___
     for ($i=0;$i<4;$i++) {
     $code.=<<___;
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	movdqa	@tweak[5],@tweak[$i]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	pand	$twmask,$twres			# isolate carry and residue
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	pxor	$twres,@tweak[5]
+	psrad	\$31,$twtmp			# broadcast upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
+	pxor	$rndkey0,@tweak[$i]
+	pxor	$twtmp,@tweak[5]
 ___
     }
 $code.=<<___;
+	movdqa	@tweak[5],@tweak[4]
+	psrad	\$31,$twres
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twres
+	pxor	$rndkey0,@tweak[4]
+	pxor	$twres,@tweak[5]
+	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
+
 	sub	\$16*6,$len
 	jc	.Lxts_dec_short
 
 	shr	\$1,$rounds
-	sub	\$1,$rounds
+	sub	\$3,$rounds
+	$movkey	16($key_),$rndkey1
 	mov	$rounds,$rnds_
+	lea	.Lxts_magic(%rip),%r8
 	jmp	.Lxts_dec_grandloop
 
-.align	16
+.align	32
 .Lxts_dec_grandloop:
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
 	movdqu	`16*0`($inp),$inout0		# load input
-	pand	$twmask,$twres			# isolate carry and residue
+	movdqa	$rndkey0,$twmask
 	movdqu	`16*1`($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	pxor	@tweak[0],$inout0
 	movdqu	`16*2`($inp),$inout2
-	pxor	@tweak[0],$inout0		# input^=tweak
-	movdqu	`16*3`($inp),$inout3
 	pxor	@tweak[1],$inout1
-	movdqu	`16*4`($inp),$inout4
+	 aesdec		$rndkey1,$inout0
+	movdqu	`16*3`($inp),$inout3
 	pxor	@tweak[2],$inout2
-	movdqu	`16*5`($inp),$inout5
-	lea	`16*6`($inp),$inp
+	 aesdec		$rndkey1,$inout1
+	movdqu	`16*4`($inp),$inout4
 	pxor	@tweak[3],$inout3
-	$movkey		($key_),$rndkey0
+	 aesdec		$rndkey1,$inout2
+	movdqu	`16*5`($inp),$inout5
+	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
+	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
 	pxor	@tweak[4],$inout4
-	pxor	@tweak[5],$inout5
+	 aesdec		$rndkey1,$inout3
+	$movkey	32($key_),$rndkey0
+	lea	`16*6`($inp),$inp
+	pxor	$twmask,$inout5
 
-	# inline _aesni_decrypt6 and interleave first and last rounds
-	# with own code...
-	$movkey		16($key_),$rndkey1
-	pxor		$rndkey0,$inout0
-	pxor		$rndkey0,$inout1
-	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
-	aesdec		$rndkey1,$inout0
-	lea		32($key_),$key
-	pxor		$rndkey0,$inout2
-	 movdqa	@tweak[1],`16*1`(%rsp)
-	aesdec		$rndkey1,$inout1
-	pxor		$rndkey0,$inout3
-	 movdqa	@tweak[2],`16*2`(%rsp)
-	aesdec		$rndkey1,$inout2
-	pxor		$rndkey0,$inout4
-	 movdqa	@tweak[3],`16*3`(%rsp)
-	aesdec		$rndkey1,$inout3
-	pxor		$rndkey0,$inout5
-	$movkey		($key),$rndkey0
-	dec		$rounds
-	 movdqa	@tweak[4],`16*4`(%rsp)
+	 pxor	$twres,@tweak[0]
 	aesdec		$rndkey1,$inout4
-	 movdqa	@tweak[5],`16*5`(%rsp)
+	 pxor	$twres,@tweak[1]
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
 	aesdec		$rndkey1,$inout5
-	pxor	$twtmp,$twtmp
-	pcmpgtd	@tweak[5],$twtmp
-	jmp		.Lxts_dec_loop6_enter
+	$movkey		48($key_),$rndkey1
 
-.align	16
+	aesdec		$rndkey0,$inout0
+	 pxor	$twres,@tweak[2]
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesdec		$rndkey0,$inout1
+	 pxor	$twres,@tweak[3]
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesdec		$rndkey0,$inout2
+	 pxor	$twres,@tweak[4]
+	aesdec		$rndkey0,$inout3
+	 pxor	$twres,$twmask
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesdec		$rndkey0,$inout4
+	 movdqa	$twmask,`16*5`(%rsp)
+	aesdec		$rndkey0,$inout5
+	$movkey		64($key_),$rndkey0
+	lea		64($key_),$key
+	pshufd	\$0x5f,@tweak[5],$twres
+	jmp	.Lxts_dec_loop6
+.align	32
 .Lxts_dec_loop6:
 	aesdec		$rndkey1,$inout0
 	aesdec		$rndkey1,$inout1
-	dec		$rounds
 	aesdec		$rndkey1,$inout2
 	aesdec		$rndkey1,$inout3
 	aesdec		$rndkey1,$inout4
 	aesdec		$rndkey1,$inout5
-.Lxts_dec_loop6_enter:
 	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+
 	aesdec		$rndkey0,$inout0
 	aesdec		$rndkey0,$inout1
-	lea		32($key),$key
 	aesdec		$rndkey0,$inout2
 	aesdec		$rndkey0,$inout3
 	aesdec		$rndkey0,$inout4
 	aesdec		$rndkey0,$inout5
 	$movkey		($key),$rndkey0
+	dec		$rounds
 	jnz		.Lxts_dec_loop6
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	(%r8),$twmask
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	 aesdec		$rndkey1,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	paddq	@tweak[5],@tweak[5]
+	psrad	\$31,$twtmp
 	 aesdec		$rndkey1,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	pand	$twmask,$twtmp
+	$movkey	($key_),@tweak[0]		# load round[0]
 	 aesdec		$rndkey1,$inout2
-	pxor	$twres,@tweak[5]
 	 aesdec		$rndkey1,$inout3
+	pxor	$twtmp,@tweak[5]
 	 aesdec		$rndkey1,$inout4
+	movaps	@tweak[0],@tweak[1]		# copy round[0]
 	 aesdec		$rndkey1,$inout5
 	 $movkey	16($key),$rndkey1
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[0]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	 aesdec		$rndkey0,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	pxor	@tweak[5],@tweak[0]
+	psrad	\$31,$twtmp
 	 aesdec		$rndkey0,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
 	 aesdec		$rndkey0,$inout2
-	pxor	$twres,@tweak[5]
 	 aesdec		$rndkey0,$inout3
+	pxor	$twtmp,@tweak[5]
 	 aesdec		$rndkey0,$inout4
+	movaps	@tweak[1],@tweak[2]
 	 aesdec		$rndkey0,$inout5
 	 $movkey	32($key),$rndkey0
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[1]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	 aesdec		$rndkey1,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	pxor	@tweak[5],@tweak[1]
+	psrad	\$31,$twtmp
 	 aesdec		$rndkey1,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
 	 aesdec		$rndkey1,$inout2
-	pxor	$twres,@tweak[5]
+	 movdqa	@tweak[3],`16*3`(%rsp)
 	 aesdec		$rndkey1,$inout3
+	pxor	$twtmp,@tweak[5]
 	 aesdec		$rndkey1,$inout4
+	movaps	@tweak[2],@tweak[3]
 	 aesdec		$rndkey1,$inout5
+	 $movkey	48($key),$rndkey1
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[2]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	 aesdeclast	$rndkey0,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 aesdeclast	$rndkey0,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	 aesdeclast	$rndkey0,$inout2
-	pxor	$twres,@tweak[5]
-	 aesdeclast	$rndkey0,$inout3
-	 aesdeclast	$rndkey0,$inout4
-	 aesdeclast	$rndkey0,$inout5
-
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[3]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
-	pand	$twmask,$twres			# isolate carry and residue
-	 xorps	`16*1`(%rsp),$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
+	 aesdec		$rndkey0,$inout0
+	pxor	@tweak[5],@tweak[2]
+	psrad	\$31,$twtmp
+	 aesdec		$rndkey0,$inout1
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
+	 aesdec		$rndkey0,$inout2
+	 aesdec		$rndkey0,$inout3
+	pxor	$twtmp,@tweak[5]
+	 aesdec		$rndkey0,$inout4
+	movaps	@tweak[3],@tweak[4]
+	 aesdec		$rndkey0,$inout5
+
+	movdqa	$twres,$rndkey0
+	paddd	$twres,$twres
+	 aesdec		$rndkey1,$inout0
+	pxor	@tweak[5],@tweak[3]
+	psrad	\$31,$rndkey0
+	 aesdec		$rndkey1,$inout1
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$rndkey0
+	 aesdec		$rndkey1,$inout2
+	 aesdec		$rndkey1,$inout3
+	pxor	$rndkey0,@tweak[5]
+	$movkey		($key_),$rndkey0
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+	$movkey		16($key_),$rndkey1
+
+	pxor	@tweak[5],@tweak[4]
+	psrad	\$31,$twres
+	 aesdeclast	`16*0`(%rsp),$inout0
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twres
+	 aesdeclast	`16*1`(%rsp),$inout1
+	 aesdeclast	`16*2`(%rsp),$inout2
 	pxor	$twres,@tweak[5]
+	 aesdeclast	`16*3`(%rsp),$inout3
+	 aesdeclast	`16*4`(%rsp),$inout4
+	 aesdeclast	`16*5`(%rsp),$inout5
+	mov		$rnds_,$rounds		# restore $rounds
 
-	xorps	`16*2`(%rsp),$inout2
-	movups	$inout0,`16*0`($out)		# write output
-	xorps	`16*3`(%rsp),$inout3
-	movups	$inout1,`16*1`($out)
-	xorps	`16*4`(%rsp),$inout4
-	movups	$inout2,`16*2`($out)
-	xorps	`16*5`(%rsp),$inout5
-	movups	$inout3,`16*3`($out)
-	mov	$rnds_,$rounds			# restore $rounds
-	movups	$inout4,`16*4`($out)
-	movups	$inout5,`16*5`($out)
 	lea	`16*6`($out),$out
+	movups	$inout0,`-16*6`($out)		# write output
+	movups	$inout1,`-16*5`($out)
+	movups	$inout2,`-16*4`($out)
+	movups	$inout3,`-16*3`($out)
+	movups	$inout4,`-16*2`($out)
+	movups	$inout5,`-16*1`($out)
 	sub	\$16*6,$len
 	jnc	.Lxts_dec_grandloop
 
-	lea	3($rounds,$rounds),$rounds	# restore original value
+	lea	7($rounds,$rounds),$rounds	# restore original value
 	mov	$key_,$key			# restore $key
 	mov	$rounds,$rnds_			# backup $rounds
 
 .Lxts_dec_short:
+	pxor	$rndkey0,@tweak[0]
+	pxor	$rndkey0,@tweak[1]
 	add	\$16*6,$len
 	jz	.Lxts_dec_done
 
+	pxor	$rndkey0,@tweak[2]
 	cmp	\$0x20,$len
 	jb	.Lxts_dec_one
+	pxor	$rndkey0,@tweak[3]
 	je	.Lxts_dec_two
 
+	pxor	$rndkey0,@tweak[4]
 	cmp	\$0x40,$len
 	jb	.Lxts_dec_three
 	je	.Lxts_dec_four
 
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
-	 movdqu	($inp),$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 movdqu	16*1($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	movdqu	($inp),$inout0
+	movdqu	16*1($inp),$inout1
 	movdqu	16*2($inp),$inout2
 	pxor	@tweak[0],$inout0
 	movdqu	16*3($inp),$inout3
@@ -2013,7 +2229,7 @@ $code.=<<___;
 	xorps	@tweak[0],$inout0
 	movdqa	@tweak[3],@tweak[0]
 	xorps	@tweak[1],$inout1
-	movdqa	@tweak[5],@tweak[1]
+	movdqa	@tweak[4],@tweak[1]
 	xorps	@tweak[2],$inout2
 	movups	$inout0,($out)
 	movups	$inout1,16*1($out)
@@ -2023,14 +2239,8 @@ $code.=<<___;
 
 .align	16
 .Lxts_dec_four:
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
-	 movups	($inp),$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 movups	16*1($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
 	movups	16*2($inp),$inout2
 	xorps	@tweak[0],$inout0
 	movups	16*3($inp),$inout3
@@ -2041,16 +2251,16 @@ $code.=<<___;
 
 	call	_aesni_decrypt4
 
-	xorps	@tweak[0],$inout0
+	pxor	@tweak[0],$inout0
 	movdqa	@tweak[4],@tweak[0]
-	xorps	@tweak[1],$inout1
+	pxor	@tweak[1],$inout1
 	movdqa	@tweak[5],@tweak[1]
-	xorps	@tweak[2],$inout2
-	movups	$inout0,($out)
-	xorps	@tweak[3],$inout3
-	movups	$inout1,16*1($out)
-	movups	$inout2,16*2($out)
-	movups	$inout3,16*3($out)
+	pxor	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	pxor	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	movdqu	$inout2,16*2($out)
+	movdqu	$inout3,16*3($out)
 	lea	16*4($out),$out
 	jmp	.Lxts_dec_done
 
@@ -2096,19 +2306,20 @@ $code.=<<___;
 .Lxts_dec_ret:
 ___
 $code.=<<___ if ($win64);
-	movaps	0x60(%rsp),%xmm6
-	movaps	0x70(%rsp),%xmm7
-	movaps	0x80(%rsp),%xmm8
-	movaps	0x90(%rsp),%xmm9
-	movaps	0xa0(%rsp),%xmm10
-	movaps	0xb0(%rsp),%xmm11
-	movaps	0xc0(%rsp),%xmm12
-	movaps	0xd0(%rsp),%xmm13
-	movaps	0xe0(%rsp),%xmm14
-	movaps	0xf0(%rsp),%xmm15
+	movaps	-0xa0(%rbp),%xmm6
+	movaps	-0x90(%rbp),%xmm7
+	movaps	-0x80(%rbp),%xmm8
+	movaps	-0x70(%rbp),%xmm9
+	movaps	-0x60(%rbp),%xmm10
+	movaps	-0x50(%rbp),%xmm11
+	movaps	-0x40(%rbp),%xmm12
+	movaps	-0x30(%rbp),%xmm13
+	movaps	-0x20(%rbp),%xmm14
+	movaps	-0x10(%rbp),%xmm15
 ___
 $code.=<<___;
-	lea	$frame_size(%rsp),%rsp
+	lea	(%rbp),%rsp
+	pop	%rbp
 .Lxts_dec_epilogue:
 	ret
 .size	aesni_xts_decrypt,.-aesni_xts_decrypt
@@ -2120,7 +2331,10 @@ ___
 #			    size_t length, const AES_KEY *key,
 #			    unsigned char *ivp,const int enc);
 {
-my $reserved = $win64?0x40:-0x18;	# used in decrypt
+my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
+my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
+my $inp_=$key_;
+
 $code.=<<___;
 .globl	${PREFIX}_cbc_encrypt
 .type	${PREFIX}_cbc_encrypt,\@function,6
@@ -2176,276 +2390,340 @@ $code.=<<___;
 #--------------------------- CBC DECRYPT ------------------------------#
 .align	16
 .Lcbc_decrypt:
+	lea	(%rsp),%rax
+	push	%rbp
+	sub	\$$frame_size,%rsp
+	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	lea	-0x58(%rsp),%rsp
-	movaps	%xmm6,(%rsp)
-	movaps	%xmm7,0x10(%rsp)
-	movaps	%xmm8,0x20(%rsp)
-	movaps	%xmm9,0x30(%rsp)
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
 .Lcbc_decrypt_body:
 ___
 $code.=<<___;
+	lea	-8(%rax),%rbp
 	movups	($ivp),$iv
 	mov	$rnds_,$rounds
-	cmp	\$0x70,$len
+	cmp	\$0x50,$len
 	jbe	.Lcbc_dec_tail
-	shr	\$1,$rnds_
+
+	$movkey	($key),$rndkey0
+	movdqu	0x00($inp),$inout0	# load input
+	movdqu	0x10($inp),$inout1
+	movdqa	$inout0,$in0
+	movdqu	0x20($inp),$inout2
+	movdqa	$inout1,$in1
+	movdqu	0x30($inp),$inout3
+	movdqa	$inout2,$in2
+	movdqu	0x40($inp),$inout4
+	movdqa	$inout3,$in3
+	movdqu	0x50($inp),$inout5
+	movdqa	$inout4,$in4
+	cmp	\$0x70,$len
+	jbe	.Lcbc_dec_six_or_seven
+
 	sub	\$0x70,$len
-	mov	$rnds_,$rounds
-	movaps	$iv,$reserved(%rsp)
+	lea	0x70($key),$key		# size optimization
 	jmp	.Lcbc_dec_loop8_enter
 .align	16
 .Lcbc_dec_loop8:
-	movaps	$rndkey0,$reserved(%rsp)	# save IV
 	movups	$inout7,($out)
 	lea	0x10($out),$out
 .Lcbc_dec_loop8_enter:
-	$movkey		($key),$rndkey0
-	movups	($inp),$inout0			# load input
-	movups	0x10($inp),$inout1
-	$movkey		16($key),$rndkey1
+	movdqu		0x60($inp),$inout6
+	pxor		$rndkey0,$inout0
+	movdqu		0x70($inp),$inout7
+	pxor		$rndkey0,$inout1
+	$movkey		0x10-0x70($key),$rndkey1
+	pxor		$rndkey0,$inout2
+	xor		$inp_,$inp_
+	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
+	pxor		$rndkey0,$inout3
+	pxor		$rndkey0,$inout4
+	pxor		$rndkey0,$inout5
+	pxor		$rndkey0,$inout6
 
-	lea		32($key),$key
-	movdqu	0x20($inp),$inout2
-	xorps		$rndkey0,$inout0
-	movdqu	0x30($inp),$inout3
-	xorps		$rndkey0,$inout1
-	movdqu	0x40($inp),$inout4
 	aesdec		$rndkey1,$inout0
-	pxor		$rndkey0,$inout2
-	movdqu	0x50($inp),$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		0x20-0x70($key),$rndkey0
 	aesdec		$rndkey1,$inout1
-	pxor		$rndkey0,$inout3
-	movdqu	0x60($inp),$inout6
 	aesdec		$rndkey1,$inout2
-	pxor		$rndkey0,$inout4
-	movdqu	0x70($inp),$inout7
 	aesdec		$rndkey1,$inout3
-	pxor		$rndkey0,$inout5
-	dec		$rounds
 	aesdec		$rndkey1,$inout4
-	pxor		$rndkey0,$inout6
 	aesdec		$rndkey1,$inout5
-	pxor		$rndkey0,$inout7
-	$movkey		($key),$rndkey0
+	setnc		${inp_}b
 	aesdec		$rndkey1,$inout6
+	shl		\$7,$inp_
 	aesdec		$rndkey1,$inout7
-	$movkey		16($key),$rndkey1
-
-	call		.Ldec_loop8_enter
+	add		$inp,$inp_
+	$movkey		0x30-0x70($key),$rndkey1
+___
+for($i=1;$i<12;$i++) {
+my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
+$code.=<<___;
+	aesdec		$rndkeyx,$inout0
+	aesdec		$rndkeyx,$inout1
+	aesdec		$rndkeyx,$inout2
+	aesdec		$rndkeyx,$inout3
+	aesdec		$rndkeyx,$inout4
+	aesdec		$rndkeyx,$inout5
+	aesdec		$rndkeyx,$inout6
+	aesdec		$rndkeyx,$inout7
+	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
+___
+$code.=<<___	if ($i==7);
+	cmp		\$11,$rounds
+	jb		.Lcbc_dec_done
+___
+$code.=<<___	if ($i==9);
+	je		.Lcbc_dec_done
+___
+}
+$code.=<<___;
+.Lcbc_dec_done:
+	aesdec		$rndkey1,$inout0
+	pxor		$rndkey0,$iv
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$in0
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$in1
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$in2
+	aesdec		$rndkey1,$inout4
+	pxor		$rndkey0,$in3
+	aesdec		$rndkey1,$inout5
+	pxor		$rndkey0,$in4
+	aesdec		$rndkey1,$inout6
+	aesdec		$rndkey1,$inout7
+	movdqu		0x50($inp),$rndkey1
+
+	aesdeclast	$iv,$inout0
+	movdqu		0x60($inp),$iv		# borrow $iv
+	pxor		$rndkey0,$rndkey1
+	aesdeclast	$in0,$inout1
+	pxor		$rndkey0,$iv
+	movdqu		0x70($inp),$rndkey0	# next IV
+	lea		0x80($inp),$inp
+	aesdeclast	$in1,$inout2
+	movdqu		0x00($inp_),$in0
+	aesdeclast	$in2,$inout3
+	movdqu		0x10($inp_),$in1
+	aesdeclast	$in3,$inout4
+	movdqu		0x20($inp_),$in2
+	aesdeclast	$in4,$inout5
+	movdqu		0x30($inp_),$in3
+	aesdeclast	$rndkey1,$inout6
+	movdqu		0x40($inp_),$in4
+	aesdeclast	$iv,$inout7
+	movdqa		$rndkey0,$iv		# return $iv
+	movdqu		0x50($inp_),$rndkey1
+	$movkey		-0x70($key),$rndkey0
+
+	movups		$inout0,($out)		# store output
+	movdqa		$in0,$inout0
+	movups		$inout1,0x10($out)
+	movdqa		$in1,$inout1
+	movups		$inout2,0x20($out)
+	movdqa		$in2,$inout2
+	movups		$inout3,0x30($out)
+	movdqa		$in3,$inout3
+	movups		$inout4,0x40($out)
+	movdqa		$in4,$inout4
+	movups		$inout5,0x50($out)
+	movdqa		$rndkey1,$inout5
+	movups		$inout6,0x60($out)
+	lea		0x70($out),$out
 
-	movups	($inp),$rndkey1		# re-load input
-	movups	0x10($inp),$rndkey0
-	xorps	$reserved(%rsp),$inout0	# ^= IV
-	xorps	$rndkey1,$inout1
-	movups	0x20($inp),$rndkey1
-	xorps	$rndkey0,$inout2
-	movups	0x30($inp),$rndkey0
-	xorps	$rndkey1,$inout3
-	movups	0x40($inp),$rndkey1
-	xorps	$rndkey0,$inout4
-	movups	0x50($inp),$rndkey0
-	xorps	$rndkey1,$inout5
-	movups	0x60($inp),$rndkey1
-	xorps	$rndkey0,$inout6
-	movups	0x70($inp),$rndkey0	# IV
-	xorps	$rndkey1,$inout7
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	mov	$rnds_,$rounds		# restore $rounds
-	movups	$inout4,0x40($out)
-	mov	$key_,$key		# restore $key
-	movups	$inout5,0x50($out)
-	lea	0x80($inp),$inp
-	movups	$inout6,0x60($out)
-	lea	0x70($out),$out
 	sub	\$0x80,$len
 	ja	.Lcbc_dec_loop8
 
 	movaps	$inout7,$inout0
-	movaps	$rndkey0,$iv
+	lea	-0x70($key),$key
 	add	\$0x70,$len
 	jle	.Lcbc_dec_tail_collected
-	movups	$inout0,($out)
-	lea	1($rnds_,$rnds_),$rounds
+	movups	$inout7,($out)
 	lea	0x10($out),$out
+	cmp	\$0x50,$len
+	jbe	.Lcbc_dec_tail
+
+	movaps	$in0,$inout0
+.Lcbc_dec_six_or_seven:
+	cmp	\$0x60,$len
+	ja	.Lcbc_dec_seven
+
+	movaps	$inout5,$inout6
+	call	_aesni_decrypt6
+	pxor	$iv,$inout0		# ^= IV
+	movaps	$inout6,$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout4
+	movdqu	$inout3,0x30($out)
+	pxor	$in4,$inout5
+	movdqu	$inout4,0x40($out)
+	lea	0x50($out),$out
+	movdqa	$inout5,$inout0
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_seven:
+	movups	0x60($inp),$inout6
+	xorps	$inout7,$inout7
+	call	_aesni_decrypt8
+	movups	0x50($inp),$inout7
+	pxor	$iv,$inout0		# ^= IV
+	movups	0x60($inp),$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout4
+	movdqu	$inout3,0x30($out)
+	pxor	$in4,$inout5
+	movdqu	$inout4,0x40($out)
+	pxor	$inout7,$inout6
+	movdqu	$inout5,0x50($out)
+	lea	0x60($out),$out
+	movdqa	$inout6,$inout0
+	jmp	.Lcbc_dec_tail_collected
+
 .Lcbc_dec_tail:
 	movups	($inp),$inout0
-	movaps	$inout0,$in0
-	cmp	\$0x10,$len
+	sub	\$0x10,$len
 	jbe	.Lcbc_dec_one
 
 	movups	0x10($inp),$inout1
-	movaps	$inout1,$in1
-	cmp	\$0x20,$len
+	movaps	$inout0,$in0
+	sub	\$0x10,$len
 	jbe	.Lcbc_dec_two
 
 	movups	0x20($inp),$inout2
-	movaps	$inout2,$in2
-	cmp	\$0x30,$len
+	movaps	$inout1,$in1
+	sub	\$0x10,$len
 	jbe	.Lcbc_dec_three
 
 	movups	0x30($inp),$inout3
-	cmp	\$0x40,$len
+	movaps	$inout2,$in2
+	sub	\$0x10,$len
 	jbe	.Lcbc_dec_four
 
 	movups	0x40($inp),$inout4
-	cmp	\$0x50,$len
-	jbe	.Lcbc_dec_five
-
-	movups	0x50($inp),$inout5
-	cmp	\$0x60,$len
-	jbe	.Lcbc_dec_six
-
-	movups	0x60($inp),$inout6
-	movaps	$iv,$reserved(%rsp)	# save IV
-	call	_aesni_decrypt8
-	movups	($inp),$rndkey1
-	movups	0x10($inp),$rndkey0
-	xorps	$reserved(%rsp),$inout0	# ^= IV
-	xorps	$rndkey1,$inout1
-	movups	0x20($inp),$rndkey1
-	xorps	$rndkey0,$inout2
-	movups	0x30($inp),$rndkey0
-	xorps	$rndkey1,$inout3
-	movups	0x40($inp),$rndkey1
-	xorps	$rndkey0,$inout4
-	movups	0x50($inp),$rndkey0
-	xorps	$rndkey1,$inout5
-	movups	0x60($inp),$iv		# IV
-	xorps	$rndkey0,$inout6
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
-	movups	$inout5,0x50($out)
-	lea	0x60($out),$out
-	movaps	$inout6,$inout0
-	sub	\$0x70,$len
+	movaps	$inout3,$in3
+	movaps	$inout4,$in4
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	pxor	$iv,$inout0
+	movaps	$in4,$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout4
+	movdqu	$inout3,0x30($out)
+	lea	0x40($out),$out
+	movdqa	$inout4,$inout0
+	sub	\$0x10,$len
 	jmp	.Lcbc_dec_tail_collected
+
 .align	16
 .Lcbc_dec_one:
+	movaps	$inout0,$in0
 ___
 	&aesni_generate1("dec",$key,$rounds);
 $code.=<<___;
 	xorps	$iv,$inout0
 	movaps	$in0,$iv
-	sub	\$0x10,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_two:
+	movaps	$inout1,$in1
 	xorps	$inout2,$inout2
 	call	_aesni_decrypt3
-	xorps	$iv,$inout0
-	xorps	$in0,$inout1
-	movups	$inout0,($out)
+	pxor	$iv,$inout0
 	movaps	$in1,$iv
-	movaps	$inout1,$inout0
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	movdqa	$inout1,$inout0
 	lea	0x10($out),$out
-	sub	\$0x20,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_three:
+	movaps	$inout2,$in2
 	call	_aesni_decrypt3
-	xorps	$iv,$inout0
-	xorps	$in0,$inout1
-	movups	$inout0,($out)
-	xorps	$in1,$inout2
-	movups	$inout1,0x10($out)
+	pxor	$iv,$inout0
 	movaps	$in2,$iv
-	movaps	$inout2,$inout0
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	movdqa	$inout2,$inout0
 	lea	0x20($out),$out
-	sub	\$0x30,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_four:
+	movaps	$inout3,$in3
 	call	_aesni_decrypt4
-	xorps	$iv,$inout0
-	movups	0x30($inp),$iv
-	xorps	$in0,$inout1
-	movups	$inout0,($out)
-	xorps	$in1,$inout2
-	movups	$inout1,0x10($out)
-	xorps	$in2,$inout3
-	movups	$inout2,0x20($out)
-	movaps	$inout3,$inout0
+	pxor	$iv,$inout0
+	movaps	$in3,$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	movdqa	$inout3,$inout0
 	lea	0x30($out),$out
-	sub	\$0x40,$len
-	jmp	.Lcbc_dec_tail_collected
-.align	16
-.Lcbc_dec_five:
-	xorps	$inout5,$inout5
-	call	_aesni_decrypt6
-	movups	0x10($inp),$rndkey1
-	movups	0x20($inp),$rndkey0
-	xorps	$iv,$inout0
-	xorps	$in0,$inout1
-	xorps	$rndkey1,$inout2
-	movups	0x30($inp),$rndkey1
-	xorps	$rndkey0,$inout3
-	movups	0x40($inp),$iv
-	xorps	$rndkey1,$inout4
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	lea	0x40($out),$out
-	movaps	$inout4,$inout0
-	sub	\$0x50,$len
-	jmp	.Lcbc_dec_tail_collected
-.align	16
-.Lcbc_dec_six:
-	call	_aesni_decrypt6
-	movups	0x10($inp),$rndkey1
-	movups	0x20($inp),$rndkey0
-	xorps	$iv,$inout0
-	xorps	$in0,$inout1
-	xorps	$rndkey1,$inout2
-	movups	0x30($inp),$rndkey1
-	xorps	$rndkey0,$inout3
-	movups	0x40($inp),$rndkey0
-	xorps	$rndkey1,$inout4
-	movups	0x50($inp),$iv
-	xorps	$rndkey0,$inout5
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
-	lea	0x50($out),$out
-	movaps	$inout5,$inout0
-	sub	\$0x60,$len
 	jmp	.Lcbc_dec_tail_collected
+
 .align	16
 .Lcbc_dec_tail_collected:
-	and	\$15,$len
 	movups	$iv,($ivp)
+	and	\$15,$len
 	jnz	.Lcbc_dec_tail_partial
 	movups	$inout0,($out)
 	jmp	.Lcbc_dec_ret
 .align	16
 .Lcbc_dec_tail_partial:
-	movaps	$inout0,$reserved(%rsp)
+	movaps	$inout0,(%rsp)
 	mov	\$16,%rcx
 	mov	$out,%rdi
 	sub	$len,%rcx
-	lea	$reserved(%rsp),%rsi
+	lea	(%rsp),%rsi
 	.long	0x9066A4F3	# rep movsb
 
 .Lcbc_dec_ret:
 ___
 $code.=<<___ if ($win64);
-	movaps	(%rsp),%xmm6
-	movaps	0x10(%rsp),%xmm7
-	movaps	0x20(%rsp),%xmm8
-	movaps	0x30(%rsp),%xmm9
-	lea	0x58(%rsp),%rsp
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
 ___
 $code.=<<___;
+	lea	(%rbp),%rsp
+	pop	%rbp
 .Lcbc_ret:
 	ret
 .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
@@ -2712,6 +2990,8 @@ $code.=<<___;
 	.long	1,0,0,0
 .Lxts_magic:
 	.long	0x87,0,1,0
+.Lincrement1:
+	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 
 .asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
@@ -2789,45 +3069,9 @@ ccm64_se_handler:
 	jmp	.Lcommon_seh_tail
 .size	ccm64_se_handler,.-ccm64_se_handler
 
-.type	ctr32_se_handler,\@abi-omnipotent
+.type	ctr_xts_se_handler,\@abi-omnipotent
 .align	16
-ctr32_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	lea	.Lctr32_body(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<"prologue" label
-	jb	.Lcommon_seh_tail
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	lea	.Lctr32_ret(%rip),%r10
-	cmp	%r10,%rbx
-	jae	.Lcommon_seh_tail
-
-	lea	0x20(%rax),%rsi		# %xmm save area
-	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
-	.long	0xa548f3fc		# cld; rep movsq
-	lea	0xc8(%rax),%rax		# adjust stack pointer
-
-	jmp	.Lcommon_seh_tail
-.size	ctr32_se_handler,.-ctr32_se_handler
-
-.type	xts_se_handler,\@abi-omnipotent
-.align	16
-xts_se_handler:
+ctr_xts_se_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -2857,14 +3101,14 @@ xts_se_handler:
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lcommon_seh_tail
 
-	lea	0x60(%rax),%rsi		# %xmm save area
+	mov	160($context),%rax	# pull context->Rbp
+	lea	-0xa0(%rax),%rsi	# %xmm save area
 	lea	512($context),%rdi	# & context.Xmm6
 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
-	lea	0x68+160(%rax),%rax	# adjust stack pointer
 
-	jmp	.Lcommon_seh_tail
-.size	xts_se_handler,.-xts_se_handler
+	jmp	.Lcommon_rbp_tail
+.size	ctr_xts_se_handler,.-ctr_xts_se_handler
 ___
 $code.=<<___;
 .type	cbc_se_handler,\@abi-omnipotent
@@ -2896,11 +3140,16 @@ cbc_se_handler:
 	cmp	%r10,%rbx		# context->Rip>="epilogue" label
 	jae	.Lcommon_seh_tail
 
-	lea	0(%rax),%rsi		# top of stack
+	lea	16(%rax),%rsi		# %xmm save area
 	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
-	lea	0x58(%rax),%rax		# adjust stack pointer
+
+.Lcommon_rbp_tail:
+	mov	160($context),%rax	# pull context->Rbp
+	mov	(%rax),%rbp		# restore saved %rbp
+	lea	8(%rax),%rax		# adjust stack pointer
+	mov	%rbp,160($context)	# restore context->Rbp
 	jmp	.Lcommon_seh_tail
 
 .Lrestore_cbc_rax:
@@ -3003,14 +3252,15 @@ $code.=<<___ if ($PREFIX eq "aesni");
 	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
 .LSEH_info_ctr32:
 	.byte	9,0,0,0
-	.rva	ctr32_se_handler
+	.rva	ctr_xts_se_handler
+	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
 .LSEH_info_xts_enc:
 	.byte	9,0,0,0
-	.rva	xts_se_handler
+	.rva	ctr_xts_se_handler
 	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
 .LSEH_info_xts_dec:
 	.byte	9,0,0,0
-	.rva	xts_se_handler
+	.rva	ctr_xts_se_handler
 	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
 ___
 $code.=<<___;
@@ -3057,6 +3307,19 @@ sub aesni {
 	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
 	return ".byte\t".join(',',@opcode);
     }
+    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
+		"aesdec" => 0xde,	"aesdeclast" => 0xdf
+	);
+	return undef if (!defined($opcodelet{$1}));
+	my $off = $2;
+	push @opcode,0x44 if ($3>=8);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
+	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+	return ".byte\t".join(',',@opcode);
+    }
     return $line;
 }
 
diff --git a/devel/perlasm/cbc.pl b/devel/perlasm/cbc.pl
index 6fc2510905..24561e759a 100644
--- a/devel/perlasm/cbc.pl
+++ b/devel/perlasm/cbc.pl
@@ -150,7 +150,7 @@ sub cbc
 &set_label("PIC_point");
 	&blindpop("edx");
 	&lea("ecx",&DWP(&label("cbc_enc_jmp_table")."-".&label("PIC_point"),"edx"));
-	&mov($count,&DWP(0,"ecx",$count,4))
+	&mov($count,&DWP(0,"ecx",$count,4));
 	&add($count,"edx");
 	&xor("ecx","ecx");
 	&xor("edx","edx");
diff --git a/devel/perlasm/e_padlock-x86.pl b/devel/perlasm/e_padlock-x86.pl
index 71ecad3bbd..4148468c41 100644
--- a/devel/perlasm/e_padlock-x86.pl
+++ b/devel/perlasm/e_padlock-x86.pl
@@ -400,9 +400,9 @@ my ($mode,$opcode) = @_;
 
 &generate_mode("ecb",0xc8);
 &generate_mode("cbc",0xd0);
-#&generate_mode("cfb",0xe0);
-#&generate_mode("ofb",0xe8);
-#&generate_mode("ctr32",0xc8);	# yes, it implements own CTR with ECB opcode,
+&generate_mode("cfb",0xe0);
+&generate_mode("ofb",0xe8);
+&generate_mode("ctr32",0xc8);	# yes, it implements own CTR with ECB opcode,
 				# because hardware CTR was introduced later
 				# and even has errata on certain C7 stepping.
 				# own implementation *always* works, though
diff --git a/devel/perlasm/e_padlock-x86_64.pl b/devel/perlasm/e_padlock-x86_64.pl
index 4d71d06f02..f8ba1e909f 100644
--- a/devel/perlasm/e_padlock-x86_64.pl
+++ b/devel/perlasm/e_padlock-x86_64.pl
@@ -23,7 +23,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $code=".text\n";
 
@@ -547,9 +548,9 @@ ___
 
 &generate_mode("ecb",0xc8);
 &generate_mode("cbc",0xd0);
-#&generate_mode("cfb",0xe0);
-#&generate_mode("ofb",0xe8);
-#&generate_mode("ctr32",0xd8);	# all 64-bit CPUs have working CTR...
+&generate_mode("cfb",0xe0);
+&generate_mode("ofb",0xe8);
+&generate_mode("ctr32",0xd8);	# all 64-bit CPUs have working CTR...
 
 $code.=<<___;
 .asciz	"VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
diff --git a/devel/perlasm/ghash-x86.pl b/devel/perlasm/ghash-x86.pl
index 2a1819cb51..e6b9663c13 100644
--- a/devel/perlasm/ghash-x86.pl
+++ b/devel/perlasm/ghash-x86.pl
@@ -26,6 +26,8 @@
 # P4		125/125		17.8		84(***)
 # Opteron	66 /70		10.1		30
 # Core2		54 /67		8.4		18
+# Atom		105/105		16.8		53
+# VIA Nano	69 /71		13.0		27
 #
 # (*)	gcc 3.4.x was observed to generate few percent slower code,
 #	which is one of reasons why 2.95.3 results were chosen,
@@ -113,6 +115,16 @@
 # similar manner resulted in almost 20% degradation on Sandy Bridge,
 # where original 64-bit code processes one byte in 1.95 cycles.
 
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
@@ -635,7 +647,7 @@ sub mmx_loop() {
     { my @lo  = ("mm0","mm1","mm2");
       my @hi  = ("mm3","mm4","mm5");
       my @tmp = ("mm6","mm7");
-      my $off1=0,$off2=0,$i;
+      my ($off1,$off2,$i) = (0,0,);
 
       &add	($Htbl,128);			# optimize for size
       &lea	("edi",&DWP(16+128,"esp"));
@@ -822,17 +834,18 @@ $len="ebx";
 &static_label("bswap");
 
 sub clmul64x64_T2 {	# minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
 
 	&movdqa		($Xhi,$Xi);		#
 	&pshufd		($T1,$Xi,0b01001110);
-	&pshufd		($T2,$Hkey,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110)	if (!defined($HK));
 	&pxor		($T1,$Xi);		#
-	&pxor		($T2,$Hkey);
+	&pxor		($T2,$Hkey)		if (!defined($HK));
+			$HK=$T2			if (!defined($HK));
 
 	&pclmulqdq	($Xi,$Hkey,0x00);	#######
 	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
-	&pclmulqdq	($T1,$T2,0x00);		#######
+	&pclmulqdq	($T1,$HK,0x00);		#######
 	&xorps		($T1,$Xi);		#
 	&xorps		($T1,$Xhi);		#
 
@@ -879,31 +892,32 @@ if (1) {		# Algorithm 9 with <<1 twist.
 			# below. Algorithm 9 was therefore chosen for
 			# further optimization...
 
-sub reduction_alg9 {	# 17/13 times faster than Intel version
+sub reduction_alg9 {	# 17/11 times faster than Intel version
 my ($Xhi,$Xi) = @_;
 
 	# 1st phase
-	&movdqa		($T1,$Xi)		#
+	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);
+	&psllq		($Xi,5);
+	&pxor		($T1,$Xi);		#
 	&psllq		($Xi,1);
 	&pxor		($Xi,$T1);		#
-	&psllq		($Xi,5);		#
-	&pxor		($Xi,$T1);		#
 	&psllq		($Xi,57);		#
-	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);		#
 	&pslldq		($Xi,8);
-	&psrldq		($T2,8);		#
-	&pxor		($Xi,$T1);
-	&pxor		($Xhi,$T2);		#
+	&psrldq		($T1,8);		#	
+	&pxor		($Xi,$T2);
+	&pxor		($Xhi,$T1);		#
 
 	# 2nd phase
 	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,1);
+	&pxor		($Xhi,$T2);		#
+	&pxor		($T2,$Xi);
 	&psrlq		($Xi,5);
 	&pxor		($Xi,$T2);		#
 	&psrlq		($Xi,1);		#
-	&pxor		($Xi,$T2);		#
-	&pxor		($T2,$Xhi);
-	&psrlq		($Xi,1);		#
-	&pxor		($Xi,$T2);		#
+	&pxor		($Xi,$Xhi)		#
 }
 
 &function_begin_B("gcm_init_clmul");
@@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_;
 	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
 	&reduction_alg9	($Xhi,$Xi);
 
+	&pshufd		($T1,$Hkey,0b01001110);
+	&pshufd		($T2,$Xi,0b01001110);
+	&pxor		($T1,$Hkey);		# Karatsuba pre-processing
 	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&pxor		($T2,$Xi);		# Karatsuba pre-processing
 	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+	&palignr	($T2,$T1,8);		# low part is H.lo^H.hi
+	&movdqu		(&QWP(32,$Htbl),$T2);	# save Karatsuba "salt"
 
 	&ret		();
 &function_end_B("gcm_init_clmul");
@@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_;
 	&movdqa		($T3,&QWP(0,$const));
 	&movups		($Hkey,&QWP(0,$Htbl));
 	&pshufb		($Xi,$T3);
+	&movups		($T2,&QWP(32,$Htbl));
 
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
 	&reduction_alg9	($Xhi,$Xi);
 
 	&pshufb		($Xi,$T3);
@@ -994,79 +1015,107 @@ my ($Xhi,$Xi) = @_;
 	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
 	&pshufb		($T1,$T3);
 	&pshufb		($Xn,$T3);
+	&movdqu		($T3,&QWP(32,$Htbl));
 	&pxor		($Xi,$T1);		# Ii+Xi
 
-	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	&pxor		($T1,$Xn);		#
+
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
 	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&pclmulqdq	($T1,$T3,0x00);		#######
 
 	&lea		($inp,&DWP(32,$inp));	# i+=2
 	&sub		($len,0x20);
 	&jbe		(&label("even_tail"));
+	&jmp		(&label("mod_loop"));
 
-&set_label("mod_loop");
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("mod_loop",32);
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
 
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movdqa		($T3,&QWP(0,$const));
 
-	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
-	&pshufb		($T1,$T3);
-	&pshufb		($Xn,$T3);
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&xorps		($Xhi,$Xhn);
+	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1
+	&pxor		($T1,$Xhi);		#
 
-	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
-	&movdqa		($Xhn,$Xn);
-	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early
+	&pxor		($T2,$T1);		#
+	 &pshufb	($Xhn,$T3);
 
-	  &movdqa	($T1,$Xi)		#&reduction_alg9($Xhi,$Xi); 1st phase
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+	 &pshufb	($Xn,$T3);
+	 &pxor		($Xhi,$Xhn);		# "Ii+Xi", consume early
+
+	&movdqa		($Xhn,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	  &movdqa	($T2,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &movdqa	($T1,$Xi);
+	  &psllq	($Xi,5);
+	  &pxor		($T1,$Xi);		#
 	  &psllq	($Xi,1);
 	  &pxor		($Xi,$T1);		#
-	  &psllq	($Xi,5);		#
-	  &pxor		($Xi,$T1);		#
+	&movups		($T3,&QWP(32,$Htbl));
 	&pclmulqdq	($Xn,$Hkey,0x00);	#######
 	  &psllq	($Xi,57);		#
-	  &movdqa	($T2,$Xi);		#
+	  &movdqa	($T1,$Xi);		#
 	  &pslldq	($Xi,8);
-	  &psrldq	($T2,8);		#	
-	  &pxor		($Xi,$T1);
-	&pshufd		($T1,$T3,0b01001110);
-	  &pxor		($Xhi,$T2);		#
-	&pxor		($T1,$T3);
-	&pshufd		($T3,$Hkey,0b01001110);
-	&pxor		($T3,$Hkey);		#
-
-	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	  &psrldq	($T1,8);		#	
+	  &pxor		($Xi,$T2);
+	  &pxor		($Xhi,$T1);		#
+	&pshufd		($T1,$Xhn,0b01001110);
 	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,1);
+	&pxor		($T1,$Xhn);
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	  &pxor		($Xhi,$T2);		#
+	  &pxor		($T2,$Xi);
 	  &psrlq	($Xi,5);
 	  &pxor		($Xi,$T2);		#
 	  &psrlq	($Xi,1);		#
-	  &pxor		($Xi,$T2);		#
-	  &pxor		($T2,$Xhi);
-	  &psrlq	($Xi,1);		#
-	  &pxor		($Xi,$T2);		#
-
+	  &pxor		($Xi,$Xhi)		#
 	&pclmulqdq	($T1,$T3,0x00);		#######
-	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
-	&xorps		($T1,$Xn);		#
-	&xorps		($T1,$Xhn);		#
-
-	&movdqa		($T3,$T1);		#
-	&psrldq		($T1,8);
-	&pslldq		($T3,8);		#
-	&pxor		($Xhn,$T1);
-	&pxor		($Xn,$T3);		#
-	&movdqa		($T3,&QWP(0,$const));
 
 	&lea		($inp,&DWP(32,$inp));
 	&sub		($len,0x20);
 	&ja		(&label("mod_loop"));
 
 &set_label("even_tail");
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
 
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movdqa		($T3,&QWP(0,$const));
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&xorps		($Xhi,$Xhn);
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	&pxor		($T1,$Xhi);		#
+
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
 
 	&reduction_alg9	($Xhi,$Xi);
 
diff --git a/devel/perlasm/ghash-x86_64.pl b/devel/perlasm/ghash-x86_64.pl
index a5ae180882..7904248070 100644
--- a/devel/perlasm/ghash-x86_64.pl
+++ b/devel/perlasm/ghash-x86_64.pl
@@ -22,6 +22,8 @@
 # P4		28.6		14.0		+100%
 # Opteron	19.3		7.7		+150%
 # Core2		17.8		8.1(**)		+120%
+# Atom		31.6		16.8		+88%
+# VIA Nano	21.8		10.1		+115%
 #
 # (*)	comparison is not completely fair, because C results are
 #	for vanilla "256B" implementation, while assembler results
@@ -39,6 +41,41 @@
 # providing access to a Westmere-based system on behalf of Intel
 # Open Source Technology Centre.
 
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere	1.76(+14%)
+# Sandy Bridge	1.79(+9%)
+# Ivy Bridge	1.79(+8%)
+# Haswell	0.55(+93%) (if system doesn't support AVX)
+# Bulldozer	1.52(+25%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -50,7 +87,25 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$avx = ($1>=10) + ($1>=11);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$do4xaggr=1;
 
 # common register layout
 $nlo="%rax";
@@ -351,19 +406,27 @@ ___
 ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
 
 sub clmul64x64_T2 {	# minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
 
-$code.=<<___ if (!defined($modulo));
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
 	movdqa		$Xi,$Xhi		#
 	pshufd		\$0b01001110,$Xi,$T1
 	pshufd		\$0b01001110,$Hkey,$T2
 	pxor		$Xi,$T1			#
 	pxor		$Hkey,$T2
 ___
+} else {
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1			#
+___
+}
 $code.=<<___;
 	pclmulqdq	\$0x00,$Hkey,$Xi	#######
 	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
-	pclmulqdq	\$0x00,$T2,$T1		#######
+	pclmulqdq	\$0x00,$HK,$T1		#######
 	pxor		$Xi,$T1			#
 	pxor		$Xhi,$T1		#
 
@@ -375,42 +438,53 @@ $code.=<<___;
 ___
 }
 
-sub reduction_alg9 {	# 17/13 times faster than Intel version
+sub reduction_alg9 {	# 17/11 times faster than Intel version
 my ($Xhi,$Xi) = @_;
 
 $code.=<<___;
 	# 1st phase
-	movdqa		$Xi,$T1			#
+	movdqa		$Xi,$T2			#
+	movdqa		$Xi,$T1
+	psllq		\$5,$Xi
+	pxor		$Xi,$T1			#
 	psllq		\$1,$Xi
 	pxor		$T1,$Xi			#
-	psllq		\$5,$Xi			#
-	pxor		$T1,$Xi			#
 	psllq		\$57,$Xi		#
-	movdqa		$Xi,$T2			#
+	movdqa		$Xi,$T1			#
 	pslldq		\$8,$Xi
-	psrldq		\$8,$T2			#	
-	pxor		$T1,$Xi
-	pxor		$T2,$Xhi		#
+	psrldq		\$8,$T1			#	
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
 
 	# 2nd phase
 	movdqa		$Xi,$T2
+	psrlq		\$1,$Xi
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
 	psrlq		\$5,$Xi
 	pxor		$T2,$Xi			#
 	psrlq		\$1,$Xi			#
-	pxor		$T2,$Xi			#
-	pxor		$Xhi,$T2
-	psrlq		\$1,$Xi			#
-	pxor		$T2,$Xi			#
+	pxor		$Xhi,$Xi		#
 ___
 }
 
 { my ($Htbl,$Xip)=@_4args;
+  my $HK="%xmm6";
 
 $code.=<<___;
 .globl	gcm_init_clmul
 .type	gcm_init_clmul,\@abi-omnipotent
 .align	16
 gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+___
+$code.=<<___;
 	movdqu		($Xip),$Hkey
 	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
 
@@ -429,13 +503,47 @@ gcm_init_clmul:
 	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
 
 	# calculate H^2
+	pshufd		\$0b01001110,$Hkey,$HK
 	movdqa		$Hkey,$Xi
+	pxor		$Hkey,$HK
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufd		\$0b01001110,$Hkey,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$Hkey,$T1		# Karatsuba pre-processing
+	movdqu		$Hkey,0x00($Htbl)	# save H
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x10($Htbl)		# save H^2
+	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
+	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
+___
+if ($do4xaggr) {
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqa		$Xi,$T3
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
-	movdqu		$Hkey,($Htbl)		# save H
-	movdqu		$Xi,16($Htbl)		# save H^2
+	pshufd		\$0b01001110,$T3,$T1
+	pshufd		\$0b01001110,$Xi,$T2
+	pxor		$T3,$T1			# Karatsuba pre-processing
+	movdqu		$T3,0x30($Htbl)		# save H^3
+	pxor		$Xi,$T2			# Karatsuba pre-processing
+	movdqu		$Xi,0x40($Htbl)		# save H^4
+	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
+	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
 	ret
 .size	gcm_init_clmul,.-gcm_init_clmul
 ___
@@ -448,13 +556,38 @@ $code.=<<___;
 .type	gcm_gmult_clmul,\@abi-omnipotent
 .align	16
 gcm_gmult_clmul:
+.L_gmult_clmul:
 	movdqu		($Xip),$Xi
 	movdqa		.Lbswap_mask(%rip),$T3
 	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$T2
 	pshufb		$T3,$Xi
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
-	&reduction_alg9	($Xhi,$Xi);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+	# experimental alternative. special thing about is that there
+	# no dependency between the two multiplications... 
+	mov		\$`0xE1<<1`,%eax
+	mov		\$0xA040608020C0E000,%r10	# ((7..0)�0xE0)&0xff
+	mov		\$0x07,%r11d
+	movq		%rax,$T1
+	movq		%r10,$T2
+	movq		%r11,$T3		# borrow $T3
+	pand		$Xi,$T3
+	pshufb		$T3,$T2			# ($Xi&7)�0xE0
+	movq		%rax,$T3
+	pclmulqdq	\$0x00,$Xi,$T1		# �(0xE1<<1)
+	pxor		$Xi,$T2
+	pslldq		\$15,$T2
+	paddd		$T2,$T2			# <<(64+56+1)
+	pxor		$T2,$Xi
+	pclmulqdq	\$0x01,$T3,$Xi
+	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3
+	psrldq		\$1,$T1
+	pxor		$T1,$Xhi
+	pslldq		\$7,$Xi
+	pxor		$Xhi,$Xi
+___
 $code.=<<___;
 	pshufb		$T3,$Xi
 	movdqu		$Xi,($Xip)
@@ -464,129 +597,318 @@ ___
 }
 
 { my ($Xip,$Htbl,$inp,$len)=@_4args;
-  my $Xn="%xmm6";
-  my $Xhn="%xmm7";
-  my $Hkey2="%xmm8";
-  my $T1n="%xmm9";
-  my $T2n="%xmm10";
+  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
 
 $code.=<<___;
 .globl	gcm_ghash_clmul
 .type	gcm_ghash_clmul,\@abi-omnipotent
-.align	16
+.align	32
 gcm_ghash_clmul:
+.L_ghash_clmul:
 ___
 $code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
 .LSEH_begin_gcm_ghash_clmul:
 	# I can't trust assembler to use specific encoding:-(
-	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
-	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
-	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
-	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
-	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
-	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
 ___
 $code.=<<___;
 	movdqa		.Lbswap_mask(%rip),$T3
+	mov		\$0xA040608020C0E000,%rax	# ((7..0)�0xE0)&0xff
 
 	movdqu		($Xip),$Xi
 	movdqu		($Htbl),$Hkey
+	movdqu		0x20($Htbl),$HK
 	pshufb		$T3,$Xi
 
 	sub		\$0x10,$len
 	jz		.Lodd_tail
 
-	movdqu		16($Htbl),$Hkey2
+	movdqu		0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+	cmp		\$0x30,$len
+	jb		.Lskip4x
+
+	sub		\$0x30,$len
+	movdqu		0x30($Htbl),$Hkey3
+	movdqu		0x40($Htbl),$Hkey4
+
+	#######
+	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+	#
+	movdqu		0x30($inp),$Xln
+	 movdqu		0x20($inp),$Xl
+	pshufb		$T3,$Xln
+	 pshufb		$T3,$Xl
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$Xmn
+	pxor		$Xln,$Xmn
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$Xmn
+
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey2,$Xl
+	pclmulqdq	\$0x11,$Hkey2,$Xh
+	xorps		$Xl,$Xln
+	pclmulqdq	\$0x10,$HK,$Xm
+	xorps		$Xh,$Xhn
+	movups		0x50($Htbl),$HK
+	xorps		$Xm,$Xmn
+
+	movdqu		0x10($inp),$Xl
+	 movdqu		0($inp),$T1
+	pshufb		$T3,$Xl
+	 pshufb		$T3,$T1
+	movdqa		$Xl,$Xh
+	pshufd		\$0b01001110,$Xl,$Xm
+	 pxor		$T1,$Xi
+	pxor		$Xl,$Xm
+	pclmulqdq	\$0x00,$Hkey3,$Xl
+	 movdqa		$Xi,$Xhi
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pxor		$Xi,$T1
+	pclmulqdq	\$0x11,$Hkey3,$Xh
+	xorps		$Xl,$Xln
+	pclmulqdq	\$0x00,$HK,$Xm
+	xorps		$Xh,$Xhn
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jc	.Ltail4x
+
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	xorps		$Xm,$Xmn
+	 movdqu		0x30($inp),$Xl
+	 pshufb		$T3,$Xl
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	xorps		$Xln,$Xi
+	 movdqu		0x20($inp),$Xln
+	 movdqa		$Xl,$Xh
+	 pshufd		\$0b01001110,$Xl,$Xm
+	pclmulqdq	\$0x10,$HK,$T1
+	xorps		$Xhn,$Xhi
+	 pxor		$Xl,$Xm
+	 pshufb		$T3,$Xln
+	movups		0x20($Htbl),$HK
+	 pclmulqdq	\$0x00,$Hkey,$Xl
+	xorps		$Xmn,$T1
+	 movdqa		$Xln,$Xhn
+	 pshufd		\$0b01001110,$Xln,$Xmn
+
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	 pxor		$Xln,$Xmn
+	pxor		$Xhi,$T1		#
+	movdqa		$T1,$T2			#
+	pslldq		\$8,$T1
+	 pclmulqdq	\$0x11,$Hkey,$Xh
+	psrldq		\$8,$T2			#
+	pxor		$T1,$Xi
+	movdqa		.L7_mask(%rip),$T1
+	pxor		$T2,$Xhi		#
+	movq		%rax,$T2
+
+	pand		$Xi,$T1			# 1st phase
+	pshufb		$T1,$T2			#
+	 pclmulqdq	\$0x00,$HK,$Xm
+	pxor		$Xi,$T2			#
+	psllq		\$57,$T2		#
+	movdqa		$T2,$T1			#
+	pslldq		\$8,$T2
+	 pclmulqdq	\$0x00,$Hkey2,$Xln
+	psrldq		\$8,$T1			#	
+	pxor		$T2,$Xi
+	pxor		$T1,$Xhi		#
+	movdqu		0($inp),$T1
+
+	movdqa		$Xi,$T2			# 2nd phase
+	psrlq		\$1,$Xi
+	 pclmulqdq	\$0x11,$Hkey2,$Xhn
+	 xorps		$Xl,$Xln
+	 movdqu		0x10($inp),$Xl
+	 pshufb		$T3,$Xl
+	 pclmulqdq	\$0x10,$HK,$Xmn
+	 xorps		$Xh,$Xhn
+	 movups		0x50($Htbl),$HK
+	pshufb		$T3,$T1
+	pxor		$T2,$Xhi		#
+	pxor		$Xi,$T2
+	psrlq		\$5,$Xi
+
+	 movdqa		$Xl,$Xh
+	 pxor		$Xm,$Xmn
+	 pshufd		\$0b01001110,$Xl,$Xm
+	 pxor		$Xl,$Xm
+	 pclmulqdq	\$0x00,$Hkey3,$Xl
+	pxor		$T2,$Xi			#
+	pxor		$T1,$Xhi
+	psrlq		\$1,$Xi			#
+	 pclmulqdq	\$0x11,$Hkey3,$Xh
+	 xorps		$Xl,$Xln
+	pxor		$Xhi,$Xi		#
+
+	 pclmulqdq	\$0x00,$HK,$Xm
+	 xorps		$Xh,$Xhn
+
+	movdqa		$Xi,$Xhi
+	pshufd		\$0b01001110,$Xi,$T1
+	pxor		$Xi,$T1
+
+	lea	0x40($inp),$inp
+	sub	\$0x40,$len
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+	pclmulqdq	\$0x00,$Hkey4,$Xi
+	xorps		$Xm,$Xmn
+	pclmulqdq	\$0x11,$Hkey4,$Xhi
+	xorps		$Xln,$Xi
+	pclmulqdq	\$0x10,$HK,$T1
+	xorps		$Xhn,$Xhi
+	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
+	pxor		$Xmn,$T1
+
+	pxor		$Xhi,$T1		#
+	pxor		$Xi,$Xhi
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+	&reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+	add	\$0x40,$len
+	jz	.Ldone
+	movdqu	0x20($Htbl),$HK
+	sub	\$0x10,$len
+	jz	.Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
 	#######
 	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
 	#	[(H*Ii+1) + (H*Xi+1)] mod P =
 	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
 	#
 	movdqu		($inp),$T1		# Ii
-	movdqu		16($inp),$Xn		# Ii+1
+	movdqu		16($inp),$Xln		# Ii+1
 	pshufb		$T3,$T1
-	pshufb		$T3,$Xn
+	pshufb		$T3,$Xln
 	pxor		$T1,$Xi			# Ii+Xi
-___
-	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
-$code.=<<___;
-	movdqa		$Xi,$Xhi		#
-	pshufd		\$0b01001110,$Xi,$T1
-	pshufd		\$0b01001110,$Hkey2,$T2
-	pxor		$Xi,$T1			#
-	pxor		$Hkey2,$T2
+
+	movdqa		$Xln,$Xhn
+	pshufd		\$0b01001110,$Xln,$T1
+	pxor		$Xln,$T1
+	pclmulqdq	\$0x00,$Hkey,$Xln
+	pclmulqdq	\$0x11,$Hkey,$Xhn
+	pclmulqdq	\$0x00,$HK,$T1
 
 	lea		32($inp),$inp		# i+=2
 	sub		\$0x20,$len
 	jbe		.Leven_tail
+	jmp		.Lmod_loop
 
+.align	32
 .Lmod_loop:
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
-$code.=<<___;
-	movdqu		($inp),$T1		# Ii
-	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
-	pxor		$Xhn,$Xhi
+	movdqa		$Xi,$Xhi
+	pshufd		\$0b01001110,$Xi,$T2	#
+	pxor		$Xi,$T2			#
 
-	movdqu		16($inp),$Xn		# Ii+1
-	pshufb		$T3,$T1
-	pshufb		$T3,$Xn
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$T2
+
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+	  movdqu	($inp),$Xhn		# Ii
+	  pshufb	$T3,$Xhn
+	  movdqu	16($inp),$Xln		# Ii+1
+
+	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
+	pxor		$Xhi,$T1
+	  pxor		$Xhn,$Xhi		# "Ii+Xi", consume early
+	pxor		$T1,$T2
+	 pshufb		$T3,$Xln
+	movdqa		$T2,$T1			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
 
-	movdqa		$Xn,$Xhn		#
-	pshufd		\$0b01001110,$Xn,$T1n
-	pshufd		\$0b01001110,$Hkey,$T2n
-	pxor		$Xn,$T1n		#
-	pxor		$Hkey,$T2n
-	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
+	movdqa		$Xln,$Xhn		#
 
-	  movdqa	$Xi,$T1			# 1st phase
+	  movdqa	$Xi,$T2			# 1st phase
+	  movdqa	$Xi,$T1
+	  psllq		\$5,$Xi
+	pclmulqdq	\$0x00,$Hkey,$Xln	#######
+	  pxor		$Xi,$T1			#
 	  psllq		\$1,$Xi
 	  pxor		$T1,$Xi			#
-	  psllq		\$5,$Xi			#
-	  pxor		$T1,$Xi			#
-	pclmulqdq	\$0x00,$Hkey,$Xn	#######
 	  psllq		\$57,$Xi		#
-	  movdqa	$Xi,$T2			#
+	  movdqa	$Xi,$T1			#
 	  pslldq	\$8,$Xi
-	  psrldq	\$8,$T2			#	
-	  pxor		$T1,$Xi
-	  pxor		$T2,$Xhi		#
+	  psrldq	\$8,$T1			#	
+	  pxor		$T2,$Xi
+	  pxor		$T1,$Xhi		#
+	pshufd		\$0b01001110,$Xhn,$T1
+	pxor		$Xhn,$T1		#
 
 	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
 	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$1,$Xi
+	  pxor		$T2,$Xhi		#
+	  pxor		$Xi,$T2
 	  psrlq		\$5,$Xi
 	  pxor		$T2,$Xi			#
 	  psrlq		\$1,$Xi			#
-	  pxor		$T2,$Xi			#
-	  pxor		$Xhi,$T2
-	  psrlq		\$1,$Xi			#
-	  pxor		$T2,$Xi			#
-
-	pclmulqdq	\$0x00,$T2n,$T1n	#######
-	 movdqa		$Xi,$Xhi		#
-	 pshufd		\$0b01001110,$Xi,$T1
-	 pshufd		\$0b01001110,$Hkey2,$T2
-	 pxor		$Xi,$T1			#
-	 pxor		$Hkey2,$T2
-
-	pxor		$Xn,$T1n		#
-	pxor		$Xhn,$T1n		#
-	movdqa		$T1n,$T2n		#
-	psrldq		\$8,$T1n
-	pslldq		\$8,$T2n		#
-	pxor		$T1n,$Xhn
-	pxor		$T2n,$Xn		#
+	pclmulqdq	\$0x00,$HK,$T1		#######
+	  pxor		$Xhi,$Xi		#
 
 	lea		32($inp),$inp
 	sub		\$0x20,$len
 	ja		.Lmod_loop
 
 .Leven_tail:
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
-$code.=<<___;
-	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	 movdqa		$Xi,$Xhi
+	 pshufd		\$0b01001110,$Xi,$T2	#
+	 pxor		$Xi,$T2			#
+
+	pclmulqdq	\$0x00,$Hkey2,$Xi
+	pclmulqdq	\$0x11,$Hkey2,$Xhi
+	pclmulqdq	\$0x10,$HK,$T2
+
+	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
 	pxor		$Xhn,$Xhi
+	pxor		$Xi,$T1
+	pxor		$Xhi,$T1
+	pxor		$T1,$T2
+	movdqa		$T2,$T1			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
 ___
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
@@ -598,7 +920,7 @@ $code.=<<___;
 	pshufb		$T3,$T1
 	pxor		$T1,$Xi			# Ii+Xi
 ___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
 	&reduction_alg9	($Xhi,$Xi);
 $code.=<<___;
 .Ldone:
@@ -611,21 +933,607 @@ $code.=<<___ if ($win64);
 	movaps	0x20(%rsp),%xmm8
 	movaps	0x30(%rsp),%xmm9
 	movaps	0x40(%rsp),%xmm10
-	add	\$0x58,%rsp
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
 ___
 $code.=<<___;
 	ret
-.LSEH_end_gcm_ghash_clmul:
 .size	gcm_ghash_clmul,.-gcm_ghash_clmul
 ___
 }
+
+$code.=<<___;
+.globl	gcm_init_avx
+.type	gcm_init_avx,\@abi-omnipotent
+.align	32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
 
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Hkey
+	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	vpsrlq		\$63,$Hkey,$T1
+	vpsllq		\$1,$Hkey,$Hkey
+	vpxor		$T3,$T3,$T3		#
+	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
+	vpslldq		\$8,$T1,$T1
+	vpor		$T1,$Hkey,$Hkey		# H<<=1
+
+	# magic reduction
+	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
+	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	vpunpckhqdq	$Hkey,$Hkey,$HK
+	vmovdqa		$Hkey,$Xi
+	vpxor		$Hkey,$HK,$HK
+	mov		\$4,%r10		# up to H^8
+	jmp		.Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) {	$HK = $T2;
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpunpckhqdq	$Hkey,$Hkey,$T2
+	vpxor		$Xi,$T1,$T1		#
+	vpxor		$Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+	vpunpckhqdq	$Xi,$Xi,$T1
+	vpxor		$Xi,$T1,$T1		#
+___
+}
+$code.=<<___;
+	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
+	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
+	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
+	vpxor		$Xi,$Xhi,$T2		#
+	vpxor		$T2,$T1,$T1		#
+
+	vpslldq		\$8,$T1,$T2		#
+	vpsrldq		\$8,$T1,$T1
+	vpxor		$T2,$Xi,$Xi		#
+	vpxor		$T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	vpsllq		\$57,$Xi,$T1		# 1st phase
+	vpsllq		\$62,$Xi,$T2
+	vpxor		$T1,$T2,$T2		#
+	vpsllq		\$63,$Xi,$T1
+	vpxor		$T1,$T2,$T2		#
+	vpslldq		\$8,$T2,$T1		#
+	vpsrldq		\$8,$T2,$T2
+	vpxor		$T1,$Xi,$Xi		#
+	vpxor		$T2,$Xhi,$Xhi
+
+	vpsrlq		\$1,$Xi,$T2		# 2nd phase
+	vpxor		$Xi,$Xhi,$Xhi
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$5,$T2,$T2
+	vpxor		$T2,$Xi,$Xi		#
+	vpsrlq		\$1,$Xi,$Xi		#
+	vpxor		$Xhi,$Xi,$Xi		#
+___
+}
+
+$code.=<<___;
+.align	32
+.Linit_loop_avx:
+	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
+	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+	vmovdqa		$Xi,$T3
+___
+	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
+	&reduction_avx	($Xhi,$Xi);
+$code.=<<___;
+	vpshufd		\$0b01001110,$T3,$T1
+	vpshufd		\$0b01001110,$Xi,$T2
+	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
+	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
+	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
+	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
+	lea		0x30($Htbl),$Htbl
+	sub		\$1,%r10
+	jnz		.Linit_loop_avx
+
+	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
+	vmovdqu		$T3,-0x10($Htbl)
+
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	lea	0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+	ret
+.size	gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_init_clmul
+.size	gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl	gcm_gmult_avx
+.type	gcm_gmult_avx,\@abi-omnipotent
+.align	32
+gcm_gmult_avx:
+	jmp	.L_gmult_clmul
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl	gcm_ghash_avx
+.type	gcm_ghash_avx,\@abi-omnipotent
+.align	32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+    $Zlo,$Zhi,$Zmi,
+    $Hkey,$HK,$T1,$T2,
+    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
+___
+$code.=<<___;
+	vzeroupper
+
+	vmovdqu		($Xip),$Xi		# load $Xi
+	lea		.L0x1c2_polynomial(%rip),%r10
+	lea		0x40($Htbl),$Htbl	# size optimization
+	vmovdqu		.Lbswap_mask(%rip),$bswap
+	vpshufb		$bswap,$Xi,$Xi
+	cmp		\$0x80,$len
+	jb		.Lshort_avx
+	sub		\$0x80,$len
+
+	vmovdqu		0x70($inp),$Ii		# I[7]
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vpshufb		$bswap,$Ii,$Ii
+	vmovdqu		0x20-0x40($Htbl),$HK
+
+	vpunpckhqdq	$Ii,$Ii,$T2
+	 vmovdqu	0x60($inp),$Ij		# I[6]
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Ii,$T2,$T2
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x50($inp),$Ii		# I[5]
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpxor		$Ii,$T2,$T2
+	 vmovdqu	0x40($inp),$Ij		# I[4]
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x50-0x40($Htbl),$HK
+
+	 vpshufb	$bswap,$Ij,$Ij
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x30($inp),$Ii		# I[3]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	0x20($inp),$Ij		# I[2]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	vpxor		$Xmi,$Zmi,$Zmi
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+
+	 vmovdqu	0x10($inp),$Ii		# I[1]
+	vpxor		$Zlo,$Xlo,$Xlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Zhi,$Xhi,$Xhi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpxor		$Zmi,$Xmi,$Xmi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
+	 vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	 vmovdqu	($inp),$Ij		# I[0]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Xhi,$Zhi,$Zhi
+	 vpshufb	$bswap,$Ij,$Ij
+	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
+
+	lea		0x80($inp),$inp
+	cmp		\$0x80,$len
+	jb		.Ltail_avx
+
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+	sub		\$0x80,$len
+	jmp		.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	 vmovdqu	0x70($inp),$Ii		# I[7]
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpxor		$Ij,$T1,$T1
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
+	 vpshufb	$bswap,$Ii,$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
+	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Tred
+	 vmovdqu	0x20-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+
+	  vmovdqu	0x60($inp),$Ij		# I[6]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	vpxor		$Zlo,$Xi,$Xi		# collect result
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	vxorps		$Zhi,$Xo,$Xo
+	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	vpxor		$Zmi,$Tred,$Tred
+	 vxorps		$Ij,$T1,$T1
+
+	  vmovdqu	0x50($inp),$Ii		# I[5]
+	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	vpxor		$Xo,$Tred,$Tred
+	vpslldq		\$8,$Tred,$T2
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	vpsrldq		\$8,$Tred,$Tred
+	vpxor		$T2, $Xi, $Xi
+	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	  vpshufb	$bswap,$Ii,$Ii
+	vxorps		$Tred,$Xo, $Xo
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x50-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x40($inp),$Ij		# I[4]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vxorps		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+
+	  vmovdqu	0x30($inp),$Ii		# I[3]
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0x80-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	0x20($inp),$Ij		# I[2]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpxor		$Zlo,$Xlo,$Xlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpunpckhqdq	$Ij,$Ij,$T1
+	 vpxor		$Zhi,$Xhi,$Xhi
+	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
+	 vpxor		$Ij,$T1,$T1
+	 vpxor		$Zmi,$Xmi,$Xmi
+	vxorps		$Tred,$Xi,$Xi
+
+	  vmovdqu	0x10($inp),$Ii		# I[1]
+	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
+	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
+	  vpshufb	$bswap,$Ii,$Ii
+	 vpxor		$Xlo,$Zlo,$Zlo
+	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
+	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
+	vxorps		$Xo,$Tred,$Tred
+	 vpunpckhqdq	$Ii,$Ii,$T2
+	 vpxor		$Xhi,$Zhi,$Zhi
+	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
+	  vmovdqu	0xb0-0x40($Htbl),$HK
+	 vpxor		$Ii,$T2,$T2
+	 vpxor		$Xmi,$Zmi,$Zmi
+
+	  vmovdqu	($inp),$Ij		# I[0]
+	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
+	  vpshufb	$bswap,$Ij,$Ij
+	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
+	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
+	vpxor		$Tred,$Ij,$Ij
+	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+
+	lea		0x80($inp),$inp
+	sub		\$0x80,$len
+	jnc		.Loop8x_avx
+
+	add		\$0x80,$len
+	jmp		.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu		-0x10($inp,$len),$Ii	# very last word
+	lea		($inp,$len),$inp
+	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
+	vmovdqu		0x20-0x40($Htbl),$HK
+	vpshufb		$bswap,$Ii,$Ij
+
+	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
+	vmovdqa		$Xhi,$Zhi		# $Zhi and
+	vmovdqa		$Xmi,$Zmi		# $Zmi
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x20($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x30($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x50-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x40($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x50($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovdqu		0x80-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x60($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vpsrldq		\$8,$HK,$HK
+	sub		\$0x10,$len
+	jz		.Ltail_avx
+
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	 vmovdqu	-0x70($inp),$Ii
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
+	 vpshufb	$bswap,$Ii,$Ij
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+	vmovq		0xb8-0x40($Htbl),$HK
+	sub		\$0x10,$len
+	jmp		.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
+.Ltail_no_xor_avx:
+	vpunpckhqdq	$Ij,$Ij,$T1
+	vpxor		$Xlo,$Zlo,$Zlo
+	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
+	vpxor		$Ij,$T1,$T1
+	vpxor		$Xhi,$Zhi,$Zhi
+	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
+	vpxor		$Xmi,$Zmi,$Zmi
+	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
+
+	vmovdqu		(%r10),$Tred
+
+	vpxor		$Xlo,$Zlo,$Xi
+	vpxor		$Xhi,$Zhi,$Xo
+	vpxor		$Xmi,$Zmi,$Zmi
+
+	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
+	vpxor		$Xo, $Zmi,$Zmi
+	vpslldq		\$8, $Zmi,$T2
+	vpsrldq		\$8, $Zmi,$Zmi
+	vpxor		$T2, $Xi, $Xi
+	vpxor		$Zmi,$Xo, $Xo
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
+	vpalignr	\$8,$Xi,$Xi,$Xi
+	vpxor		$Xo,$Xi,$Xi
+	vpxor		$T2,$Xi,$Xi
+
+	cmp		\$0,$len
+	jne		.Lshort_avx
+
+	vpshufb		$bswap,$Xi,$Xi
+	vmovdqu		$Xi,($Xip)
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+	jmp	.L_ghash_clmul
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
 $code.=<<___;
 .align	64
 .Lbswap_mask:
 	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 .L0x1c2_polynomial:
 	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+	.long	7,0,7,0
+.L7_mask_poly:
+	.long	7,0,`0xE1<<1`,0
 .align	64
 .type	.Lrem_4bit,\@object
 .Lrem_4bit:
@@ -773,10 +1681,24 @@ se_handler:
 	.rva	.LSEH_end_gcm_ghash_4bit
 	.rva	.LSEH_info_gcm_ghash_4bit
 
+	.rva	.LSEH_begin_gcm_init_clmul
+	.rva	.LSEH_end_gcm_init_clmul
+	.rva	.LSEH_info_gcm_init_clmul
+
 	.rva	.LSEH_begin_gcm_ghash_clmul
 	.rva	.LSEH_end_gcm_ghash_clmul
 	.rva	.LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___	if ($avx);
+	.rva	.LSEH_begin_gcm_init_avx
+	.rva	.LSEH_end_gcm_init_avx
+	.rva	.LSEH_info_gcm_init_clmul
 
+	.rva	.LSEH_begin_gcm_ghash_avx
+	.rva	.LSEH_end_gcm_ghash_avx
+	.rva	.LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_gcm_gmult_4bit:
@@ -787,14 +1709,23 @@ se_handler:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_init_clmul:
+	.byte	0x01,0x08,0x03,0x00
+	.byte	0x08,0x68,0x00,0x00	#movaps	0x00(rsp),xmm6
+	.byte	0x04,0x22,0x00,0x00	#sub	rsp,0x18
 .LSEH_info_gcm_ghash_clmul:
-	.byte	0x01,0x1f,0x0b,0x00
-	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
-	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
-	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
-	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
-	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
-	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
+	.byte	0x01,0x33,0x16,0x00
+	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
+	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
+	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
+	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
+	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
+	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
 ___
 }
 
diff --git a/devel/perlasm/ppc-xlate.pl b/devel/perlasm/ppc-xlate.pl
index a3edd982b6..c075d5fce0 100755
--- a/devel/perlasm/ppc-xlate.pl
+++ b/devel/perlasm/ppc-xlate.pl
@@ -37,7 +37,6 @@ my $globl = sub {
 				$ret .= ".align	3\n";
 				$ret .= "$name:\n";
 				$ret .= ".quad	.$name,.TOC.\@tocbase,0\n";
-				$ret .= ".size	$name,24\n";
 				$ret .= ".previous\n";
 
 				$name = ".$name";
@@ -62,9 +61,12 @@ my $machine = sub {
     ".machine	$arch";
 };
 my $size = sub {
-    if ($flavour =~ /linux.*32/)
+    if ($flavour =~ /linux/)
     {	shift;
-	".size	" . join(",",@_);
+	my $name = shift; $name =~ s|^[\.\_]||;
+	my $ret  = ".size	$name,.-".($flavour=~/64/?".":"").$name;
+	$ret .= "\n.size	.$name,.-.$name" if ($flavour=~/64/);
+	$ret;
     }
     else
     {	"";	}
@@ -77,6 +79,25 @@ my $asciz = sub {
     else
     {	"";	}
 };
+my $quad = sub {
+    shift;
+    my @ret;
+    my ($hi,$lo);
+    for (@_) {
+	if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io)
+	{  $hi=$1?"0x$1":"0"; $lo="0x$2";  }
+	elsif (/^([0-9]+)$/o)
+	{  $hi=$1>>32; $lo=$1&0xffffffff;  } # error-prone with 32-bit perl
+	else
+	{  $hi=undef; $lo=$_; }
+
+	if (defined($hi))
+	{  push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo");  }
+	else
+	{  push(@ret,".quad	$lo");  }
+    }
+    join("\n",@ret);
+};
 
 ################################################################
 # simplified mnemonics not handled by at least one assembler
diff --git a/devel/perlasm/x86_64-xlate.pl b/devel/perlasm/x86_64-xlate.pl
index 1f4ce0a84e..bd165b152b 100755
--- a/devel/perlasm/x86_64-xlate.pl
+++ b/devel/perlasm/x86_64-xlate.pl
@@ -62,12 +62,8 @@ my $flavour = shift;
 my $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 
-{ my ($stddev,$stdino,@junk)=stat(STDOUT);
-  my ($outdev,$outino,@junk)=stat($output);
-
-    open STDOUT,">$output" || die "can't open $output: $!"
-	if ($stddev!=$outdev || $stdino!=$outino);
-}
+open STDOUT,">$output" || die "can't open $output: $!"
+	if (defined($output));
 
 my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
 my $elf=1;	$elf=0 if (!$gas);
@@ -254,8 +250,13 @@ my %globals;
 	# in $self->{label}, new gas requires sign extension...
 	use integer;
 	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
-	$self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
-	$self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+	$self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
+	$self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg;
+
+	if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
+	    $self->{base} =~ /(rbp|r13)/) {
+		$self->{base} = $self->{index}; $self->{index} = $1;
+	}
 
 	if ($gas) {
 	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
@@ -270,13 +271,14 @@ my %globals;
 	    }
 	} else {
 	    %szmap = (	b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
-	    		q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
+	    		q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR",
+			y=>"" );
 
 	    $self->{label} =~ s/\./\$/g;
 	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
 	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
-	    $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
-	    $sz="l" if (opcode->mnemonic() eq "movd");
+	    $sz="q" if ($self->{asterisk} || opcode->mnemonic() =~ /^v?movq$/);
+	    $sz="l" if (opcode->mnemonic() =~ /^v?movd$/);
 
 	    if (defined($self->{index})) {
 		sprintf "%s[%s%s*%d%s]",$szmap{$sz},
@@ -416,7 +418,7 @@ my %globals;
     }
     sub out {
 	my $self = shift;
-	if ($nasm && opcode->mnemonic()=~m/^j/) {
+	if ($nasm && opcode->mnemonic()=~m/^j(?![re]cxz)/) {
 	    "NEAR ".$self->{value};
 	} else {
 	    $self->{value};
@@ -569,7 +571,8 @@ my %globals;
 					    $v.=" READONLY";
 					    $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
 					} elsif ($line=~/\.CRT\$/i) {
-					    $v.=" READONLY ALIGN(8)";
+					    $v.=" READONLY ";
+					    $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
 					}
 				    }
 				    $current_segment = $line;
@@ -775,6 +778,45 @@ my $rdrand = sub {
     }
 };
 
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+   $rxb|=0x7<<5;
+   $rxb&=~(0x04<<5) if($dst>=8);
+   $rxb&=~(0x01<<5) if($src1>=8);
+   $rxb&=~(0x02<<5) if($src2>=8);
+   push @opcode,$rxb;
+}
+
+my $vprotd = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+	rxb(\@opcode,$3,$2,-1,0x08);
+	push @opcode,0x78,0xc2;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $vprotq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x8f);
+	rxb(\@opcode,$3,$2,-1,0x08);
+	push @opcode,0x78,0xc3;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
 if ($nasm) {
     print <<___;
 default	rel
@@ -840,6 +882,7 @@ while($line=<>) {
 		    my $arg = $_->out();
 		    # $insn.=$sz compensates for movq, pinsrw, ...
 		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+		    if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
 		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
 		}
 		@args = reverse(@args);
@@ -1049,7 +1092,7 @@ close STDOUT;
 #	.rva	.LSEH_end_function
 #	.rva	function_unwind_info
 #
-# Reference to functon_unwind_info from .xdata segment is the anchor.
+# Reference to function_unwind_info from .xdata segment is the anchor.
 # In case you wonder why references are 32-bit .rvas and not 64-bit
 # .quads. References put into these two segments are required to be
 # *relative* to the base address of the current binary module, a.k.a.
diff --git a/devel/perlasm/x86asm.pl b/devel/perlasm/x86asm.pl
index eb543db2f6..17abf92297 100644
--- a/devel/perlasm/x86asm.pl
+++ b/devel/perlasm/x86asm.pl
@@ -131,6 +131,32 @@ sub ::rdrand
     {	&::generic("rdrand",@_);	}
 }
 
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+   $rxb|=0x7<<5;
+   $rxb&=~(0x04<<5) if($dst>=8);
+   $rxb&=~(0x01<<5) if($src1>=8);
+   $rxb&=~(0x02<<5) if($src2>=8);
+   push @opcode,$rxb;
+}
+
+sub ::vprotd
+{ my $args=join(',',@_);
+    if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/)
+    { my @opcode=(0x8f);
+	rxb(\@opcode,$1,$2,-1,0x08);
+	push @opcode,0x78,0xc2;
+	push @opcode,0xc0|($2&7)|(($1&7)<<3);		# ModR/M
+	my $c=$3;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	&::data_byte(@opcode);
+    }
+    else
+    {	&::generic("vprotd",@_);	}
+}
+
 # label management
 $lbdecor="L";		# local label decoration, set by package
 $label="000";
@@ -257,4 +283,6 @@ EOF
     &file($filename);
 }
 
+sub ::hidden {}
+
 1;
diff --git a/devel/perlasm/x86gas.pl b/devel/perlasm/x86gas.pl
index 4af871889a..5c2498118f 100644
--- a/devel/perlasm/x86gas.pl
+++ b/devel/perlasm/x86gas.pl
@@ -45,10 +45,8 @@ sub ::generic
     undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
 
     if ($#_==0)				{ &::emit($opcode);		}
-    elsif ($opcode =~ m/^j/o && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "call" && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "clflush" && $#_==1){ &::emit($opcode,@arg);	}
-    elsif ($opcode =~ m/^set/&& $#_==1)	{ &::emit($opcode,@arg);	}
+    elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+					{ &::emit($opcode,@arg);	}
     else				{ &::emit($opcode.$suffix,@arg);}
 
   1;
@@ -72,6 +70,8 @@ sub ::DWP
 { my($addr,$reg1,$reg2,$idx)=@_;
   my $ret="";
 
+    if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
     $addr =~ s/^\s+//;
     # prepend global references with optional underscore
     $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige;
@@ -159,7 +159,7 @@ sub ::file_end
 	}
     }
     if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
-	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,16";
 	if ($::macosx)	{ push (@out,"$tmp,2\n"); }
 	elsif ($::elf)	{ push (@out,"$tmp,4\n"); }
 	else		{ push (@out,"$tmp\n"); }
@@ -172,10 +172,9 @@ sub ::data_short{   push(@out,".value\t".join(',',@_)."\n");  }
 sub ::data_word {   push(@out,".long\t".join(',',@_)."\n");   }
 
 sub ::align
-{ my $val=$_[0],$p2,$i;
+{ my $val=$_[0];
     if ($::aout)
-    {	for ($p2=0;$val!=0;$val>>=1) { $p2++; }
-	$val=$p2-1;
+    {	$val=int(log($val)/log(2));
 	$val.=",0x90";
     }
     push(@out,".align\t$val\n");
@@ -184,7 +183,9 @@ sub ::align
 sub ::picmeup
 { my($dst,$sym,$base,$reflabel)=@_;
 
-    if (($::pic && ($::elf || $::aout)) || $::macosx)
+    if (defined($base) && $sym eq "OPENSSL_ia32cap_P" && !$::macosx)
+    {	&::lea($dst,&::DWP("$sym-$reflabel",$base));	}
+    elsif (($::pic && ($::elf || $::aout)) || $::macosx)
     {	if (!defined($base))
 	{   &::call(&::label("PIC_me_up"));
 	    &::set_label("PIC_me_up");
@@ -252,4 +253,6 @@ ___
 sub ::dataseg
 {   push(@out,".data\n");   }
 
+*::hidden = sub { push(@out,".hidden\t$nmdecor$_[0]\n"); } if ($::elf);
+
 1;
diff --git a/devel/perlasm/x86masm.pl b/devel/perlasm/x86masm.pl
index ee446de5c1..1741342c3a 100644
--- a/devel/perlasm/x86masm.pl
+++ b/devel/perlasm/x86masm.pl
@@ -16,7 +16,9 @@ sub ::generic
     # fix hexadecimal constants
     for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; }
 
-    if ($opcode !~ /movq/)
+    if ($opcode =~ /lea/ && @arg[1] =~ s/.*PTR\s+(\(.*\))$/OFFSET $1/)	# no []
+    {	$opcode="mov";	}
+    elsif ($opcode !~ /movq/)
     {	# fix xmm references
 	$arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[1]=~/\bxmm[0-7]\b/i);
 	$arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[0]=~/\bxmm[0-7]\b/i);
@@ -37,6 +39,8 @@ sub get_mem
 { my($size,$addr,$reg1,$reg2,$idx)=@_;
   my($post,$ret);
 
+    if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
     $ret .= "$size PTR " if ($size ne "");
 
     $addr =~ s/^\s+//;
@@ -131,7 +135,7 @@ ___
     if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
     {	my $comm=<<___;
 .bss	SEGMENT 'BSS'
-COMM	${nmdecor}OPENSSL_ia32cap_P:QWORD
+COMM	${nmdecor}OPENSSL_ia32cap_P:DWORD:4
 .bss	ENDS
 ___
 	# comment out OPENSSL_ia32cap_P declarations
diff --git a/devel/perlasm/x86nasm.pl b/devel/perlasm/x86nasm.pl
index ca2511c9eb..5d92f6092a 100644
--- a/devel/perlasm/x86nasm.pl
+++ b/devel/perlasm/x86nasm.pl
@@ -36,6 +36,8 @@ sub get_mem
 { my($size,$addr,$reg1,$reg2,$idx)=@_;
   my($post,$ret);
 
+    if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
     if ($size ne "")
     {	$ret .= "$size";
 	$ret .= " PTR" if ($::mwerks);
@@ -117,7 +119,7 @@ sub ::file_end
 {   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
     {	my $comm=<<___;
 ${drdecor}segment	.bss
-${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 8
+${drdecor}common	${nmdecor}OPENSSL_ia32cap_P 16
 ___
 	# comment out OPENSSL_ia32cap_P declarations
 	grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
author	Nikos Mavrogiannopoulos <nmav@gnutls.org>	2013-11-26 23:19:01 +0100
committer	Nikos Mavrogiannopoulos <nmav@gnutls.org>	2013-11-26 23:20:56 +0100
commit	c61d244ee65b40cbf8e8a8dbf83f92ada6eb430a (patch)
tree	fcd06f998a52f354924a3fb4840d0b08a8e5c71f /devel
parent	11c6e66f570a6d6ed3baabe3c7924ef29fe47ad0 (diff)
download	gnutls-c61d244ee65b40cbf8e8a8dbf83f92ada6eb430a.tar.gz