summaryrefslogtreecommitdiff
path: root/devel
diff options
context:
space:
mode:
authorNikos Mavrogiannopoulos <nmav@gnutls.org>2013-11-26 23:19:01 +0100
committerNikos Mavrogiannopoulos <nmav@gnutls.org>2013-11-26 23:20:56 +0100
commitc61d244ee65b40cbf8e8a8dbf83f92ada6eb430a (patch)
treefcd06f998a52f354924a3fb4840d0b08a8e5c71f /devel
parent11c6e66f570a6d6ed3baabe3c7924ef29fe47ad0 (diff)
downloadgnutls-c61d244ee65b40cbf8e8a8dbf83f92ada6eb430a.tar.gz
updated perlasm files
Diffstat (limited to 'devel')
-rw-r--r--devel/perlasm/aesni-x86.pl10
-rw-r--r--devel/perlasm/aesni-x86_64.pl1735
-rw-r--r--devel/perlasm/cbc.pl2
-rw-r--r--devel/perlasm/e_padlock-x86.pl6
-rw-r--r--devel/perlasm/e_padlock-x86_64.pl9
-rw-r--r--devel/perlasm/ghash-x86.pl173
-rw-r--r--devel/perlasm/ghash-x86_64.pl1131
-rwxr-xr-xdevel/perlasm/ppc-xlate.pl27
-rwxr-xr-xdevel/perlasm/x86_64-xlate.pl71
-rw-r--r--devel/perlasm/x86asm.pl28
-rw-r--r--devel/perlasm/x86gas.pl21
-rw-r--r--devel/perlasm/x86masm.pl8
-rw-r--r--devel/perlasm/x86nasm.pl4
13 files changed, 2285 insertions, 940 deletions
diff --git a/devel/perlasm/aesni-x86.pl b/devel/perlasm/aesni-x86.pl
index 3dc345b585..14ff2602ed 100644
--- a/devel/perlasm/aesni-x86.pl
+++ b/devel/perlasm/aesni-x86.pl
@@ -54,8 +54,8 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0);
-if ($PREFIX eq "aesni") { $movekey=*movups; }
-else { $movekey=*movups; }
+if ($PREFIX eq "aesni") { $movekey=\&movups; }
+else { $movekey=\&movups; }
$len="eax";
$rounds="ecx";
@@ -1816,7 +1816,7 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x10,$out),$inout1);
&lea ($inp,&DWP(0x60,$inp));
&movups (&QWP(0x20,$out),$inout2);
- &mov ($rounds,$rounds_) # restore $rounds
+ &mov ($rounds,$rounds_); # restore $rounds
&movups (&QWP(0x30,$out),$inout3);
&mov ($key,$key_); # restore $key
&movups (&QWP(0x40,$out),$inout4);
@@ -2015,7 +2015,7 @@ if ($PREFIX eq "aesni") {
&set_label("12rounds",16);
&movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
&mov ($rounds,11);
- &$movekey (&QWP(-16,$key),"xmm0") # round 0
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
&call (&label("key_192a_cold"));
&aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
@@ -2152,7 +2152,7 @@ if ($PREFIX eq "aesni") {
&mov ($key,&wparam(2));
&call ("_aesni_set_encrypt_key");
&mov ($key,&wparam(2));
- &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
+ &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
&test ("eax","eax");
&jnz (&label("dec_key_ret"));
&lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
diff --git a/devel/perlasm/aesni-x86_64.pl b/devel/perlasm/aesni-x86_64.pl
index 499f3b3f42..4a10fe6bd2 100644
--- a/devel/perlasm/aesni-x86_64.pl
+++ b/devel/perlasm/aesni-x86_64.pl
@@ -129,8 +129,8 @@
#
# Further data for other parallelizable modes:
#
-# CBC decrypt 1.16 0.93 0.93
-# CTR 1.14 0.91 n/a
+# CBC decrypt 1.16 0.93 0.74
+# CTR 1.14 0.91 0.74
#
# Well, given 3x column it's probably inappropriate to call the limit
# asymptotic, if it can be surpassed, isn't it? What happens there?
@@ -153,10 +153,17 @@
# April 2011
#
-# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
-# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
+######################################################################
+# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
+# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
+# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
+# instruction latency is 9 cycles and that they can be issued every
+# cycle.
+
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-x86_64.pl:-)
@@ -172,7 +179,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
@@ -1003,220 +1011,378 @@ ___
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
-# does not update *ivec! (see engine/eng_aesni.c for details)
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
{
-my $reserved = $win64?0:-0x28;
-my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
-my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
-my $bswap_mask="%xmm15";
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("${key_}d","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
$code.=<<___;
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,\@function,5
.align 16
aesni_ctr32_encrypt_blocks:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- lea -0xc8(%rsp),%rsp
- movaps %xmm6,0x20(%rsp)
- movaps %xmm7,0x30(%rsp)
- movaps %xmm8,0x40(%rsp)
- movaps %xmm9,0x50(%rsp)
- movaps %xmm10,0x60(%rsp)
- movaps %xmm11,0x70(%rsp)
- movaps %xmm12,0x80(%rsp)
- movaps %xmm13,0x90(%rsp)
- movaps %xmm14,0xa0(%rsp)
- movaps %xmm15,0xb0(%rsp)
+ movaps %xmm6,-0xa8(%rax)
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
.Lctr32_body:
___
$code.=<<___;
+ lea -8(%rax),%rbp
+
cmp \$1,$len
je .Lctr32_one_shortcut
- movdqu ($ivp),$ivec
- movdqa .Lbswap_mask(%rip),$bswap_mask
- xor $rounds,$rounds
- pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
- pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
+ movdqu ($ivp),$inout0
+ movdqu ($key),$rndkey0
+ mov 12($ivp),$ctr # counter LSB
+ pxor $rndkey0,$inout0
+ mov 12($key),$key0 # 0-round key LSB
+ movdqa $inout0,0x00(%rsp) # populate counter block
+ bswap $ctr
+ movdqa $inout0,$inout1
+ movdqa $inout0,$inout2
+ movdqa $inout0,$inout3
+ movdqa $inout0,0x40(%rsp)
+ movdqa $inout0,0x50(%rsp)
+ movdqa $inout0,0x60(%rsp)
+ movdqa $inout0,0x70(%rsp)
mov 240($key),$rounds # key->rounds
- bswap $rnds_
- pxor $iv0,$iv0 # vector of 3 32-bit counters
- pxor $iv1,$iv1 # vector of 3 32-bit counters
- pinsrd \$0,$rnds_,$iv0
- lea 3($rnds_),$key_
- pinsrd \$0,$key_,$iv1
- inc $rnds_
- pinsrd \$1,$rnds_,$iv0
- inc $key_
- pinsrd \$1,$key_,$iv1
- inc $rnds_
- pinsrd \$2,$rnds_,$iv0
- inc $key_
- pinsrd \$2,$key_,$iv1
- movdqa $iv0,$reserved(%rsp)
- pshufb $bswap_mask,$iv0
- movdqa $iv1,`$reserved+0x10`(%rsp)
- pshufb $bswap_mask,$iv1
-
- pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
- pshufd \$`2<<6`,$iv0,$inout1
- pshufd \$`1<<6`,$iv0,$inout2
- cmp \$6,$len
+
+ lea 1($ctr),%r9
+ lea 2($ctr),%r10
+ bswap %r9d
+ bswap %r10d
+ xor $key0,%r9d
+ xor $key0,%r10d
+ pinsrd \$3,%r9d,$inout1
+ lea 3($ctr),%r9
+ movdqa $inout1,0x10(%rsp)
+ pinsrd \$3,%r10d,$inout2
+ bswap %r9d
+ lea 4($ctr),%r10
+ movdqa $inout2,0x20(%rsp)
+ xor $key0,%r9d
+ bswap %r10d
+ pinsrd \$3,%r9d,$inout3
+ xor $key0,%r10d
+ movdqa $inout3,0x30(%rsp)
+ lea 5($ctr),%r9
+ mov %r10d,0x40+12(%rsp)
+ bswap %r9d
+ lea 6($ctr),%r10
+ xor $key0,%r9d
+ bswap %r10d
+ mov %r9d,0x50+12(%rsp)
+ xor $key0,%r10d
+ lea 7($ctr),%r9
+ mov %r10d,0x60+12(%rsp)
+ bswap %r9d
+ xor $key0,%r9d
+ mov %r9d,0x70+12(%rsp)
+
+ $movkey 0x10($key),$rndkey1
+
+ movdqa 0x40(%rsp),$inout4
+ movdqa 0x50(%rsp),$inout5
+
+ cmp \$8,$len
jb .Lctr32_tail
- shr \$1,$rounds
- mov $key,$key_ # backup $key
- mov $rounds,$rnds_ # backup $rounds
- sub \$6,$len
- jmp .Lctr32_loop6
-
-.align 16
-.Lctr32_loop6:
- pshufd \$`3<<6`,$iv1,$inout3
- por $ivec,$inout0 # merge counter-less ivec
- $movkey ($key_),$rndkey0
- pshufd \$`2<<6`,$iv1,$inout4
- por $ivec,$inout1
- $movkey 16($key_),$rndkey1
- pshufd \$`1<<6`,$iv1,$inout5
- por $ivec,$inout2
- por $ivec,$inout3
- xorps $rndkey0,$inout0
- por $ivec,$inout4
- por $ivec,$inout5
-
- # inline _aesni_encrypt6 and interleave last rounds
- # with own code...
- pxor $rndkey0,$inout1
+ lea 0x80($key),$key # size optimization
+ sub \$8,$len
+ jmp .Lctr32_loop8
+
+.align 32
+.Lctr32_loop8:
+ add \$8,$ctr
+ movdqa 0x60(%rsp),$inout6
aesenc $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
+ mov $ctr,%r9d
+ movdqa 0x70(%rsp),$inout7
aesenc $rndkey1,$inout1
- movdqa .Lincrement32(%rip),$iv1
- pxor $rndkey0,$inout3
+ bswap %r9d
+ $movkey 0x20-0x80($key),$rndkey0
aesenc $rndkey1,$inout2
- movdqa $reserved(%rsp),$iv0
- pxor $rndkey0,$inout4
+ xor $key0,%r9d
aesenc $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
+ mov %r9d,0x00+12(%rsp)
+ lea 1($ctr),%r9
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
- jmp .Lctr32_enc_loop6_enter
-.align 16
-.Lctr32_enc_loop6:
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+ aesenc $rndkeyx,$inout0
+ aesenc $rndkeyx,$inout1
+ bswap %r9d
+ aesenc $rndkeyx,$inout2
+ xor $key0,%r9d
+ aesenc $rndkeyx,$inout3
+ mov %r9d,`0x10*($i-1)`+12(%rsp)
+ lea $i($ctr),%r9
+ aesenc $rndkeyx,$inout4
+ aesenc $rndkeyx,$inout5
+ aesenc $rndkeyx,$inout6
+ aesenc $rndkeyx,$inout7
+ $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ bswap %r9d
+ aesenc $rndkey0,$inout2
+ xor $key0,%r9d
+ aesenc $rndkey0,$inout3
+ mov %r9d,0x70+12(%rsp)
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ movdqu 0x00($inp),$in0
+ aesenc $rndkey0,$inout7
+ $movkey 0xa0-0x80($key),$rndkey0
+
+ cmp \$11,$rounds
+ jb .Lctr32_enc_done
+
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
- dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
-.Lctr32_enc_loop6_enter:
- $movkey 16($key),$rndkey1
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xb0-0x80($key),$rndkey1
+
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
- lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
- $movkey ($key),$rndkey0
- jnz .Lctr32_enc_loop6
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xc0-0x80($key),$rndkey0
+ je .Lctr32_enc_done
aesenc $rndkey1,$inout0
- paddd $iv1,$iv0 # increment counter vector
aesenc $rndkey1,$inout1
- paddd `$reserved+0x10`(%rsp),$iv1
aesenc $rndkey1,$inout2
- movdqa $iv0,$reserved(%rsp) # save counter vector
aesenc $rndkey1,$inout3
- movdqa $iv1,`$reserved+0x10`(%rsp)
aesenc $rndkey1,$inout4
- pshufb $bswap_mask,$iv0 # byte swap
aesenc $rndkey1,$inout5
- pshufb $bswap_mask,$iv1
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xd0-0x80($key),$rndkey1
- aesenclast $rndkey0,$inout0
- movups ($inp),$in0 # load input
- aesenclast $rndkey0,$inout1
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xe0-0x80($key),$rndkey0
+
+.Lctr32_enc_done:
+ movdqu 0x10($inp),$in1
+ pxor $rndkey0,$in0
+ movdqu 0x20($inp),$in2
+ pxor $rndkey0,$in1
+ movdqu 0x30($inp),$in3
+ pxor $rndkey0,$in2
+ movdqu 0x40($inp),$in4
+ pxor $rndkey0,$in3
+ movdqu 0x50($inp),$in5
+ pxor $rndkey0,$in4
+ aesenc $rndkey1,$inout0
+ pxor $rndkey0,$in5
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ movdqu 0x60($inp),$rndkey1
+
+ aesenclast $in0,$inout0
+ pxor $rndkey0,$rndkey1
+ movdqu 0x70($inp),$in0
+ lea 0x80($inp),$inp
+ aesenclast $in1,$inout1
+ pxor $rndkey0,$in0
+ movdqa 0x00(%rsp),$in1 # load next counter block
+ aesenclast $in2,$inout2
+ movdqa 0x10(%rsp),$in2
+ aesenclast $in3,$inout3
+ movdqa 0x20(%rsp),$in3
+ aesenclast $in4,$inout4
+ movdqa 0x30(%rsp),$in4
+ aesenclast $in5,$inout5
+ movdqa 0x40(%rsp),$in5
+ aesenclast $rndkey1,$inout6
+ movdqa 0x50(%rsp),$rndkey0
+ aesenclast $in0,$inout7
+ $movkey 0x10-0x80($key),$rndkey1
+
+ movups $inout0,($out) # store output
+ movdqa $in1,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in2,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in3,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in4,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in5,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey0,$inout5
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out
+
+ sub \$8,$len
+ jnc .Lctr32_loop8
+
+ add \$8,$len
+ jz .Lctr32_done
+ lea -0x80($key),$key
+
+.Lctr32_tail:
+ lea 16($key),$key
+ cmp \$4,$len
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
+
+ movdqa 0x60(%rsp),$inout6
+ pxor $inout7,$inout7
+
+ $movkey 16($key),$rndkey0
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ aesenc $rndkey1,$inout1
+ shr \$1,$rounds
+ aesenc $rndkey1,$inout2
+ dec $rounds
+ aesenc $rndkey1,$inout3
+ movups ($inp),$in0
+ aesenc $rndkey1,$inout4
movups 0x10($inp),$in1
- aesenclast $rndkey0,$inout2
+ aesenc $rndkey1,$inout5
movups 0x20($inp),$in2
- aesenclast $rndkey0,$inout3
+ aesenc $rndkey1,$inout6
+ $movkey 16($key),$rndkey1
+
+ call .Lenc_loop8_enter
+
+ movdqu 0x30($inp),$in3
+ pxor $in0,$inout0
+ movdqu 0x40($inp),$in0
+ pxor $in1,$inout1
+ movdqu $inout0,($out)
+ pxor $in2,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in3,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in0,$inout4
+ movdqu $inout3,0x30($out)
+ movdqu $inout4,0x40($out)
+ cmp \$6,$len
+ jb .Lctr32_done
+
+ movups 0x50($inp),$in1
+ xorps $in1,$inout5
+ movups $inout5,0x50($out)
+ je .Lctr32_done
+
+ movups 0x60($inp),$in2
+ xorps $in2,$inout6
+ movups $inout6,0x60($out)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop4:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key),$rndkey1
+ dec $rounds
+ jnz .Lctr32_loop4
+ aesenclast $rndkey1,$inout0
+ movups ($inp),$in0
+ aesenclast $rndkey1,$inout1
+ movups 0x10($inp),$in1
+ aesenclast $rndkey1,$inout2
+ movups 0x20($inp),$in2
+ aesenclast $rndkey1,$inout3
movups 0x30($inp),$in3
- aesenclast $rndkey0,$inout4
- movups 0x40($inp),$rndkey1
- aesenclast $rndkey0,$inout5
- movups 0x50($inp),$rndkey0
- lea 0x60($inp),$inp
-
- xorps $inout0,$in0 # xor
- pshufd \$`3<<6`,$iv0,$inout0
- xorps $inout1,$in1
- pshufd \$`2<<6`,$iv0,$inout1
- movups $in0,($out) # store output
- xorps $inout2,$in2
- pshufd \$`1<<6`,$iv0,$inout2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- xorps $inout4,$rndkey1
- movups $in3,0x30($out)
- xorps $inout5,$rndkey0
- movups $rndkey1,0x40($out)
- movups $rndkey0,0x50($out)
- lea 0x60($out),$out
- mov $rnds_,$rounds
- sub \$6,$len
- jnc .Lctr32_loop6
- add \$6,$len
- jz .Lctr32_done
- mov $key_,$key # restore $key
- lea 1($rounds,$rounds),$rounds # restore original value
+ xorps $in0,$inout0
+ movups $inout0,($out)
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ pxor $in2,$inout2
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout3
+ movdqu $inout3,0x30($out)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop3:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ $movkey ($key),$rndkey1
+ dec $rounds
+ jnz .Lctr32_loop3
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ aesenclast $rndkey1,$inout2
-.Lctr32_tail:
- por $ivec,$inout0
movups ($inp),$in0
+ xorps $in0,$inout0
+ movups $inout0,($out)
cmp \$2,$len
- jb .Lctr32_one
+ jb .Lctr32_done
- por $ivec,$inout1
movups 0x10($inp),$in1
- je .Lctr32_two
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ je .Lctr32_done
- pshufd \$`3<<6`,$iv1,$inout3
- por $ivec,$inout2
movups 0x20($inp),$in2
- cmp \$4,$len
- jb .Lctr32_three
-
- pshufd \$`2<<6`,$iv1,$inout4
- por $ivec,$inout3
- movups 0x30($inp),$in3
- je .Lctr32_four
-
- por $ivec,$inout4
- xorps $inout5,$inout5
-
- call _aesni_encrypt6
-
- movups 0x40($inp),$rndkey1
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- xorps $inout4,$rndkey1
- movups $in3,0x30($out)
- movups $rndkey1,0x40($out)
+ xorps $in2,$inout2
+ movups $inout2,0x20($out)
jmp .Lctr32_done
.align 16
@@ -1224,64 +1390,32 @@ $code.=<<___;
movups ($ivp),$inout0
movups ($inp),$in0
mov 240($key),$rounds # key->rounds
-.Lctr32_one:
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
- xorps $inout0,$in0
- movups $in0,($out)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_two:
- xorps $inout2,$inout2
- call _aesni_encrypt3
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- movups $in1,0x10($out)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_three:
- call _aesni_encrypt3
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- movups $in2,0x20($out)
+ xorps $in0,$inout0
+ movups $inout0,($out)
jmp .Lctr32_done
.align 16
-.Lctr32_four:
- call _aesni_encrypt4
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- movups $in3,0x30($out)
-
.Lctr32_done:
___
$code.=<<___ if ($win64);
- movaps 0x20(%rsp),%xmm6
- movaps 0x30(%rsp),%xmm7
- movaps 0x40(%rsp),%xmm8
- movaps 0x50(%rsp),%xmm9
- movaps 0x60(%rsp),%xmm10
- movaps 0x70(%rsp),%xmm11
- movaps 0x80(%rsp),%xmm12
- movaps 0x90(%rsp),%xmm13
- movaps 0xa0(%rsp),%xmm14
- movaps 0xb0(%rsp),%xmm15
- lea 0xc8(%rsp),%rsp
-.Lctr32_ret:
+ movaps -0xa0(%rbp),%xmm6
+ movaps -0x90(%rbp),%xmm7
+ movaps -0x80(%rbp),%xmm8
+ movaps -0x70(%rbp),%xmm9
+ movaps -0x60(%rbp),%xmm10
+ movaps -0x50(%rbp),%xmm11
+ movaps -0x40(%rbp),%xmm12
+ movaps -0x30(%rbp),%xmm13
+ movaps -0x20(%rbp),%xmm14
+ movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lctr32_epilogue:
ret
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
___
@@ -1296,29 +1430,33 @@ ___
my @tweak=map("%xmm$_",(10..15));
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
-my $frame_size = 0x68 + ($win64?160:0);
+my $frame_size = 0x70 + ($win64?160:0);
$code.=<<___;
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,\@function,6
.align 16
aesni_xts_encrypt:
- lea -$frame_size(%rsp),%rsp
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,0x60(%rsp)
- movaps %xmm7,0x70(%rsp)
- movaps %xmm8,0x80(%rsp)
- movaps %xmm9,0x90(%rsp)
- movaps %xmm10,0xa0(%rsp)
- movaps %xmm11,0xb0(%rsp)
- movaps %xmm12,0xc0(%rsp)
- movaps %xmm13,0xd0(%rsp)
- movaps %xmm14,0xe0(%rsp)
- movaps %xmm15,0xf0(%rsp)
+ movaps %xmm6,-0xa8(%rax)
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
.Lxts_enc_body:
___
$code.=<<___;
+ lea -8(%rax),%rbp
movups ($ivp),@tweak[5] # load clear-text tweak
mov 240(%r8),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
@@ -1326,213 +1464,251 @@ ___
# generate the tweak
&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
$code.=<<___;
+ $movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+ mov $rounds,$rnds_
+
movdqa .Lxts_magic(%rip),$twmask
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ pshufd \$0x5f,@tweak[5],$twres
+ pxor $rndkey0,$rndkey1
___
+ # alternative tweak calculation algorithm is based on suggestions
+ # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+ # and should help in the future...
for ($i=0;$i<4;$i++) {
$code.=<<___;
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
movdqa @tweak[5],@tweak[$i]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- pand $twmask,$twres # isolate carry and residue
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- pxor $twres,@tweak[5]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
___
}
$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
sub \$16*6,$len
jc .Lxts_enc_short
shr \$1,$rounds
- sub \$1,$rounds
+ sub \$3,$rounds
+ $movkey 16($key_),$rndkey1
mov $rounds,$rnds_
+ lea .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.align 16
+.align 32
.Lxts_enc_grandloop:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
- pand $twmask,$twres # isolate carry and residue
+ movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
- pxor $twres,@tweak[5]
-
+ pxor @tweak[0],$inout0
movdqu `16*2`($inp),$inout2
- pxor @tweak[0],$inout0 # input^=tweak
- movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
- movdqu `16*4`($inp),$inout4
+ aesenc $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
- movdqu `16*5`($inp),$inout5
- lea `16*6`($inp),$inp
+ aesenc $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
- $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
- pxor @tweak[5],$inout5
+ aesenc $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
- # inline _aesni_encrypt6 and interleave first and last rounds
- # with own code...
- $movkey 16($key_),$rndkey1
- pxor $rndkey0,$inout0
- pxor $rndkey0,$inout1
- movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
- aesenc $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
- movdqa @tweak[1],`16*1`(%rsp)
- aesenc $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqa @tweak[2],`16*2`(%rsp)
- aesenc $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqa @tweak[3],`16*3`(%rsp)
- aesenc $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
- movdqa @tweak[4],`16*4`(%rsp)
+ pxor $twres,@tweak[0]
aesenc $rndkey1,$inout4
- movdqa @tweak[5],`16*5`(%rsp)
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
aesenc $rndkey1,$inout5
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp
- jmp .Lxts_enc_loop6_enter
+ $movkey 48($key_),$rndkey1
-.align 16
+ aesenc $rndkey0,$inout0
+ pxor $twres,@tweak[2]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesenc $rndkey0,$inout1
+ pxor $twres,@tweak[3]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesenc $rndkey0,$inout2
+ pxor $twres,@tweak[4]
+ aesenc $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesenc $rndkey0,$inout4
+ movdqa $twmask,`16*5`(%rsp)
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ lea 64($key_),$key
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_enc_loop6
+.align 32
.Lxts_enc_loop6:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
- dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
-.Lxts_enc_loop6_enter:
$movkey 16($key),$rndkey1
+ lea 32($key),$key
+
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
- lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
$movkey ($key),$rndkey0
+ dec $rounds
jnz .Lxts_enc_loop6
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa (%r8),$twmask
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesenc $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
aesenc $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
aesenc $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesenc $rndkey1,$inout3
+ pxor $twtmp,@tweak[5]
aesenc $rndkey1,$inout4
+ movaps @tweak[0],@tweak[1] # copy round[0]
aesenc $rndkey1,$inout5
$movkey 16($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[0]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesenc $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
+ pxor @tweak[5],@tweak[0]
+ psrad \$31,$twtmp
aesenc $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
aesenc $rndkey0,$inout2
- pxor $twres,@tweak[5]
aesenc $rndkey0,$inout3
+ pxor $twtmp,@tweak[5]
aesenc $rndkey0,$inout4
+ movaps @tweak[1],@tweak[2]
aesenc $rndkey0,$inout5
$movkey 32($key),$rndkey0
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[1]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesenc $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
aesenc $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
aesenc $rndkey1,$inout2
- pxor $twres,@tweak[5]
+ movdqa @tweak[3],`16*3`(%rsp)
aesenc $rndkey1,$inout3
+ pxor $twtmp,@tweak[5]
aesenc $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
aesenc $rndkey1,$inout5
+ $movkey 48($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[2]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- aesenclast $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
- aesenclast $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- aesenclast $rndkey0,$inout2
- pxor $twres,@tweak[5]
- aesenclast $rndkey0,$inout3
- aesenclast $rndkey0,$inout4
- aesenclast $rndkey0,$inout5
-
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[3]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- xorps `16*0`(%rsp),$inout0 # output^=tweak
- pand $twmask,$twres # isolate carry and residue
- xorps `16*1`(%rsp),$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ aesenc $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesenc $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pxor $twtmp,@tweak[5]
+ aesenc $rndkey0,$inout4
+ movaps @tweak[3],@tweak[4]
+ aesenc $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ aesenclast `16*0`(%rsp),$inout0
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ aesenclast `16*1`(%rsp),$inout1
+ aesenclast `16*2`(%rsp),$inout2
pxor $twres,@tweak[5]
+ aesenclast `16*3`(%rsp),$inout3
+ aesenclast `16*4`(%rsp),$inout4
+ aesenclast `16*5`(%rsp),$inout5
+ mov $rnds_,$rounds # restore $rounds
- xorps `16*2`(%rsp),$inout2
- movups $inout0,`16*0`($out) # write output
- xorps `16*3`(%rsp),$inout3
- movups $inout1,`16*1`($out)
- xorps `16*4`(%rsp),$inout4
- movups $inout2,`16*2`($out)
- xorps `16*5`(%rsp),$inout5
- movups $inout3,`16*3`($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,`16*4`($out)
- movups $inout5,`16*5`($out)
lea `16*6`($out),$out
+ movups $inout0,`-16*6`($out) # write output
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
sub \$16*6,$len
jnc .Lxts_enc_grandloop
- lea 3($rounds,$rounds),$rounds # restore original value
+ lea 7($rounds,$rounds),$rounds # restore original value
mov $key_,$key # restore $key
mov $rounds,$rnds_ # backup $rounds
.Lxts_enc_short:
+ pxor $rndkey0,@tweak[0]
add \$16*6,$len
jz .Lxts_enc_done
+ pxor $rndkey0,@tweak[1]
cmp \$0x20,$len
jb .Lxts_enc_one
+ pxor $rndkey0,@tweak[2]
je .Lxts_enc_two
+ pxor $rndkey0,@tweak[3]
cmp \$0x40,$len
jb .Lxts_enc_three
+ pxor $rndkey0,@tweak[4]
je .Lxts_enc_four
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movdqu ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movdqu 16*1($inp),$inout1
- pxor $twres,@tweak[5]
-
+ movdqu ($inp),$inout0
+ movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
@@ -1627,15 +1803,15 @@ $code.=<<___;
call _aesni_encrypt4
- xorps @tweak[0],$inout0
- movdqa @tweak[5],@tweak[0]
- xorps @tweak[1],$inout1
- xorps @tweak[2],$inout2
- movups $inout0,($out)
- xorps @tweak[3],$inout3
- movups $inout1,16*1($out)
- movups $inout2,16*2($out)
- movups $inout3,16*3($out)
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out)
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
lea 16*4($out),$out
jmp .Lxts_enc_done
@@ -1670,19 +1846,20 @@ $code.=<<___;
.Lxts_enc_ret:
___
$code.=<<___ if ($win64);
- movaps 0x60(%rsp),%xmm6
- movaps 0x70(%rsp),%xmm7
- movaps 0x80(%rsp),%xmm8
- movaps 0x90(%rsp),%xmm9
- movaps 0xa0(%rsp),%xmm10
- movaps 0xb0(%rsp),%xmm11
- movaps 0xc0(%rsp),%xmm12
- movaps 0xd0(%rsp),%xmm13
- movaps 0xe0(%rsp),%xmm14
- movaps 0xf0(%rsp),%xmm15
+ movaps -0xa0(%rbp),%xmm6
+ movaps -0x90(%rbp),%xmm7
+ movaps -0x80(%rbp),%xmm8
+ movaps -0x70(%rbp),%xmm9
+ movaps -0x60(%rbp),%xmm10
+ movaps -0x50(%rbp),%xmm11
+ movaps -0x40(%rbp),%xmm12
+ movaps -0x30(%rbp),%xmm13
+ movaps -0x20(%rbp),%xmm14
+ movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
- lea $frame_size(%rsp),%rsp
+ lea (%rbp),%rsp
+ pop %rbp
.Lxts_enc_epilogue:
ret
.size aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -1693,22 +1870,26 @@ $code.=<<___;
.type aesni_xts_decrypt,\@function,6
.align 16
aesni_xts_decrypt:
- lea -$frame_size(%rsp),%rsp
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,0x60(%rsp)
- movaps %xmm7,0x70(%rsp)
- movaps %xmm8,0x80(%rsp)
- movaps %xmm9,0x90(%rsp)
- movaps %xmm10,0xa0(%rsp)
- movaps %xmm11,0xb0(%rsp)
- movaps %xmm12,0xc0(%rsp)
- movaps %xmm13,0xd0(%rsp)
- movaps %xmm14,0xe0(%rsp)
- movaps %xmm15,0xf0(%rsp)
+ movaps %xmm6,-0xa8(%rax)
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
.Lxts_dec_body:
___
$code.=<<___;
+ lea -8(%rax),%rbp
movups ($ivp),@tweak[5] # load clear-text tweak
mov 240($key2),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
@@ -1722,213 +1903,248 @@ $code.=<<___;
shl \$4,%rax
sub %rax,$len
+ $movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+ mov $rounds,$rnds_
+
movdqa .Lxts_magic(%rip),$twmask
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ pshufd \$0x5f,@tweak[5],$twres
+ pxor $rndkey0,$rndkey1
___
for ($i=0;$i<4;$i++) {
$code.=<<___;
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
movdqa @tweak[5],@tweak[$i]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- pand $twmask,$twres # isolate carry and residue
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- pxor $twres,@tweak[5]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
___
}
$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
sub \$16*6,$len
jc .Lxts_dec_short
shr \$1,$rounds
- sub \$1,$rounds
+ sub \$3,$rounds
+ $movkey 16($key_),$rndkey1
mov $rounds,$rnds_
+ lea .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.align 16
+.align 32
.Lxts_dec_grandloop:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
- pand $twmask,$twres # isolate carry and residue
+ movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
- pxor $twres,@tweak[5]
-
+ pxor @tweak[0],$inout0
movdqu `16*2`($inp),$inout2
- pxor @tweak[0],$inout0 # input^=tweak
- movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
- movdqu `16*4`($inp),$inout4
+ aesdec $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
- movdqu `16*5`($inp),$inout5
- lea `16*6`($inp),$inp
+ aesdec $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
- $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
- pxor @tweak[5],$inout5
+ aesdec $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
- # inline _aesni_decrypt6 and interleave first and last rounds
- # with own code...
- $movkey 16($key_),$rndkey1
- pxor $rndkey0,$inout0
- pxor $rndkey0,$inout1
- movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
- aesdec $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
- movdqa @tweak[1],`16*1`(%rsp)
- aesdec $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqa @tweak[2],`16*2`(%rsp)
- aesdec $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqa @tweak[3],`16*3`(%rsp)
- aesdec $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
- movdqa @tweak[4],`16*4`(%rsp)
+ pxor $twres,@tweak[0]
aesdec $rndkey1,$inout4
- movdqa @tweak[5],`16*5`(%rsp)
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
aesdec $rndkey1,$inout5
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp
- jmp .Lxts_dec_loop6_enter
+ $movkey 48($key_),$rndkey1
-.align 16
+ aesdec $rndkey0,$inout0
+ pxor $twres,@tweak[2]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesdec $rndkey0,$inout1
+ pxor $twres,@tweak[3]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesdec $rndkey0,$inout2
+ pxor $twres,@tweak[4]
+ aesdec $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesdec $rndkey0,$inout4
+ movdqa $twmask,`16*5`(%rsp)
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ lea 64($key_),$key
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_dec_loop6
+.align 32
.Lxts_dec_loop6:
aesdec $rndkey1,$inout0
aesdec $rndkey1,$inout1
- dec $rounds
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
-.Lxts_dec_loop6_enter:
$movkey 16($key),$rndkey1
+ lea 32($key),$key
+
aesdec $rndkey0,$inout0
aesdec $rndkey0,$inout1
- lea 32($key),$key
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
aesdec $rndkey0,$inout4
aesdec $rndkey0,$inout5
$movkey ($key),$rndkey0
+ dec $rounds
jnz .Lxts_dec_loop6
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa (%r8),$twmask
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesdec $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
aesdec $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
aesdec $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesdec $rndkey1,$inout3
+ pxor $twtmp,@tweak[5]
aesdec $rndkey1,$inout4
+ movaps @tweak[0],@tweak[1] # copy round[0]
aesdec $rndkey1,$inout5
$movkey 16($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[0]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesdec $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
+ pxor @tweak[5],@tweak[0]
+ psrad \$31,$twtmp
aesdec $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
aesdec $rndkey0,$inout2
- pxor $twres,@tweak[5]
aesdec $rndkey0,$inout3
+ pxor $twtmp,@tweak[5]
aesdec $rndkey0,$inout4
+ movaps @tweak[1],@tweak[2]
aesdec $rndkey0,$inout5
$movkey 32($key),$rndkey0
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[1]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesdec $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
aesdec $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
aesdec $rndkey1,$inout2
- pxor $twres,@tweak[5]
+ movdqa @tweak[3],`16*3`(%rsp)
aesdec $rndkey1,$inout3
+ pxor $twtmp,@tweak[5]
aesdec $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
aesdec $rndkey1,$inout5
+ $movkey 48($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[2]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- aesdeclast $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
- aesdeclast $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- aesdeclast $rndkey0,$inout2
- pxor $twres,@tweak[5]
- aesdeclast $rndkey0,$inout3
- aesdeclast $rndkey0,$inout4
- aesdeclast $rndkey0,$inout5
-
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[3]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- xorps `16*0`(%rsp),$inout0 # output^=tweak
- pand $twmask,$twres # isolate carry and residue
- xorps `16*1`(%rsp),$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ aesdec $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesdec $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pxor $twtmp,@tweak[5]
+ aesdec $rndkey0,$inout4
+ movaps @tweak[3],@tweak[4]
+ aesdec $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ aesdeclast `16*0`(%rsp),$inout0
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ aesdeclast `16*1`(%rsp),$inout1
+ aesdeclast `16*2`(%rsp),$inout2
pxor $twres,@tweak[5]
+ aesdeclast `16*3`(%rsp),$inout3
+ aesdeclast `16*4`(%rsp),$inout4
+ aesdeclast `16*5`(%rsp),$inout5
+ mov $rnds_,$rounds # restore $rounds
- xorps `16*2`(%rsp),$inout2
- movups $inout0,`16*0`($out) # write output
- xorps `16*3`(%rsp),$inout3
- movups $inout1,`16*1`($out)
- xorps `16*4`(%rsp),$inout4
- movups $inout2,`16*2`($out)
- xorps `16*5`(%rsp),$inout5
- movups $inout3,`16*3`($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,`16*4`($out)
- movups $inout5,`16*5`($out)
lea `16*6`($out),$out
+ movups $inout0,`-16*6`($out) # write output
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
sub \$16*6,$len
jnc .Lxts_dec_grandloop
- lea 3($rounds,$rounds),$rounds # restore original value
+ lea 7($rounds,$rounds),$rounds # restore original value
mov $key_,$key # restore $key
mov $rounds,$rnds_ # backup $rounds
.Lxts_dec_short:
+ pxor $rndkey0,@tweak[0]
+ pxor $rndkey0,@tweak[1]
add \$16*6,$len
jz .Lxts_dec_done
+ pxor $rndkey0,@tweak[2]
cmp \$0x20,$len
jb .Lxts_dec_one
+ pxor $rndkey0,@tweak[3]
je .Lxts_dec_two
+ pxor $rndkey0,@tweak[4]
cmp \$0x40,$len
jb .Lxts_dec_three
je .Lxts_dec_four
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movdqu ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movdqu 16*1($inp),$inout1
- pxor $twres,@tweak[5]
-
+ movdqu ($inp),$inout0
+ movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
@@ -2013,7 +2229,7 @@ $code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
- movdqa @tweak[5],@tweak[1]
+ movdqa @tweak[4],@tweak[1]
xorps @tweak[2],$inout2
movups $inout0,($out)
movups $inout1,16*1($out)
@@ -2023,14 +2239,8 @@ $code.=<<___;
.align 16
.Lxts_dec_four:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movups ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movups 16*1($inp),$inout1
- pxor $twres,@tweak[5]
-
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
@@ -2041,16 +2251,16 @@ $code.=<<___;
call _aesni_decrypt4
- xorps @tweak[0],$inout0
+ pxor @tweak[0],$inout0
movdqa @tweak[4],@tweak[0]
- xorps @tweak[1],$inout1
+ pxor @tweak[1],$inout1
movdqa @tweak[5],@tweak[1]
- xorps @tweak[2],$inout2
- movups $inout0,($out)
- xorps @tweak[3],$inout3
- movups $inout1,16*1($out)
- movups $inout2,16*2($out)
- movups $inout3,16*3($out)
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out)
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
lea 16*4($out),$out
jmp .Lxts_dec_done
@@ -2096,19 +2306,20 @@ $code.=<<___;
.Lxts_dec_ret:
___
$code.=<<___ if ($win64);
- movaps 0x60(%rsp),%xmm6
- movaps 0x70(%rsp),%xmm7
- movaps 0x80(%rsp),%xmm8
- movaps 0x90(%rsp),%xmm9
- movaps 0xa0(%rsp),%xmm10
- movaps 0xb0(%rsp),%xmm11
- movaps 0xc0(%rsp),%xmm12
- movaps 0xd0(%rsp),%xmm13
- movaps 0xe0(%rsp),%xmm14
- movaps 0xf0(%rsp),%xmm15
+ movaps -0xa0(%rbp),%xmm6
+ movaps -0x90(%rbp),%xmm7
+ movaps -0x80(%rbp),%xmm8
+ movaps -0x70(%rbp),%xmm9
+ movaps -0x60(%rbp),%xmm10
+ movaps -0x50(%rbp),%xmm11
+ movaps -0x40(%rbp),%xmm12
+ movaps -0x30(%rbp),%xmm13
+ movaps -0x20(%rbp),%xmm14
+ movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
- lea $frame_size(%rsp),%rsp
+ lea (%rbp),%rsp
+ pop %rbp
.Lxts_dec_epilogue:
ret
.size aesni_xts_decrypt,.-aesni_xts_decrypt
@@ -2120,7 +2331,10 @@ ___
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
{
-my $reserved = $win64?0x40:-0x18; # used in decrypt
+my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
+my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
+my $inp_=$key_;
+
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
.type ${PREFIX}_cbc_encrypt,\@function,6
@@ -2176,276 +2390,340 @@ $code.=<<___;
#--------------------------- CBC DECRYPT ------------------------------#
.align 16
.Lcbc_decrypt:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
.Lcbc_decrypt_body:
___
$code.=<<___;
+ lea -8(%rax),%rbp
movups ($ivp),$iv
mov $rnds_,$rounds
- cmp \$0x70,$len
+ cmp \$0x50,$len
jbe .Lcbc_dec_tail
- shr \$1,$rnds_
+
+ $movkey ($key),$rndkey0
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+ cmp \$0x70,$len
+ jbe .Lcbc_dec_six_or_seven
+
sub \$0x70,$len
- mov $rnds_,$rounds
- movaps $iv,$reserved(%rsp)
+ lea 0x70($key),$key # size optimization
jmp .Lcbc_dec_loop8_enter
.align 16
.Lcbc_dec_loop8:
- movaps $rndkey0,$reserved(%rsp) # save IV
movups $inout7,($out)
lea 0x10($out),$out
.Lcbc_dec_loop8_enter:
- $movkey ($key),$rndkey0
- movups ($inp),$inout0 # load input
- movups 0x10($inp),$inout1
- $movkey 16($key),$rndkey1
+ movdqu 0x60($inp),$inout6
+ pxor $rndkey0,$inout0
+ movdqu 0x70($inp),$inout7
+ pxor $rndkey0,$inout1
+ $movkey 0x10-0x70($key),$rndkey1
+ pxor $rndkey0,$inout2
+ xor $inp_,$inp_
+ cmp \$0x70,$len # is there at least 0x60 bytes ahead?
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ pxor $rndkey0,$inout5
+ pxor $rndkey0,$inout6
- lea 32($key),$key
- movdqu 0x20($inp),$inout2
- xorps $rndkey0,$inout0
- movdqu 0x30($inp),$inout3
- xorps $rndkey0,$inout1
- movdqu 0x40($inp),$inout4
aesdec $rndkey1,$inout0
- pxor $rndkey0,$inout2
- movdqu 0x50($inp),$inout5
+ pxor $rndkey0,$inout7
+ $movkey 0x20-0x70($key),$rndkey0
aesdec $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqu 0x60($inp),$inout6
aesdec $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqu 0x70($inp),$inout7
aesdec $rndkey1,$inout3
- pxor $rndkey0,$inout5
- dec $rounds
aesdec $rndkey1,$inout4
- pxor $rndkey0,$inout6
aesdec $rndkey1,$inout5
- pxor $rndkey0,$inout7
- $movkey ($key),$rndkey0
+ setnc ${inp_}b
aesdec $rndkey1,$inout6
+ shl \$7,$inp_
aesdec $rndkey1,$inout7
- $movkey 16($key),$rndkey1
-
- call .Ldec_loop8_enter
+ add $inp,$inp_
+ $movkey 0x30-0x70($key),$rndkey1
+___
+for($i=1;$i<12;$i++) {
+my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
+$code.=<<___;
+ aesdec $rndkeyx,$inout0
+ aesdec $rndkeyx,$inout1
+ aesdec $rndkeyx,$inout2
+ aesdec $rndkeyx,$inout3
+ aesdec $rndkeyx,$inout4
+ aesdec $rndkeyx,$inout5
+ aesdec $rndkeyx,$inout6
+ aesdec $rndkeyx,$inout7
+ $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
+___
+$code.=<<___ if ($i==7);
+ cmp \$11,$rounds
+ jb .Lcbc_dec_done
+___
+$code.=<<___ if ($i==9);
+ je .Lcbc_dec_done
+___
+}
+$code.=<<___;
+.Lcbc_dec_done:
+ aesdec $rndkey1,$inout0
+ pxor $rndkey0,$iv
+ aesdec $rndkey1,$inout1
+ pxor $rndkey0,$in0
+ aesdec $rndkey1,$inout2
+ pxor $rndkey0,$in1
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,$in2
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0,$in3
+ aesdec $rndkey1,$inout5
+ pxor $rndkey0,$in4
+ aesdec $rndkey1,$inout6
+ aesdec $rndkey1,$inout7
+ movdqu 0x50($inp),$rndkey1
+
+ aesdeclast $iv,$inout0
+ movdqu 0x60($inp),$iv # borrow $iv
+ pxor $rndkey0,$rndkey1
+ aesdeclast $in0,$inout1
+ pxor $rndkey0,$iv
+ movdqu 0x70($inp),$rndkey0 # next IV
+ lea 0x80($inp),$inp
+ aesdeclast $in1,$inout2
+ movdqu 0x00($inp_),$in0
+ aesdeclast $in2,$inout3
+ movdqu 0x10($inp_),$in1
+ aesdeclast $in3,$inout4
+ movdqu 0x20($inp_),$in2
+ aesdeclast $in4,$inout5
+ movdqu 0x30($inp_),$in3
+ aesdeclast $rndkey1,$inout6
+ movdqu 0x40($inp_),$in4
+ aesdeclast $iv,$inout7
+ movdqa $rndkey0,$iv # return $iv
+ movdqu 0x50($inp_),$rndkey1
+ $movkey -0x70($key),$rndkey0
+
+ movups $inout0,($out) # store output
+ movdqa $in0,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in1,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in2,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in3,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in4,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey1,$inout5
+ movups $inout6,0x60($out)
+ lea 0x70($out),$out
- movups ($inp),$rndkey1 # re-load input
- movups 0x10($inp),$rndkey0
- xorps $reserved(%rsp),$inout0 # ^= IV
- xorps $rndkey1,$inout1
- movups 0x20($inp),$rndkey1
- xorps $rndkey0,$inout2
- movups 0x30($inp),$rndkey0
- xorps $rndkey1,$inout3
- movups 0x40($inp),$rndkey1
- xorps $rndkey0,$inout4
- movups 0x50($inp),$rndkey0
- xorps $rndkey1,$inout5
- movups 0x60($inp),$rndkey1
- xorps $rndkey0,$inout6
- movups 0x70($inp),$rndkey0 # IV
- xorps $rndkey1,$inout7
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,0x40($out)
- mov $key_,$key # restore $key
- movups $inout5,0x50($out)
- lea 0x80($inp),$inp
- movups $inout6,0x60($out)
- lea 0x70($out),$out
sub \$0x80,$len
ja .Lcbc_dec_loop8
movaps $inout7,$inout0
- movaps $rndkey0,$iv
+ lea -0x70($key),$key
add \$0x70,$len
jle .Lcbc_dec_tail_collected
- movups $inout0,($out)
- lea 1($rnds_,$rnds_),$rounds
+ movups $inout7,($out)
lea 0x10($out),$out
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_tail
+
+ movaps $in0,$inout0
+.Lcbc_dec_six_or_seven:
+ cmp \$0x60,$len
+ ja .Lcbc_dec_seven
+
+ movaps $inout5,$inout6
+ call _aesni_decrypt6
+ pxor $iv,$inout0 # ^= IV
+ movaps $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ lea 0x50($out),$out
+ movdqa $inout5,$inout0
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 0x60($inp),$inout6
+ xorps $inout7,$inout7
+ call _aesni_decrypt8
+ movups 0x50($inp),$inout7
+ pxor $iv,$inout0 # ^= IV
+ movups 0x60($inp),$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout7,$inout6
+ movdqu $inout5,0x50($out)
+ lea 0x60($out),$out
+ movdqa $inout6,$inout0
+ jmp .Lcbc_dec_tail_collected
+
.Lcbc_dec_tail:
movups ($inp),$inout0
- movaps $inout0,$in0
- cmp \$0x10,$len
+ sub \$0x10,$len
jbe .Lcbc_dec_one
movups 0x10($inp),$inout1
- movaps $inout1,$in1
- cmp \$0x20,$len
+ movaps $inout0,$in0
+ sub \$0x10,$len
jbe .Lcbc_dec_two
movups 0x20($inp),$inout2
- movaps $inout2,$in2
- cmp \$0x30,$len
+ movaps $inout1,$in1
+ sub \$0x10,$len
jbe .Lcbc_dec_three
movups 0x30($inp),$inout3
- cmp \$0x40,$len
+ movaps $inout2,$in2
+ sub \$0x10,$len
jbe .Lcbc_dec_four
movups 0x40($inp),$inout4
- cmp \$0x50,$len
- jbe .Lcbc_dec_five
-
- movups 0x50($inp),$inout5
- cmp \$0x60,$len
- jbe .Lcbc_dec_six
-
- movups 0x60($inp),$inout6
- movaps $iv,$reserved(%rsp) # save IV
- call _aesni_decrypt8
- movups ($inp),$rndkey1
- movups 0x10($inp),$rndkey0
- xorps $reserved(%rsp),$inout0 # ^= IV
- xorps $rndkey1,$inout1
- movups 0x20($inp),$rndkey1
- xorps $rndkey0,$inout2
- movups 0x30($inp),$rndkey0
- xorps $rndkey1,$inout3
- movups 0x40($inp),$rndkey1
- xorps $rndkey0,$inout4
- movups 0x50($inp),$rndkey0
- xorps $rndkey1,$inout5
- movups 0x60($inp),$iv # IV
- xorps $rndkey0,$inout6
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- movups $inout4,0x40($out)
- movups $inout5,0x50($out)
- lea 0x60($out),$out
- movaps $inout6,$inout0
- sub \$0x70,$len
+ movaps $inout3,$in3
+ movaps $inout4,$in4
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ pxor $iv,$inout0
+ movaps $in4,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ lea 0x40($out),$out
+ movdqa $inout4,$inout0
+ sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
+
.align 16
.Lcbc_dec_one:
+ movaps $inout0,$in0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps $iv,$inout0
movaps $in0,$iv
- sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
+ movaps $inout1,$in1
xorps $inout2,$inout2
call _aesni_decrypt3
- xorps $iv,$inout0
- xorps $in0,$inout1
- movups $inout0,($out)
+ pxor $iv,$inout0
movaps $in1,$iv
- movaps $inout1,$inout0
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ movdqa $inout1,$inout0
lea 0x10($out),$out
- sub \$0x20,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
+ movaps $inout2,$in2
call _aesni_decrypt3
- xorps $iv,$inout0
- xorps $in0,$inout1
- movups $inout0,($out)
- xorps $in1,$inout2
- movups $inout1,0x10($out)
+ pxor $iv,$inout0
movaps $in2,$iv
- movaps $inout2,$inout0
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ movdqa $inout2,$inout0
lea 0x20($out),$out
- sub \$0x30,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
+ movaps $inout3,$in3
call _aesni_decrypt4
- xorps $iv,$inout0
- movups 0x30($inp),$iv
- xorps $in0,$inout1
- movups $inout0,($out)
- xorps $in1,$inout2
- movups $inout1,0x10($out)
- xorps $in2,$inout3
- movups $inout2,0x20($out)
- movaps $inout3,$inout0
+ pxor $iv,$inout0
+ movaps $in3,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ movdqa $inout3,$inout0
lea 0x30($out),$out
- sub \$0x40,$len
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_five:
- xorps $inout5,$inout5
- call _aesni_decrypt6
- movups 0x10($inp),$rndkey1
- movups 0x20($inp),$rndkey0
- xorps $iv,$inout0
- xorps $in0,$inout1
- xorps $rndkey1,$inout2
- movups 0x30($inp),$rndkey1
- xorps $rndkey0,$inout3
- movups 0x40($inp),$iv
- xorps $rndkey1,$inout4
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- lea 0x40($out),$out
- movaps $inout4,$inout0
- sub \$0x50,$len
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_six:
- call _aesni_decrypt6
- movups 0x10($inp),$rndkey1
- movups 0x20($inp),$rndkey0
- xorps $iv,$inout0
- xorps $in0,$inout1
- xorps $rndkey1,$inout2
- movups 0x30($inp),$rndkey1
- xorps $rndkey0,$inout3
- movups 0x40($inp),$rndkey0
- xorps $rndkey1,$inout4
- movups 0x50($inp),$iv
- xorps $rndkey0,$inout5
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- movups $inout4,0x40($out)
- lea 0x50($out),$out
- movaps $inout5,$inout0
- sub \$0x60,$len
jmp .Lcbc_dec_tail_collected
+
.align 16
.Lcbc_dec_tail_collected:
- and \$15,$len
movups $iv,($ivp)
+ and \$15,$len
jnz .Lcbc_dec_tail_partial
movups $inout0,($out)
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
- movaps $inout0,$reserved(%rsp)
+ movaps $inout0,(%rsp)
mov \$16,%rcx
mov $out,%rdi
sub $len,%rcx
- lea $reserved(%rsp),%rsi
+ lea (%rsp),%rsi
.long 0x9066A4F3 # rep movsb
.Lcbc_dec_ret:
___
$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- movaps 0x20(%rsp),%xmm8
- movaps 0x30(%rsp),%xmm9
- lea 0x58(%rsp),%rsp
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
___
$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
.Lcbc_ret:
ret
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
@@ -2712,6 +2990,8 @@ $code.=<<___;
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
+.Lincrement1:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
@@ -2789,45 +3069,9 @@ ccm64_se_handler:
jmp .Lcommon_seh_tail
.size ccm64_se_handler,.-ccm64_se_handler
-.type ctr32_se_handler,\@abi-omnipotent
+.type ctr_xts_se_handler,\@abi-omnipotent
.align 16
-ctr32_se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- lea .Lctr32_body(%rip),%r10
- cmp %r10,%rbx # context->Rip<"prologue" label
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- lea .Lctr32_ret(%rip),%r10
- cmp %r10,%rbx
- jae .Lcommon_seh_tail
-
- lea 0x20(%rax),%rsi # %xmm save area
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
- .long 0xa548f3fc # cld; rep movsq
- lea 0xc8(%rax),%rax # adjust stack pointer
-
- jmp .Lcommon_seh_tail
-.size ctr32_se_handler,.-ctr32_se_handler
-
-.type xts_se_handler,\@abi-omnipotent
-.align 16
-xts_se_handler:
+ctr_xts_se_handler:
push %rsi
push %rdi
push %rbx
@@ -2857,14 +3101,14 @@ xts_se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- lea 0x60(%rax),%rsi # %xmm save area
+ mov 160($context),%rax # pull context->Rbp
+ lea -0xa0(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0x68+160(%rax),%rax # adjust stack pointer
- jmp .Lcommon_seh_tail
-.size xts_se_handler,.-xts_se_handler
+ jmp .Lcommon_rbp_tail
+.size ctr_xts_se_handler,.-ctr_xts_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
@@ -2896,11 +3140,16 @@ cbc_se_handler:
cmp %r10,%rbx # context->Rip>="epilogue" label
jae .Lcommon_seh_tail
- lea 0(%rax),%rsi # top of stack
+ lea 16(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
- mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0x58(%rax),%rax # adjust stack pointer
+
+.Lcommon_rbp_tail:
+ mov 160($context),%rax # pull context->Rbp
+ mov (%rax),%rbp # restore saved %rbp
+ lea 8(%rax),%rax # adjust stack pointer
+ mov %rbp,160($context) # restore context->Rbp
jmp .Lcommon_seh_tail
.Lrestore_cbc_rax:
@@ -3003,14 +3252,15 @@ $code.=<<___ if ($PREFIX eq "aesni");
.rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
.LSEH_info_ctr32:
.byte 9,0,0,0
- .rva ctr32_se_handler
+ .rva ctr_xts_se_handler
+ .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
.LSEH_info_xts_enc:
.byte 9,0,0,0
- .rva xts_se_handler
+ .rva ctr_xts_se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
.LSEH_info_xts_dec:
.byte 9,0,0,0
- .rva xts_se_handler
+ .rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
___
$code.=<<___;
@@ -3057,6 +3307,19 @@ sub aesni {
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
}
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ my $off = $2;
+ push @opcode,0x44 if ($3>=8);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+ return ".byte\t".join(',',@opcode);
+ }
return $line;
}
diff --git a/devel/perlasm/cbc.pl b/devel/perlasm/cbc.pl
index 6fc2510905..24561e759a 100644
--- a/devel/perlasm/cbc.pl
+++ b/devel/perlasm/cbc.pl
@@ -150,7 +150,7 @@ sub cbc
&set_label("PIC_point");
&blindpop("edx");
&lea("ecx",&DWP(&label("cbc_enc_jmp_table")."-".&label("PIC_point"),"edx"));
- &mov($count,&DWP(0,"ecx",$count,4))
+ &mov($count,&DWP(0,"ecx",$count,4));
&add($count,"edx");
&xor("ecx","ecx");
&xor("edx","edx");
diff --git a/devel/perlasm/e_padlock-x86.pl b/devel/perlasm/e_padlock-x86.pl
index 71ecad3bbd..4148468c41 100644
--- a/devel/perlasm/e_padlock-x86.pl
+++ b/devel/perlasm/e_padlock-x86.pl
@@ -400,9 +400,9 @@ my ($mode,$opcode) = @_;
&generate_mode("ecb",0xc8);
&generate_mode("cbc",0xd0);
-#&generate_mode("cfb",0xe0);
-#&generate_mode("ofb",0xe8);
-#&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode,
+&generate_mode("cfb",0xe0);
+&generate_mode("ofb",0xe8);
+&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode,
# because hardware CTR was introduced later
# and even has errata on certain C7 stepping.
# own implementation *always* works, though
diff --git a/devel/perlasm/e_padlock-x86_64.pl b/devel/perlasm/e_padlock-x86_64.pl
index 4d71d06f02..f8ba1e909f 100644
--- a/devel/perlasm/e_padlock-x86_64.pl
+++ b/devel/perlasm/e_padlock-x86_64.pl
@@ -23,7 +23,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$code=".text\n";
@@ -547,9 +548,9 @@ ___
&generate_mode("ecb",0xc8);
&generate_mode("cbc",0xd0);
-#&generate_mode("cfb",0xe0);
-#&generate_mode("ofb",0xe8);
-#&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
+&generate_mode("cfb",0xe0);
+&generate_mode("ofb",0xe8);
+&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
$code.=<<___;
.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
diff --git a/devel/perlasm/ghash-x86.pl b/devel/perlasm/ghash-x86.pl
index 2a1819cb51..e6b9663c13 100644
--- a/devel/perlasm/ghash-x86.pl
+++ b/devel/perlasm/ghash-x86.pl
@@ -26,6 +26,8 @@
# P4 125/125 17.8 84(***)
# Opteron 66 /70 10.1 30
# Core2 54 /67 8.4 18
+# Atom 105/105 16.8 53
+# VIA Nano 69 /71 13.0 27
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
@@ -113,6 +115,16 @@
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
@@ -635,7 +647,7 @@ sub mmx_loop() {
{ my @lo = ("mm0","mm1","mm2");
my @hi = ("mm3","mm4","mm5");
my @tmp = ("mm6","mm7");
- my $off1=0,$off2=0,$i;
+ my ($off1,$off2,$i) = (0,0,);
&add ($Htbl,128); # optimize for size
&lea ("edi",&DWP(16+128,"esp"));
@@ -822,17 +834,18 @@ $len="ebx";
&static_label("bswap");
sub clmul64x64_T2 { # minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
&movdqa ($Xhi,$Xi); #
&pshufd ($T1,$Xi,0b01001110);
- &pshufd ($T2,$Hkey,0b01001110);
+ &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
&pxor ($T1,$Xi); #
- &pxor ($T2,$Hkey);
+ &pxor ($T2,$Hkey) if (!defined($HK));
+ $HK=$T2 if (!defined($HK));
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
- &pclmulqdq ($T1,$T2,0x00); #######
+ &pclmulqdq ($T1,$HK,0x00); #######
&xorps ($T1,$Xi); #
&xorps ($T1,$Xhi); #
@@ -879,31 +892,32 @@ if (1) { # Algorithm 9 with <<1 twist.
# below. Algorithm 9 was therefore chosen for
# further optimization...
-sub reduction_alg9 { # 17/13 times faster than Intel version
+sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
# 1st phase
- &movdqa ($T1,$Xi) #
+ &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
&psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pxor ($Xhi,$T2); #
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
# 2nd phase
&movdqa ($T2,$Xi);
+ &psrlq ($Xi,1);
+ &pxor ($Xhi,$T2); #
+ &pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
+ &pxor ($Xi,$Xhi) #
}
&function_begin_B("gcm_init_clmul");
@@ -937,8 +951,14 @@ my ($Xhi,$Xi) = @_;
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
+ &pshufd ($T1,$Hkey,0b01001110);
+ &pshufd ($T2,$Xi,0b01001110);
+ &pxor ($T1,$Hkey); # Karatsuba pre-processing
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
+ &pxor ($T2,$Xi); # Karatsuba pre-processing
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
+ &palignr ($T2,$T1,8); # low part is H.lo^H.hi
+ &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
&ret ();
&function_end_B("gcm_init_clmul");
@@ -956,8 +976,9 @@ my ($Xhi,$Xi) = @_;
&movdqa ($T3,&QWP(0,$const));
&movups ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
+ &movups ($T2,&QWP(32,$Htbl));
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
&reduction_alg9 ($Xhi,$Xi);
&pshufb ($Xi,$T3);
@@ -994,79 +1015,107 @@ my ($Xhi,$Xi) = @_;
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
+ &movdqu ($T3,&QWP(32,$Htbl));
&pxor ($Xi,$T1); # Ii+Xi
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
+ &pshufd ($T1,$Xn,0b01001110); # H*Ii+1
+ &movdqa ($Xhn,$Xn);
+ &pxor ($T1,$Xn); #
+
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &pclmulqdq ($T1,$T3,0x00); #######
&lea ($inp,&DWP(32,$inp)); # i+=2
&sub ($len,0x20);
&jbe (&label("even_tail"));
+ &jmp (&label("mod_loop"));
-&set_label("mod_loop");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
- &movdqu ($T1,&QWP(0,$inp)); # Ii
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
+&set_label("mod_loop",32);
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movdqa ($T3,&QWP(0,$const));
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
- &pshufb ($T1,$T3);
- &pshufb ($Xn,$T3);
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &xorps ($Xhi,$Xhn);
+ &movdqu ($Xhn,&QWP(0,$inp)); # Ii
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
+ &pxor ($T1,$Xhi); #
- &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
- &movdqa ($Xhn,$Xn);
- &pxor ($Xhi,$T1); # "Ii+Xi", consume early
+ &pxor ($T2,$T1); #
+ &pshufb ($Xhn,$T3);
- &movdqa ($T1,$Xi) #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
+ &pshufb ($Xn,$T3);
+ &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
+
+ &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
+ &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
+ &movdqa ($T1,$Xi);
+ &psllq ($Xi,5);
+ &pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &psllq ($Xi,5); #
- &pxor ($Xi,$T1); #
+ &movups ($T3,&QWP(32,$Htbl));
&pclmulqdq ($Xn,$Hkey,0x00); #######
&psllq ($Xi,57); #
- &movdqa ($T2,$Xi); #
+ &movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T2,8); #
- &pxor ($Xi,$T1);
- &pshufd ($T1,$T3,0b01001110);
- &pxor ($Xhi,$T2); #
- &pxor ($T1,$T3);
- &pshufd ($T3,$Hkey,0b01001110);
- &pxor ($T3,$Hkey); #
-
- &pclmulqdq ($Xhn,$Hkey,0x11); #######
+ &psrldq ($T1,8); #
+ &pxor ($Xi,$T2);
+ &pxor ($Xhi,$T1); #
+ &pshufd ($T1,$Xhn,0b01001110);
&movdqa ($T2,$Xi); # 2nd phase
+ &psrlq ($Xi,1);
+ &pxor ($T1,$Xhn);
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &pxor ($Xhi,$T2); #
+ &pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
- &pxor ($T2,$Xhi);
- &psrlq ($Xi,1); #
- &pxor ($Xi,$T2); #
-
+ &pxor ($Xi,$Xhi) #
&pclmulqdq ($T1,$T3,0x00); #######
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
- &xorps ($T1,$Xn); #
- &xorps ($T1,$Xhn); #
-
- &movdqa ($T3,$T1); #
- &psrldq ($T1,8);
- &pslldq ($T3,8); #
- &pxor ($Xhn,$T1);
- &pxor ($Xn,$T3); #
- &movdqa ($T3,&QWP(0,$const));
&lea ($inp,&DWP(32,$inp));
&sub ($len,0x20);
&ja (&label("mod_loop"));
&set_label("even_tail");
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
+ &movdqa ($Xhi,$Xi);
+ &pxor ($T2,$Xi); #
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
- &pxor ($Xhi,$Xhn);
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
+ &pclmulqdq ($T2,$T3,0x10); #######
+ &movdqa ($T3,&QWP(0,$const));
+
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &xorps ($Xhi,$Xhn);
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
+ &pxor ($T1,$Xhi); #
+
+ &pxor ($T2,$T1); #
+
+ &movdqa ($T1,$T2); #
+ &psrldq ($T2,8);
+ &pslldq ($T1,8); #
+ &pxor ($Xhi,$T2);
+ &pxor ($Xi,$T1); #
&reduction_alg9 ($Xhi,$Xi);
diff --git a/devel/perlasm/ghash-x86_64.pl b/devel/perlasm/ghash-x86_64.pl
index a5ae180882..7904248070 100644
--- a/devel/perlasm/ghash-x86_64.pl
+++ b/devel/perlasm/ghash-x86_64.pl
@@ -22,6 +22,8 @@
# P4 28.6 14.0 +100%
# Opteron 19.3 7.7 +150%
# Core2 17.8 8.1(**) +120%
+# Atom 31.6 16.8 +88%
+# VIA Nano 21.8 10.1 +115%
#
# (*) comparison is not completely fair, because C results are
# for vanilla "256B" implementation, while assembler results
@@ -39,6 +41,41 @@
# providing access to a Westmere-based system on behalf of Intel
# Open Source Technology Centre.
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere 1.76(+14%)
+# Sandy Bridge 1.79(+9%)
+# Ivy Bridge 1.79(+8%)
+# Haswell 0.55(+93%) (if system doesn't support AVX)
+# Bulldozer 1.52(+25%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -50,7 +87,25 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$do4xaggr=1;
# common register layout
$nlo="%rax";
@@ -351,19 +406,27 @@ ___
($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
sub clmul64x64_T2 { # minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
-$code.=<<___ if (!defined($modulo));
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
movdqa $Xi,$Xhi #
pshufd \$0b01001110,$Xi,$T1
pshufd \$0b01001110,$Hkey,$T2
pxor $Xi,$T1 #
pxor $Hkey,$T2
___
+} else {
+$code.=<<___;
+ movdqa $Xi,$Xhi #
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1 #
+___
+}
$code.=<<___;
pclmulqdq \$0x00,$Hkey,$Xi #######
pclmulqdq \$0x11,$Hkey,$Xhi #######
- pclmulqdq \$0x00,$T2,$T1 #######
+ pclmulqdq \$0x00,$HK,$T1 #######
pxor $Xi,$T1 #
pxor $Xhi,$T1 #
@@ -375,42 +438,53 @@ $code.=<<___;
___
}
-sub reduction_alg9 { # 17/13 times faster than Intel version
+sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
$code.=<<___;
# 1st phase
- movdqa $Xi,$T1 #
+ movdqa $Xi,$T2 #
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pxor $Xi,$T1 #
psllq \$1,$Xi
pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
psllq \$57,$Xi #
- movdqa $Xi,$T2 #
+ movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
# 2nd phase
movdqa $Xi,$T2
+ psrlq \$1,$Xi
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
+ pxor $Xhi,$Xi #
___
}
{ my ($Htbl,$Xip)=@_4args;
+ my $HK="%xmm6";
$code.=<<___;
.globl gcm_init_clmul
.type gcm_init_clmul,\@abi-omnipotent
.align 16
gcm_init_clmul:
+.L_init_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_clmul:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
movdqu ($Xip),$Hkey
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
@@ -429,13 +503,47 @@ gcm_init_clmul:
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
# calculate H^2
+ pshufd \$0b01001110,$Hkey,$HK
movdqa $Hkey,$Xi
+ pxor $Hkey,$HK
+___
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ pshufd \$0b01001110,$Hkey,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $Hkey,$T1 # Karatsuba pre-processing
+ movdqu $Hkey,0x00($Htbl) # save H
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x10($Htbl) # save H^2
+ palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
+ movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
+___
+if ($do4xaggr) {
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
+ &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+ movdqa $Xi,$T3
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
- movdqu $Hkey,($Htbl) # save H
- movdqu $Xi,16($Htbl) # save H^2
+ pshufd \$0b01001110,$T3,$T1
+ pshufd \$0b01001110,$Xi,$T2
+ pxor $T3,$T1 # Karatsuba pre-processing
+ movdqu $T3,0x30($Htbl) # save H^3
+ pxor $Xi,$T2 # Karatsuba pre-processing
+ movdqu $Xi,0x40($Htbl) # save H^4
+ palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
+ movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
+___
+}
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_clmul:
+___
+$code.=<<___;
ret
.size gcm_init_clmul,.-gcm_init_clmul
___
@@ -448,13 +556,38 @@ $code.=<<___;
.type gcm_gmult_clmul,\@abi-omnipotent
.align 16
gcm_gmult_clmul:
+.L_gmult_clmul:
movdqu ($Xip),$Xi
movdqa .Lbswap_mask(%rip),$T3
movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$T2
pshufb $T3,$Xi
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
- &reduction_alg9 ($Xhi,$Xi);
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
+$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
+ # experimental alternative. special thing about is that there
+ # no dependency between the two multiplications...
+ mov \$`0xE1<<1`,%eax
+ mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
+ mov \$0x07,%r11d
+ movq %rax,$T1
+ movq %r10,$T2
+ movq %r11,$T3 # borrow $T3
+ pand $Xi,$T3
+ pshufb $T3,$T2 # ($Xi&7)·0xE0
+ movq %rax,$T3
+ pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
+ pxor $Xi,$T2
+ pslldq \$15,$T2
+ paddd $T2,$T2 # <<(64+56+1)
+ pxor $T2,$Xi
+ pclmulqdq \$0x01,$T3,$Xi
+ movdqa .Lbswap_mask(%rip),$T3 # reload $T3
+ psrldq \$1,$T1
+ pxor $T1,$Xhi
+ pslldq \$7,$Xi
+ pxor $Xhi,$Xi
+___
$code.=<<___;
pshufb $T3,$Xi
movdqu $Xi,($Xip)
@@ -464,129 +597,318 @@ ___
}
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my $Xn="%xmm6";
- my $Xhn="%xmm7";
- my $Hkey2="%xmm8";
- my $T1n="%xmm9";
- my $T2n="%xmm10";
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
$code.=<<___;
.globl gcm_ghash_clmul
.type gcm_ghash_clmul,\@abi-omnipotent
-.align 16
+.align 32
gcm_ghash_clmul:
+.L_ghash_clmul:
___
$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
.LSEH_begin_gcm_ghash_clmul:
# I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
- .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
- .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
- .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
+ movdqu 0x20($Htbl),$HK
pshufb $T3,$Xi
sub \$0x10,$len
jz .Lodd_tail
- movdqu 16($Htbl),$Hkey2
+ movdqu 0x10($Htbl),$Hkey2
+___
+if ($do4xaggr) {
+my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
+
+$code.=<<___;
+ cmp \$0x30,$len
+ jb .Lskip4x
+
+ sub \$0x30,$len
+ movdqu 0x30($Htbl),$Hkey3
+ movdqu 0x40($Htbl),$Hkey4
+
+ #######
+ # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
+ #
+ movdqu 0x30($inp),$Xln
+ movdqu 0x20($inp),$Xl
+ pshufb $T3,$Xln
+ pshufb $T3,$Xl
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$Xmn
+
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey2,$Xl
+ pclmulqdq \$0x11,$Hkey2,$Xh
+ xorps $Xl,$Xln
+ pclmulqdq \$0x10,$HK,$Xm
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ xorps $Xm,$Xmn
+
+ movdqu 0x10($inp),$Xl
+ movdqu 0($inp),$T1
+ pshufb $T3,$Xl
+ pshufb $T3,$T1
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $T1,$Xi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ movdqa $Xi,$Xhi
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ xorps $Xl,$Xln
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xh,$Xhn
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ xorps $Xm,$Xmn
+ movdqu 0x30($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ xorps $Xln,$Xi
+ movdqu 0x20($inp),$Xln
+ movdqa $Xl,$Xh
+ pshufd \$0b01001110,$Xl,$Xm
+ pclmulqdq \$0x10,$HK,$T1
+ xorps $Xhn,$Xhi
+ pxor $Xl,$Xm
+ pshufb $T3,$Xln
+ movups 0x20($Htbl),$HK
+ pclmulqdq \$0x00,$Hkey,$Xl
+ xorps $Xmn,$T1
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$Xmn
+
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ pxor $Xln,$Xmn
+ pxor $Xhi,$T1 #
+ movdqa $T1,$T2 #
+ pslldq \$8,$T1
+ pclmulqdq \$0x11,$Hkey,$Xh
+ psrldq \$8,$T2 #
+ pxor $T1,$Xi
+ movdqa .L7_mask(%rip),$T1
+ pxor $T2,$Xhi #
+ movq %rax,$T2
+
+ pand $Xi,$T1 # 1st phase
+ pshufb $T1,$T2 #
+ pclmulqdq \$0x00,$HK,$Xm
+ pxor $Xi,$T2 #
+ psllq \$57,$T2 #
+ movdqa $T2,$T1 #
+ pslldq \$8,$T2
+ pclmulqdq \$0x00,$Hkey2,$Xln
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
+ movdqu 0($inp),$T1
+
+ movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhn
+ xorps $Xl,$Xln
+ movdqu 0x10($inp),$Xl
+ pshufb $T3,$Xl
+ pclmulqdq \$0x10,$HK,$Xmn
+ xorps $Xh,$Xhn
+ movups 0x50($Htbl),$HK
+ pshufb $T3,$T1
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
+ psrlq \$5,$Xi
+
+ movdqa $Xl,$Xh
+ pxor $Xm,$Xmn
+ pshufd \$0b01001110,$Xl,$Xm
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
+ pxor $T2,$Xi #
+ pxor $T1,$Xhi
+ psrlq \$1,$Xi #
+ pclmulqdq \$0x11,$Hkey3,$Xh
+ xorps $Xl,$Xln
+ pxor $Xhi,$Xi #
+
+ pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xh,$Xhn
+
+ movdqa $Xi,$Xhi
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
+
+ lea 0x40($inp),$inp
+ sub \$0x40,$len
+ jnc .Lmod4_loop
+
+.Ltail4x:
+ pclmulqdq \$0x00,$Hkey4,$Xi
+ xorps $Xm,$Xmn
+ pclmulqdq \$0x11,$Hkey4,$Xhi
+ xorps $Xln,$Xi
+ pclmulqdq \$0x10,$HK,$T1
+ xorps $Xhn,$Xhi
+ pxor $Xi,$Xhi # aggregated Karatsuba post-processing
+ pxor $Xmn,$T1
+
+ pxor $Xhi,$T1 #
+ pxor $Xi,$Xhi
+
+ movdqa $T1,$T2 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
+___
+ &reduction_alg9($Xhi,$Xi);
+$code.=<<___;
+ add \$0x40,$len
+ jz .Ldone
+ movdqu 0x20($Htbl),$HK
+ sub \$0x10,$len
+ jz .Lodd_tail
+.Lskip4x:
+___
+}
+$code.=<<___;
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
movdqu ($inp),$T1 # Ii
- movdqu 16($inp),$Xn # Ii+1
+ movdqu 16($inp),$Xln # Ii+1
pshufb $T3,$T1
- pshufb $T3,$Xn
+ pshufb $T3,$Xln
pxor $T1,$Xi # Ii+Xi
-___
- &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
-$code.=<<___;
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
+
+ movdqa $Xln,$Xhn
+ pshufd \$0b01001110,$Xln,$T1
+ pxor $Xln,$T1
+ pclmulqdq \$0x00,$Hkey,$Xln
+ pclmulqdq \$0x11,$Hkey,$Xhn
+ pclmulqdq \$0x00,$HK,$T1
lea 32($inp),$inp # i+=2
sub \$0x20,$len
jbe .Leven_tail
+ jmp .Lmod_loop
+.align 32
.Lmod_loop:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- movdqu ($inp),$T1 # Ii
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
- pxor $Xhn,$Xhi
+ movdqa $Xi,$Xhi
+ pshufd \$0b01001110,$Xi,$T2 #
+ pxor $Xi,$T2 #
- movdqu 16($inp),$Xn # Ii+1
- pshufb $T3,$T1
- pshufb $T3,$Xn
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$T2
+
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ pxor $Xhn,$Xhi
+ movdqu ($inp),$Xhn # Ii
+ pshufb $T3,$Xhn
+ movdqu 16($inp),$Xln # Ii+1
+
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ pxor $Xhi,$T1
+ pxor $Xhn,$Xhi # "Ii+Xi", consume early
+ pxor $T1,$T2
+ pshufb $T3,$Xln
+ movdqa $T2,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
- movdqa $Xn,$Xhn #
- pshufd \$0b01001110,$Xn,$T1n
- pshufd \$0b01001110,$Hkey,$T2n
- pxor $Xn,$T1n #
- pxor $Hkey,$T2n
- pxor $T1,$Xhi # "Ii+Xi", consume early
+ movdqa $Xln,$Xhn #
- movdqa $Xi,$T1 # 1st phase
+ movdqa $Xi,$T2 # 1st phase
+ movdqa $Xi,$T1
+ psllq \$5,$Xi
+ pclmulqdq \$0x00,$Hkey,$Xln #######
+ pxor $Xi,$T1 #
psllq \$1,$Xi
pxor $T1,$Xi #
- psllq \$5,$Xi #
- pxor $T1,$Xi #
- pclmulqdq \$0x00,$Hkey,$Xn #######
psllq \$57,$Xi #
- movdqa $Xi,$T2 #
+ movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T2 #
- pxor $T1,$Xi
- pxor $T2,$Xhi #
+ psrldq \$8,$T1 #
+ pxor $T2,$Xi
+ pxor $T1,$Xhi #
+ pshufd \$0b01001110,$Xhn,$T1
+ pxor $Xhn,$T1 #
pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
+ psrlq \$1,$Xi
+ pxor $T2,$Xhi #
+ pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
psrlq \$1,$Xi #
- pxor $T2,$Xi #
- pxor $Xhi,$T2
- psrlq \$1,$Xi #
- pxor $T2,$Xi #
-
- pclmulqdq \$0x00,$T2n,$T1n #######
- movdqa $Xi,$Xhi #
- pshufd \$0b01001110,$Xi,$T1
- pshufd \$0b01001110,$Hkey2,$T2
- pxor $Xi,$T1 #
- pxor $Hkey2,$T2
-
- pxor $Xn,$T1n #
- pxor $Xhn,$T1n #
- movdqa $T1n,$T2n #
- psrldq \$8,$T1n
- pslldq \$8,$T2n #
- pxor $T1n,$Xhn
- pxor $T2n,$Xn #
+ pclmulqdq \$0x00,$HK,$T1 #######
+ pxor $Xhi,$Xi #
lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
-___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
-$code.=<<___;
- pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
+ movdqa $Xi,$Xhi
+ pshufd \$0b01001110,$Xi,$T2 #
+ pxor $Xi,$T2 #
+
+ pclmulqdq \$0x00,$Hkey2,$Xi
+ pclmulqdq \$0x11,$Hkey2,$Xhi
+ pclmulqdq \$0x10,$HK,$T2
+
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
+ pxor $Xi,$T1
+ pxor $Xhi,$T1
+ pxor $T1,$T2
+ movdqa $T2,$T1 #
+ psrldq \$8,$T1
+ pslldq \$8,$T2 #
+ pxor $T1,$Xhi
+ pxor $T2,$Xi #
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
@@ -598,7 +920,7 @@ $code.=<<___;
pshufb $T3,$T1
pxor $T1,$Xi # Ii+Xi
___
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
.Ldone:
@@ -611,21 +933,607 @@ $code.=<<___ if ($win64);
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
- add \$0x58,%rsp
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_clmul:
___
$code.=<<___;
ret
-.LSEH_end_gcm_ghash_clmul:
.size gcm_ghash_clmul,.-gcm_ghash_clmul
___
}
+
+$code.=<<___;
+.globl gcm_init_avx
+.type gcm_init_avx,\@abi-omnipotent
+.align 32
+gcm_init_avx:
+___
+if ($avx) {
+my ($Htbl,$Xip)=@_4args;
+my $HK="%xmm6";
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_init_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Hkey
+ vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
+
+ # <<1 twist
+ vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
+ vpsrlq \$63,$Hkey,$T1
+ vpsllq \$1,$Hkey,$Hkey
+ vpxor $T3,$T3,$T3 #
+ vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
+ vpslldq \$8,$T1,$T1
+ vpor $T1,$Hkey,$Hkey # H<<=1
+
+ # magic reduction
+ vpand .L0x1c2_polynomial(%rip),$T3,$T3
+ vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
+
+ vpunpckhqdq $Hkey,$Hkey,$HK
+ vmovdqa $Hkey,$Xi
+ vpxor $Hkey,$HK,$HK
+ mov \$4,%r10 # up to H^8
+ jmp .Linit_start_avx
+___
+
+sub clmul64x64_avx {
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+if (!defined($HK)) { $HK = $T2;
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpunpckhqdq $Hkey,$Hkey,$T2
+ vpxor $Xi,$T1,$T1 #
+ vpxor $Hkey,$T2,$T2
+___
+} else {
+$code.=<<___;
+ vpunpckhqdq $Xi,$Xi,$T1
+ vpxor $Xi,$T1,$T1 #
+___
+}
+$code.=<<___;
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
+ vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
+ vpclmulqdq \$0x00,$HK,$T1,$T1 #######
+ vpxor $Xi,$Xhi,$T2 #
+ vpxor $T2,$T1,$T1 #
+
+ vpslldq \$8,$T1,$T2 #
+ vpsrldq \$8,$T1,$T1
+ vpxor $T2,$Xi,$Xi #
+ vpxor $T1,$Xhi,$Xhi
+___
+}
+
+sub reduction_avx {
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+ vpsllq \$57,$Xi,$T1 # 1st phase
+ vpsllq \$62,$Xi,$T2
+ vpxor $T1,$T2,$T2 #
+ vpsllq \$63,$Xi,$T1
+ vpxor $T1,$T2,$T2 #
+ vpslldq \$8,$T2,$T1 #
+ vpsrldq \$8,$T2,$T2
+ vpxor $T1,$Xi,$Xi #
+ vpxor $T2,$Xhi,$Xhi
+
+ vpsrlq \$1,$Xi,$T2 # 2nd phase
+ vpxor $Xi,$Xhi,$Xhi
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$5,$T2,$T2
+ vpxor $T2,$Xi,$Xi #
+ vpsrlq \$1,$Xi,$Xi #
+ vpxor $Xhi,$Xi,$Xi #
+___
+}
+
+$code.=<<___;
+.align 32
+.Linit_loop_avx:
+ vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
+ vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+.Linit_start_avx:
+ vmovdqa $Xi,$T3
+___
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
+ &reduction_avx ($Xhi,$Xi);
+$code.=<<___;
+ vpshufd \$0b01001110,$T3,$T1
+ vpshufd \$0b01001110,$Xi,$T2
+ vpxor $T3,$T1,$T1 # Karatsuba pre-processing
+ vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
+ vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
+ vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
+ lea 0x30($Htbl),$Htbl
+ sub \$1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
+ vmovdqu $T3,-0x10($Htbl)
+
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ lea 0x18(%rsp),%rsp
+.LSEH_end_gcm_init_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_init_avx,.-gcm_init_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_init_clmul
+.size gcm_init_avx,.-gcm_init_avx
+___
+}
+
+$code.=<<___;
+.globl gcm_gmult_avx
+.type gcm_gmult_avx,\@abi-omnipotent
+.align 32
+gcm_gmult_avx:
+ jmp .L_gmult_clmul
+.size gcm_gmult_avx,.-gcm_gmult_avx
+___
+
+$code.=<<___;
+.globl gcm_ghash_avx
+.type gcm_ghash_avx,\@abi-omnipotent
+.align 32
+gcm_ghash_avx:
+___
+if ($avx) {
+my ($Xip,$Htbl,$inp,$len)=@_4args;
+my ($Xlo,$Xhi,$Xmi,
+ $Zlo,$Zhi,$Zmi,
+ $Hkey,$HK,$T1,$T2,
+ $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
+
+$code.=<<___ if ($win64);
+ lea -0x88(%rsp),%rax
+.LSEH_begin_gcm_ghash_avx:
+ # I can't trust assembler to use specific encoding:-(
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+ vzeroupper
+
+ vmovdqu ($Xip),$Xi # load $Xi
+ lea .L0x1c2_polynomial(%rip),%r10
+ lea 0x40($Htbl),$Htbl # size optimization
+ vmovdqu .Lbswap_mask(%rip),$bswap
+ vpshufb $bswap,$Xi,$Xi
+ cmp \$0x80,$len
+ jb .Lshort_avx
+ sub \$0x80,$len
+
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpshufb $bswap,$Ii,$Ii
+ vmovdqu 0x20-0x40($Htbl),$HK
+
+ vpunpckhqdq $Ii,$Ii,$T2
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Ii,$T2,$T2
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpxor $Ii,$T2,$T2
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpxor $Xmi,$Zmi,$Zmi
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
+ vpxor $Ij,$T1,$T1
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Zhi,$Xhi,$Xhi
+ vpshufb $bswap,$Ii,$Ii
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpxor $Zmi,$Xmi,$Xmi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x10,$HK,$T2,$Xmi
+
+ lea 0x80($inp),$inp
+ cmp \$0x80,$len
+ jb .Ltail_avx
+
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+ sub \$0x80,$len
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vmovdqu 0x70($inp),$Ii # I[7]
+ vpxor $Xlo,$Zlo,$Zlo
+ vpxor $Ij,$T1,$T1
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Tred
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+
+ vmovdqu 0x60($inp),$Ij # I[6]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpxor $Zlo,$Xi,$Xi # collect result
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vxorps $Zhi,$Xo,$Xo
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Zmi,$Tred,$Tred
+ vxorps $Ij,$T1,$T1
+
+ vmovdqu 0x50($inp),$Ii # I[5]
+ vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpxor $Xo,$Tred,$Tred
+ vpslldq \$8,$Tred,$T2
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vpsrldq \$8,$Tred,$Tred
+ vpxor $T2, $Xi, $Xi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ii
+ vxorps $Tred,$Xo, $Xo
+ vpxor $Xhi,$Zhi,$Zhi
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x40($inp),$Ij # I[4]
+ vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vxorps $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+
+ vmovdqu 0x30($inp),$Ii # I[3]
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu 0x20($inp),$Ij # I[2]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpxor $Zlo,$Xlo,$Xlo
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Zhi,$Xhi,$Xhi
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
+ vpxor $Ij,$T1,$T1
+ vpxor $Zmi,$Xmi,$Xmi
+ vxorps $Tred,$Xi,$Xi
+
+ vmovdqu 0x10($inp),$Ii # I[1]
+ vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
+ vpshufb $bswap,$Ii,$Ii
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
+ vxorps $Xo,$Tred,$Tred
+ vpunpckhqdq $Ii,$Ii,$T2
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
+ vmovdqu 0xb0-0x40($Htbl),$HK
+ vpxor $Ii,$T2,$T2
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vmovdqu ($inp),$Ij # I[0]
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
+ vpshufb $bswap,$Ij,$Ij
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
+ vpxor $Tred,$Ij,$Ij
+ vpclmulqdq \$0x10,$HK, $T2,$Xmi
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+
+ lea 0x80($inp),$inp
+ sub \$0x80,$len
+ jnc .Loop8x_avx
+
+ add \$0x80,$len
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -0x10($inp,$len),$Ii # very last word
+ lea ($inp,$len),$inp
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
+ vmovdqu 0x20-0x40($Htbl),$HK
+ vpshufb $bswap,$Ii,$Ij
+
+ vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
+ vmovdqa $Xhi,$Zhi # $Zhi and
+ vmovdqa $Xmi,$Zmi # $Zmi
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x20($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x30($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x50-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x40($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x50($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovdqu 0x80-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x60($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vpsrldq \$8,$HK,$HK
+ sub \$0x10,$len
+ jz .Ltail_avx
+
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vmovdqu -0x70($inp),$Ii
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
+ vpshufb $bswap,$Ii,$Ij
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+ vmovq 0xb8-0x40($Htbl),$HK
+ sub \$0x10,$len
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
+.Ltail_no_xor_avx:
+ vpunpckhqdq $Ij,$Ij,$T1
+ vpxor $Xlo,$Zlo,$Zlo
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
+ vpxor $Ij,$T1,$T1
+ vpxor $Xhi,$Zhi,$Zhi
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
+ vpxor $Xmi,$Zmi,$Zmi
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
+
+ vmovdqu (%r10),$Tred
+
+ vpxor $Xlo,$Zlo,$Xi
+ vpxor $Xhi,$Zhi,$Xo
+ vpxor $Xmi,$Zmi,$Zmi
+
+ vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
+ vpxor $Xo, $Zmi,$Zmi
+ vpslldq \$8, $Zmi,$T2
+ vpsrldq \$8, $Zmi,$Zmi
+ vpxor $T2, $Xi, $Xi
+ vpxor $Zmi,$Xo, $Xo
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
+ vpalignr \$8,$Xi,$Xi,$Xi
+ vpxor $Xo,$Xi,$Xi
+ vpxor $T2,$Xi,$Xi
+
+ cmp \$0,$len
+ jne .Lshort_avx
+
+ vpshufb $bswap,$Xi,$Xi
+ vmovdqu $Xi,($Xip)
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps 0x10(%rsp),%xmm7
+ movaps 0x20(%rsp),%xmm8
+ movaps 0x30(%rsp),%xmm9
+ movaps 0x40(%rsp),%xmm10
+ movaps 0x50(%rsp),%xmm11
+ movaps 0x60(%rsp),%xmm12
+ movaps 0x70(%rsp),%xmm13
+ movaps 0x80(%rsp),%xmm14
+ movaps 0x90(%rsp),%xmm15
+ lea 0xa8(%rsp),%rsp
+.LSEH_end_gcm_ghash_avx:
+___
+$code.=<<___;
+ ret
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+} else {
+$code.=<<___;
+ jmp .L_ghash_clmul
+.size gcm_ghash_avx,.-gcm_ghash_avx
+___
+}
+
$code.=<<___;
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.L0x1c2_polynomial:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+ .long 7,0,7,0
+.L7_mask_poly:
+ .long 7,0,`0xE1<<1`,0
.align 64
.type .Lrem_4bit,\@object
.Lrem_4bit:
@@ -773,10 +1681,24 @@ se_handler:
.rva .LSEH_end_gcm_ghash_4bit
.rva .LSEH_info_gcm_ghash_4bit
+ .rva .LSEH_begin_gcm_init_clmul
+ .rva .LSEH_end_gcm_init_clmul
+ .rva .LSEH_info_gcm_init_clmul
+
.rva .LSEH_begin_gcm_ghash_clmul
.rva .LSEH_end_gcm_ghash_clmul
.rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_gcm_init_avx
+ .rva .LSEH_end_gcm_init_avx
+ .rva .LSEH_info_gcm_init_clmul
+ .rva .LSEH_begin_gcm_ghash_avx
+ .rva .LSEH_end_gcm_ghash_avx
+ .rva .LSEH_info_gcm_ghash_clmul
+___
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_gcm_gmult_4bit:
@@ -787,14 +1709,23 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
+.LSEH_info_gcm_init_clmul:
+ .byte 0x01,0x08,0x03,0x00
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
.LSEH_info_gcm_ghash_clmul:
- .byte 0x01,0x1f,0x0b,0x00
- .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
- .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
- .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
- .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
- .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
+ .byte 0x01,0x33,0x16,0x00
+ .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
+ .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
+ .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
+ .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
+ .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
+ .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
+ .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
+ .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
+ .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
___
}
diff --git a/devel/perlasm/ppc-xlate.pl b/devel/perlasm/ppc-xlate.pl
index a3edd982b6..c075d5fce0 100755
--- a/devel/perlasm/ppc-xlate.pl
+++ b/devel/perlasm/ppc-xlate.pl
@@ -37,7 +37,6 @@ my $globl = sub {
$ret .= ".align 3\n";
$ret .= "$name:\n";
$ret .= ".quad .$name,.TOC.\@tocbase,0\n";
- $ret .= ".size $name,24\n";
$ret .= ".previous\n";
$name = ".$name";
@@ -62,9 +61,12 @@ my $machine = sub {
".machine $arch";
};
my $size = sub {
- if ($flavour =~ /linux.*32/)
+ if ($flavour =~ /linux/)
{ shift;
- ".size " . join(",",@_);
+ my $name = shift; $name =~ s|^[\.\_]||;
+ my $ret = ".size $name,.-".($flavour=~/64/?".":"").$name;
+ $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64/);
+ $ret;
}
else
{ ""; }
@@ -77,6 +79,25 @@ my $asciz = sub {
else
{ ""; }
};
+my $quad = sub {
+ shift;
+ my @ret;
+ my ($hi,$lo);
+ for (@_) {
+ if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io)
+ { $hi=$1?"0x$1":"0"; $lo="0x$2"; }
+ elsif (/^([0-9]+)$/o)
+ { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl
+ else
+ { $hi=undef; $lo=$_; }
+
+ if (defined($hi))
+ { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); }
+ else
+ { push(@ret,".quad $lo"); }
+ }
+ join("\n",@ret);
+};
################################################################
# simplified mnemonics not handled by at least one assembler
diff --git a/devel/perlasm/x86_64-xlate.pl b/devel/perlasm/x86_64-xlate.pl
index 1f4ce0a84e..bd165b152b 100755
--- a/devel/perlasm/x86_64-xlate.pl
+++ b/devel/perlasm/x86_64-xlate.pl
@@ -62,12 +62,8 @@ my $flavour = shift;
my $output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-{ my ($stddev,$stdino,@junk)=stat(STDOUT);
- my ($outdev,$outino,@junk)=stat($output);
-
- open STDOUT,">$output" || die "can't open $output: $!"
- if ($stddev!=$outdev || $stdino!=$outino);
-}
+open STDOUT,">$output" || die "can't open $output: $!"
+ if (defined($output));
my $gas=1; $gas=0 if ($output =~ /\.asm$/);
my $elf=1; $elf=0 if (!$gas);
@@ -254,8 +250,13 @@ my %globals;
# in $self->{label}, new gas requires sign extension...
use integer;
$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
- $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
- $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+ $self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg;
+ $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg;
+
+ if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
+ $self->{base} =~ /(rbp|r13)/) {
+ $self->{base} = $self->{index}; $self->{index} = $1;
+ }
if ($gas) {
$self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64");
@@ -270,13 +271,14 @@ my %globals;
}
} else {
%szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
- q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
+ q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR",
+ y=>"" );
$self->{label} =~ s/\./\$/g;
$self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
$self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
- $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
- $sz="l" if (opcode->mnemonic() eq "movd");
+ $sz="q" if ($self->{asterisk} || opcode->mnemonic() =~ /^v?movq$/);
+ $sz="l" if (opcode->mnemonic() =~ /^v?movd$/);
if (defined($self->{index})) {
sprintf "%s[%s%s*%d%s]",$szmap{$sz},
@@ -416,7 +418,7 @@ my %globals;
}
sub out {
my $self = shift;
- if ($nasm && opcode->mnemonic()=~m/^j/) {
+ if ($nasm && opcode->mnemonic()=~m/^j(?![re]cxz)/) {
"NEAR ".$self->{value};
} else {
$self->{value};
@@ -569,7 +571,8 @@ my %globals;
$v.=" READONLY";
$v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
} elsif ($line=~/\.CRT\$/i) {
- $v.=" READONLY ALIGN(8)";
+ $v.=" READONLY ";
+ $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
}
}
$current_segment = $line;
@@ -775,6 +778,45 @@ my $rdrand = sub {
}
};
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+ $rxb|=0x7<<5;
+ $rxb&=~(0x04<<5) if($dst>=8);
+ $rxb&=~(0x01<<5) if($src1>=8);
+ $rxb&=~(0x02<<5) if($src2>=8);
+ push @opcode,$rxb;
+}
+
+my $vprotd = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x8f);
+ rxb(\@opcode,$3,$2,-1,0x08);
+ push @opcode,0x78,0xc2;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
+my $vprotq = sub {
+ if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x8f);
+ rxb(\@opcode,$3,$2,-1,0x08);
+ push @opcode,0x78,0xc3;
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ @opcode;
+ } else {
+ ();
+ }
+};
+
if ($nasm) {
print <<___;
default rel
@@ -840,6 +882,7 @@ while($line=<>) {
my $arg = $_->out();
# $insn.=$sz compensates for movq, pinsrw, ...
if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+ if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; }
if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; }
}
@args = reverse(@args);
@@ -1049,7 +1092,7 @@ close STDOUT;
# .rva .LSEH_end_function
# .rva function_unwind_info
#
-# Reference to functon_unwind_info from .xdata segment is the anchor.
+# Reference to function_unwind_info from .xdata segment is the anchor.
# In case you wonder why references are 32-bit .rvas and not 64-bit
# .quads. References put into these two segments are required to be
# *relative* to the base address of the current binary module, a.k.a.
diff --git a/devel/perlasm/x86asm.pl b/devel/perlasm/x86asm.pl
index eb543db2f6..17abf92297 100644
--- a/devel/perlasm/x86asm.pl
+++ b/devel/perlasm/x86asm.pl
@@ -131,6 +131,32 @@ sub ::rdrand
{ &::generic("rdrand",@_); }
}
+sub rxb {
+ local *opcode=shift;
+ my ($dst,$src1,$src2,$rxb)=@_;
+
+ $rxb|=0x7<<5;
+ $rxb&=~(0x04<<5) if($dst>=8);
+ $rxb&=~(0x01<<5) if($src1>=8);
+ $rxb&=~(0x02<<5) if($src2>=8);
+ push @opcode,$rxb;
+}
+
+sub ::vprotd
+{ my $args=join(',',@_);
+ if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/)
+ { my @opcode=(0x8f);
+ rxb(\@opcode,$1,$2,-1,0x08);
+ push @opcode,0x78,0xc2;
+ push @opcode,0xc0|($2&7)|(($1&7)<<3); # ModR/M
+ my $c=$3;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ &::data_byte(@opcode);
+ }
+ else
+ { &::generic("vprotd",@_); }
+}
+
# label management
$lbdecor="L"; # local label decoration, set by package
$label="000";
@@ -257,4 +283,6 @@ EOF
&file($filename);
}
+sub ::hidden {}
+
1;
diff --git a/devel/perlasm/x86gas.pl b/devel/perlasm/x86gas.pl
index 4af871889a..5c2498118f 100644
--- a/devel/perlasm/x86gas.pl
+++ b/devel/perlasm/x86gas.pl
@@ -45,10 +45,8 @@ sub ::generic
undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
if ($#_==0) { &::emit($opcode); }
- elsif ($opcode =~ m/^j/o && $#_==1) { &::emit($opcode,@arg); }
- elsif ($opcode eq "call" && $#_==1) { &::emit($opcode,@arg); }
- elsif ($opcode eq "clflush" && $#_==1){ &::emit($opcode,@arg); }
- elsif ($opcode =~ m/^set/&& $#_==1) { &::emit($opcode,@arg); }
+ elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+ { &::emit($opcode,@arg); }
else { &::emit($opcode.$suffix,@arg);}
1;
@@ -72,6 +70,8 @@ sub ::DWP
{ my($addr,$reg1,$reg2,$idx)=@_;
my $ret="";
+ if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
$addr =~ s/^\s+//;
# prepend global references with optional underscore
$addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige;
@@ -159,7 +159,7 @@ sub ::file_end
}
}
if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
- my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+ my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,16";
if ($::macosx) { push (@out,"$tmp,2\n"); }
elsif ($::elf) { push (@out,"$tmp,4\n"); }
else { push (@out,"$tmp\n"); }
@@ -172,10 +172,9 @@ sub ::data_short{ push(@out,".value\t".join(',',@_)."\n"); }
sub ::data_word { push(@out,".long\t".join(',',@_)."\n"); }
sub ::align
-{ my $val=$_[0],$p2,$i;
+{ my $val=$_[0];
if ($::aout)
- { for ($p2=0;$val!=0;$val>>=1) { $p2++; }
- $val=$p2-1;
+ { $val=int(log($val)/log(2));
$val.=",0x90";
}
push(@out,".align\t$val\n");
@@ -184,7 +183,9 @@ sub ::align
sub ::picmeup
{ my($dst,$sym,$base,$reflabel)=@_;
- if (($::pic && ($::elf || $::aout)) || $::macosx)
+ if (defined($base) && $sym eq "OPENSSL_ia32cap_P" && !$::macosx)
+ { &::lea($dst,&::DWP("$sym-$reflabel",$base)); }
+ elsif (($::pic && ($::elf || $::aout)) || $::macosx)
{ if (!defined($base))
{ &::call(&::label("PIC_me_up"));
&::set_label("PIC_me_up");
@@ -252,4 +253,6 @@ ___
sub ::dataseg
{ push(@out,".data\n"); }
+*::hidden = sub { push(@out,".hidden\t$nmdecor$_[0]\n"); } if ($::elf);
+
1;
diff --git a/devel/perlasm/x86masm.pl b/devel/perlasm/x86masm.pl
index ee446de5c1..1741342c3a 100644
--- a/devel/perlasm/x86masm.pl
+++ b/devel/perlasm/x86masm.pl
@@ -16,7 +16,9 @@ sub ::generic
# fix hexadecimal constants
for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; }
- if ($opcode !~ /movq/)
+ if ($opcode =~ /lea/ && @arg[1] =~ s/.*PTR\s+(\(.*\))$/OFFSET $1/) # no []
+ { $opcode="mov"; }
+ elsif ($opcode !~ /movq/)
{ # fix xmm references
$arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[1]=~/\bxmm[0-7]\b/i);
$arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[0]=~/\bxmm[0-7]\b/i);
@@ -37,6 +39,8 @@ sub get_mem
{ my($size,$addr,$reg1,$reg2,$idx)=@_;
my($post,$ret);
+ if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
$ret .= "$size PTR " if ($size ne "");
$addr =~ s/^\s+//;
@@ -131,7 +135,7 @@ ___
if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
{ my $comm=<<___;
.bss SEGMENT 'BSS'
-COMM ${nmdecor}OPENSSL_ia32cap_P:QWORD
+COMM ${nmdecor}OPENSSL_ia32cap_P:DWORD:4
.bss ENDS
___
# comment out OPENSSL_ia32cap_P declarations
diff --git a/devel/perlasm/x86nasm.pl b/devel/perlasm/x86nasm.pl
index ca2511c9eb..5d92f6092a 100644
--- a/devel/perlasm/x86nasm.pl
+++ b/devel/perlasm/x86nasm.pl
@@ -36,6 +36,8 @@ sub get_mem
{ my($size,$addr,$reg1,$reg2,$idx)=@_;
my($post,$ret);
+ if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; }
+
if ($size ne "")
{ $ret .= "$size";
$ret .= " PTR" if ($::mwerks);
@@ -117,7 +119,7 @@ sub ::file_end
{ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out)
{ my $comm=<<___;
${drdecor}segment .bss
-${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 8
+${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 16
___
# comment out OPENSSL_ia32cap_P declarations
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;