diff options
author | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2012-03-19 22:55:14 +0100 |
---|---|---|
committer | Nikos Mavrogiannopoulos <nmav@gnutls.org> | 2012-03-19 22:58:02 +0100 |
commit | 9567d93c07f87ecb5c8560b7a45125de28710bc1 (patch) | |
tree | 31a779ef6d1e51589dc257599dca05ea6a768c01 /devel | |
parent | abbfc182f738c654ebeaf75cf6893acc0947699b (diff) | |
download | gnutls-9567d93c07f87ecb5c8560b7a45125de28710bc1.tar.gz |
updated openssl code
Diffstat (limited to 'devel')
-rw-r--r-- | devel/perlasm/e_padlock-x86.pl | 104 | ||||
-rw-r--r-- | devel/perlasm/e_padlock-x86_64.pl | 178 | ||||
-rw-r--r-- | devel/perlasm/ghash-x86.pl | 28 |
3 files changed, 218 insertions, 92 deletions
diff --git a/devel/perlasm/e_padlock-x86.pl b/devel/perlasm/e_padlock-x86.pl index 7a52528fed..71ecad3bbd 100644 --- a/devel/perlasm/e_padlock-x86.pl +++ b/devel/perlasm/e_padlock-x86.pl @@ -37,7 +37,7 @@ require "x86asm.pl"; &asm_init($ARGV[0],$0); -%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata +%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 $ctx="edx"; @@ -188,10 +188,6 @@ my ($mode,$opcode) = @_; &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter } else { &xor ("ebx","ebx"); - if ($PADLOCK_MARGIN{$mode}) { - &cmp ($len,$PADLOCK_MARGIN{$mode}); - &jbe (&label("${mode}_short")); - } &test (&DWP(0,$ctx),1<<5); # align bit in control word &jnz (&label("${mode}_aligned")); &test ($out,0x0f); @@ -212,7 +208,27 @@ my ($mode,$opcode) = @_; &neg ("eax"); &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK &lea ("esp",&DWP(0,"eax","ebp")); # alloca + &mov ("eax",$PADLOCK_CHUNK); + &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK + &mov ("eax","ebp"); + &and ("ebp",-16); &and ("esp",-16); + &mov (&DWP(16,"ebp"),"eax"); + if ($PADLOCK_PREFETCH{$mode}) { + &cmp ($len,$chunk); + &ja (&label("${mode}_loop")); + &mov ("eax",$inp); # check if prefetch crosses page + &cmp ("ebp","esp"); + &cmove ("eax",$out); + &add ("eax",$len); + &neg ("eax"); + &and ("eax",0xfff); # distance to page boundary + &cmp ("eax",$PADLOCK_PREFETCH{$mode}); + &mov ("eax",-$PADLOCK_PREFETCH{$mode}); + &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 + &and ($chunk,"eax"); + &jz (&label("${mode}_unaligned_tail")); + } &jmp (&label("${mode}_loop")); &set_label("${mode}_loop",16); @@ -276,8 +292,8 @@ my ($mode,$opcode) = @_; &test ($out,0x0f); &jz (&label("${mode}_out_aligned")); &mov ($len,$chunk); - &shr ($len,2); &lea ($inp,&DWP(0,"esp")); + &shr ($len,2); &data_byte(0xf3,0xa5); # rep movsl &sub ($out,$chunk); &set_label("${mode}_out_aligned"); @@ -288,7 +304,30 @@ my ($mode,$opcode) = @_; &add ($inp,$chunk); &sub ($len,$chunk); &mov ($chunk,$PADLOCK_CHUNK); + if (!$PADLOCK_PREFETCH{$mode}) { &jnz (&label("${mode}_loop")); + } else { + &jz (&label("${mode}_break")); + &cmp ($len,$chunk); + &jae (&label("${mode}_loop")); + +&set_label("${mode}_unaligned_tail"); + &xor ("eax","eax"); + &cmp ("esp","ebp"); + &cmove ("eax",$len); + &sub ("esp","eax"); # alloca + &mov ("eax", $out); # save parameters + &mov ($chunk,$len); + &shr ($len,2); + &lea ($out,&DWP(0,"esp")); + &data_byte(0xf3,0xa5); # rep movsl + &mov ($inp,"esp"); + &mov ($out,"eax"); # restore parameters + &mov ($len,$chunk); + &jmp (&label("${mode}_loop")); + +&set_label("${mode}_break",16); + } if ($mode ne "ctr32") { &cmp ("esp","ebp"); &je (&label("${mode}_done")); @@ -302,28 +341,24 @@ my ($mode,$opcode) = @_; &ja (&label("${mode}_bzero")); &set_label("${mode}_done"); + &mov ("ebp",&DWP(16,"ebp")); &lea ("esp",&DWP(24,"ebp")); if ($mode ne "ctr32") { &jmp (&label("${mode}_exit")); -&set_label("${mode}_short",16); - &xor ("eax","eax"); - &lea ("ebp",&DWP(-24,"esp")); - &sub ("eax",$len); - &lea ("esp",&DWP(0,"eax","ebp")); - &and ("esp",-16); - &xor ($chunk,$chunk); -&set_label("${mode}_short_copy"); - &movups ("xmm0",&QWP(0,$inp,$chunk)); - &lea ($chunk,&DWP(16,$chunk)); - &cmp ($len,$chunk); - &movaps (&QWP(-16,"esp",$chunk),"xmm0"); - &ja (&label("${mode}_short_copy")); - &mov ($inp,"esp"); - &mov ($chunk,$len); - &jmp (&label("${mode}_loop")); - &set_label("${mode}_aligned",16); + if ($PADLOCK_PREFETCH{$mode}) { + &lea ("ebp",&DWP(0,$inp,$len)); + &neg ("ebp"); + &and ("ebp",0xfff); # distance to page boundary + &xor ("eax","eax"); + &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); + &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); + &cmovae ("ebp","eax"); + &and ("ebp",$len); # remainder + &sub ($len,"ebp"); + &jz (&label("${mode}_aligned_tail")); + } &lea ("eax",&DWP(-16,$ctx)); # ivp &lea ("ebx",&DWP(16,$ctx)); # key &shr ($len,4); # len/=AES_BLOCK_SIZE @@ -332,6 +367,29 @@ my ($mode,$opcode) = @_; &movaps ("xmm0",&QWP(0,"eax")); &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv } + if ($PADLOCK_PREFETCH{$mode}) { + &test ("ebp","ebp"); + &jz (&label("${mode}_exit")); + +&set_label("${mode}_aligned_tail"); + &mov ($len,"ebp"); + &lea ("ebp",&DWP(-24,"esp")); + &mov ("esp","ebp"); + &mov ("eax","ebp"); + &sub ("esp",$len); + &and ("ebp",-16); + &and ("esp",-16); + &mov (&DWP(16,"ebp"),"eax"); + &mov ("eax", $out); # save parameters + &mov ($chunk,$len); + &shr ($len,2); + &lea ($out,&DWP(0,"esp")); + &data_byte(0xf3,0xa5); # rep movsl + &mov ($inp,"esp"); + &mov ($out,"eax"); # restore parameters + &mov ($len,$chunk); + &jmp (&label("${mode}_loop")); + } &set_label("${mode}_exit"); } &mov ("eax",1); &lea ("esp",&DWP(4,"esp")); # popf diff --git a/devel/perlasm/e_padlock-x86_64.pl b/devel/perlasm/e_padlock-x86_64.pl index cbffb9d40f..4d71d06f02 100644 --- a/devel/perlasm/e_padlock-x86_64.pl +++ b/devel/perlasm/e_padlock-x86_64.pl @@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output"; $code=".text\n"; -%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata +%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 $ctx="%rdx"; @@ -285,17 +285,6 @@ padlock_${mode}_encrypt: lea 16($ctx),$ctx # control word xor %eax,%eax xor %ebx,%ebx -___ -# Formally speaking correct condtion is $len<=$margin and $inp+$margin -# crosses page boundary [and next page is unreadable]. But $inp can -# be unaligned in which case data can be copied to $out if latter is -# aligned, in which case $out+$margin has to be checked. Covering all -# cases appears more complicated than just copying short input... -$code.=<<___ if ($PADLOCK_MARGIN{$mode}); - cmp \$$PADLOCK_MARGIN{$mode},$len - jbe .L${mode}_short -___ -$code.=<<___; testl \$`1<<5`,($ctx) # align bit in control word jnz .L${mode}_aligned test \$0x0f,$out @@ -315,6 +304,8 @@ $code.=<<___; neg %rax and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK lea (%rax,%rbp),%rsp + mov \$$PADLOCK_CHUNK,%rax + cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK ___ $code.=<<___ if ($mode eq "ctr32"); .L${mode}_reenter: @@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32"); bswap %eax neg %eax and \$`$PADLOCK_CHUNK/16-1`,%eax - jz .L${mode}_loop + mov \$$PADLOCK_CHUNK,$chunk shl \$4,%eax + cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK + cmovbe $len,$chunk +___ +$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); + cmp $chunk,$len + ja .L${mode}_loop + mov $inp,%rax # check if prefetch crosses page + cmp %rsp,%rbp + cmove $out,%rax + add $len,%rax + neg %rax + and \$0xfff,%rax # distance to page boundary + cmp \$$PADLOCK_PREFETCH{$mode},%rax + mov \$-$PADLOCK_PREFETCH{$mode},%rax + cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 + and %rax,$chunk + jz .L${mode}_unaligned_tail ___ $code.=<<___; jmp .L${mode}_loop @@ -360,12 +368,12 @@ ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter test \$0xffff0000,%eax - jnz .L${mode}_no_corr + jnz .L${mode}_no_carry bswap %eax add \$0x10000,%eax bswap %eax mov %eax,-4($ctx) -.L${mode}_no_corr: +.L${mode}_no_carry: ___ $code.=<<___; mov %r8,$out # restore paramters @@ -373,8 +381,8 @@ $code.=<<___; test \$0x0f,$out jz .L${mode}_out_aligned mov $chunk,$len - shr \$3,$len lea (%rsp),$inp + shr \$3,$len .byte 0xf3,0x48,0xa5 # rep movsq sub $chunk,$out .L${mode}_out_aligned: @@ -384,9 +392,52 @@ $code.=<<___; add $chunk,$inp sub $chunk,$len mov \$$PADLOCK_CHUNK,$chunk +___ + if (!$PADLOCK_PREFETCH{$mode}) { +$code.=<<___; jnz .L${mode}_loop - +___ + } else { +$code.=<<___; + jz .L${mode}_break + cmp $chunk,$len + jae .L${mode}_loop +___ +$code.=<<___ if ($mode eq "ctr32"); + mov $len,$chunk + mov $inp,%rax # check if prefetch crosses page cmp %rsp,%rbp + cmove $out,%rax + add $len,%rax + neg %rax + and \$0xfff,%rax # distance to page boundary + cmp \$$PADLOCK_PREFETCH{$mode},%rax + mov \$-$PADLOCK_PREFETCH{$mode},%rax + cmovae $chunk,%rax + and %rax,$chunk + jnz .L${mode}_loop +___ +$code.=<<___; +.L${mode}_unaligned_tail: + xor %eax,%eax + cmp %rsp,%rbp + cmove $len,%rax + mov $out,%r8 # save parameters + mov $len,$chunk + sub %rax,%rsp # alloca + shr \$3,$len + lea (%rsp),$out + .byte 0xf3,0x48,0xa5 # rep movsq + mov %rsp,$inp + mov %r8, $out # restore parameters + mov $chunk,$len + jmp .L${mode}_loop +.align 16 +.L${mode}_break: +___ + } +$code.=<<___; + cmp %rbp,%rsp je .L${mode}_done pxor %xmm0,%xmm0 @@ -400,70 +451,87 @@ $code.=<<___; .L${mode}_done: lea (%rbp),%rsp jmp .L${mode}_exit -___ -$code.=<<___ if ($PADLOCK_MARGIN{$mode}); -.align 16 -.L${mode}_short: - mov %rsp,%rbp - sub $len,%rsp - xor $chunk,$chunk -.L${mode}_short_copy: - movups ($inp,$chunk),%xmm0 - lea 16($chunk),$chunk - cmp $chunk,$len - movaps %xmm0,-16(%rsp,$chunk) - ja .L${mode}_short_copy - mov %rsp,$inp - mov $len,$chunk - jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"` -___ -$code.=<<___; + .align 16 .L${mode}_aligned: ___ $code.=<<___ if ($mode eq "ctr32"); mov -4($ctx),%eax # pull 32-bit counter - mov \$`16*0x10000`,$chunk bswap %eax - cmp $len,$chunk - cmova $len,$chunk neg %eax and \$0xffff,%eax - jz .L${mode}_aligned_loop + mov \$`16*0x10000`,$chunk shl \$4,%eax + cmovz $chunk,%rax cmp %rax,$len cmova %rax,$chunk # don't let counter cross 2^16 - jmp .L${mode}_aligned_loop -.align 16 + cmovbe $len,$chunk + jbe .L${mode}_aligned_skip + .L${mode}_aligned_loop: - cmp $len,$chunk - cmova $len,$chunk mov $len,%r10 # save parameters mov $chunk,$len mov $chunk,%r11 -___ -$code.=<<___; + lea -16($ctx),%rax # ivp lea 16($ctx),%rbx # key shr \$4,$len # len/=AES_BLOCK_SIZE .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* -___ -$code.=<<___ if ($mode !~ /ecb|ctr/); - movdqa (%rax),%xmm0 - movdqa %xmm0,-16($ctx) # copy [or refresh] iv -___ -$code.=<<___ if ($mode eq "ctr32"); + mov -4($ctx),%eax # pull 32-bit counter bswap %eax add \$0x10000,%eax bswap %eax mov %eax,-4($ctx) - mov %r11,$chunk # restore paramters - mov %r10,$len - sub $chunk,$len + mov %r10,$len # restore paramters + sub %r11,$len mov \$`16*0x10000`,$chunk - jnz .L${mode}_aligned_loop + jz .L${mode}_exit + cmp $chunk,$len + jae .L${mode}_aligned_loop + +.L${mode}_aligned_skip: +___ +$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); + lea ($inp,$len),%rbp + neg %rbp + and \$0xfff,%rbp # distance to page boundary + xor %eax,%eax + cmp \$$PADLOCK_PREFETCH{$mode},%rbp + mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp + cmovae %rax,%rbp + and $len,%rbp # remainder + sub %rbp,$len + jz .L${mode}_aligned_tail +___ +$code.=<<___; + lea -16($ctx),%rax # ivp + lea 16($ctx),%rbx # key + shr \$4,$len # len/=AES_BLOCK_SIZE + .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* +___ +$code.=<<___ if ($mode !~ /ecb|ctr/); + movdqa (%rax),%xmm0 + movdqa %xmm0,-16($ctx) # copy [or refresh] iv +___ +$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); + test %rbp,%rbp # check remainder + jz .L${mode}_exit + +.L${mode}_aligned_tail: + mov $out,%r8 + mov %rbp,$chunk + mov %rbp,$len + lea (%rsp),%rbp + sub $len,%rsp + shr \$3,$len + lea (%rsp),$out + .byte 0xf3,0x48,0xa5 # rep movsq + lea (%r8),$out + lea (%rsp),$inp + mov $chunk,$len + jmp .L${mode}_loop ___ $code.=<<___; .L${mode}_exit: diff --git a/devel/perlasm/ghash-x86.pl b/devel/perlasm/ghash-x86.pl index 1b9adfbc72..2a1819cb51 100644 --- a/devel/perlasm/ghash-x86.pl +++ b/devel/perlasm/ghash-x86.pl @@ -12,14 +12,14 @@ # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that it # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two -# code paths: vanilla x86 and vanilla MMX. Former will be executed on -# 486 and Pentium, latter on all others. MMX GHASH features so called +# code paths: vanilla x86 and vanilla SSE. Former will be executed on +# 486 and Pentium, latter on all others. SSE GHASH features so called # "528B" variant of "4-bit" method utilizing additional 256+16 bytes # of per-key storage [+512 bytes shared table]. Performance results # are for streamed GHASH subroutine and are expressed in cycles per # processed byte, less is better: # -# gcc 2.95.3(*) MMX assembler x86 assembler +# gcc 2.95.3(*) SSE assembler x86 assembler # # Pentium 105/111(**) - 50 # PIII 68 /75 12.2 24 @@ -30,7 +30,7 @@ # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, # another reason is lack of 3.4.x results for older CPUs; -# comparison with MMX results is not completely fair, because C +# comparison with SSE results is not completely fair, because C # results are for vanilla "256B" implementation, while # assembler results are for "528B";-) # (**) second number is result for code compiled with -fPIC flag, @@ -40,8 +40,8 @@ # # To summarize, it's >2-5 times faster than gcc-generated code. To # anchor it to something else SHA1 assembler processes one byte in -# 11-13 cycles on contemporary x86 cores. As for choice of MMX in -# particular, see comment at the end of the file... +# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE +# in particular, see comment at the end of the file... # May 2010 # @@ -331,7 +331,7 @@ if (!$x86only) {{{ &static_label("rem_4bit"); -if (0) {{ # "May" MMX version is kept for reference... +if (!$sse2) {{ # pure-MMX "May" version... $S=12; # shift factor for rem_4bit @@ -1273,13 +1273,6 @@ my ($Xhi,$Xi)=@_; &set_label("bswap",64); &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial -}} # $sse2 - -&set_label("rem_4bit",64); - &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); - &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); - &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); - &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); &set_label("rem_8bit",64); &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); @@ -1313,6 +1306,13 @@ my ($Xhi,$Xi)=@_; &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); +}} # $sse2 + +&set_label("rem_4bit",64); + &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); + &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); + &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); + &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); }}} # !$x86only &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); |