diff options
Diffstat (limited to 'devel')
24 files changed, 19 insertions, 22255 deletions
diff --git a/devel/openssl b/devel/openssl new file mode 160000 +Subproject 4a253652eebd8678d2d4494705c0ce498966fc2 diff --git a/devel/perlasm/aes-ssse3-x86.pl b/devel/perlasm/aes-ssse3-x86.pl index bacf42cf0f..2f314a8a7a 100644..120000 --- a/devel/perlasm/aes-ssse3-x86.pl +++ b/devel/perlasm/aes-ssse3-x86.pl @@ -1,902 +1 @@ -#!/usr/bin/env perl - -###################################################################### -## Constant-time SSSE3 AES core implementation. -## version 0.1 -## -## By Mike Hamburg (Stanford University), 2009 -## Public domain. -## -## For details see http://shiftleft.org/papers/vector_aes/ and -## http://crypto.stanford.edu/vpaes/. - -###################################################################### -# September 2011. -# -# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for -# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt -# doesn't handle partial vectors (doesn't have to if called from -# EVP only). "Drop-in" implies that this module doesn't share key -# schedule structure with the original nor does it make assumption -# about its alignment... -# -# Performance summary. aes-586.pl column lists large-block CBC -# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per -# byte processed with 128-bit key, and vpaes-x86.pl column - [also -# large-block CBC] encrypt/decrypt. -# -# aes-586.pl vpaes-x86.pl -# -# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***) -# Nehalem 27.9/40.4/18.1 10.2/11.9 -# Atom 70.7/92.1/60.1 61.1/75.4(***) -# -# (*) "Hyper-threading" in the context refers rather to cache shared -# among multiple cores, than to specifically Intel HTT. As vast -# majority of contemporary cores share cache, slower code path -# is common place. In other words "with-hyper-threading-off" -# results are presented mostly for reference purposes. -# -# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. -# -# (***) Less impressive improvement on Core 2 and Atom is due to slow -# pshufb, yet it's respectable +28%/64% improvement on Core 2 -# and +15% on Atom (as implied, over "hyper-threading-safe" -# code path). -# -# <appro@openssl.org> - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); - -$PREFIX="vpaes"; - -my ($round, $base, $magic, $key, $const, $inp, $out)= - ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); - -&static_label("_vpaes_consts"); -&static_label("_vpaes_schedule_low_round"); - -&set_label("_vpaes_consts",64); -$k_inv=-0x30; # inv, inva - &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); - &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); - -$k_s0F=-0x10; # s0F - &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); - -$k_ipt=0x00; # input transform (lo, hi) - &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); - &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); - -$k_sb1=0x20; # sb1u, sb1t - &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); - &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); -$k_sb2=0x40; # sb2u, sb2t - &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); - &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); -$k_sbo=0x60; # sbou, sbot - &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); - &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); - -$k_mc_forward=0x80; # mc_forward - &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); - &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); - &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); - &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); - -$k_mc_backward=0xc0; # mc_backward - &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); - &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); - &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); - &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); - -$k_sr=0x100; # sr - &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); - &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); - &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); - &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); - -$k_rcon=0x140; # rcon - &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); - -$k_s63=0x150; # s63: all equal to 0x63 transformed - &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); - -$k_opt=0x160; # output transform - &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); - &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); - -$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" - &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); - &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); -## -## Decryption stuff -## Key schedule constants -## -$k_dksd=0x1a0; # decryption key schedule: invskew x*D - &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); - &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); -$k_dksb=0x1c0; # decryption key schedule: invskew x*B - &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); - &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); -$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 - &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); - &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); -$k_dks9=0x200; # decryption key schedule: invskew x*9 - &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); - &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); - -## -## Decryption stuff -## Round function constants -## -$k_dipt=0x220; # decryption input transform - &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); - &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); - -$k_dsb9=0x240; # decryption sbox output *9*u, *9*t - &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); - &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); -$k_dsbd=0x260; # decryption sbox output *D*u, *D*t - &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); - &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); -$k_dsbb=0x280; # decryption sbox output *B*u, *B*t - &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); - &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); -$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t - &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); - &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); -$k_dsbo=0x2c0; # decryption sbox final output - &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); - &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); -&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); -&align (64); - -&function_begin_B("_vpaes_preheat"); - &add ($const,&DWP(0,"esp")); - &movdqa ("xmm7",&QWP($k_inv,$const)); - &movdqa ("xmm6",&QWP($k_s0F,$const)); - &ret (); -&function_end_B("_vpaes_preheat"); - -## -## _aes_encrypt_core -## -## AES-encrypt %xmm0. -## -## Inputs: -## %xmm0 = input -## %xmm6-%xmm7 as in _vpaes_preheat -## (%edx) = scheduled keys -## -## Output in %xmm0 -## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx -## -## -&function_begin_B("_vpaes_encrypt_core"); - &mov ($magic,16); - &mov ($round,&DWP(240,$key)); - &movdqa ("xmm1","xmm6") - &movdqa ("xmm2",&QWP($k_ipt,$const)); - &pandn ("xmm1","xmm0"); - &pand ("xmm0","xmm6"); - &movdqu ("xmm5",&QWP(0,$key)); - &pshufb ("xmm2","xmm0"); - &movdqa ("xmm0",&QWP($k_ipt+16,$const)); - &pxor ("xmm2","xmm5"); - &psrld ("xmm1",4); - &add ($key,16); - &pshufb ("xmm0","xmm1"); - &lea ($base,&DWP($k_mc_backward,$const)); - &pxor ("xmm0","xmm2"); - &jmp (&label("enc_entry")); - - -&set_label("enc_loop",16); - # middle of middle round - &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u - &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t - &pshufb ("xmm4","xmm2"); # 4 = sb1u - &pshufb ("xmm0","xmm3"); # 0 = sb1t - &pxor ("xmm4","xmm5"); # 4 = sb1u + k - &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u - &pxor ("xmm0","xmm4"); # 0 = A - &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] - &pshufb ("xmm5","xmm2"); # 4 = sb2u - &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t - &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] - &pshufb ("xmm2","xmm3"); # 2 = sb2t - &movdqa ("xmm3","xmm0"); # 3 = A - &pxor ("xmm2","xmm5"); # 2 = 2A - &pshufb ("xmm0","xmm1"); # 0 = B - &add ($key,16); # next key - &pxor ("xmm0","xmm2"); # 0 = 2A+B - &pshufb ("xmm3","xmm4"); # 3 = D - &add ($magic,16); # next mc - &pxor ("xmm3","xmm0"); # 3 = 2A+B+D - &pshufb ("xmm0","xmm1"); # 0 = 2B+C - &and ($magic,0x30); # ... mod 4 - &sub ($round,1); # nr-- - &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D - -&set_label("enc_entry"); - # top of round - &movdqa ("xmm1","xmm6"); # 1 : i - &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k - &pandn ("xmm1","xmm0"); # 1 = i<<4 - &psrld ("xmm1",4); # 1 = i - &pand ("xmm0","xmm6"); # 0 = k - &pshufb ("xmm5","xmm0"); # 2 = a/k - &movdqa ("xmm3","xmm7"); # 3 : 1/i - &pxor ("xmm0","xmm1"); # 0 = j - &pshufb ("xmm3","xmm1"); # 3 = 1/i - &movdqa ("xmm4","xmm7"); # 4 : 1/j - &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k - &pshufb ("xmm4","xmm0"); # 4 = 1/j - &movdqa ("xmm2","xmm7"); # 2 : 1/iak - &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k - &pshufb ("xmm2","xmm3"); # 2 = 1/iak - &movdqa ("xmm3","xmm7"); # 3 : 1/jak - &pxor ("xmm2","xmm0"); # 2 = io - &pshufb ("xmm3","xmm4"); # 3 = 1/jak - &movdqu ("xmm5",&QWP(0,$key)); - &pxor ("xmm3","xmm1"); # 3 = jo - &jnz (&label("enc_loop")); - - # middle of last round - &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo - &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 - &pshufb ("xmm4","xmm2"); # 4 = sbou - &pxor ("xmm4","xmm5"); # 4 = sb1u + k - &pshufb ("xmm0","xmm3"); # 0 = sb1t - &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] - &pxor ("xmm0","xmm4"); # 0 = A - &pshufb ("xmm0","xmm1"); - &ret (); -&function_end_B("_vpaes_encrypt_core"); - -## -## Decryption core -## -## Same API as encryption core. -## -&function_begin_B("_vpaes_decrypt_core"); - &lea ($base,&DWP($k_dsbd,$const)); - &mov ($round,&DWP(240,$key)); - &movdqa ("xmm1","xmm6"); - &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); - &pandn ("xmm1","xmm0"); - &mov ($magic,$round); - &psrld ("xmm1",4) - &movdqu ("xmm5",&QWP(0,$key)); - &shl ($magic,4); - &pand ("xmm0","xmm6"); - &pshufb ("xmm2","xmm0"); - &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); - &xor ($magic,0x30); - &pshufb ("xmm0","xmm1"); - &and ($magic,0x30); - &pxor ("xmm2","xmm5"); - &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); - &pxor ("xmm0","xmm2"); - &add ($key,16); - &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); - &jmp (&label("dec_entry")); - -&set_label("dec_loop",16); -## -## Inverse mix columns -## - &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u - &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t - &pshufb ("xmm4","xmm2"); # 4 = sb9u - &pshufb ("xmm1","xmm3"); # 0 = sb9t - &pxor ("xmm0","xmm4"); - &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu - &pxor ("xmm0","xmm1"); # 0 = ch - &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt - - &pshufb ("xmm4","xmm2"); # 4 = sbdu - &pshufb ("xmm0","xmm5"); # MC ch - &pshufb ("xmm1","xmm3"); # 0 = sbdt - &pxor ("xmm0","xmm4"); # 4 = ch - &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu - &pxor ("xmm0","xmm1"); # 0 = ch - &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt - - &pshufb ("xmm4","xmm2"); # 4 = sbbu - &pshufb ("xmm0","xmm5"); # MC ch - &pshufb ("xmm1","xmm3"); # 0 = sbbt - &pxor ("xmm0","xmm4"); # 4 = ch - &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu - &pxor ("xmm0","xmm1"); # 0 = ch - &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet - - &pshufb ("xmm4","xmm2"); # 4 = sbeu - &pshufb ("xmm0","xmm5"); # MC ch - &pshufb ("xmm1","xmm3"); # 0 = sbet - &pxor ("xmm0","xmm4"); # 4 = ch - &add ($key,16); # next round key - &palignr("xmm5","xmm5",12); - &pxor ("xmm0","xmm1"); # 0 = ch - &sub ($round,1); # nr-- - -&set_label("dec_entry"); - # top of round - &movdqa ("xmm1","xmm6"); # 1 : i - &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k - &pandn ("xmm1","xmm0"); # 1 = i<<4 - &pand ("xmm0","xmm6"); # 0 = k - &psrld ("xmm1",4); # 1 = i - &pshufb ("xmm2","xmm0"); # 2 = a/k - &movdqa ("xmm3","xmm7"); # 3 : 1/i - &pxor ("xmm0","xmm1"); # 0 = j - &pshufb ("xmm3","xmm1"); # 3 = 1/i - &movdqa ("xmm4","xmm7"); # 4 : 1/j - &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k - &pshufb ("xmm4","xmm0"); # 4 = 1/j - &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k - &movdqa ("xmm2","xmm7"); # 2 : 1/iak - &pshufb ("xmm2","xmm3"); # 2 = 1/iak - &movdqa ("xmm3","xmm7"); # 3 : 1/jak - &pxor ("xmm2","xmm0"); # 2 = io - &pshufb ("xmm3","xmm4"); # 3 = 1/jak - &movdqu ("xmm0",&QWP(0,$key)); - &pxor ("xmm3","xmm1"); # 3 = jo - &jnz (&label("dec_loop")); - - # middle of last round - &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou - &pshufb ("xmm4","xmm2"); # 4 = sbou - &pxor ("xmm4","xmm0"); # 4 = sb1u + k - &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot - &movdqa ("xmm2",&QWP(0,$magic)); - &pshufb ("xmm0","xmm3"); # 0 = sb1t - &pxor ("xmm0","xmm4"); # 0 = A - &pshufb ("xmm0","xmm2"); - &ret (); -&function_end_B("_vpaes_decrypt_core"); - -######################################################## -## ## -## AES key schedule ## -## ## -######################################################## -&function_begin_B("_vpaes_schedule_core"); - &add ($const,&DWP(0,"esp")); - &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) - &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon - - # input transform - &movdqa ("xmm3","xmm0"); - &lea ($base,&DWP($k_ipt,$const)); - &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 - &call ("_vpaes_schedule_transform"); - &movdqa ("xmm7","xmm0"); - - &test ($out,$out); - &jnz (&label("schedule_am_decrypting")); - - # encrypting, output zeroth round key after transform - &movdqu (&QWP(0,$key),"xmm0"); - &jmp (&label("schedule_go")); - -&set_label("schedule_am_decrypting"); - # decrypting, output zeroth round key after shiftrows - &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); - &pshufb ("xmm3","xmm1"); - &movdqu (&QWP(0,$key),"xmm3"); - &xor ($magic,0x30); - -&set_label("schedule_go"); - &cmp ($round,192); - &ja (&label("schedule_256")); - &je (&label("schedule_192")); - # 128: fall though - -## -## .schedule_128 -## -## 128-bit specific part of key schedule. -## -## This schedule is really simple, because all its parts -## are accomplished by the subroutines. -## -&set_label("schedule_128"); - &mov ($round,10); - -&set_label("loop_schedule_128"); - &call ("_vpaes_schedule_round"); - &dec ($round); - &jz (&label("schedule_mangle_last")); - &call ("_vpaes_schedule_mangle"); # write output - &jmp (&label("loop_schedule_128")); - -## -## .aes_schedule_192 -## -## 192-bit specific part of key schedule. -## -## The main body of this schedule is the same as the 128-bit -## schedule, but with more smearing. The long, high side is -## stored in %xmm7 as before, and the short, low side is in -## the high bits of %xmm6. -## -## This schedule is somewhat nastier, however, because each -## round produces 192 bits of key material, or 1.5 round keys. -## Therefore, on each cycle we do 2 rounds and produce 3 round -## keys. -## -&set_label("schedule_192",16); - &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) - &call ("_vpaes_schedule_transform"); # input transform - &movdqa ("xmm6","xmm0"); # save short part - &pxor ("xmm4","xmm4"); # clear 4 - &movhlps("xmm6","xmm4"); # clobber low side with zeros - &mov ($round,4); - -&set_label("loop_schedule_192"); - &call ("_vpaes_schedule_round"); - &palignr("xmm0","xmm6",8); - &call ("_vpaes_schedule_mangle"); # save key n - &call ("_vpaes_schedule_192_smear"); - &call ("_vpaes_schedule_mangle"); # save key n+1 - &call ("_vpaes_schedule_round"); - &dec ($round); - &jz (&label("schedule_mangle_last")); - &call ("_vpaes_schedule_mangle"); # save key n+2 - &call ("_vpaes_schedule_192_smear"); - &jmp (&label("loop_schedule_192")); - -## -## .aes_schedule_256 -## -## 256-bit specific part of key schedule. -## -## The structure here is very similar to the 128-bit -## schedule, but with an additional "low side" in -## %xmm6. The low side's rounds are the same as the -## high side's, except no rcon and no rotation. -## -&set_label("schedule_256",16); - &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) - &call ("_vpaes_schedule_transform"); # input transform - &mov ($round,7); - -&set_label("loop_schedule_256"); - &call ("_vpaes_schedule_mangle"); # output low result - &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 - - # high round - &call ("_vpaes_schedule_round"); - &dec ($round); - &jz (&label("schedule_mangle_last")); - &call ("_vpaes_schedule_mangle"); - - # low round. swap xmm7 and xmm6 - &pshufd ("xmm0","xmm0",0xFF); - &movdqa (&QWP(20,"esp"),"xmm7"); - &movdqa ("xmm7","xmm6"); - &call ("_vpaes_schedule_low_round"); - &movdqa ("xmm7",&QWP(20,"esp")); - - &jmp (&label("loop_schedule_256")); - -## -## .aes_schedule_mangle_last -## -## Mangler for last round of key schedule -## Mangles %xmm0 -## when encrypting, outputs out(%xmm0) ^ 63 -## when decrypting, outputs unskew(%xmm0) -## -## Always called right before return... jumps to cleanup and exits -## -&set_label("schedule_mangle_last",16); - # schedule last round key from xmm0 - &lea ($base,&DWP($k_deskew,$const)); - &test ($out,$out); - &jnz (&label("schedule_mangle_last_dec")); - - # encrypting - &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); - &pshufb ("xmm0","xmm1"); # output permute - &lea ($base,&DWP($k_opt,$const)); # prepare to output transform - &add ($key,32); - -&set_label("schedule_mangle_last_dec"); - &add ($key,-16); - &pxor ("xmm0",&QWP($k_s63,$const)); - &call ("_vpaes_schedule_transform"); # output transform - &movdqu (&QWP(0,$key),"xmm0"); # save last key - - # cleanup - &pxor ("xmm0","xmm0"); - &pxor ("xmm1","xmm1"); - &pxor ("xmm2","xmm2"); - &pxor ("xmm3","xmm3"); - &pxor ("xmm4","xmm4"); - &pxor ("xmm5","xmm5"); - &pxor ("xmm6","xmm6"); - &pxor ("xmm7","xmm7"); - &ret (); -&function_end_B("_vpaes_schedule_core"); - -## -## .aes_schedule_192_smear -## -## Smear the short, low side in the 192-bit key schedule. -## -## Inputs: -## %xmm7: high side, b a x y -## %xmm6: low side, d c 0 0 -## %xmm13: 0 -## -## Outputs: -## %xmm6: b+c+d b+c 0 0 -## %xmm0: b+c+d b+c b a -## -&function_begin_B("_vpaes_schedule_192_smear"); - &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0 - &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a - &pxor ("xmm6","xmm1"); # -> c+d c 0 0 - &pxor ("xmm1","xmm1"); - &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a - &movdqa ("xmm0","xmm6"); - &movhlps("xmm6","xmm1"); # clobber low side with zeros - &ret (); -&function_end_B("_vpaes_schedule_192_smear"); - -## -## .aes_schedule_round -## -## Runs one main round of the key schedule on %xmm0, %xmm7 -## -## Specifically, runs subbytes on the high dword of %xmm0 -## then rotates it by one byte and xors into the low dword of -## %xmm7. -## -## Adds rcon from low byte of %xmm8, then rotates %xmm8 for -## next rcon. -## -## Smears the dwords of %xmm7 by xoring the low into the -## second low, result into third, result into highest. -## -## Returns results in %xmm7 = %xmm0. -## Clobbers %xmm1-%xmm5. -## -&function_begin_B("_vpaes_schedule_round"); - # extract rcon from xmm8 - &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 - &pxor ("xmm1","xmm1"); - &palignr("xmm1","xmm2",15); - &palignr("xmm2","xmm2",15); - &pxor ("xmm7","xmm1"); - - # rotate - &pshufd ("xmm0","xmm0",0xFF); - &palignr("xmm0","xmm0",1); - - # fall through... - &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 - - # low round: same as high round, but no rotation and no rcon. -&set_label("_vpaes_schedule_low_round"); - # smear xmm7 - &movdqa ("xmm1","xmm7"); - &pslldq ("xmm7",4); - &pxor ("xmm7","xmm1"); - &movdqa ("xmm1","xmm7"); - &pslldq ("xmm7",8); - &pxor ("xmm7","xmm1"); - &pxor ("xmm7",&QWP($k_s63,$const)); - - # subbyte - &movdqa ("xmm4",&QWP($k_s0F,$const)); - &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j - &movdqa ("xmm1","xmm4"); - &pandn ("xmm1","xmm0"); - &psrld ("xmm1",4); # 1 = i - &pand ("xmm0","xmm4"); # 0 = k - &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k - &pshufb ("xmm2","xmm0"); # 2 = a/k - &pxor ("xmm0","xmm1"); # 0 = j - &movdqa ("xmm3","xmm5"); # 3 : 1/i - &pshufb ("xmm3","xmm1"); # 3 = 1/i - &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k - &movdqa ("xmm4","xmm5"); # 4 : 1/j - &pshufb ("xmm4","xmm0"); # 4 = 1/j - &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k - &movdqa ("xmm2","xmm5"); # 2 : 1/iak - &pshufb ("xmm2","xmm3"); # 2 = 1/iak - &pxor ("xmm2","xmm0"); # 2 = io - &movdqa ("xmm3","xmm5"); # 3 : 1/jak - &pshufb ("xmm3","xmm4"); # 3 = 1/jak - &pxor ("xmm3","xmm1"); # 3 = jo - &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou - &pshufb ("xmm4","xmm2"); # 4 = sbou - &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot - &pshufb ("xmm0","xmm3"); # 0 = sb1t - &pxor ("xmm0","xmm4"); # 0 = sbox output - - # add in smeared stuff - &pxor ("xmm0","xmm7"); - &movdqa ("xmm7","xmm0"); - &ret (); -&function_end_B("_vpaes_schedule_round"); - -## -## .aes_schedule_transform -## -## Linear-transform %xmm0 according to tables at (%ebx) -## -## Output in %xmm0 -## Clobbers %xmm1, %xmm2 -## -&function_begin_B("_vpaes_schedule_transform"); - &movdqa ("xmm2",&QWP($k_s0F,$const)); - &movdqa ("xmm1","xmm2"); - &pandn ("xmm1","xmm0"); - &psrld ("xmm1",4); - &pand ("xmm0","xmm2"); - &movdqa ("xmm2",&QWP(0,$base)); - &pshufb ("xmm2","xmm0"); - &movdqa ("xmm0",&QWP(16,$base)); - &pshufb ("xmm0","xmm1"); - &pxor ("xmm0","xmm2"); - &ret (); -&function_end_B("_vpaes_schedule_transform"); - -## -## .aes_schedule_mangle -## -## Mangle xmm0 from (basis-transformed) standard version -## to our version. -## -## On encrypt, -## xor with 0x63 -## multiply by circulant 0,1,1,1 -## apply shiftrows transform -## -## On decrypt, -## xor with 0x63 -## multiply by "inverse mixcolumns" circulant E,B,D,9 -## deskew -## apply shiftrows transform -## -## -## Writes out to (%edx), and increments or decrements it -## Keeps track of round number mod 4 in %ecx -## Preserves xmm0 -## Clobbers xmm1-xmm5 -## -&function_begin_B("_vpaes_schedule_mangle"); - &movdqa ("xmm4","xmm0"); # save xmm0 for later - &movdqa ("xmm5",&QWP($k_mc_forward,$const)); - &test ($out,$out); - &jnz (&label("schedule_mangle_dec")); - - # encrypting - &add ($key,16); - &pxor ("xmm4",&QWP($k_s63,$const)); - &pshufb ("xmm4","xmm5"); - &movdqa ("xmm3","xmm4"); - &pshufb ("xmm4","xmm5"); - &pxor ("xmm3","xmm4"); - &pshufb ("xmm4","xmm5"); - &pxor ("xmm3","xmm4"); - - &jmp (&label("schedule_mangle_both")); - -&set_label("schedule_mangle_dec",16); - # inverse mix columns - &movdqa ("xmm2",&QWP($k_s0F,$const)); - &lea ($inp,&DWP($k_dksd,$const)); - &movdqa ("xmm1","xmm2"); - &pandn ("xmm1","xmm4"); - &psrld ("xmm1",4); # 1 = hi - &pand ("xmm4","xmm2"); # 4 = lo - - &movdqa ("xmm2",&QWP(0,$inp)); - &pshufb ("xmm2","xmm4"); - &movdqa ("xmm3",&QWP(0x10,$inp)); - &pshufb ("xmm3","xmm1"); - &pxor ("xmm3","xmm2"); - &pshufb ("xmm3","xmm5"); - - &movdqa ("xmm2",&QWP(0x20,$inp)); - &pshufb ("xmm2","xmm4"); - &pxor ("xmm2","xmm3"); - &movdqa ("xmm3",&QWP(0x30,$inp)); - &pshufb ("xmm3","xmm1"); - &pxor ("xmm3","xmm2"); - &pshufb ("xmm3","xmm5"); - - &movdqa ("xmm2",&QWP(0x40,$inp)); - &pshufb ("xmm2","xmm4"); - &pxor ("xmm2","xmm3"); - &movdqa ("xmm3",&QWP(0x50,$inp)); - &pshufb ("xmm3","xmm1"); - &pxor ("xmm3","xmm2"); - &pshufb ("xmm3","xmm5"); - - &movdqa ("xmm2",&QWP(0x60,$inp)); - &pshufb ("xmm2","xmm4"); - &pxor ("xmm2","xmm3"); - &movdqa ("xmm3",&QWP(0x70,$inp)); - &pshufb ("xmm3","xmm1"); - &pxor ("xmm3","xmm2"); - - &add ($key,-16); - -&set_label("schedule_mangle_both"); - &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); - &pshufb ("xmm3","xmm1"); - &add ($magic,-16); - &and ($magic,0x30); - &movdqu (&QWP(0,$key),"xmm3"); - &ret (); -&function_end_B("_vpaes_schedule_mangle"); - -# -# Interface to OpenSSL -# -&function_begin("${PREFIX}_set_encrypt_key"); - &mov ($inp,&wparam(0)); # inp - &lea ($base,&DWP(-56,"esp")); - &mov ($round,&wparam(1)); # bits - &and ($base,-16); - &mov ($key,&wparam(2)); # key - &xchg ($base,"esp"); # alloca - &mov (&DWP(48,"esp"),$base); - - &mov ($base,$round); - &shr ($base,5); - &add ($base,5); - &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; - &mov ($magic,0x30); - &mov ($out,0); - - &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); - &call ("_vpaes_schedule_core"); -&set_label("pic_point"); - - &mov ("esp",&DWP(48,"esp")); - &xor ("eax","eax"); -&function_end("${PREFIX}_set_encrypt_key"); - -&function_begin("${PREFIX}_set_decrypt_key"); - &mov ($inp,&wparam(0)); # inp - &lea ($base,&DWP(-56,"esp")); - &mov ($round,&wparam(1)); # bits - &and ($base,-16); - &mov ($key,&wparam(2)); # key - &xchg ($base,"esp"); # alloca - &mov (&DWP(48,"esp"),$base); - - &mov ($base,$round); - &shr ($base,5); - &add ($base,5); - &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; - &shl ($base,4); - &lea ($key,&DWP(16,$key,$base)); - - &mov ($out,1); - &mov ($magic,$round); - &shr ($magic,1); - &and ($magic,32); - &xor ($magic,32); # nbist==192?0:32; - - &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); - &call ("_vpaes_schedule_core"); -&set_label("pic_point"); - - &mov ("esp",&DWP(48,"esp")); - &xor ("eax","eax"); -&function_end("${PREFIX}_set_decrypt_key"); - -&function_begin("${PREFIX}_encrypt"); - &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); - &call ("_vpaes_preheat"); -&set_label("pic_point"); - &mov ($inp,&wparam(0)); # inp - &lea ($base,&DWP(-56,"esp")); - &mov ($out,&wparam(1)); # out - &and ($base,-16); - &mov ($key,&wparam(2)); # key - &xchg ($base,"esp"); # alloca - &mov (&DWP(48,"esp"),$base); - - &movdqu ("xmm0",&QWP(0,$inp)); - &call ("_vpaes_encrypt_core"); - &movdqu (&QWP(0,$out),"xmm0"); - - &mov ("esp",&DWP(48,"esp")); -&function_end("${PREFIX}_encrypt"); - -&function_begin("${PREFIX}_decrypt"); - &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); - &call ("_vpaes_preheat"); -&set_label("pic_point"); - &mov ($inp,&wparam(0)); # inp - &lea ($base,&DWP(-56,"esp")); - &mov ($out,&wparam(1)); # out - &and ($base,-16); - &mov ($key,&wparam(2)); # key - &xchg ($base,"esp"); # alloca - &mov (&DWP(48,"esp"),$base); - - &movdqu ("xmm0",&QWP(0,$inp)); - &call ("_vpaes_decrypt_core"); - &movdqu (&QWP(0,$out),"xmm0"); - - &mov ("esp",&DWP(48,"esp")); -&function_end("${PREFIX}_decrypt"); - -&function_begin("${PREFIX}_cbc_encrypt"); - &mov ($inp,&wparam(0)); # inp - &mov ($out,&wparam(1)); # out - &mov ($round,&wparam(2)); # len - &mov ($key,&wparam(3)); # key - &sub ($round,16); - &jc (&label("cbc_abort")); - &lea ($base,&DWP(-56,"esp")); - &mov ($const,&wparam(4)); # ivp - &and ($base,-16); - &mov ($magic,&wparam(5)); # enc - &xchg ($base,"esp"); # alloca - &movdqu ("xmm1",&QWP(0,$const)); # load IV - &sub ($out,$inp); - &mov (&DWP(48,"esp"),$base); - - &mov (&DWP(0,"esp"),$out); # save out - &mov (&DWP(4,"esp"),$key) # save key - &mov (&DWP(8,"esp"),$const); # save ivp - &mov ($out,$round); # $out works as $len - - &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); - &call ("_vpaes_preheat"); -&set_label("pic_point"); - &cmp ($magic,0); - &je (&label("cbc_dec_loop")); - &jmp (&label("cbc_enc_loop")); - -&set_label("cbc_enc_loop",16); - &movdqu ("xmm0",&QWP(0,$inp)); # load input - &pxor ("xmm0","xmm1"); # inp^=iv - &call ("_vpaes_encrypt_core"); - &mov ($base,&DWP(0,"esp")); # restore out - &mov ($key,&DWP(4,"esp")); # restore key - &movdqa ("xmm1","xmm0"); - &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output - &lea ($inp,&DWP(16,$inp)); - &sub ($out,16); - &jnc (&label("cbc_enc_loop")); - &jmp (&label("cbc_done")); - -&set_label("cbc_dec_loop",16); - &movdqu ("xmm0",&QWP(0,$inp)); # load input - &movdqa (&QWP(16,"esp"),"xmm1"); # save IV - &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV - &call ("_vpaes_decrypt_core"); - &mov ($base,&DWP(0,"esp")); # restore out - &mov ($key,&DWP(4,"esp")); # restore key - &pxor ("xmm0",&QWP(16,"esp")); # out^=iv - &movdqa ("xmm1",&QWP(32,"esp")); # load next IV - &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output - &lea ($inp,&DWP(16,$inp)); - &sub ($out,16); - &jnc (&label("cbc_dec_loop")); - -&set_label("cbc_done"); - &mov ($base,&DWP(8,"esp")); # restore ivp - &mov ("esp",&DWP(48,"esp")); - &movdqu (&QWP(0,$base),"xmm1"); # write IV -&set_label("cbc_abort"); -&function_end("${PREFIX}_cbc_encrypt"); - -&asm_finish(); +../openssl/./crypto/aes/asm/vpaes-x86.pl
\ No newline at end of file diff --git a/devel/perlasm/aes-ssse3-x86_64.pl b/devel/perlasm/aes-ssse3-x86_64.pl index 212394bc59..4e6c4fa12e 100644..120000 --- a/devel/perlasm/aes-ssse3-x86_64.pl +++ b/devel/perlasm/aes-ssse3-x86_64.pl @@ -1,1206 +1 @@ -#!/usr/bin/env perl - -###################################################################### -## Constant-time SSSE3 AES core implementation. -## version 0.1 -## -## By Mike Hamburg (Stanford University), 2009 -## Public domain. -## -## For details see http://shiftleft.org/papers/vector_aes/ and -## http://crypto.stanford.edu/vpaes/. - -###################################################################### -# September 2011. -# -# Interface to OpenSSL as "almost" drop-in replacement for -# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt -# doesn't handle partial vectors (doesn't have to if called from -# EVP only). "Drop-in" implies that this module doesn't share key -# schedule structure with the original nor does it make assumption -# about its alignment... -# -# Performance summary. aes-x86_64.pl column lists large-block CBC -# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per -# byte processed with 128-bit key, and vpaes-x86_64.pl column - -# [also large-block CBC] encrypt/decrypt. -# -# aes-x86_64.pl vpaes-x86_64.pl -# -# Core 2(**) 29.6/41.1/14.3 21.9/25.2(***) -# Nehalem 29.6/40.3/14.6 10.0/11.8 -# Atom 57.3/74.2/32.1 60.9/77.2(***) -# -# (*) "Hyper-threading" in the context refers rather to cache shared -# among multiple cores, than to specifically Intel HTT. As vast -# majority of contemporary cores share cache, slower code path -# is common place. In other words "with-hyper-threading-off" -# results are presented mostly for reference purposes. -# -# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. -# -# (***) Less impressive improvement on Core 2 and Atom is due to slow -# pshufb, yet it's respectable +36%/62% improvement on Core 2 -# (as implied, over "hyper-threading-safe" code path). -# -# <appro@openssl.org> - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -$PREFIX="vpaes"; - -$code.=<<___; -.text - -## -## _aes_encrypt_core -## -## AES-encrypt %xmm0. -## -## Inputs: -## %xmm0 = input -## %xmm9-%xmm15 as in _vpaes_preheat -## (%rdx) = scheduled keys -## -## Output in %xmm0 -## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax -## Preserves %xmm6 - %xmm8 so you get some local vectors -## -## -.type _vpaes_encrypt_core,\@abi-omnipotent -.align 16 -_vpaes_encrypt_core: - mov %rdx, %r9 - mov \$16, %r11 - mov 240(%rdx),%eax - movdqa %xmm9, %xmm1 - movdqa .Lk_ipt(%rip), %xmm2 # iptlo - pandn %xmm0, %xmm1 - movdqu (%r9), %xmm5 # round0 key - psrld \$4, %xmm1 - pand %xmm9, %xmm0 - pshufb %xmm0, %xmm2 - movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi - pshufb %xmm1, %xmm0 - pxor %xmm5, %xmm2 - add \$16, %r9 - pxor %xmm2, %xmm0 - lea .Lk_mc_backward(%rip),%r10 - jmp .Lenc_entry - -.align 16 -.Lenc_loop: - # middle of middle round - movdqa %xmm13, %xmm4 # 4 : sb1u - movdqa %xmm12, %xmm0 # 0 : sb1t - pshufb %xmm2, %xmm4 # 4 = sb1u - pshufb %xmm3, %xmm0 # 0 = sb1t - pxor %xmm5, %xmm4 # 4 = sb1u + k - movdqa %xmm15, %xmm5 # 4 : sb2u - pxor %xmm4, %xmm0 # 0 = A - movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] - pshufb %xmm2, %xmm5 # 4 = sb2u - movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] - movdqa %xmm14, %xmm2 # 2 : sb2t - pshufb %xmm3, %xmm2 # 2 = sb2t - movdqa %xmm0, %xmm3 # 3 = A - pxor %xmm5, %xmm2 # 2 = 2A - pshufb %xmm1, %xmm0 # 0 = B - add \$16, %r9 # next key - pxor %xmm2, %xmm0 # 0 = 2A+B - pshufb %xmm4, %xmm3 # 3 = D - add \$16, %r11 # next mc - pxor %xmm0, %xmm3 # 3 = 2A+B+D - pshufb %xmm1, %xmm0 # 0 = 2B+C - and \$0x30, %r11 # ... mod 4 - sub \$1,%rax # nr-- - pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D - -.Lenc_entry: - # top of round - movdqa %xmm9, %xmm1 # 1 : i - movdqa %xmm11, %xmm5 # 2 : a/k - pandn %xmm0, %xmm1 # 1 = i<<4 - psrld \$4, %xmm1 # 1 = i - pand %xmm9, %xmm0 # 0 = k - pshufb %xmm0, %xmm5 # 2 = a/k - movdqa %xmm10, %xmm3 # 3 : 1/i - pxor %xmm1, %xmm0 # 0 = j - pshufb %xmm1, %xmm3 # 3 = 1/i - movdqa %xmm10, %xmm4 # 4 : 1/j - pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k - pshufb %xmm0, %xmm4 # 4 = 1/j - movdqa %xmm10, %xmm2 # 2 : 1/iak - pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k - pshufb %xmm3, %xmm2 # 2 = 1/iak - movdqa %xmm10, %xmm3 # 3 : 1/jak - pxor %xmm0, %xmm2 # 2 = io - pshufb %xmm4, %xmm3 # 3 = 1/jak - movdqu (%r9), %xmm5 - pxor %xmm1, %xmm3 # 3 = jo - jnz .Lenc_loop - - # middle of last round - movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo - movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 - pshufb %xmm2, %xmm4 # 4 = sbou - pxor %xmm5, %xmm4 # 4 = sb1u + k - pshufb %xmm3, %xmm0 # 0 = sb1t - movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] - pxor %xmm4, %xmm0 # 0 = A - pshufb %xmm1, %xmm0 - ret -.size _vpaes_encrypt_core,.-_vpaes_encrypt_core - -## -## Decryption core -## -## Same API as encryption core. -## -.type _vpaes_decrypt_core,\@abi-omnipotent -.align 16 -_vpaes_decrypt_core: - mov %rdx, %r9 # load key - mov 240(%rdx),%eax - movdqa %xmm9, %xmm1 - movdqa .Lk_dipt(%rip), %xmm2 # iptlo - pandn %xmm0, %xmm1 - mov %rax, %r11 - psrld \$4, %xmm1 - movdqu (%r9), %xmm5 # round0 key - shl \$4, %r11 - pand %xmm9, %xmm0 - pshufb %xmm0, %xmm2 - movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi - xor \$0x30, %r11 - lea .Lk_dsbd(%rip),%r10 - pshufb %xmm1, %xmm0 - and \$0x30, %r11 - pxor %xmm5, %xmm2 - movdqa .Lk_mc_forward+48(%rip), %xmm5 - pxor %xmm2, %xmm0 - add \$16, %r9 - add %r10, %r11 - jmp .Ldec_entry - -.align 16 -.Ldec_loop: -## -## Inverse mix columns -## - movdqa -0x20(%r10),%xmm4 # 4 : sb9u - movdqa -0x10(%r10),%xmm1 # 0 : sb9t - pshufb %xmm2, %xmm4 # 4 = sb9u - pshufb %xmm3, %xmm1 # 0 = sb9t - pxor %xmm4, %xmm0 - movdqa 0x00(%r10),%xmm4 # 4 : sbdu - pxor %xmm1, %xmm0 # 0 = ch - movdqa 0x10(%r10),%xmm1 # 0 : sbdt - - pshufb %xmm2, %xmm4 # 4 = sbdu - pshufb %xmm5, %xmm0 # MC ch - pshufb %xmm3, %xmm1 # 0 = sbdt - pxor %xmm4, %xmm0 # 4 = ch - movdqa 0x20(%r10),%xmm4 # 4 : sbbu - pxor %xmm1, %xmm0 # 0 = ch - movdqa 0x30(%r10),%xmm1 # 0 : sbbt - - pshufb %xmm2, %xmm4 # 4 = sbbu - pshufb %xmm5, %xmm0 # MC ch - pshufb %xmm3, %xmm1 # 0 = sbbt - pxor %xmm4, %xmm0 # 4 = ch - movdqa 0x40(%r10),%xmm4 # 4 : sbeu - pxor %xmm1, %xmm0 # 0 = ch - movdqa 0x50(%r10),%xmm1 # 0 : sbet - - pshufb %xmm2, %xmm4 # 4 = sbeu - pshufb %xmm5, %xmm0 # MC ch - pshufb %xmm3, %xmm1 # 0 = sbet - pxor %xmm4, %xmm0 # 4 = ch - add \$16, %r9 # next round key - palignr \$12, %xmm5, %xmm5 - pxor %xmm1, %xmm0 # 0 = ch - sub \$1,%rax # nr-- - -.Ldec_entry: - # top of round - movdqa %xmm9, %xmm1 # 1 : i - pandn %xmm0, %xmm1 # 1 = i<<4 - movdqa %xmm11, %xmm2 # 2 : a/k - psrld \$4, %xmm1 # 1 = i - pand %xmm9, %xmm0 # 0 = k - pshufb %xmm0, %xmm2 # 2 = a/k - movdqa %xmm10, %xmm3 # 3 : 1/i - pxor %xmm1, %xmm0 # 0 = j - pshufb %xmm1, %xmm3 # 3 = 1/i - movdqa %xmm10, %xmm4 # 4 : 1/j - pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k - pshufb %xmm0, %xmm4 # 4 = 1/j - pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k - movdqa %xmm10, %xmm2 # 2 : 1/iak - pshufb %xmm3, %xmm2 # 2 = 1/iak - movdqa %xmm10, %xmm3 # 3 : 1/jak - pxor %xmm0, %xmm2 # 2 = io - pshufb %xmm4, %xmm3 # 3 = 1/jak - movdqu (%r9), %xmm0 - pxor %xmm1, %xmm3 # 3 = jo - jnz .Ldec_loop - - # middle of last round - movdqa 0x60(%r10), %xmm4 # 3 : sbou - pshufb %xmm2, %xmm4 # 4 = sbou - pxor %xmm0, %xmm4 # 4 = sb1u + k - movdqa 0x70(%r10), %xmm0 # 0 : sbot - movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 - pshufb %xmm3, %xmm0 # 0 = sb1t - pxor %xmm4, %xmm0 # 0 = A - pshufb %xmm2, %xmm0 - ret -.size _vpaes_decrypt_core,.-_vpaes_decrypt_core - -######################################################## -## ## -## AES key schedule ## -## ## -######################################################## -.type _vpaes_schedule_core,\@abi-omnipotent -.align 16 -_vpaes_schedule_core: - # rdi = key - # rsi = size in bits - # rdx = buffer - # rcx = direction. 0=encrypt, 1=decrypt - - call _vpaes_preheat # load the tables - movdqa .Lk_rcon(%rip), %xmm8 # load rcon - movdqu (%rdi), %xmm0 # load key (unaligned) - - # input transform - movdqa %xmm0, %xmm3 - lea .Lk_ipt(%rip), %r11 - call _vpaes_schedule_transform - movdqa %xmm0, %xmm7 - - lea .Lk_sr(%rip),%r10 - test %rcx, %rcx - jnz .Lschedule_am_decrypting - - # encrypting, output zeroth round key after transform - movdqu %xmm0, (%rdx) - jmp .Lschedule_go - -.Lschedule_am_decrypting: - # decrypting, output zeroth round key after shiftrows - movdqa (%r8,%r10),%xmm1 - pshufb %xmm1, %xmm3 - movdqu %xmm3, (%rdx) - xor \$0x30, %r8 - -.Lschedule_go: - cmp \$192, %esi - ja .Lschedule_256 - je .Lschedule_192 - # 128: fall though - -## -## .schedule_128 -## -## 128-bit specific part of key schedule. -## -## This schedule is really simple, because all its parts -## are accomplished by the subroutines. -## -.Lschedule_128: - mov \$10, %esi - -.Loop_schedule_128: - call _vpaes_schedule_round - dec %rsi - jz .Lschedule_mangle_last - call _vpaes_schedule_mangle # write output - jmp .Loop_schedule_128 - -## -## .aes_schedule_192 -## -## 192-bit specific part of key schedule. -## -## The main body of this schedule is the same as the 128-bit -## schedule, but with more smearing. The long, high side is -## stored in %xmm7 as before, and the short, low side is in -## the high bits of %xmm6. -## -## This schedule is somewhat nastier, however, because each -## round produces 192 bits of key material, or 1.5 round keys. -## Therefore, on each cycle we do 2 rounds and produce 3 round -## keys. -## -.align 16 -.Lschedule_192: - movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) - call _vpaes_schedule_transform # input transform - movdqa %xmm0, %xmm6 # save short part - pxor %xmm4, %xmm4 # clear 4 - movhlps %xmm4, %xmm6 # clobber low side with zeros - mov \$4, %esi - -.Loop_schedule_192: - call _vpaes_schedule_round - palignr \$8,%xmm6,%xmm0 - call _vpaes_schedule_mangle # save key n - call _vpaes_schedule_192_smear - call _vpaes_schedule_mangle # save key n+1 - call _vpaes_schedule_round - dec %rsi - jz .Lschedule_mangle_last - call _vpaes_schedule_mangle # save key n+2 - call _vpaes_schedule_192_smear - jmp .Loop_schedule_192 - -## -## .aes_schedule_256 -## -## 256-bit specific part of key schedule. -## -## The structure here is very similar to the 128-bit -## schedule, but with an additional "low side" in -## %xmm6. The low side's rounds are the same as the -## high side's, except no rcon and no rotation. -## -.align 16 -.Lschedule_256: - movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) - call _vpaes_schedule_transform # input transform - mov \$7, %esi - -.Loop_schedule_256: - call _vpaes_schedule_mangle # output low result - movdqa %xmm0, %xmm6 # save cur_lo in xmm6 - - # high round - call _vpaes_schedule_round - dec %rsi - jz .Lschedule_mangle_last - call _vpaes_schedule_mangle - - # low round. swap xmm7 and xmm6 - pshufd \$0xFF, %xmm0, %xmm0 - movdqa %xmm7, %xmm5 - movdqa %xmm6, %xmm7 - call _vpaes_schedule_low_round - movdqa %xmm5, %xmm7 - - jmp .Loop_schedule_256 - - -## -## .aes_schedule_mangle_last -## -## Mangler for last round of key schedule -## Mangles %xmm0 -## when encrypting, outputs out(%xmm0) ^ 63 -## when decrypting, outputs unskew(%xmm0) -## -## Always called right before return... jumps to cleanup and exits -## -.align 16 -.Lschedule_mangle_last: - # schedule last round key from xmm0 - lea .Lk_deskew(%rip),%r11 # prepare to deskew - test %rcx, %rcx - jnz .Lschedule_mangle_last_dec - - # encrypting - movdqa (%r8,%r10),%xmm1 - pshufb %xmm1, %xmm0 # output permute - lea .Lk_opt(%rip), %r11 # prepare to output transform - add \$32, %rdx - -.Lschedule_mangle_last_dec: - add \$-16, %rdx - pxor .Lk_s63(%rip), %xmm0 - call _vpaes_schedule_transform # output transform - movdqu %xmm0, (%rdx) # save last key - - # cleanup - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - pxor %xmm4, %xmm4 - pxor %xmm5, %xmm5 - pxor %xmm6, %xmm6 - pxor %xmm7, %xmm7 - ret -.size _vpaes_schedule_core,.-_vpaes_schedule_core - -## -## .aes_schedule_192_smear -## -## Smear the short, low side in the 192-bit key schedule. -## -## Inputs: -## %xmm7: high side, b a x y -## %xmm6: low side, d c 0 0 -## %xmm13: 0 -## -## Outputs: -## %xmm6: b+c+d b+c 0 0 -## %xmm0: b+c+d b+c b a -## -.type _vpaes_schedule_192_smear,\@abi-omnipotent -.align 16 -_vpaes_schedule_192_smear: - pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 - pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a - pxor %xmm1, %xmm6 # -> c+d c 0 0 - pxor %xmm1, %xmm1 - pxor %xmm0, %xmm6 # -> b+c+d b+c b a - movdqa %xmm6, %xmm0 - movhlps %xmm1, %xmm6 # clobber low side with zeros - ret -.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear - -## -## .aes_schedule_round -## -## Runs one main round of the key schedule on %xmm0, %xmm7 -## -## Specifically, runs subbytes on the high dword of %xmm0 -## then rotates it by one byte and xors into the low dword of -## %xmm7. -## -## Adds rcon from low byte of %xmm8, then rotates %xmm8 for -## next rcon. -## -## Smears the dwords of %xmm7 by xoring the low into the -## second low, result into third, result into highest. -## -## Returns results in %xmm7 = %xmm0. -## Clobbers %xmm1-%xmm4, %r11. -## -.type _vpaes_schedule_round,\@abi-omnipotent -.align 16 -_vpaes_schedule_round: - # extract rcon from xmm8 - pxor %xmm1, %xmm1 - palignr \$15, %xmm8, %xmm1 - palignr \$15, %xmm8, %xmm8 - pxor %xmm1, %xmm7 - - # rotate - pshufd \$0xFF, %xmm0, %xmm0 - palignr \$1, %xmm0, %xmm0 - - # fall through... - - # low round: same as high round, but no rotation and no rcon. -_vpaes_schedule_low_round: - # smear xmm7 - movdqa %xmm7, %xmm1 - pslldq \$4, %xmm7 - pxor %xmm1, %xmm7 - movdqa %xmm7, %xmm1 - pslldq \$8, %xmm7 - pxor %xmm1, %xmm7 - pxor .Lk_s63(%rip), %xmm7 - - # subbytes - movdqa %xmm9, %xmm1 - pandn %xmm0, %xmm1 - psrld \$4, %xmm1 # 1 = i - pand %xmm9, %xmm0 # 0 = k - movdqa %xmm11, %xmm2 # 2 : a/k - pshufb %xmm0, %xmm2 # 2 = a/k - pxor %xmm1, %xmm0 # 0 = j - movdqa %xmm10, %xmm3 # 3 : 1/i - pshufb %xmm1, %xmm3 # 3 = 1/i - pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k - movdqa %xmm10, %xmm4 # 4 : 1/j - pshufb %xmm0, %xmm4 # 4 = 1/j - pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k - movdqa %xmm10, %xmm2 # 2 : 1/iak - pshufb %xmm3, %xmm2 # 2 = 1/iak - pxor %xmm0, %xmm2 # 2 = io - movdqa %xmm10, %xmm3 # 3 : 1/jak - pshufb %xmm4, %xmm3 # 3 = 1/jak - pxor %xmm1, %xmm3 # 3 = jo - movdqa %xmm13, %xmm4 # 4 : sbou - pshufb %xmm2, %xmm4 # 4 = sbou - movdqa %xmm12, %xmm0 # 0 : sbot - pshufb %xmm3, %xmm0 # 0 = sb1t - pxor %xmm4, %xmm0 # 0 = sbox output - - # add in smeared stuff - pxor %xmm7, %xmm0 - movdqa %xmm0, %xmm7 - ret -.size _vpaes_schedule_round,.-_vpaes_schedule_round - -## -## .aes_schedule_transform -## -## Linear-transform %xmm0 according to tables at (%r11) -## -## Requires that %xmm9 = 0x0F0F... as in preheat -## Output in %xmm0 -## Clobbers %xmm1, %xmm2 -## -.type _vpaes_schedule_transform,\@abi-omnipotent -.align 16 -_vpaes_schedule_transform: - movdqa %xmm9, %xmm1 - pandn %xmm0, %xmm1 - psrld \$4, %xmm1 - pand %xmm9, %xmm0 - movdqa (%r11), %xmm2 # lo - pshufb %xmm0, %xmm2 - movdqa 16(%r11), %xmm0 # hi - pshufb %xmm1, %xmm0 - pxor %xmm2, %xmm0 - ret -.size _vpaes_schedule_transform,.-_vpaes_schedule_transform - -## -## .aes_schedule_mangle -## -## Mangle xmm0 from (basis-transformed) standard version -## to our version. -## -## On encrypt, -## xor with 0x63 -## multiply by circulant 0,1,1,1 -## apply shiftrows transform -## -## On decrypt, -## xor with 0x63 -## multiply by "inverse mixcolumns" circulant E,B,D,9 -## deskew -## apply shiftrows transform -## -## -## Writes out to (%rdx), and increments or decrements it -## Keeps track of round number mod 4 in %r8 -## Preserves xmm0 -## Clobbers xmm1-xmm5 -## -.type _vpaes_schedule_mangle,\@abi-omnipotent -.align 16 -_vpaes_schedule_mangle: - movdqa %xmm0, %xmm4 # save xmm0 for later - movdqa .Lk_mc_forward(%rip),%xmm5 - test %rcx, %rcx - jnz .Lschedule_mangle_dec - - # encrypting - add \$16, %rdx - pxor .Lk_s63(%rip),%xmm4 - pshufb %xmm5, %xmm4 - movdqa %xmm4, %xmm3 - pshufb %xmm5, %xmm4 - pxor %xmm4, %xmm3 - pshufb %xmm5, %xmm4 - pxor %xmm4, %xmm3 - - jmp .Lschedule_mangle_both -.align 16 -.Lschedule_mangle_dec: - # inverse mix columns - lea .Lk_dksd(%rip),%r11 - movdqa %xmm9, %xmm1 - pandn %xmm4, %xmm1 - psrld \$4, %xmm1 # 1 = hi - pand %xmm9, %xmm4 # 4 = lo - - movdqa 0x00(%r11), %xmm2 - pshufb %xmm4, %xmm2 - movdqa 0x10(%r11), %xmm3 - pshufb %xmm1, %xmm3 - pxor %xmm2, %xmm3 - pshufb %xmm5, %xmm3 - - movdqa 0x20(%r11), %xmm2 - pshufb %xmm4, %xmm2 - pxor %xmm3, %xmm2 - movdqa 0x30(%r11), %xmm3 - pshufb %xmm1, %xmm3 - pxor %xmm2, %xmm3 - pshufb %xmm5, %xmm3 - - movdqa 0x40(%r11), %xmm2 - pshufb %xmm4, %xmm2 - pxor %xmm3, %xmm2 - movdqa 0x50(%r11), %xmm3 - pshufb %xmm1, %xmm3 - pxor %xmm2, %xmm3 - pshufb %xmm5, %xmm3 - - movdqa 0x60(%r11), %xmm2 - pshufb %xmm4, %xmm2 - pxor %xmm3, %xmm2 - movdqa 0x70(%r11), %xmm3 - pshufb %xmm1, %xmm3 - pxor %xmm2, %xmm3 - - add \$-16, %rdx - -.Lschedule_mangle_both: - movdqa (%r8,%r10),%xmm1 - pshufb %xmm1,%xmm3 - add \$-16, %r8 - and \$0x30, %r8 - movdqu %xmm3, (%rdx) - ret -.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle - -# -# Interface to OpenSSL -# -.globl ${PREFIX}_set_encrypt_key -.type ${PREFIX}_set_encrypt_key,\@function,3 -.align 16 -${PREFIX}_set_encrypt_key: -___ -$code.=<<___ if ($win64); - lea -0xb8(%rsp),%rsp - movaps %xmm6,0x10(%rsp) - movaps %xmm7,0x20(%rsp) - movaps %xmm8,0x30(%rsp) - movaps %xmm9,0x40(%rsp) - movaps %xmm10,0x50(%rsp) - movaps %xmm11,0x60(%rsp) - movaps %xmm12,0x70(%rsp) - movaps %xmm13,0x80(%rsp) - movaps %xmm14,0x90(%rsp) - movaps %xmm15,0xa0(%rsp) -.Lenc_key_body: -___ -$code.=<<___; - mov %esi,%eax - shr \$5,%eax - add \$5,%eax - mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - - mov \$0,%ecx - mov \$0x30,%r8d - call _vpaes_schedule_core -___ -$code.=<<___ if ($win64); - movaps 0x10(%rsp),%xmm6 - movaps 0x20(%rsp),%xmm7 - movaps 0x30(%rsp),%xmm8 - movaps 0x40(%rsp),%xmm9 - movaps 0x50(%rsp),%xmm10 - movaps 0x60(%rsp),%xmm11 - movaps 0x70(%rsp),%xmm12 - movaps 0x80(%rsp),%xmm13 - movaps 0x90(%rsp),%xmm14 - movaps 0xa0(%rsp),%xmm15 - lea 0xb8(%rsp),%rsp -.Lenc_key_epilogue: -___ -$code.=<<___; - xor %eax,%eax - ret -.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key - -.globl ${PREFIX}_set_decrypt_key -.type ${PREFIX}_set_decrypt_key,\@function,3 -.align 16 -${PREFIX}_set_decrypt_key: -___ -$code.=<<___ if ($win64); - lea -0xb8(%rsp),%rsp - movaps %xmm6,0x10(%rsp) - movaps %xmm7,0x20(%rsp) - movaps %xmm8,0x30(%rsp) - movaps %xmm9,0x40(%rsp) - movaps %xmm10,0x50(%rsp) - movaps %xmm11,0x60(%rsp) - movaps %xmm12,0x70(%rsp) - movaps %xmm13,0x80(%rsp) - movaps %xmm14,0x90(%rsp) - movaps %xmm15,0xa0(%rsp) -.Ldec_key_body: -___ -$code.=<<___; - mov %esi,%eax - shr \$5,%eax - add \$5,%eax - mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - shl \$4,%eax - lea 16(%rdx,%rax),%rdx - - mov \$1,%ecx - mov %esi,%r8d - shr \$1,%r8d - and \$32,%r8d - xor \$32,%r8d # nbits==192?0:32 - call _vpaes_schedule_core -___ -$code.=<<___ if ($win64); - movaps 0x10(%rsp),%xmm6 - movaps 0x20(%rsp),%xmm7 - movaps 0x30(%rsp),%xmm8 - movaps 0x40(%rsp),%xmm9 - movaps 0x50(%rsp),%xmm10 - movaps 0x60(%rsp),%xmm11 - movaps 0x70(%rsp),%xmm12 - movaps 0x80(%rsp),%xmm13 - movaps 0x90(%rsp),%xmm14 - movaps 0xa0(%rsp),%xmm15 - lea 0xb8(%rsp),%rsp -.Ldec_key_epilogue: -___ -$code.=<<___; - xor %eax,%eax - ret -.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key - -.globl ${PREFIX}_encrypt -.type ${PREFIX}_encrypt,\@function,3 -.align 16 -${PREFIX}_encrypt: -___ -$code.=<<___ if ($win64); - lea -0xb8(%rsp),%rsp - movaps %xmm6,0x10(%rsp) - movaps %xmm7,0x20(%rsp) - movaps %xmm8,0x30(%rsp) - movaps %xmm9,0x40(%rsp) - movaps %xmm10,0x50(%rsp) - movaps %xmm11,0x60(%rsp) - movaps %xmm12,0x70(%rsp) - movaps %xmm13,0x80(%rsp) - movaps %xmm14,0x90(%rsp) - movaps %xmm15,0xa0(%rsp) -.Lenc_body: -___ -$code.=<<___; - movdqu (%rdi),%xmm0 - call _vpaes_preheat - call _vpaes_encrypt_core - movdqu %xmm0,(%rsi) -___ -$code.=<<___ if ($win64); - movaps 0x10(%rsp),%xmm6 - movaps 0x20(%rsp),%xmm7 - movaps 0x30(%rsp),%xmm8 - movaps 0x40(%rsp),%xmm9 - movaps 0x50(%rsp),%xmm10 - movaps 0x60(%rsp),%xmm11 - movaps 0x70(%rsp),%xmm12 - movaps 0x80(%rsp),%xmm13 - movaps 0x90(%rsp),%xmm14 - movaps 0xa0(%rsp),%xmm15 - lea 0xb8(%rsp),%rsp -.Lenc_epilogue: -___ -$code.=<<___; - ret -.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt - -.globl ${PREFIX}_decrypt -.type ${PREFIX}_decrypt,\@function,3 -.align 16 -${PREFIX}_decrypt: -___ -$code.=<<___ if ($win64); - lea -0xb8(%rsp),%rsp - movaps %xmm6,0x10(%rsp) - movaps %xmm7,0x20(%rsp) - movaps %xmm8,0x30(%rsp) - movaps %xmm9,0x40(%rsp) - movaps %xmm10,0x50(%rsp) - movaps %xmm11,0x60(%rsp) - movaps %xmm12,0x70(%rsp) - movaps %xmm13,0x80(%rsp) - movaps %xmm14,0x90(%rsp) - movaps %xmm15,0xa0(%rsp) -.Ldec_body: -___ -$code.=<<___; - movdqu (%rdi),%xmm0 - call _vpaes_preheat - call _vpaes_decrypt_core - movdqu %xmm0,(%rsi) -___ -$code.=<<___ if ($win64); - movaps 0x10(%rsp),%xmm6 - movaps 0x20(%rsp),%xmm7 - movaps 0x30(%rsp),%xmm8 - movaps 0x40(%rsp),%xmm9 - movaps 0x50(%rsp),%xmm10 - movaps 0x60(%rsp),%xmm11 - movaps 0x70(%rsp),%xmm12 - movaps 0x80(%rsp),%xmm13 - movaps 0x90(%rsp),%xmm14 - movaps 0xa0(%rsp),%xmm15 - lea 0xb8(%rsp),%rsp -.Ldec_epilogue: -___ -$code.=<<___; - ret -.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt -___ -{ -my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); -# void AES_cbc_encrypt (const void char *inp, unsigned char *out, -# size_t length, const AES_KEY *key, -# unsigned char *ivp,const int enc); -$code.=<<___; -.globl ${PREFIX}_cbc_encrypt -.type ${PREFIX}_cbc_encrypt,\@function,6 -.align 16 -${PREFIX}_cbc_encrypt: - xchg $key,$len -___ -($len,$key)=($key,$len); -$code.=<<___; - sub \$16,$len - jc .Lcbc_abort -___ -$code.=<<___ if ($win64); - lea -0xb8(%rsp),%rsp - movaps %xmm6,0x10(%rsp) - movaps %xmm7,0x20(%rsp) - movaps %xmm8,0x30(%rsp) - movaps %xmm9,0x40(%rsp) - movaps %xmm10,0x50(%rsp) - movaps %xmm11,0x60(%rsp) - movaps %xmm12,0x70(%rsp) - movaps %xmm13,0x80(%rsp) - movaps %xmm14,0x90(%rsp) - movaps %xmm15,0xa0(%rsp) -.Lcbc_body: -___ -$code.=<<___; - movdqu ($ivp),%xmm6 # load IV - sub $inp,$out - call _vpaes_preheat - cmp \$0,${enc}d - je .Lcbc_dec_loop - jmp .Lcbc_enc_loop -.align 16 -.Lcbc_enc_loop: - movdqu ($inp),%xmm0 - pxor %xmm6,%xmm0 - call _vpaes_encrypt_core - movdqa %xmm0,%xmm6 - movdqu %xmm0,($out,$inp) - lea 16($inp),$inp - sub \$16,$len - jnc .Lcbc_enc_loop - jmp .Lcbc_done -.align 16 -.Lcbc_dec_loop: - movdqu ($inp),%xmm0 - movdqa %xmm0,%xmm7 - call _vpaes_decrypt_core - pxor %xmm6,%xmm0 - movdqa %xmm7,%xmm6 - movdqu %xmm0,($out,$inp) - lea 16($inp),$inp - sub \$16,$len - jnc .Lcbc_dec_loop -.Lcbc_done: - movdqu %xmm6,($ivp) # save IV -___ -$code.=<<___ if ($win64); - movaps 0x10(%rsp),%xmm6 - movaps 0x20(%rsp),%xmm7 - movaps 0x30(%rsp),%xmm8 - movaps 0x40(%rsp),%xmm9 - movaps 0x50(%rsp),%xmm10 - movaps 0x60(%rsp),%xmm11 - movaps 0x70(%rsp),%xmm12 - movaps 0x80(%rsp),%xmm13 - movaps 0x90(%rsp),%xmm14 - movaps 0xa0(%rsp),%xmm15 - lea 0xb8(%rsp),%rsp -.Lcbc_epilogue: -___ -$code.=<<___; -.Lcbc_abort: - ret -.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt -___ -} -$code.=<<___; -## -## _aes_preheat -## -## Fills register %r10 -> .aes_consts (so you can -fPIC) -## and %xmm9-%xmm15 as specified below. -## -.type _vpaes_preheat,\@abi-omnipotent -.align 16 -_vpaes_preheat: - lea .Lk_s0F(%rip), %r10 - movdqa -0x20(%r10), %xmm10 # .Lk_inv - movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 - movdqa 0x00(%r10), %xmm9 # .Lk_s0F - movdqa 0x30(%r10), %xmm13 # .Lk_sb1 - movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 - movdqa 0x50(%r10), %xmm15 # .Lk_sb2 - movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 - ret -.size _vpaes_preheat,.-_vpaes_preheat -######################################################## -## ## -## Constants ## -## ## -######################################################## -.type _vpaes_consts,\@object -.align 64 -_vpaes_consts: -.Lk_inv: # inv, inva - .quad 0x0E05060F0D080180, 0x040703090A0B0C02 - .quad 0x01040A060F0B0780, 0x030D0E0C02050809 - -.Lk_s0F: # s0F - .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F - -.Lk_ipt: # input transform (lo, hi) - .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 - .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 - -.Lk_sb1: # sb1u, sb1t - .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 - .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -.Lk_sb2: # sb2u, sb2t - .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD - .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -.Lk_sbo: # sbou, sbot - .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 - .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA - -.Lk_mc_forward: # mc_forward - .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 - .quad 0x080B0A0904070605, 0x000302010C0F0E0D - .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 - .quad 0x000302010C0F0E0D, 0x080B0A0904070605 - -.Lk_mc_backward:# mc_backward - .quad 0x0605040702010003, 0x0E0D0C0F0A09080B - .quad 0x020100030E0D0C0F, 0x0A09080B06050407 - .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 - .quad 0x0A09080B06050407, 0x020100030E0D0C0F - -.Lk_sr: # sr - .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 - .quad 0x030E09040F0A0500, 0x0B06010C07020D08 - .quad 0x0F060D040B020900, 0x070E050C030A0108 - .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 - -.Lk_rcon: # rcon - .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 - -.Lk_s63: # s63: all equal to 0x63 transformed - .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B - -.Lk_opt: # output transform - .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 - .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 - -.Lk_deskew: # deskew tables: inverts the sbox's "skew" - .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A - .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 - -## -## Decryption stuff -## Key schedule constants -## -.Lk_dksd: # decryption key schedule: invskew x*D - .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 - .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -.Lk_dksb: # decryption key schedule: invskew x*B - .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 - .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -.Lk_dkse: # decryption key schedule: invskew x*E + 0x63 - .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 - .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -.Lk_dks9: # decryption key schedule: invskew x*9 - .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC - .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE - -## -## Decryption stuff -## Round function constants -## -.Lk_dipt: # decryption input transform - .quad 0x0F505B040B545F00, 0x154A411E114E451A - .quad 0x86E383E660056500, 0x12771772F491F194 - -.Lk_dsb9: # decryption sbox output *9*u, *9*t - .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 - .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 -.Lk_dsbd: # decryption sbox output *D*u, *D*t - .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 - .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 -.Lk_dsbb: # decryption sbox output *B*u, *B*t - .quad 0xD022649296B44200, 0x602646F6B0F2D404 - .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B -.Lk_dsbe: # decryption sbox output *E*u, *E*t - .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 - .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 -.Lk_dsbo: # decryption sbox final output - .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D - .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" -.align 64 -.size _vpaes_consts,.-_vpaes_consts -___ - -if ($win64) { -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lin_prologue - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lin_prologue - - lea 16(%rax),%rsi # %xmm save area - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) - .long 0xa548f3fc # cld; rep movsq - lea 0xb8(%rax),%rax # adjust stack pointer - -.Lin_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$`1232/8`,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size se_handler,.-se_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_${PREFIX}_set_encrypt_key - .rva .LSEH_end_${PREFIX}_set_encrypt_key - .rva .LSEH_info_${PREFIX}_set_encrypt_key - - .rva .LSEH_begin_${PREFIX}_set_decrypt_key - .rva .LSEH_end_${PREFIX}_set_decrypt_key - .rva .LSEH_info_${PREFIX}_set_decrypt_key - - .rva .LSEH_begin_${PREFIX}_encrypt - .rva .LSEH_end_${PREFIX}_encrypt - .rva .LSEH_info_${PREFIX}_encrypt - - .rva .LSEH_begin_${PREFIX}_decrypt - .rva .LSEH_end_${PREFIX}_decrypt - .rva .LSEH_info_${PREFIX}_decrypt - - .rva .LSEH_begin_${PREFIX}_cbc_encrypt - .rva .LSEH_end_${PREFIX}_cbc_encrypt - .rva .LSEH_info_${PREFIX}_cbc_encrypt - -.section .xdata -.align 8 -.LSEH_info_${PREFIX}_set_encrypt_key: - .byte 9,0,0,0 - .rva se_handler - .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] -.LSEH_info_${PREFIX}_set_decrypt_key: - .byte 9,0,0,0 - .rva se_handler - .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] -.LSEH_info_${PREFIX}_encrypt: - .byte 9,0,0,0 - .rva se_handler - .rva .Lenc_body,.Lenc_epilogue # HandlerData[] -.LSEH_info_${PREFIX}_decrypt: - .byte 9,0,0,0 - .rva se_handler - .rva .Ldec_body,.Ldec_epilogue # HandlerData[] -.LSEH_info_${PREFIX}_cbc_encrypt: - .byte 9,0,0,0 - .rva se_handler - .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] -___ -} - -$code =~ s/\`([^\`]*)\`/eval($1)/gem; - -print $code; - -close STDOUT; +../openssl/./crypto/aes/asm/vpaes-x86_64.pl
\ No newline at end of file diff --git a/devel/perlasm/aesni-x86.pl b/devel/perlasm/aesni-x86.pl index 14ff2602ed..770bf0a7c7 100644..120000 --- a/devel/perlasm/aesni-x86.pl +++ b/devel/perlasm/aesni-x86.pl @@ -1,2189 +1 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements support for Intel AES-NI extension. In -# OpenSSL context it's used with Intel engine, but can also be used as -# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for -# details]. -# -# Performance. -# -# To start with see corresponding paragraph in aesni-x86_64.pl... -# Instead of filling table similar to one found there I've chosen to -# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. -# The simplified table below represents 32-bit performance relative -# to 64-bit one in every given point. Ratios vary for different -# encryption modes, therefore interval values. -# -# 16-byte 64-byte 256-byte 1-KB 8-KB -# 53-67% 67-84% 91-94% 95-98% 97-99.5% -# -# Lower ratios for smaller block sizes are perfectly understandable, -# because function call overhead is higher in 32-bit mode. Largest -# 8-KB block performance is virtually same: 32-bit code is less than -# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. - -# January 2011 -# -# See aesni-x86_64.pl for details. Unlike x86_64 version this module -# interleaves at most 6 aes[enc|dec] instructions, because there are -# not enough registers for 8x interleave [which should be optimal for -# Sandy Bridge]. Actually, performance results for 6x interleave -# factor presented in aesni-x86_64.pl (except for CTR) are for this -# module. - -# April 2011 -# -# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing -# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. - -$PREFIX="aesni"; # if $PREFIX is set to "AES", the script - # generates drop-in replacement for - # crypto/aes/asm/aes-586.pl:-) -$inline=1; # inline _aesni_[en|de]crypt - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],$0); - -if ($PREFIX eq "aesni") { $movekey=\&movups; } -else { $movekey=\&movups; } - -$len="eax"; -$rounds="ecx"; -$key="edx"; -$inp="esi"; -$out="edi"; -$rounds_="ebx"; # backup copy for $rounds -$key_="ebp"; # backup copy for $key - -$rndkey0="xmm0"; -$rndkey1="xmm1"; -$inout0="xmm2"; -$inout1="xmm3"; -$inout2="xmm4"; -$inout3="xmm5"; $in1="xmm5"; -$inout4="xmm6"; $in0="xmm6"; -$inout5="xmm7"; $ivec="xmm7"; - -# AESNI extenstion -sub aeskeygenassist -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) - { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } -} -sub aescommon -{ my($opcodelet,$dst,$src)=@_; - if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) - { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} -} -sub aesimc { aescommon(0xdb,@_); } -sub aesenc { aescommon(0xdc,@_); } -sub aesenclast { aescommon(0xdd,@_); } -sub aesdec { aescommon(0xde,@_); } -sub aesdeclast { aescommon(0xdf,@_); } - -# Inline version of internal aesni_[en|de]crypt1 -{ my $sn; -sub aesni_inline_generate1 -{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); - $sn++; - - &$movekey ($rndkey0,&QWP(0,$key)); - &$movekey ($rndkey1,&QWP(16,$key)); - &xorps ($ivec,$rndkey0) if (defined($ivec)); - &lea ($key,&DWP(32,$key)); - &xorps ($inout,$ivec) if (defined($ivec)); - &xorps ($inout,$rndkey0) if (!defined($ivec)); - &set_label("${p}1_loop_$sn"); - eval"&aes${p} ($inout,$rndkey1)"; - &dec ($rounds); - &$movekey ($rndkey1,&QWP(0,$key)); - &lea ($key,&DWP(16,$key)); - &jnz (&label("${p}1_loop_$sn")); - eval"&aes${p}last ($inout,$rndkey1)"; -}} - -sub aesni_generate1 # fully unrolled loop -{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); - - &function_begin_B("_aesni_${p}rypt1"); - &movups ($rndkey0,&QWP(0,$key)); - &$movekey ($rndkey1,&QWP(0x10,$key)); - &xorps ($inout,$rndkey0); - &$movekey ($rndkey0,&QWP(0x20,$key)); - &lea ($key,&DWP(0x30,$key)); - &cmp ($rounds,11); - &jb (&label("${p}128")); - &lea ($key,&DWP(0x20,$key)); - &je (&label("${p}192")); - &lea ($key,&DWP(0x20,$key)); - eval"&aes${p} ($inout,$rndkey1)"; - &$movekey ($rndkey1,&QWP(-0x40,$key)); - eval"&aes${p} ($inout,$rndkey0)"; - &$movekey ($rndkey0,&QWP(-0x30,$key)); - &set_label("${p}192"); - eval"&aes${p} ($inout,$rndkey1)"; - &$movekey ($rndkey1,&QWP(-0x20,$key)); - eval"&aes${p} ($inout,$rndkey0)"; - &$movekey ($rndkey0,&QWP(-0x10,$key)); - &set_label("${p}128"); - eval"&aes${p} ($inout,$rndkey1)"; - &$movekey ($rndkey1,&QWP(0,$key)); - eval"&aes${p} ($inout,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0x10,$key)); - eval"&aes${p} ($inout,$rndkey1)"; - &$movekey ($rndkey1,&QWP(0x20,$key)); - eval"&aes${p} ($inout,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0x30,$key)); - eval"&aes${p} ($inout,$rndkey1)"; - &$movekey ($rndkey1,&QWP(0x40,$key)); - eval"&aes${p} ($inout,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0x50,$key)); - eval"&aes${p} ($inout,$rndkey1)"; - &$movekey ($rndkey1,&QWP(0x60,$key)); - eval"&aes${p} ($inout,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0x70,$key)); - eval"&aes${p} ($inout,$rndkey1)"; - eval"&aes${p}last ($inout,$rndkey0)"; - &ret(); - &function_end_B("_aesni_${p}rypt1"); -} - -# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); -&aesni_generate1("enc") if (!$inline); -&function_begin_B("${PREFIX}_encrypt"); - &mov ("eax",&wparam(0)); - &mov ($key,&wparam(2)); - &movups ($inout0,&QWP(0,"eax")); - &mov ($rounds,&DWP(240,$key)); - &mov ("eax",&wparam(1)); - if ($inline) - { &aesni_inline_generate1("enc"); } - else - { &call ("_aesni_encrypt1"); } - &movups (&QWP(0,"eax"),$inout0); - &ret (); -&function_end_B("${PREFIX}_encrypt"); - -# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); -&aesni_generate1("dec") if(!$inline); -&function_begin_B("${PREFIX}_decrypt"); - &mov ("eax",&wparam(0)); - &mov ($key,&wparam(2)); - &movups ($inout0,&QWP(0,"eax")); - &mov ($rounds,&DWP(240,$key)); - &mov ("eax",&wparam(1)); - if ($inline) - { &aesni_inline_generate1("dec"); } - else - { &call ("_aesni_decrypt1"); } - &movups (&QWP(0,"eax"),$inout0); - &ret (); -&function_end_B("${PREFIX}_decrypt"); - -# _aesni_[en|de]cryptN are private interfaces, N denotes interleave -# factor. Why 3x subroutine were originally used in loops? Even though -# aes[enc|dec] latency was originally 6, it could be scheduled only -# every *2nd* cycle. Thus 3x interleave was the one providing optimal -# utilization, i.e. when subroutine's throughput is virtually same as -# of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. -# aes[enc|dec] latency in next processor generation is 8, but the -# instructions can be scheduled every cycle. Optimal interleave for -# new processor is therefore 8x, but it's unfeasible to accommodate it -# in XMM registers addreassable in 32-bit mode and therefore 6x is -# used instead... - -sub aesni_generate3 -{ my $p=shift; - - &function_begin_B("_aesni_${p}rypt3"); - &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); - &$movekey ($rndkey1,&QWP(16,$key)); - &lea ($key,&DWP(32,$key)); - &xorps ($inout0,$rndkey0); - &pxor ($inout1,$rndkey0); - &pxor ($inout2,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); - - &set_label("${p}3_loop"); - eval"&aes${p} ($inout0,$rndkey1)"; - eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); - eval"&aes${p} ($inout2,$rndkey1)"; - &$movekey ($rndkey1,&QWP(16,$key)); - eval"&aes${p} ($inout0,$rndkey0)"; - eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); - eval"&aes${p} ($inout2,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); - &jnz (&label("${p}3_loop")); - eval"&aes${p} ($inout0,$rndkey1)"; - eval"&aes${p} ($inout1,$rndkey1)"; - eval"&aes${p} ($inout2,$rndkey1)"; - eval"&aes${p}last ($inout0,$rndkey0)"; - eval"&aes${p}last ($inout1,$rndkey0)"; - eval"&aes${p}last ($inout2,$rndkey0)"; - &ret(); - &function_end_B("_aesni_${p}rypt3"); -} - -# 4x interleave is implemented to improve small block performance, -# most notably [and naturally] 4 block by ~30%. One can argue that one -# should have implemented 5x as well, but improvement would be <20%, -# so it's not worth it... -sub aesni_generate4 -{ my $p=shift; - - &function_begin_B("_aesni_${p}rypt4"); - &$movekey ($rndkey0,&QWP(0,$key)); - &$movekey ($rndkey1,&QWP(16,$key)); - &shr ($rounds,1); - &lea ($key,&DWP(32,$key)); - &xorps ($inout0,$rndkey0); - &pxor ($inout1,$rndkey0); - &pxor ($inout2,$rndkey0); - &pxor ($inout3,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); - - &set_label("${p}4_loop"); - eval"&aes${p} ($inout0,$rndkey1)"; - eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); - eval"&aes${p} ($inout2,$rndkey1)"; - eval"&aes${p} ($inout3,$rndkey1)"; - &$movekey ($rndkey1,&QWP(16,$key)); - eval"&aes${p} ($inout0,$rndkey0)"; - eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); - eval"&aes${p} ($inout2,$rndkey0)"; - eval"&aes${p} ($inout3,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); - &jnz (&label("${p}4_loop")); - - eval"&aes${p} ($inout0,$rndkey1)"; - eval"&aes${p} ($inout1,$rndkey1)"; - eval"&aes${p} ($inout2,$rndkey1)"; - eval"&aes${p} ($inout3,$rndkey1)"; - eval"&aes${p}last ($inout0,$rndkey0)"; - eval"&aes${p}last ($inout1,$rndkey0)"; - eval"&aes${p}last ($inout2,$rndkey0)"; - eval"&aes${p}last ($inout3,$rndkey0)"; - &ret(); - &function_end_B("_aesni_${p}rypt4"); -} - -sub aesni_generate6 -{ my $p=shift; - - &function_begin_B("_aesni_${p}rypt6"); - &static_label("_aesni_${p}rypt6_enter"); - &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); - &$movekey ($rndkey1,&QWP(16,$key)); - &lea ($key,&DWP(32,$key)); - &xorps ($inout0,$rndkey0); - &pxor ($inout1,$rndkey0); # pxor does better here - eval"&aes${p} ($inout0,$rndkey1)"; - &pxor ($inout2,$rndkey0); - eval"&aes${p} ($inout1,$rndkey1)"; - &pxor ($inout3,$rndkey0); - &dec ($rounds); - eval"&aes${p} ($inout2,$rndkey1)"; - &pxor ($inout4,$rndkey0); - eval"&aes${p} ($inout3,$rndkey1)"; - &pxor ($inout5,$rndkey0); - eval"&aes${p} ($inout4,$rndkey1)"; - &$movekey ($rndkey0,&QWP(0,$key)); - eval"&aes${p} ($inout5,$rndkey1)"; - &jmp (&label("_aesni_${p}rypt6_enter")); - - &set_label("${p}6_loop",16); - eval"&aes${p} ($inout0,$rndkey1)"; - eval"&aes${p} ($inout1,$rndkey1)"; - &dec ($rounds); - eval"&aes${p} ($inout2,$rndkey1)"; - eval"&aes${p} ($inout3,$rndkey1)"; - eval"&aes${p} ($inout4,$rndkey1)"; - eval"&aes${p} ($inout5,$rndkey1)"; - &set_label("_aesni_${p}rypt6_enter",16); - &$movekey ($rndkey1,&QWP(16,$key)); - eval"&aes${p} ($inout0,$rndkey0)"; - eval"&aes${p} ($inout1,$rndkey0)"; - &lea ($key,&DWP(32,$key)); - eval"&aes${p} ($inout2,$rndkey0)"; - eval"&aes${p} ($inout3,$rndkey0)"; - eval"&aes${p} ($inout4,$rndkey0)"; - eval"&aes${p} ($inout5,$rndkey0)"; - &$movekey ($rndkey0,&QWP(0,$key)); - &jnz (&label("${p}6_loop")); - - eval"&aes${p} ($inout0,$rndkey1)"; - eval"&aes${p} ($inout1,$rndkey1)"; - eval"&aes${p} ($inout2,$rndkey1)"; - eval"&aes${p} ($inout3,$rndkey1)"; - eval"&aes${p} ($inout4,$rndkey1)"; - eval"&aes${p} ($inout5,$rndkey1)"; - eval"&aes${p}last ($inout0,$rndkey0)"; - eval"&aes${p}last ($inout1,$rndkey0)"; - eval"&aes${p}last ($inout2,$rndkey0)"; - eval"&aes${p}last ($inout3,$rndkey0)"; - eval"&aes${p}last ($inout4,$rndkey0)"; - eval"&aes${p}last ($inout5,$rndkey0)"; - &ret(); - &function_end_B("_aesni_${p}rypt6"); -} -&aesni_generate3("enc") if ($PREFIX eq "aesni"); -&aesni_generate3("dec"); -&aesni_generate4("enc") if ($PREFIX eq "aesni"); -&aesni_generate4("dec"); -&aesni_generate6("enc") if ($PREFIX eq "aesni"); -&aesni_generate6("dec"); - -if ($PREFIX eq "aesni") { -###################################################################### -# void aesni_ecb_encrypt (const void *in, void *out, -# size_t length, const AES_KEY *key, -# int enc); -&function_begin("aesni_ecb_encrypt"); - &mov ($inp,&wparam(0)); - &mov ($out,&wparam(1)); - &mov ($len,&wparam(2)); - &mov ($key,&wparam(3)); - &mov ($rounds_,&wparam(4)); - &and ($len,-16); - &jz (&label("ecb_ret")); - &mov ($rounds,&DWP(240,$key)); - &test ($rounds_,$rounds_); - &jz (&label("ecb_decrypt")); - - &mov ($key_,$key); # backup $key - &mov ($rounds_,$rounds); # backup $rounds - &cmp ($len,0x60); - &jb (&label("ecb_enc_tail")); - - &movdqu ($inout0,&QWP(0,$inp)); - &movdqu ($inout1,&QWP(0x10,$inp)); - &movdqu ($inout2,&QWP(0x20,$inp)); - &movdqu ($inout3,&QWP(0x30,$inp)); - &movdqu ($inout4,&QWP(0x40,$inp)); - &movdqu ($inout5,&QWP(0x50,$inp)); - &lea ($inp,&DWP(0x60,$inp)); - &sub ($len,0x60); - &jmp (&label("ecb_enc_loop6_enter")); - -&set_label("ecb_enc_loop6",16); - &movups (&QWP(0,$out),$inout0); - &movdqu ($inout0,&QWP(0,$inp)); - &movups (&QWP(0x10,$out),$inout1); - &movdqu ($inout1,&QWP(0x10,$inp)); - &movups (&QWP(0x20,$out),$inout2); - &movdqu ($inout2,&QWP(0x20,$inp)); - &movups (&QWP(0x30,$out),$inout3); - &movdqu ($inout3,&QWP(0x30,$inp)); - &movups (&QWP(0x40,$out),$inout4); - &movdqu ($inout4,&QWP(0x40,$inp)); - &movups (&QWP(0x50,$out),$inout5); - &lea ($out,&DWP(0x60,$out)); - &movdqu ($inout5,&QWP(0x50,$inp)); - &lea ($inp,&DWP(0x60,$inp)); -&set_label("ecb_enc_loop6_enter"); - - &call ("_aesni_encrypt6"); - - &mov ($key,$key_); # restore $key - &mov ($rounds,$rounds_); # restore $rounds - &sub ($len,0x60); - &jnc (&label("ecb_enc_loop6")); - - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - &movups (&QWP(0x40,$out),$inout4); - &movups (&QWP(0x50,$out),$inout5); - &lea ($out,&DWP(0x60,$out)); - &add ($len,0x60); - &jz (&label("ecb_ret")); - -&set_label("ecb_enc_tail"); - &movups ($inout0,&QWP(0,$inp)); - &cmp ($len,0x20); - &jb (&label("ecb_enc_one")); - &movups ($inout1,&QWP(0x10,$inp)); - &je (&label("ecb_enc_two")); - &movups ($inout2,&QWP(0x20,$inp)); - &cmp ($len,0x40); - &jb (&label("ecb_enc_three")); - &movups ($inout3,&QWP(0x30,$inp)); - &je (&label("ecb_enc_four")); - &movups ($inout4,&QWP(0x40,$inp)); - &xorps ($inout5,$inout5); - &call ("_aesni_encrypt6"); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - &movups (&QWP(0x40,$out),$inout4); - jmp (&label("ecb_ret")); - -&set_label("ecb_enc_one",16); - if ($inline) - { &aesni_inline_generate1("enc"); } - else - { &call ("_aesni_encrypt1"); } - &movups (&QWP(0,$out),$inout0); - &jmp (&label("ecb_ret")); - -&set_label("ecb_enc_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_encrypt3"); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &jmp (&label("ecb_ret")); - -&set_label("ecb_enc_three",16); - &call ("_aesni_encrypt3"); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &jmp (&label("ecb_ret")); - -&set_label("ecb_enc_four",16); - &call ("_aesni_encrypt4"); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - &jmp (&label("ecb_ret")); -###################################################################### -&set_label("ecb_decrypt",16); - &mov ($key_,$key); # backup $key - &mov ($rounds_,$rounds); # backup $rounds - &cmp ($len,0x60); - &jb (&label("ecb_dec_tail")); - - &movdqu ($inout0,&QWP(0,$inp)); - &movdqu ($inout1,&QWP(0x10,$inp)); - &movdqu ($inout2,&QWP(0x20,$inp)); - &movdqu ($inout3,&QWP(0x30,$inp)); - &movdqu ($inout4,&QWP(0x40,$inp)); - &movdqu ($inout5,&QWP(0x50,$inp)); - &lea ($inp,&DWP(0x60,$inp)); - &sub ($len,0x60); - &jmp (&label("ecb_dec_loop6_enter")); - -&set_label("ecb_dec_loop6",16); - &movups (&QWP(0,$out),$inout0); - &movdqu ($inout0,&QWP(0,$inp)); - &movups (&QWP(0x10,$out),$inout1); - &movdqu ($inout1,&QWP(0x10,$inp)); - &movups (&QWP(0x20,$out),$inout2); - &movdqu ($inout2,&QWP(0x20,$inp)); - &movups (&QWP(0x30,$out),$inout3); - &movdqu ($inout3,&QWP(0x30,$inp)); - &movups (&QWP(0x40,$out),$inout4); - &movdqu ($inout4,&QWP(0x40,$inp)); - &movups (&QWP(0x50,$out),$inout5); - &lea ($out,&DWP(0x60,$out)); - &movdqu ($inout5,&QWP(0x50,$inp)); - &lea ($inp,&DWP(0x60,$inp)); -&set_label("ecb_dec_loop6_enter"); - - &call ("_aesni_decrypt6"); - - &mov ($key,$key_); # restore $key - &mov ($rounds,$rounds_); # restore $rounds - &sub ($len,0x60); - &jnc (&label("ecb_dec_loop6")); - - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - &movups (&QWP(0x40,$out),$inout4); - &movups (&QWP(0x50,$out),$inout5); - &lea ($out,&DWP(0x60,$out)); - &add ($len,0x60); - &jz (&label("ecb_ret")); - -&set_label("ecb_dec_tail"); - &movups ($inout0,&QWP(0,$inp)); - &cmp ($len,0x20); - &jb (&label("ecb_dec_one")); - &movups ($inout1,&QWP(0x10,$inp)); - &je (&label("ecb_dec_two")); - &movups ($inout2,&QWP(0x20,$inp)); - &cmp ($len,0x40); - &jb (&label("ecb_dec_three")); - &movups ($inout3,&QWP(0x30,$inp)); - &je (&label("ecb_dec_four")); - &movups ($inout4,&QWP(0x40,$inp)); - &xorps ($inout5,$inout5); - &call ("_aesni_decrypt6"); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - &movups (&QWP(0x40,$out),$inout4); - &jmp (&label("ecb_ret")); - -&set_label("ecb_dec_one",16); - if ($inline) - { &aesni_inline_generate1("dec"); } - else - { &call ("_aesni_decrypt1"); } - &movups (&QWP(0,$out),$inout0); - &jmp (&label("ecb_ret")); - -&set_label("ecb_dec_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_decrypt3"); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &jmp (&label("ecb_ret")); - -&set_label("ecb_dec_three",16); - &call ("_aesni_decrypt3"); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &jmp (&label("ecb_ret")); - -&set_label("ecb_dec_four",16); - &call ("_aesni_decrypt4"); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - -&set_label("ecb_ret"); -&function_end("aesni_ecb_encrypt"); - -###################################################################### -# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, -# size_t blocks, const AES_KEY *key, -# const char *ivec,char *cmac); -# -# Handles only complete blocks, operates on 64-bit counter and -# does not update *ivec! Nor does it finalize CMAC value -# (see engine/eng_aesni.c for details) -# -{ my $cmac=$inout1; -&function_begin("aesni_ccm64_encrypt_blocks"); - &mov ($inp,&wparam(0)); - &mov ($out,&wparam(1)); - &mov ($len,&wparam(2)); - &mov ($key,&wparam(3)); - &mov ($rounds_,&wparam(4)); - &mov ($rounds,&wparam(5)); - &mov ($key_,"esp"); - &sub ("esp",60); - &and ("esp",-16); # align stack - &mov (&DWP(48,"esp"),$key_); - - &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec - &movdqu ($cmac,&QWP(0,$rounds)); # load cmac - &mov ($rounds,&DWP(240,$key)); - - # compose byte-swap control mask for pshufb on stack - &mov (&DWP(0,"esp"),0x0c0d0e0f); - &mov (&DWP(4,"esp"),0x08090a0b); - &mov (&DWP(8,"esp"),0x04050607); - &mov (&DWP(12,"esp"),0x00010203); - - # compose counter increment vector on stack - &mov ($rounds_,1); - &xor ($key_,$key_); - &mov (&DWP(16,"esp"),$rounds_); - &mov (&DWP(20,"esp"),$key_); - &mov (&DWP(24,"esp"),$key_); - &mov (&DWP(28,"esp"),$key_); - - &shr ($rounds,1); - &lea ($key_,&DWP(0,$key)); - &movdqa ($inout3,&QWP(0,"esp")); - &movdqa ($inout0,$ivec); - &mov ($rounds_,$rounds); - &pshufb ($ivec,$inout3); - -&set_label("ccm64_enc_outer"); - &$movekey ($rndkey0,&QWP(0,$key_)); - &mov ($rounds,$rounds_); - &movups ($in0,&QWP(0,$inp)); - - &xorps ($inout0,$rndkey0); - &$movekey ($rndkey1,&QWP(16,$key_)); - &xorps ($rndkey0,$in0); - &lea ($key,&DWP(32,$key_)); - &xorps ($cmac,$rndkey0); # cmac^=inp - &$movekey ($rndkey0,&QWP(0,$key)); - -&set_label("ccm64_enc2_loop"); - &aesenc ($inout0,$rndkey1); - &dec ($rounds); - &aesenc ($cmac,$rndkey1); - &$movekey ($rndkey1,&QWP(16,$key)); - &aesenc ($inout0,$rndkey0); - &lea ($key,&DWP(32,$key)); - &aesenc ($cmac,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); - &jnz (&label("ccm64_enc2_loop")); - &aesenc ($inout0,$rndkey1); - &aesenc ($cmac,$rndkey1); - &paddq ($ivec,&QWP(16,"esp")); - &aesenclast ($inout0,$rndkey0); - &aesenclast ($cmac,$rndkey0); - - &dec ($len); - &lea ($inp,&DWP(16,$inp)); - &xorps ($in0,$inout0); # inp^=E(ivec) - &movdqa ($inout0,$ivec); - &movups (&QWP(0,$out),$in0); # save output - &lea ($out,&DWP(16,$out)); - &pshufb ($inout0,$inout3); - &jnz (&label("ccm64_enc_outer")); - - &mov ("esp",&DWP(48,"esp")); - &mov ($out,&wparam(5)); - &movups (&QWP(0,$out),$cmac); -&function_end("aesni_ccm64_encrypt_blocks"); - -&function_begin("aesni_ccm64_decrypt_blocks"); - &mov ($inp,&wparam(0)); - &mov ($out,&wparam(1)); - &mov ($len,&wparam(2)); - &mov ($key,&wparam(3)); - &mov ($rounds_,&wparam(4)); - &mov ($rounds,&wparam(5)); - &mov ($key_,"esp"); - &sub ("esp",60); - &and ("esp",-16); # align stack - &mov (&DWP(48,"esp"),$key_); - - &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec - &movdqu ($cmac,&QWP(0,$rounds)); # load cmac - &mov ($rounds,&DWP(240,$key)); - - # compose byte-swap control mask for pshufb on stack - &mov (&DWP(0,"esp"),0x0c0d0e0f); - &mov (&DWP(4,"esp"),0x08090a0b); - &mov (&DWP(8,"esp"),0x04050607); - &mov (&DWP(12,"esp"),0x00010203); - - # compose counter increment vector on stack - &mov ($rounds_,1); - &xor ($key_,$key_); - &mov (&DWP(16,"esp"),$rounds_); - &mov (&DWP(20,"esp"),$key_); - &mov (&DWP(24,"esp"),$key_); - &mov (&DWP(28,"esp"),$key_); - - &movdqa ($inout3,&QWP(0,"esp")); # bswap mask - &movdqa ($inout0,$ivec); - - &mov ($key_,$key); - &mov ($rounds_,$rounds); - - &pshufb ($ivec,$inout3); - if ($inline) - { &aesni_inline_generate1("enc"); } - else - { &call ("_aesni_encrypt1"); } - &movups ($in0,&QWP(0,$inp)); # load inp - &paddq ($ivec,&QWP(16,"esp")); - &lea ($inp,&QWP(16,$inp)); - &jmp (&label("ccm64_dec_outer")); - -&set_label("ccm64_dec_outer",16); - &xorps ($in0,$inout0); # inp ^= E(ivec) - &movdqa ($inout0,$ivec); - &mov ($rounds,$rounds_); - &movups (&QWP(0,$out),$in0); # save output - &lea ($out,&DWP(16,$out)); - &pshufb ($inout0,$inout3); - - &sub ($len,1); - &jz (&label("ccm64_dec_break")); - - &$movekey ($rndkey0,&QWP(0,$key_)); - &shr ($rounds,1); - &$movekey ($rndkey1,&QWP(16,$key_)); - &xorps ($in0,$rndkey0); - &lea ($key,&DWP(32,$key_)); - &xorps ($inout0,$rndkey0); - &xorps ($cmac,$in0); # cmac^=out - &$movekey ($rndkey0,&QWP(0,$key)); - -&set_label("ccm64_dec2_loop"); - &aesenc ($inout0,$rndkey1); - &dec ($rounds); - &aesenc ($cmac,$rndkey1); - &$movekey ($rndkey1,&QWP(16,$key)); - &aesenc ($inout0,$rndkey0); - &lea ($key,&DWP(32,$key)); - &aesenc ($cmac,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); - &jnz (&label("ccm64_dec2_loop")); - &movups ($in0,&QWP(0,$inp)); # load inp - &paddq ($ivec,&QWP(16,"esp")); - &aesenc ($inout0,$rndkey1); - &aesenc ($cmac,$rndkey1); - &lea ($inp,&QWP(16,$inp)); - &aesenclast ($inout0,$rndkey0); - &aesenclast ($cmac,$rndkey0); - &jmp (&label("ccm64_dec_outer")); - -&set_label("ccm64_dec_break",16); - &mov ($key,$key_); - if ($inline) - { &aesni_inline_generate1("enc",$cmac,$in0); } - else - { &call ("_aesni_encrypt1",$cmac); } - - &mov ("esp",&DWP(48,"esp")); - &mov ($out,&wparam(5)); - &movups (&QWP(0,$out),$cmac); -&function_end("aesni_ccm64_decrypt_blocks"); -} - -###################################################################### -# void aesni_ctr32_encrypt_blocks (const void *in, void *out, -# size_t blocks, const AES_KEY *key, -# const char *ivec); -# -# Handles only complete blocks, operates on 32-bit counter and -# does not update *ivec! (see engine/eng_aesni.c for details) -# -# stack layout: -# 0 pshufb mask -# 16 vector addend: 0,6,6,6 -# 32 counter-less ivec -# 48 1st triplet of counter vector -# 64 2nd triplet of counter vector -# 80 saved %esp - -&function_begin("aesni_ctr32_encrypt_blocks"); - &mov ($inp,&wparam(0)); - &mov ($out,&wparam(1)); - &mov ($len,&wparam(2)); - &mov ($key,&wparam(3)); - &mov ($rounds_,&wparam(4)); - &mov ($key_,"esp"); - &sub ("esp",88); - &and ("esp",-16); # align stack - &mov (&DWP(80,"esp"),$key_); - - &cmp ($len,1); - &je (&label("ctr32_one_shortcut")); - - &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec - - # compose byte-swap control mask for pshufb on stack - &mov (&DWP(0,"esp"),0x0c0d0e0f); - &mov (&DWP(4,"esp"),0x08090a0b); - &mov (&DWP(8,"esp"),0x04050607); - &mov (&DWP(12,"esp"),0x00010203); - - # compose counter increment vector on stack - &mov ($rounds,6); - &xor ($key_,$key_); - &mov (&DWP(16,"esp"),$rounds); - &mov (&DWP(20,"esp"),$rounds); - &mov (&DWP(24,"esp"),$rounds); - &mov (&DWP(28,"esp"),$key_); - - &pextrd ($rounds_,$inout5,3); # pull 32-bit counter - &pinsrd ($inout5,$key_,3); # wipe 32-bit counter - - &mov ($rounds,&DWP(240,$key)); # key->rounds - - # compose 2 vectors of 3x32-bit counters - &bswap ($rounds_); - &pxor ($rndkey1,$rndkey1); - &pxor ($rndkey0,$rndkey0); - &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask - &pinsrd ($rndkey1,$rounds_,0); - &lea ($key_,&DWP(3,$rounds_)); - &pinsrd ($rndkey0,$key_,0); - &inc ($rounds_); - &pinsrd ($rndkey1,$rounds_,1); - &inc ($key_); - &pinsrd ($rndkey0,$key_,1); - &inc ($rounds_); - &pinsrd ($rndkey1,$rounds_,2); - &inc ($key_); - &pinsrd ($rndkey0,$key_,2); - &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet - &pshufb ($rndkey1,$inout0); # byte swap - &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet - &pshufb ($rndkey0,$inout0); # byte swap - - &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword - &pshufd ($inout1,$rndkey1,2<<6); - &cmp ($len,6); - &jb (&label("ctr32_tail")); - &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec - &shr ($rounds,1); - &mov ($key_,$key); # backup $key - &mov ($rounds_,$rounds); # backup $rounds - &sub ($len,6); - &jmp (&label("ctr32_loop6")); - -&set_label("ctr32_loop6",16); - &pshufd ($inout2,$rndkey1,1<<6); - &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec - &pshufd ($inout3,$rndkey0,3<<6); - &por ($inout0,$rndkey1); # merge counter-less ivec - &pshufd ($inout4,$rndkey0,2<<6); - &por ($inout1,$rndkey1); - &pshufd ($inout5,$rndkey0,1<<6); - &por ($inout2,$rndkey1); - &por ($inout3,$rndkey1); - &por ($inout4,$rndkey1); - &por ($inout5,$rndkey1); - - # inlining _aesni_encrypt6's prologue gives ~4% improvement... - &$movekey ($rndkey0,&QWP(0,$key_)); - &$movekey ($rndkey1,&QWP(16,$key_)); - &lea ($key,&DWP(32,$key_)); - &dec ($rounds); - &pxor ($inout0,$rndkey0); - &pxor ($inout1,$rndkey0); - &aesenc ($inout0,$rndkey1); - &pxor ($inout2,$rndkey0); - &aesenc ($inout1,$rndkey1); - &pxor ($inout3,$rndkey0); - &aesenc ($inout2,$rndkey1); - &pxor ($inout4,$rndkey0); - &aesenc ($inout3,$rndkey1); - &pxor ($inout5,$rndkey0); - &aesenc ($inout4,$rndkey1); - &$movekey ($rndkey0,&QWP(0,$key)); - &aesenc ($inout5,$rndkey1); - - &call (&label("_aesni_encrypt6_enter")); - - &movups ($rndkey1,&QWP(0,$inp)); - &movups ($rndkey0,&QWP(0x10,$inp)); - &xorps ($inout0,$rndkey1); - &movups ($rndkey1,&QWP(0x20,$inp)); - &xorps ($inout1,$rndkey0); - &movups (&QWP(0,$out),$inout0); - &movdqa ($rndkey0,&QWP(16,"esp")); # load increment - &xorps ($inout2,$rndkey1); - &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - - &paddd ($rndkey1,$rndkey0); # 1st triplet increment - &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment - &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask - - &movups ($inout1,&QWP(0x30,$inp)); - &movups ($inout2,&QWP(0x40,$inp)); - &xorps ($inout3,$inout1); - &movups ($inout1,&QWP(0x50,$inp)); - &lea ($inp,&DWP(0x60,$inp)); - &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet - &pshufb ($rndkey1,$inout0); # byte swap - &xorps ($inout4,$inout2); - &movups (&QWP(0x30,$out),$inout3); - &xorps ($inout5,$inout1); - &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet - &pshufb ($rndkey0,$inout0); # byte swap - &movups (&QWP(0x40,$out),$inout4); - &pshufd ($inout0,$rndkey1,3<<6); - &movups (&QWP(0x50,$out),$inout5); - &lea ($out,&DWP(0x60,$out)); - - &mov ($rounds,$rounds_); - &pshufd ($inout1,$rndkey1,2<<6); - &sub ($len,6); - &jnc (&label("ctr32_loop6")); - - &add ($len,6); - &jz (&label("ctr32_ret")); - &mov ($key,$key_); - &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds - &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec - -&set_label("ctr32_tail"); - &por ($inout0,$inout5); - &cmp ($len,2); - &jb (&label("ctr32_one")); - - &pshufd ($inout2,$rndkey1,1<<6); - &por ($inout1,$inout5); - &je (&label("ctr32_two")); - - &pshufd ($inout3,$rndkey0,3<<6); - &por ($inout2,$inout5); - &cmp ($len,4); - &jb (&label("ctr32_three")); - - &pshufd ($inout4,$rndkey0,2<<6); - &por ($inout3,$inout5); - &je (&label("ctr32_four")); - - &por ($inout4,$inout5); - &call ("_aesni_encrypt6"); - &movups ($rndkey1,&QWP(0,$inp)); - &movups ($rndkey0,&QWP(0x10,$inp)); - &xorps ($inout0,$rndkey1); - &movups ($rndkey1,&QWP(0x20,$inp)); - &xorps ($inout1,$rndkey0); - &movups ($rndkey0,&QWP(0x30,$inp)); - &xorps ($inout2,$rndkey1); - &movups ($rndkey1,&QWP(0x40,$inp)); - &xorps ($inout3,$rndkey0); - &movups (&QWP(0,$out),$inout0); - &xorps ($inout4,$rndkey1); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - &movups (&QWP(0x40,$out),$inout4); - &jmp (&label("ctr32_ret")); - -&set_label("ctr32_one_shortcut",16); - &movups ($inout0,&QWP(0,$rounds_)); # load ivec - &mov ($rounds,&DWP(240,$key)); - -&set_label("ctr32_one"); - if ($inline) - { &aesni_inline_generate1("enc"); } - else - { &call ("_aesni_encrypt1"); } - &movups ($in0,&QWP(0,$inp)); - &xorps ($in0,$inout0); - &movups (&QWP(0,$out),$in0); - &jmp (&label("ctr32_ret")); - -&set_label("ctr32_two",16); - &call ("_aesni_encrypt3"); - &movups ($inout3,&QWP(0,$inp)); - &movups ($inout4,&QWP(0x10,$inp)); - &xorps ($inout0,$inout3); - &xorps ($inout1,$inout4); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &jmp (&label("ctr32_ret")); - -&set_label("ctr32_three",16); - &call ("_aesni_encrypt3"); - &movups ($inout3,&QWP(0,$inp)); - &movups ($inout4,&QWP(0x10,$inp)); - &xorps ($inout0,$inout3); - &movups ($inout5,&QWP(0x20,$inp)); - &xorps ($inout1,$inout4); - &movups (&QWP(0,$out),$inout0); - &xorps ($inout2,$inout5); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &jmp (&label("ctr32_ret")); - -&set_label("ctr32_four",16); - &call ("_aesni_encrypt4"); - &movups ($inout4,&QWP(0,$inp)); - &movups ($inout5,&QWP(0x10,$inp)); - &movups ($rndkey1,&QWP(0x20,$inp)); - &xorps ($inout0,$inout4); - &movups ($rndkey0,&QWP(0x30,$inp)); - &xorps ($inout1,$inout5); - &movups (&QWP(0,$out),$inout0); - &xorps ($inout2,$rndkey1); - &movups (&QWP(0x10,$out),$inout1); - &xorps ($inout3,$rndkey0); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - -&set_label("ctr32_ret"); - &mov ("esp",&DWP(80,"esp")); -&function_end("aesni_ctr32_encrypt_blocks"); - -###################################################################### -# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, -# const AES_KEY *key1, const AES_KEY *key2 -# const unsigned char iv[16]); -# -{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); - -&function_begin("aesni_xts_encrypt"); - &mov ($key,&wparam(4)); # key2 - &mov ($inp,&wparam(5)); # clear-text tweak - - &mov ($rounds,&DWP(240,$key)); # key2->rounds - &movups ($inout0,&QWP(0,$inp)); - if ($inline) - { &aesni_inline_generate1("enc"); } - else - { &call ("_aesni_encrypt1"); } - - &mov ($inp,&wparam(0)); - &mov ($out,&wparam(1)); - &mov ($len,&wparam(2)); - &mov ($key,&wparam(3)); # key1 - - &mov ($key_,"esp"); - &sub ("esp",16*7+8); - &mov ($rounds,&DWP(240,$key)); # key1->rounds - &and ("esp",-16); # align stack - - &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant - &mov (&DWP(16*6+4,"esp"),0); - &mov (&DWP(16*6+8,"esp"),1); - &mov (&DWP(16*6+12,"esp"),0); - &mov (&DWP(16*7+0,"esp"),$len); # save original $len - &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp - - &movdqa ($tweak,$inout0); - &pxor ($twtmp,$twtmp); - &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - - &and ($len,-16); - &mov ($key_,$key); # backup $key - &mov ($rounds_,$rounds); # backup $rounds - &sub ($len,16*6); - &jc (&label("xts_enc_short")); - - &shr ($rounds,1); - &mov ($rounds_,$rounds); - &jmp (&label("xts_enc_loop6")); - -&set_label("xts_enc_loop6",16); - for ($i=0;$i<4;$i++) { - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &movdqa (&QWP(16*$i,"esp"),$tweak); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd ($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - } - &pshufd ($inout5,$twtmp,0x13); - &movdqa (&QWP(16*$i++,"esp"),$tweak); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &$movekey ($rndkey0,&QWP(0,$key_)); - &pand ($inout5,$twmask); # isolate carry and residue - &movups ($inout0,&QWP(0,$inp)); # load input - &pxor ($inout5,$tweak); - - # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] - &movdqu ($inout1,&QWP(16*1,$inp)); - &xorps ($inout0,$rndkey0); # input^=rndkey[0] - &movdqu ($inout2,&QWP(16*2,$inp)); - &pxor ($inout1,$rndkey0); - &movdqu ($inout3,&QWP(16*3,$inp)); - &pxor ($inout2,$rndkey0); - &movdqu ($inout4,&QWP(16*4,$inp)); - &pxor ($inout3,$rndkey0); - &movdqu ($rndkey1,&QWP(16*5,$inp)); - &pxor ($inout4,$rndkey0); - &lea ($inp,&DWP(16*6,$inp)); - &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak - &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak - &pxor ($inout5,$rndkey1); - - &$movekey ($rndkey1,&QWP(16,$key_)); - &lea ($key,&DWP(32,$key_)); - &pxor ($inout1,&QWP(16*1,"esp")); - &aesenc ($inout0,$rndkey1); - &pxor ($inout2,&QWP(16*2,"esp")); - &aesenc ($inout1,$rndkey1); - &pxor ($inout3,&QWP(16*3,"esp")); - &dec ($rounds); - &aesenc ($inout2,$rndkey1); - &pxor ($inout4,&QWP(16*4,"esp")); - &aesenc ($inout3,$rndkey1); - &pxor ($inout5,$rndkey0); - &aesenc ($inout4,$rndkey1); - &$movekey ($rndkey0,&QWP(0,$key)); - &aesenc ($inout5,$rndkey1); - &call (&label("_aesni_encrypt6_enter")); - - &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak - &pxor ($twtmp,$twtmp); - &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak - &pcmpgtd ($twtmp,$tweak); # broadcast upper bits - &xorps ($inout1,&QWP(16*1,"esp")); - &movups (&QWP(16*0,$out),$inout0); # write output - &xorps ($inout2,&QWP(16*2,"esp")); - &movups (&QWP(16*1,$out),$inout1); - &xorps ($inout3,&QWP(16*3,"esp")); - &movups (&QWP(16*2,$out),$inout2); - &xorps ($inout4,&QWP(16*4,"esp")); - &movups (&QWP(16*3,$out),$inout3); - &xorps ($inout5,$tweak); - &movups (&QWP(16*4,$out),$inout4); - &pshufd ($twres,$twtmp,0x13); - &movups (&QWP(16*5,$out),$inout5); - &lea ($out,&DWP(16*6,$out)); - &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 - - &pxor ($twtmp,$twtmp); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &mov ($rounds,$rounds_); # restore $rounds - &pxor ($tweak,$twres); - - &sub ($len,16*6); - &jnc (&label("xts_enc_loop6")); - - &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds - &mov ($key,$key_); # restore $key - &mov ($rounds_,$rounds); - -&set_label("xts_enc_short"); - &add ($len,16*6); - &jz (&label("xts_enc_done6x")); - - &movdqa ($inout3,$tweak); # put aside previous tweak - &cmp ($len,0x20); - &jb (&label("xts_enc_one")); - - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - &je (&label("xts_enc_two")); - - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &movdqa ($inout4,$tweak); # put aside previous tweak - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - &cmp ($len,0x40); - &jb (&label("xts_enc_three")); - - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &movdqa ($inout5,$tweak); # put aside previous tweak - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - &movdqa (&QWP(16*0,"esp"),$inout3); - &movdqa (&QWP(16*1,"esp"),$inout4); - &je (&label("xts_enc_four")); - - &movdqa (&QWP(16*2,"esp"),$inout5); - &pshufd ($inout5,$twtmp,0x13); - &movdqa (&QWP(16*3,"esp"),$tweak); - &paddq ($tweak,$tweak); # &psllq($inout0,1); - &pand ($inout5,$twmask); # isolate carry and residue - &pxor ($inout5,$tweak); - - &movdqu ($inout0,&QWP(16*0,$inp)); # load input - &movdqu ($inout1,&QWP(16*1,$inp)); - &movdqu ($inout2,&QWP(16*2,$inp)); - &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak - &movdqu ($inout3,&QWP(16*3,$inp)); - &pxor ($inout1,&QWP(16*1,"esp")); - &movdqu ($inout4,&QWP(16*4,$inp)); - &pxor ($inout2,&QWP(16*2,"esp")); - &lea ($inp,&DWP(16*5,$inp)); - &pxor ($inout3,&QWP(16*3,"esp")); - &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak - &pxor ($inout4,$inout5); - - &call ("_aesni_encrypt6"); - - &movaps ($tweak,&QWP(16*4,"esp")); # last tweak - &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak - &xorps ($inout1,&QWP(16*1,"esp")); - &xorps ($inout2,&QWP(16*2,"esp")); - &movups (&QWP(16*0,$out),$inout0); # write output - &xorps ($inout3,&QWP(16*3,"esp")); - &movups (&QWP(16*1,$out),$inout1); - &xorps ($inout4,$tweak); - &movups (&QWP(16*2,$out),$inout2); - &movups (&QWP(16*3,$out),$inout3); - &movups (&QWP(16*4,$out),$inout4); - &lea ($out,&DWP(16*5,$out)); - &jmp (&label("xts_enc_done")); - -&set_label("xts_enc_one",16); - &movups ($inout0,&QWP(16*0,$inp)); # load input - &lea ($inp,&DWP(16*1,$inp)); - &xorps ($inout0,$inout3); # input^=tweak - if ($inline) - { &aesni_inline_generate1("enc"); } - else - { &call ("_aesni_encrypt1"); } - &xorps ($inout0,$inout3); # output^=tweak - &movups (&QWP(16*0,$out),$inout0); # write output - &lea ($out,&DWP(16*1,$out)); - - &movdqa ($tweak,$inout3); # last tweak - &jmp (&label("xts_enc_done")); - -&set_label("xts_enc_two",16); - &movaps ($inout4,$tweak); # put aside last tweak - - &movups ($inout0,&QWP(16*0,$inp)); # load input - &movups ($inout1,&QWP(16*1,$inp)); - &lea ($inp,&DWP(16*2,$inp)); - &xorps ($inout0,$inout3); # input^=tweak - &xorps ($inout1,$inout4); - &xorps ($inout2,$inout2); - - &call ("_aesni_encrypt3"); - - &xorps ($inout0,$inout3); # output^=tweak - &xorps ($inout1,$inout4); - &movups (&QWP(16*0,$out),$inout0); # write output - &movups (&QWP(16*1,$out),$inout1); - &lea ($out,&DWP(16*2,$out)); - - &movdqa ($tweak,$inout4); # last tweak - &jmp (&label("xts_enc_done")); - -&set_label("xts_enc_three",16); - &movaps ($inout5,$tweak); # put aside last tweak - &movups ($inout0,&QWP(16*0,$inp)); # load input - &movups ($inout1,&QWP(16*1,$inp)); - &movups ($inout2,&QWP(16*2,$inp)); - &lea ($inp,&DWP(16*3,$inp)); - &xorps ($inout0,$inout3); # input^=tweak - &xorps ($inout1,$inout4); - &xorps ($inout2,$inout5); - - &call ("_aesni_encrypt3"); - - &xorps ($inout0,$inout3); # output^=tweak - &xorps ($inout1,$inout4); - &xorps ($inout2,$inout5); - &movups (&QWP(16*0,$out),$inout0); # write output - &movups (&QWP(16*1,$out),$inout1); - &movups (&QWP(16*2,$out),$inout2); - &lea ($out,&DWP(16*3,$out)); - - &movdqa ($tweak,$inout5); # last tweak - &jmp (&label("xts_enc_done")); - -&set_label("xts_enc_four",16); - &movaps ($inout4,$tweak); # put aside last tweak - - &movups ($inout0,&QWP(16*0,$inp)); # load input - &movups ($inout1,&QWP(16*1,$inp)); - &movups ($inout2,&QWP(16*2,$inp)); - &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak - &movups ($inout3,&QWP(16*3,$inp)); - &lea ($inp,&DWP(16*4,$inp)); - &xorps ($inout1,&QWP(16*1,"esp")); - &xorps ($inout2,$inout5); - &xorps ($inout3,$inout4); - - &call ("_aesni_encrypt4"); - - &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak - &xorps ($inout1,&QWP(16*1,"esp")); - &xorps ($inout2,$inout5); - &movups (&QWP(16*0,$out),$inout0); # write output - &xorps ($inout3,$inout4); - &movups (&QWP(16*1,$out),$inout1); - &movups (&QWP(16*2,$out),$inout2); - &movups (&QWP(16*3,$out),$inout3); - &lea ($out,&DWP(16*4,$out)); - - &movdqa ($tweak,$inout4); # last tweak - &jmp (&label("xts_enc_done")); - -&set_label("xts_enc_done6x",16); # $tweak is pre-calculated - &mov ($len,&DWP(16*7+0,"esp")); # restore original $len - &and ($len,15); - &jz (&label("xts_enc_ret")); - &movdqa ($inout3,$tweak); - &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 - &jmp (&label("xts_enc_steal")); - -&set_label("xts_enc_done",16); - &mov ($len,&DWP(16*7+0,"esp")); # restore original $len - &pxor ($twtmp,$twtmp); - &and ($len,15); - &jz (&label("xts_enc_ret")); - - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 - &pshufd ($inout3,$twtmp,0x13); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue - &pxor ($inout3,$tweak); - -&set_label("xts_enc_steal"); - &movz ($rounds,&BP(0,$inp)); - &movz ($key,&BP(-16,$out)); - &lea ($inp,&DWP(1,$inp)); - &mov (&BP(-16,$out),&LB($rounds)); - &mov (&BP(0,$out),&LB($key)); - &lea ($out,&DWP(1,$out)); - &sub ($len,1); - &jnz (&label("xts_enc_steal")); - - &sub ($out,&DWP(16*7+0,"esp")); # rewind $out - &mov ($key,$key_); # restore $key - &mov ($rounds,$rounds_); # restore $rounds - - &movups ($inout0,&QWP(-16,$out)); # load input - &xorps ($inout0,$inout3); # input^=tweak - if ($inline) - { &aesni_inline_generate1("enc"); } - else - { &call ("_aesni_encrypt1"); } - &xorps ($inout0,$inout3); # output^=tweak - &movups (&QWP(-16,$out),$inout0); # write output - -&set_label("xts_enc_ret"); - &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp -&function_end("aesni_xts_encrypt"); - -&function_begin("aesni_xts_decrypt"); - &mov ($key,&wparam(4)); # key2 - &mov ($inp,&wparam(5)); # clear-text tweak - - &mov ($rounds,&DWP(240,$key)); # key2->rounds - &movups ($inout0,&QWP(0,$inp)); - if ($inline) - { &aesni_inline_generate1("enc"); } - else - { &call ("_aesni_encrypt1"); } - - &mov ($inp,&wparam(0)); - &mov ($out,&wparam(1)); - &mov ($len,&wparam(2)); - &mov ($key,&wparam(3)); # key1 - - &mov ($key_,"esp"); - &sub ("esp",16*7+8); - &and ("esp",-16); # align stack - - &xor ($rounds_,$rounds_); # if(len%16) len-=16; - &test ($len,15); - &setnz (&LB($rounds_)); - &shl ($rounds_,4); - &sub ($len,$rounds_); - - &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant - &mov (&DWP(16*6+4,"esp"),0); - &mov (&DWP(16*6+8,"esp"),1); - &mov (&DWP(16*6+12,"esp"),0); - &mov (&DWP(16*7+0,"esp"),$len); # save original $len - &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp - - &mov ($rounds,&DWP(240,$key)); # key1->rounds - &mov ($key_,$key); # backup $key - &mov ($rounds_,$rounds); # backup $rounds - - &movdqa ($tweak,$inout0); - &pxor ($twtmp,$twtmp); - &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - - &and ($len,-16); - &sub ($len,16*6); - &jc (&label("xts_dec_short")); - - &shr ($rounds,1); - &mov ($rounds_,$rounds); - &jmp (&label("xts_dec_loop6")); - -&set_label("xts_dec_loop6",16); - for ($i=0;$i<4;$i++) { - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &movdqa (&QWP(16*$i,"esp"),$tweak); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd ($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - } - &pshufd ($inout5,$twtmp,0x13); - &movdqa (&QWP(16*$i++,"esp"),$tweak); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &$movekey ($rndkey0,&QWP(0,$key_)); - &pand ($inout5,$twmask); # isolate carry and residue - &movups ($inout0,&QWP(0,$inp)); # load input - &pxor ($inout5,$tweak); - - # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] - &movdqu ($inout1,&QWP(16*1,$inp)); - &xorps ($inout0,$rndkey0); # input^=rndkey[0] - &movdqu ($inout2,&QWP(16*2,$inp)); - &pxor ($inout1,$rndkey0); - &movdqu ($inout3,&QWP(16*3,$inp)); - &pxor ($inout2,$rndkey0); - &movdqu ($inout4,&QWP(16*4,$inp)); - &pxor ($inout3,$rndkey0); - &movdqu ($rndkey1,&QWP(16*5,$inp)); - &pxor ($inout4,$rndkey0); - &lea ($inp,&DWP(16*6,$inp)); - &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak - &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak - &pxor ($inout5,$rndkey1); - - &$movekey ($rndkey1,&QWP(16,$key_)); - &lea ($key,&DWP(32,$key_)); - &pxor ($inout1,&QWP(16*1,"esp")); - &aesdec ($inout0,$rndkey1); - &pxor ($inout2,&QWP(16*2,"esp")); - &aesdec ($inout1,$rndkey1); - &pxor ($inout3,&QWP(16*3,"esp")); - &dec ($rounds); - &aesdec ($inout2,$rndkey1); - &pxor ($inout4,&QWP(16*4,"esp")); - &aesdec ($inout3,$rndkey1); - &pxor ($inout5,$rndkey0); - &aesdec ($inout4,$rndkey1); - &$movekey ($rndkey0,&QWP(0,$key)); - &aesdec ($inout5,$rndkey1); - &call (&label("_aesni_decrypt6_enter")); - - &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak - &pxor ($twtmp,$twtmp); - &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak - &pcmpgtd ($twtmp,$tweak); # broadcast upper bits - &xorps ($inout1,&QWP(16*1,"esp")); - &movups (&QWP(16*0,$out),$inout0); # write output - &xorps ($inout2,&QWP(16*2,"esp")); - &movups (&QWP(16*1,$out),$inout1); - &xorps ($inout3,&QWP(16*3,"esp")); - &movups (&QWP(16*2,$out),$inout2); - &xorps ($inout4,&QWP(16*4,"esp")); - &movups (&QWP(16*3,$out),$inout3); - &xorps ($inout5,$tweak); - &movups (&QWP(16*4,$out),$inout4); - &pshufd ($twres,$twtmp,0x13); - &movups (&QWP(16*5,$out),$inout5); - &lea ($out,&DWP(16*6,$out)); - &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 - - &pxor ($twtmp,$twtmp); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &mov ($rounds,$rounds_); # restore $rounds - &pxor ($tweak,$twres); - - &sub ($len,16*6); - &jnc (&label("xts_dec_loop6")); - - &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds - &mov ($key,$key_); # restore $key - &mov ($rounds_,$rounds); - -&set_label("xts_dec_short"); - &add ($len,16*6); - &jz (&label("xts_dec_done6x")); - - &movdqa ($inout3,$tweak); # put aside previous tweak - &cmp ($len,0x20); - &jb (&label("xts_dec_one")); - - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - &je (&label("xts_dec_two")); - - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &movdqa ($inout4,$tweak); # put aside previous tweak - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - &cmp ($len,0x40); - &jb (&label("xts_dec_three")); - - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &movdqa ($inout5,$tweak); # put aside previous tweak - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - &movdqa (&QWP(16*0,"esp"),$inout3); - &movdqa (&QWP(16*1,"esp"),$inout4); - &je (&label("xts_dec_four")); - - &movdqa (&QWP(16*2,"esp"),$inout5); - &pshufd ($inout5,$twtmp,0x13); - &movdqa (&QWP(16*3,"esp"),$tweak); - &paddq ($tweak,$tweak); # &psllq($inout0,1); - &pand ($inout5,$twmask); # isolate carry and residue - &pxor ($inout5,$tweak); - - &movdqu ($inout0,&QWP(16*0,$inp)); # load input - &movdqu ($inout1,&QWP(16*1,$inp)); - &movdqu ($inout2,&QWP(16*2,$inp)); - &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak - &movdqu ($inout3,&QWP(16*3,$inp)); - &pxor ($inout1,&QWP(16*1,"esp")); - &movdqu ($inout4,&QWP(16*4,$inp)); - &pxor ($inout2,&QWP(16*2,"esp")); - &lea ($inp,&DWP(16*5,$inp)); - &pxor ($inout3,&QWP(16*3,"esp")); - &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak - &pxor ($inout4,$inout5); - - &call ("_aesni_decrypt6"); - - &movaps ($tweak,&QWP(16*4,"esp")); # last tweak - &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak - &xorps ($inout1,&QWP(16*1,"esp")); - &xorps ($inout2,&QWP(16*2,"esp")); - &movups (&QWP(16*0,$out),$inout0); # write output - &xorps ($inout3,&QWP(16*3,"esp")); - &movups (&QWP(16*1,$out),$inout1); - &xorps ($inout4,$tweak); - &movups (&QWP(16*2,$out),$inout2); - &movups (&QWP(16*3,$out),$inout3); - &movups (&QWP(16*4,$out),$inout4); - &lea ($out,&DWP(16*5,$out)); - &jmp (&label("xts_dec_done")); - -&set_label("xts_dec_one",16); - &movups ($inout0,&QWP(16*0,$inp)); # load input - &lea ($inp,&DWP(16*1,$inp)); - &xorps ($inout0,$inout3); # input^=tweak - if ($inline) - { &aesni_inline_generate1("dec"); } - else - { &call ("_aesni_decrypt1"); } - &xorps ($inout0,$inout3); # output^=tweak - &movups (&QWP(16*0,$out),$inout0); # write output - &lea ($out,&DWP(16*1,$out)); - - &movdqa ($tweak,$inout3); # last tweak - &jmp (&label("xts_dec_done")); - -&set_label("xts_dec_two",16); - &movaps ($inout4,$tweak); # put aside last tweak - - &movups ($inout0,&QWP(16*0,$inp)); # load input - &movups ($inout1,&QWP(16*1,$inp)); - &lea ($inp,&DWP(16*2,$inp)); - &xorps ($inout0,$inout3); # input^=tweak - &xorps ($inout1,$inout4); - - &call ("_aesni_decrypt3"); - - &xorps ($inout0,$inout3); # output^=tweak - &xorps ($inout1,$inout4); - &movups (&QWP(16*0,$out),$inout0); # write output - &movups (&QWP(16*1,$out),$inout1); - &lea ($out,&DWP(16*2,$out)); - - &movdqa ($tweak,$inout4); # last tweak - &jmp (&label("xts_dec_done")); - -&set_label("xts_dec_three",16); - &movaps ($inout5,$tweak); # put aside last tweak - &movups ($inout0,&QWP(16*0,$inp)); # load input - &movups ($inout1,&QWP(16*1,$inp)); - &movups ($inout2,&QWP(16*2,$inp)); - &lea ($inp,&DWP(16*3,$inp)); - &xorps ($inout0,$inout3); # input^=tweak - &xorps ($inout1,$inout4); - &xorps ($inout2,$inout5); - - &call ("_aesni_decrypt3"); - - &xorps ($inout0,$inout3); # output^=tweak - &xorps ($inout1,$inout4); - &xorps ($inout2,$inout5); - &movups (&QWP(16*0,$out),$inout0); # write output - &movups (&QWP(16*1,$out),$inout1); - &movups (&QWP(16*2,$out),$inout2); - &lea ($out,&DWP(16*3,$out)); - - &movdqa ($tweak,$inout5); # last tweak - &jmp (&label("xts_dec_done")); - -&set_label("xts_dec_four",16); - &movaps ($inout4,$tweak); # put aside last tweak - - &movups ($inout0,&QWP(16*0,$inp)); # load input - &movups ($inout1,&QWP(16*1,$inp)); - &movups ($inout2,&QWP(16*2,$inp)); - &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak - &movups ($inout3,&QWP(16*3,$inp)); - &lea ($inp,&DWP(16*4,$inp)); - &xorps ($inout1,&QWP(16*1,"esp")); - &xorps ($inout2,$inout5); - &xorps ($inout3,$inout4); - - &call ("_aesni_decrypt4"); - - &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak - &xorps ($inout1,&QWP(16*1,"esp")); - &xorps ($inout2,$inout5); - &movups (&QWP(16*0,$out),$inout0); # write output - &xorps ($inout3,$inout4); - &movups (&QWP(16*1,$out),$inout1); - &movups (&QWP(16*2,$out),$inout2); - &movups (&QWP(16*3,$out),$inout3); - &lea ($out,&DWP(16*4,$out)); - - &movdqa ($tweak,$inout4); # last tweak - &jmp (&label("xts_dec_done")); - -&set_label("xts_dec_done6x",16); # $tweak is pre-calculated - &mov ($len,&DWP(16*7+0,"esp")); # restore original $len - &and ($len,15); - &jz (&label("xts_dec_ret")); - &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 - &jmp (&label("xts_dec_only_one_more")); - -&set_label("xts_dec_done",16); - &mov ($len,&DWP(16*7+0,"esp")); # restore original $len - &pxor ($twtmp,$twtmp); - &and ($len,15); - &jz (&label("xts_dec_ret")); - - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 - &pshufd ($twres,$twtmp,0x13); - &pxor ($twtmp,$twtmp); - &movdqa ($twmask,&QWP(16*6,"esp")); - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($twres,$twmask); # isolate carry and residue - &pcmpgtd($twtmp,$tweak); # broadcast upper bits - &pxor ($tweak,$twres); - -&set_label("xts_dec_only_one_more"); - &pshufd ($inout3,$twtmp,0x13); - &movdqa ($inout4,$tweak); # put aside previous tweak - &paddq ($tweak,$tweak); # &psllq($tweak,1); - &pand ($inout3,$twmask); # isolate carry and residue - &pxor ($inout3,$tweak); - - &mov ($key,$key_); # restore $key - &mov ($rounds,$rounds_); # restore $rounds - - &movups ($inout0,&QWP(0,$inp)); # load input - &xorps ($inout0,$inout3); # input^=tweak - if ($inline) - { &aesni_inline_generate1("dec"); } - else - { &call ("_aesni_decrypt1"); } - &xorps ($inout0,$inout3); # output^=tweak - &movups (&QWP(0,$out),$inout0); # write output - -&set_label("xts_dec_steal"); - &movz ($rounds,&BP(16,$inp)); - &movz ($key,&BP(0,$out)); - &lea ($inp,&DWP(1,$inp)); - &mov (&BP(0,$out),&LB($rounds)); - &mov (&BP(16,$out),&LB($key)); - &lea ($out,&DWP(1,$out)); - &sub ($len,1); - &jnz (&label("xts_dec_steal")); - - &sub ($out,&DWP(16*7+0,"esp")); # rewind $out - &mov ($key,$key_); # restore $key - &mov ($rounds,$rounds_); # restore $rounds - - &movups ($inout0,&QWP(0,$out)); # load input - &xorps ($inout0,$inout4); # input^=tweak - if ($inline) - { &aesni_inline_generate1("dec"); } - else - { &call ("_aesni_decrypt1"); } - &xorps ($inout0,$inout4); # output^=tweak - &movups (&QWP(0,$out),$inout0); # write output - -&set_label("xts_dec_ret"); - &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp -&function_end("aesni_xts_decrypt"); -} -} - -###################################################################### -# void $PREFIX_cbc_encrypt (const void *inp, void *out, -# size_t length, const AES_KEY *key, -# unsigned char *ivp,const int enc); -&function_begin("${PREFIX}_cbc_encrypt"); - &mov ($inp,&wparam(0)); - &mov ($rounds_,"esp"); - &mov ($out,&wparam(1)); - &sub ($rounds_,24); - &mov ($len,&wparam(2)); - &and ($rounds_,-16); - &mov ($key,&wparam(3)); - &mov ($key_,&wparam(4)); - &test ($len,$len); - &jz (&label("cbc_abort")); - - &cmp (&wparam(5),0); - &xchg ($rounds_,"esp"); # alloca - &movups ($ivec,&QWP(0,$key_)); # load IV - &mov ($rounds,&DWP(240,$key)); - &mov ($key_,$key); # backup $key - &mov (&DWP(16,"esp"),$rounds_); # save original %esp - &mov ($rounds_,$rounds); # backup $rounds - &je (&label("cbc_decrypt")); - - &movaps ($inout0,$ivec); - &cmp ($len,16); - &jb (&label("cbc_enc_tail")); - &sub ($len,16); - &jmp (&label("cbc_enc_loop")); - -&set_label("cbc_enc_loop",16); - &movups ($ivec,&QWP(0,$inp)); # input actually - &lea ($inp,&DWP(16,$inp)); - if ($inline) - { &aesni_inline_generate1("enc",$inout0,$ivec); } - else - { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } - &mov ($rounds,$rounds_); # restore $rounds - &mov ($key,$key_); # restore $key - &movups (&QWP(0,$out),$inout0); # store output - &lea ($out,&DWP(16,$out)); - &sub ($len,16); - &jnc (&label("cbc_enc_loop")); - &add ($len,16); - &jnz (&label("cbc_enc_tail")); - &movaps ($ivec,$inout0); - &jmp (&label("cbc_ret")); - -&set_label("cbc_enc_tail"); - &mov ("ecx",$len); # zaps $rounds - &data_word(0xA4F3F689); # rep movsb - &mov ("ecx",16); # zero tail - &sub ("ecx",$len); - &xor ("eax","eax"); # zaps $len - &data_word(0xAAF3F689); # rep stosb - &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block - &mov ($rounds,$rounds_); # restore $rounds - &mov ($inp,$out); # $inp and $out are the same - &mov ($key,$key_); # restore $key - &jmp (&label("cbc_enc_loop")); -###################################################################### -&set_label("cbc_decrypt",16); - &cmp ($len,0x50); - &jbe (&label("cbc_dec_tail")); - &movaps (&QWP(0,"esp"),$ivec); # save IV - &sub ($len,0x50); - &jmp (&label("cbc_dec_loop6_enter")); - -&set_label("cbc_dec_loop6",16); - &movaps (&QWP(0,"esp"),$rndkey0); # save IV - &movups (&QWP(0,$out),$inout5); - &lea ($out,&DWP(0x10,$out)); -&set_label("cbc_dec_loop6_enter"); - &movdqu ($inout0,&QWP(0,$inp)); - &movdqu ($inout1,&QWP(0x10,$inp)); - &movdqu ($inout2,&QWP(0x20,$inp)); - &movdqu ($inout3,&QWP(0x30,$inp)); - &movdqu ($inout4,&QWP(0x40,$inp)); - &movdqu ($inout5,&QWP(0x50,$inp)); - - &call ("_aesni_decrypt6"); - - &movups ($rndkey1,&QWP(0,$inp)); - &movups ($rndkey0,&QWP(0x10,$inp)); - &xorps ($inout0,&QWP(0,"esp")); # ^=IV - &xorps ($inout1,$rndkey1); - &movups ($rndkey1,&QWP(0x20,$inp)); - &xorps ($inout2,$rndkey0); - &movups ($rndkey0,&QWP(0x30,$inp)); - &xorps ($inout3,$rndkey1); - &movups ($rndkey1,&QWP(0x40,$inp)); - &xorps ($inout4,$rndkey0); - &movups ($rndkey0,&QWP(0x50,$inp)); # IV - &xorps ($inout5,$rndkey1); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &lea ($inp,&DWP(0x60,$inp)); - &movups (&QWP(0x20,$out),$inout2); - &mov ($rounds,$rounds_); # restore $rounds - &movups (&QWP(0x30,$out),$inout3); - &mov ($key,$key_); # restore $key - &movups (&QWP(0x40,$out),$inout4); - &lea ($out,&DWP(0x50,$out)); - &sub ($len,0x60); - &ja (&label("cbc_dec_loop6")); - - &movaps ($inout0,$inout5); - &movaps ($ivec,$rndkey0); - &add ($len,0x50); - &jle (&label("cbc_dec_tail_collected")); - &movups (&QWP(0,$out),$inout0); - &lea ($out,&DWP(0x10,$out)); -&set_label("cbc_dec_tail"); - &movups ($inout0,&QWP(0,$inp)); - &movaps ($in0,$inout0); - &cmp ($len,0x10); - &jbe (&label("cbc_dec_one")); - - &movups ($inout1,&QWP(0x10,$inp)); - &movaps ($in1,$inout1); - &cmp ($len,0x20); - &jbe (&label("cbc_dec_two")); - - &movups ($inout2,&QWP(0x20,$inp)); - &cmp ($len,0x30); - &jbe (&label("cbc_dec_three")); - - &movups ($inout3,&QWP(0x30,$inp)); - &cmp ($len,0x40); - &jbe (&label("cbc_dec_four")); - - &movups ($inout4,&QWP(0x40,$inp)); - &movaps (&QWP(0,"esp"),$ivec); # save IV - &movups ($inout0,&QWP(0,$inp)); - &xorps ($inout5,$inout5); - &call ("_aesni_decrypt6"); - &movups ($rndkey1,&QWP(0,$inp)); - &movups ($rndkey0,&QWP(0x10,$inp)); - &xorps ($inout0,&QWP(0,"esp")); # ^= IV - &xorps ($inout1,$rndkey1); - &movups ($rndkey1,&QWP(0x20,$inp)); - &xorps ($inout2,$rndkey0); - &movups ($rndkey0,&QWP(0x30,$inp)); - &xorps ($inout3,$rndkey1); - &movups ($ivec,&QWP(0x40,$inp)); # IV - &xorps ($inout4,$rndkey0); - &movups (&QWP(0,$out),$inout0); - &movups (&QWP(0x10,$out),$inout1); - &movups (&QWP(0x20,$out),$inout2); - &movups (&QWP(0x30,$out),$inout3); - &lea ($out,&DWP(0x40,$out)); - &movaps ($inout0,$inout4); - &sub ($len,0x50); - &jmp (&label("cbc_dec_tail_collected")); - -&set_label("cbc_dec_one",16); - if ($inline) - { &aesni_inline_generate1("dec"); } - else - { &call ("_aesni_decrypt1"); } - &xorps ($inout0,$ivec); - &movaps ($ivec,$in0); - &sub ($len,0x10); - &jmp (&label("cbc_dec_tail_collected")); - -&set_label("cbc_dec_two",16); - &xorps ($inout2,$inout2); - &call ("_aesni_decrypt3"); - &xorps ($inout0,$ivec); - &xorps ($inout1,$in0); - &movups (&QWP(0,$out),$inout0); - &movaps ($inout0,$inout1); - &lea ($out,&DWP(0x10,$out)); - &movaps ($ivec,$in1); - &sub ($len,0x20); - &jmp (&label("cbc_dec_tail_collected")); - -&set_label("cbc_dec_three",16); - &call ("_aesni_decrypt3"); - &xorps ($inout0,$ivec); - &xorps ($inout1,$in0); - &xorps ($inout2,$in1); - &movups (&QWP(0,$out),$inout0); - &movaps ($inout0,$inout2); - &movups (&QWP(0x10,$out),$inout1); - &lea ($out,&DWP(0x20,$out)); - &movups ($ivec,&QWP(0x20,$inp)); - &sub ($len,0x30); - &jmp (&label("cbc_dec_tail_collected")); - -&set_label("cbc_dec_four",16); - &call ("_aesni_decrypt4"); - &movups ($rndkey1,&QWP(0x10,$inp)); - &movups ($rndkey0,&QWP(0x20,$inp)); - &xorps ($inout0,$ivec); - &movups ($ivec,&QWP(0x30,$inp)); - &xorps ($inout1,$in0); - &movups (&QWP(0,$out),$inout0); - &xorps ($inout2,$rndkey1); - &movups (&QWP(0x10,$out),$inout1); - &xorps ($inout3,$rndkey0); - &movups (&QWP(0x20,$out),$inout2); - &lea ($out,&DWP(0x30,$out)); - &movaps ($inout0,$inout3); - &sub ($len,0x40); - -&set_label("cbc_dec_tail_collected"); - &and ($len,15); - &jnz (&label("cbc_dec_tail_partial")); - &movups (&QWP(0,$out),$inout0); - &jmp (&label("cbc_ret")); - -&set_label("cbc_dec_tail_partial",16); - &movaps (&QWP(0,"esp"),$inout0); - &mov ("ecx",16); - &mov ($inp,"esp"); - &sub ("ecx",$len); - &data_word(0xA4F3F689); # rep movsb - -&set_label("cbc_ret"); - &mov ("esp",&DWP(16,"esp")); # pull original %esp - &mov ($key_,&wparam(4)); - &movups (&QWP(0,$key_),$ivec); # output IV -&set_label("cbc_abort"); -&function_end("${PREFIX}_cbc_encrypt"); - -###################################################################### -# Mechanical port from aesni-x86_64.pl. -# -# _aesni_set_encrypt_key is private interface, -# input: -# "eax" const unsigned char *userKey -# $rounds int bits -# $key AES_KEY *key -# output: -# "eax" return code -# $round rounds - -&function_begin_B("_aesni_set_encrypt_key"); - &test ("eax","eax"); - &jz (&label("bad_pointer")); - &test ($key,$key); - &jz (&label("bad_pointer")); - - &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey - &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 - &lea ($key,&DWP(16,$key)); - &cmp ($rounds,256); - &je (&label("14rounds")); - &cmp ($rounds,192); - &je (&label("12rounds")); - &cmp ($rounds,128); - &jne (&label("bad_keybits")); - -&set_label("10rounds",16); - &mov ($rounds,9); - &$movekey (&QWP(-16,$key),"xmm0"); # round 0 - &aeskeygenassist("xmm1","xmm0",0x01); # round 1 - &call (&label("key_128_cold")); - &aeskeygenassist("xmm1","xmm0",0x2); # round 2 - &call (&label("key_128")); - &aeskeygenassist("xmm1","xmm0",0x04); # round 3 - &call (&label("key_128")); - &aeskeygenassist("xmm1","xmm0",0x08); # round 4 - &call (&label("key_128")); - &aeskeygenassist("xmm1","xmm0",0x10); # round 5 - &call (&label("key_128")); - &aeskeygenassist("xmm1","xmm0",0x20); # round 6 - &call (&label("key_128")); - &aeskeygenassist("xmm1","xmm0",0x40); # round 7 - &call (&label("key_128")); - &aeskeygenassist("xmm1","xmm0",0x80); # round 8 - &call (&label("key_128")); - &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 - &call (&label("key_128")); - &aeskeygenassist("xmm1","xmm0",0x36); # round 10 - &call (&label("key_128")); - &$movekey (&QWP(0,$key),"xmm0"); - &mov (&DWP(80,$key),$rounds); - &xor ("eax","eax"); - &ret(); - -&set_label("key_128",16); - &$movekey (&QWP(0,$key),"xmm0"); - &lea ($key,&DWP(16,$key)); -&set_label("key_128_cold"); - &shufps ("xmm4","xmm0",0b00010000); - &xorps ("xmm0","xmm4"); - &shufps ("xmm4","xmm0",0b10001100); - &xorps ("xmm0","xmm4"); - &shufps ("xmm1","xmm1",0b11111111); # critical path - &xorps ("xmm0","xmm1"); - &ret(); - -&set_label("12rounds",16); - &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey - &mov ($rounds,11); - &$movekey (&QWP(-16,$key),"xmm0"); # round 0 - &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 - &call (&label("key_192a_cold")); - &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 - &call (&label("key_192b")); - &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 - &call (&label("key_192a")); - &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 - &call (&label("key_192b")); - &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 - &call (&label("key_192a")); - &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 - &call (&label("key_192b")); - &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 - &call (&label("key_192a")); - &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 - &call (&label("key_192b")); - &$movekey (&QWP(0,$key),"xmm0"); - &mov (&DWP(48,$key),$rounds); - &xor ("eax","eax"); - &ret(); - -&set_label("key_192a",16); - &$movekey (&QWP(0,$key),"xmm0"); - &lea ($key,&DWP(16,$key)); -&set_label("key_192a_cold",16); - &movaps ("xmm5","xmm2"); -&set_label("key_192b_warm"); - &shufps ("xmm4","xmm0",0b00010000); - &movdqa ("xmm3","xmm2"); - &xorps ("xmm0","xmm4"); - &shufps ("xmm4","xmm0",0b10001100); - &pslldq ("xmm3",4); - &xorps ("xmm0","xmm4"); - &pshufd ("xmm1","xmm1",0b01010101); # critical path - &pxor ("xmm2","xmm3"); - &pxor ("xmm0","xmm1"); - &pshufd ("xmm3","xmm0",0b11111111); - &pxor ("xmm2","xmm3"); - &ret(); - -&set_label("key_192b",16); - &movaps ("xmm3","xmm0"); - &shufps ("xmm5","xmm0",0b01000100); - &$movekey (&QWP(0,$key),"xmm5"); - &shufps ("xmm3","xmm2",0b01001110); - &$movekey (&QWP(16,$key),"xmm3"); - &lea ($key,&DWP(32,$key)); - &jmp (&label("key_192b_warm")); - -&set_label("14rounds",16); - &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey - &mov ($rounds,13); - &lea ($key,&DWP(16,$key)); - &$movekey (&QWP(-32,$key),"xmm0"); # round 0 - &$movekey (&QWP(-16,$key),"xmm2"); # round 1 - &aeskeygenassist("xmm1","xmm2",0x01); # round 2 - &call (&label("key_256a_cold")); - &aeskeygenassist("xmm1","xmm0",0x01); # round 3 - &call (&label("key_256b")); - &aeskeygenassist("xmm1","xmm2",0x02); # round 4 - &call (&label("key_256a")); - &aeskeygenassist("xmm1","xmm0",0x02); # round 5 - &call (&label("key_256b")); - &aeskeygenassist("xmm1","xmm2",0x04); # round 6 - &call (&label("key_256a")); - &aeskeygenassist("xmm1","xmm0",0x04); # round 7 - &call (&label("key_256b")); - &aeskeygenassist("xmm1","xmm2",0x08); # round 8 - &call (&label("key_256a")); - &aeskeygenassist("xmm1","xmm0",0x08); # round 9 - &call (&label("key_256b")); - &aeskeygenassist("xmm1","xmm2",0x10); # round 10 - &call (&label("key_256a")); - &aeskeygenassist("xmm1","xmm0",0x10); # round 11 - &call (&label("key_256b")); - &aeskeygenassist("xmm1","xmm2",0x20); # round 12 - &call (&label("key_256a")); - &aeskeygenassist("xmm1","xmm0",0x20); # round 13 - &call (&label("key_256b")); - &aeskeygenassist("xmm1","xmm2",0x40); # round 14 - &call (&label("key_256a")); - &$movekey (&QWP(0,$key),"xmm0"); - &mov (&DWP(16,$key),$rounds); - &xor ("eax","eax"); - &ret(); - -&set_label("key_256a",16); - &$movekey (&QWP(0,$key),"xmm2"); - &lea ($key,&DWP(16,$key)); -&set_label("key_256a_cold"); - &shufps ("xmm4","xmm0",0b00010000); - &xorps ("xmm0","xmm4"); - &shufps ("xmm4","xmm0",0b10001100); - &xorps ("xmm0","xmm4"); - &shufps ("xmm1","xmm1",0b11111111); # critical path - &xorps ("xmm0","xmm1"); - &ret(); - -&set_label("key_256b",16); - &$movekey (&QWP(0,$key),"xmm0"); - &lea ($key,&DWP(16,$key)); - - &shufps ("xmm4","xmm2",0b00010000); - &xorps ("xmm2","xmm4"); - &shufps ("xmm4","xmm2",0b10001100); - &xorps ("xmm2","xmm4"); - &shufps ("xmm1","xmm1",0b10101010); # critical path - &xorps ("xmm2","xmm1"); - &ret(); - -&set_label("bad_pointer",4); - &mov ("eax",-1); - &ret (); -&set_label("bad_keybits",4); - &mov ("eax",-2); - &ret (); -&function_end_B("_aesni_set_encrypt_key"); - -# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, -# AES_KEY *key) -&function_begin_B("${PREFIX}_set_encrypt_key"); - &mov ("eax",&wparam(0)); - &mov ($rounds,&wparam(1)); - &mov ($key,&wparam(2)); - &call ("_aesni_set_encrypt_key"); - &ret (); -&function_end_B("${PREFIX}_set_encrypt_key"); - -# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, -# AES_KEY *key) -&function_begin_B("${PREFIX}_set_decrypt_key"); - &mov ("eax",&wparam(0)); - &mov ($rounds,&wparam(1)); - &mov ($key,&wparam(2)); - &call ("_aesni_set_encrypt_key"); - &mov ($key,&wparam(2)); - &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key - &test ("eax","eax"); - &jnz (&label("dec_key_ret")); - &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule - - &$movekey ("xmm0",&QWP(0,$key)); # just swap - &$movekey ("xmm1",&QWP(0,"eax")); - &$movekey (&QWP(0,"eax"),"xmm0"); - &$movekey (&QWP(0,$key),"xmm1"); - &lea ($key,&DWP(16,$key)); - &lea ("eax",&DWP(-16,"eax")); - -&set_label("dec_key_inverse"); - &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse - &$movekey ("xmm1",&QWP(0,"eax")); - &aesimc ("xmm0","xmm0"); - &aesimc ("xmm1","xmm1"); - &lea ($key,&DWP(16,$key)); - &lea ("eax",&DWP(-16,"eax")); - &$movekey (&QWP(16,"eax"),"xmm0"); - &$movekey (&QWP(-16,$key),"xmm1"); - &cmp ("eax",$key); - &ja (&label("dec_key_inverse")); - - &$movekey ("xmm0",&QWP(0,$key)); # inverse middle - &aesimc ("xmm0","xmm0"); - &$movekey (&QWP(0,$key),"xmm0"); - - &xor ("eax","eax"); # return success -&set_label("dec_key_ret"); - &ret (); -&function_end_B("${PREFIX}_set_decrypt_key"); -&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); - -&asm_finish(); +../openssl/./crypto/aes/asm/aesni-x86.pl
\ No newline at end of file diff --git a/devel/perlasm/aesni-x86_64.pl b/devel/perlasm/aesni-x86_64.pl index 4a10fe6bd2..288d126b80 100644..120000 --- a/devel/perlasm/aesni-x86_64.pl +++ b/devel/perlasm/aesni-x86_64.pl @@ -1,3331 +1 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements support for Intel AES-NI extension. In -# OpenSSL context it's used with Intel engine, but can also be used as -# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for -# details]. -# -# Performance. -# -# Given aes(enc|dec) instructions' latency asymptotic performance for -# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte -# processed with 128-bit key. And given their throughput asymptotic -# performance for parallelizable modes is 1.25 cycles per byte. Being -# asymptotic limit it's not something you commonly achieve in reality, -# but how close does one get? Below are results collected for -# different modes and block sized. Pairs of numbers are for en-/ -# decryption. -# -# 16-byte 64-byte 256-byte 1-KB 8-KB -# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 -# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 -# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 -# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 -# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 -# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 -# -# ECB, CTR, CBC and CCM results are free from EVP overhead. This means -# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni -# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. -# The results were collected with specially crafted speed.c benchmark -# in order to compare them with results reported in "Intel Advanced -# Encryption Standard (AES) New Instruction Set" White Paper Revision -# 3.0 dated May 2010. All above results are consistently better. This -# module also provides better performance for block sizes smaller than -# 128 bytes in points *not* represented in the above table. -# -# Looking at the results for 8-KB buffer. -# -# CFB and OFB results are far from the limit, because implementation -# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on -# single-block aesni_encrypt, which is not the most optimal way to go. -# CBC encrypt result is unexpectedly high and there is no documented -# explanation for it. Seemingly there is a small penalty for feeding -# the result back to AES unit the way it's done in CBC mode. There is -# nothing one can do and the result appears optimal. CCM result is -# identical to CBC, because CBC-MAC is essentially CBC encrypt without -# saving output. CCM CTR "stays invisible," because it's neatly -# interleaved wih CBC-MAC. This provides ~30% improvement over -# "straghtforward" CCM implementation with CTR and CBC-MAC performed -# disjointly. Parallelizable modes practically achieve the theoretical -# limit. -# -# Looking at how results vary with buffer size. -# -# Curves are practically saturated at 1-KB buffer size. In most cases -# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. -# CTR curve doesn't follow this pattern and is "slowest" changing one -# with "256-byte" result being 87% of "8-KB." This is because overhead -# in CTR mode is most computationally intensive. Small-block CCM -# decrypt is slower than encrypt, because first CTR and last CBC-MAC -# iterations can't be interleaved. -# -# Results for 192- and 256-bit keys. -# -# EVP-free results were observed to scale perfectly with number of -# rounds for larger block sizes, i.e. 192-bit result being 10/12 times -# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences -# are a tad smaller, because the above mentioned penalty biases all -# results by same constant value. In similar way function call -# overhead affects small-block performance, as well as OFB and CFB -# results. Differences are not large, most common coefficients are -# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one -# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... - -# January 2011 -# -# While Westmere processor features 6 cycles latency for aes[enc|dec] -# instructions, which can be scheduled every second cycle, Sandy -# Bridge spends 8 cycles per instruction, but it can schedule them -# every cycle. This means that code targeting Westmere would perform -# suboptimally on Sandy Bridge. Therefore this update. -# -# In addition, non-parallelizable CBC encrypt (as well as CCM) is -# optimized. Relative improvement might appear modest, 8% on Westmere, -# but in absolute terms it's 3.77 cycles per byte encrypted with -# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers -# should be compared to asymptotic limits of 3.75 for Westmere and -# 5.00 for Sandy Bridge. Actually, the fact that they get this close -# to asymptotic limits is quite amazing. Indeed, the limit is -# calculated as latency times number of rounds, 10 for 128-bit key, -# and divided by 16, the number of bytes in block, or in other words -# it accounts *solely* for aesenc instructions. But there are extra -# instructions, and numbers so close to the asymptotic limits mean -# that it's as if it takes as little as *one* additional cycle to -# execute all of them. How is it possible? It is possible thanks to -# out-of-order execution logic, which manages to overlap post- -# processing of previous block, things like saving the output, with -# actual encryption of current block, as well as pre-processing of -# current block, things like fetching input and xor-ing it with -# 0-round element of the key schedule, with actual encryption of -# previous block. Keep this in mind... -# -# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher -# performance is achieved by interleaving instructions working on -# independent blocks. In which case asymptotic limit for such modes -# can be obtained by dividing above mentioned numbers by AES -# instructions' interleave factor. Westmere can execute at most 3 -# instructions at a time, meaning that optimal interleave factor is 3, -# and that's where the "magic" number of 1.25 come from. "Optimal -# interleave factor" means that increase of interleave factor does -# not improve performance. The formula has proven to reflect reality -# pretty well on Westmere... Sandy Bridge on the other hand can -# execute up to 8 AES instructions at a time, so how does varying -# interleave factor affect the performance? Here is table for ECB -# (numbers are cycles per byte processed with 128-bit key): -# -# instruction interleave factor 3x 6x 8x -# theoretical asymptotic limit 1.67 0.83 0.625 -# measured performance for 8KB block 1.05 0.86 0.84 -# -# "as if" interleave factor 4.7x 5.8x 6.0x -# -# Further data for other parallelizable modes: -# -# CBC decrypt 1.16 0.93 0.74 -# CTR 1.14 0.91 0.74 -# -# Well, given 3x column it's probably inappropriate to call the limit -# asymptotic, if it can be surpassed, isn't it? What happens there? -# Rewind to CBC paragraph for the answer. Yes, out-of-order execution -# magic is responsible for this. Processor overlaps not only the -# additional instructions with AES ones, but even AES instuctions -# processing adjacent triplets of independent blocks. In the 6x case -# additional instructions still claim disproportionally small amount -# of additional cycles, but in 8x case number of instructions must be -# a tad too high for out-of-order logic to cope with, and AES unit -# remains underutilized... As you can see 8x interleave is hardly -# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl -# utilizies 6x interleave because of limited register bank capacity. -# -# Higher interleave factors do have negative impact on Westmere -# performance. While for ECB mode it's negligible ~1.5%, other -# parallelizables perform ~5% worse, which is outweighed by ~25% -# improvement on Sandy Bridge. To balance regression on Westmere -# CTR mode was implemented with 6x aesenc interleave factor. - -# April 2011 -# -# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing -# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like -# in CTR mode AES instruction interleave factor was chosen to be 6x. - -###################################################################### -# For reference, AMD Bulldozer spends 5.77 cycles per byte processed -# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70 -# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec] -# instruction latency is 9 cycles and that they can be issued every -# cycle. - -$PREFIX="aesni"; # if $PREFIX is set to "AES", the script - # generates drop-in replacement for - # crypto/aes/asm/aes-x86_64.pl:-) - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; -@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order - -$code=".text\n"; - -$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! -# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... -$inp="%rdi"; -$out="%rsi"; -$len="%rdx"; -$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! -$ivp="%r8"; # cbc, ctr, ... - -$rnds_="%r10d"; # backup copy for $rounds -$key_="%r11"; # backup copy for $key - -# %xmm register layout -$rndkey0="%xmm0"; $rndkey1="%xmm1"; -$inout0="%xmm2"; $inout1="%xmm3"; -$inout2="%xmm4"; $inout3="%xmm5"; -$inout4="%xmm6"; $inout5="%xmm7"; -$inout6="%xmm8"; $inout7="%xmm9"; - -$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... -$in0="%xmm8"; $iv="%xmm9"; - -# Inline version of internal aesni_[en|de]crypt1. -# -# Why folded loop? Because aes[enc|dec] is slow enough to accommodate -# cycles which take care of loop variables... -{ my $sn; -sub aesni_generate1 { -my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); -++$sn; -$code.=<<___; - $movkey ($key),$rndkey0 - $movkey 16($key),$rndkey1 -___ -$code.=<<___ if (defined($ivec)); - xorps $rndkey0,$ivec - lea 32($key),$key - xorps $ivec,$inout -___ -$code.=<<___ if (!defined($ivec)); - lea 32($key),$key - xorps $rndkey0,$inout -___ -$code.=<<___; -.Loop_${p}1_$sn: - aes${p} $rndkey1,$inout - dec $rounds - $movkey ($key),$rndkey1 - lea 16($key),$key - jnz .Loop_${p}1_$sn # loop body is 16 bytes - aes${p}last $rndkey1,$inout -___ -}} -# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); -# -{ my ($inp,$out,$key) = @_4args; - -$code.=<<___; -.globl ${PREFIX}_encrypt -.type ${PREFIX}_encrypt,\@abi-omnipotent -.align 16 -${PREFIX}_encrypt: - movups ($inp),$inout0 # load input - mov 240($key),$rounds # key->rounds -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - movups $inout0,($out) # output - ret -.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt - -.globl ${PREFIX}_decrypt -.type ${PREFIX}_decrypt,\@abi-omnipotent -.align 16 -${PREFIX}_decrypt: - movups ($inp),$inout0 # load input - mov 240($key),$rounds # key->rounds -___ - &aesni_generate1("dec",$key,$rounds); -$code.=<<___; - movups $inout0,($out) # output - ret -.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt -___ -} - -# _aesni_[en|de]cryptN are private interfaces, N denotes interleave -# factor. Why 3x subroutine were originally used in loops? Even though -# aes[enc|dec] latency was originally 6, it could be scheduled only -# every *2nd* cycle. Thus 3x interleave was the one providing optimal -# utilization, i.e. when subroutine's throughput is virtually same as -# of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. -# aes[enc|dec] latency in next processor generation is 8, but the -# instructions can be scheduled every cycle. Optimal interleave for -# new processor is therefore 8x... -sub aesni_generate3 { -my $dir=shift; -# As already mentioned it takes in $key and $rounds, which are *not* -# preserved. $inout[0-2] is cipher/clear text... -$code.=<<___; -.type _aesni_${dir}rypt3,\@abi-omnipotent -.align 16 -_aesni_${dir}rypt3: - $movkey ($key),$rndkey0 - shr \$1,$rounds - $movkey 16($key),$rndkey1 - lea 32($key),$key - xorps $rndkey0,$inout0 - xorps $rndkey0,$inout1 - xorps $rndkey0,$inout2 - $movkey ($key),$rndkey0 - -.L${dir}_loop3: - aes${dir} $rndkey1,$inout0 - aes${dir} $rndkey1,$inout1 - dec $rounds - aes${dir} $rndkey1,$inout2 - $movkey 16($key),$rndkey1 - aes${dir} $rndkey0,$inout0 - aes${dir} $rndkey0,$inout1 - lea 32($key),$key - aes${dir} $rndkey0,$inout2 - $movkey ($key),$rndkey0 - jnz .L${dir}_loop3 - - aes${dir} $rndkey1,$inout0 - aes${dir} $rndkey1,$inout1 - aes${dir} $rndkey1,$inout2 - aes${dir}last $rndkey0,$inout0 - aes${dir}last $rndkey0,$inout1 - aes${dir}last $rndkey0,$inout2 - ret -.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 -___ -} -# 4x interleave is implemented to improve small block performance, -# most notably [and naturally] 4 block by ~30%. One can argue that one -# should have implemented 5x as well, but improvement would be <20%, -# so it's not worth it... -sub aesni_generate4 { -my $dir=shift; -# As already mentioned it takes in $key and $rounds, which are *not* -# preserved. $inout[0-3] is cipher/clear text... -$code.=<<___; -.type _aesni_${dir}rypt4,\@abi-omnipotent -.align 16 -_aesni_${dir}rypt4: - $movkey ($key),$rndkey0 - shr \$1,$rounds - $movkey 16($key),$rndkey1 - lea 32($key),$key - xorps $rndkey0,$inout0 - xorps $rndkey0,$inout1 - xorps $rndkey0,$inout2 - xorps $rndkey0,$inout3 - $movkey ($key),$rndkey0 - -.L${dir}_loop4: - aes${dir} $rndkey1,$inout0 - aes${dir} $rndkey1,$inout1 - dec $rounds - aes${dir} $rndkey1,$inout2 - aes${dir} $rndkey1,$inout3 - $movkey 16($key),$rndkey1 - aes${dir} $rndkey0,$inout0 - aes${dir} $rndkey0,$inout1 - lea 32($key),$key - aes${dir} $rndkey0,$inout2 - aes${dir} $rndkey0,$inout3 - $movkey ($key),$rndkey0 - jnz .L${dir}_loop4 - - aes${dir} $rndkey1,$inout0 - aes${dir} $rndkey1,$inout1 - aes${dir} $rndkey1,$inout2 - aes${dir} $rndkey1,$inout3 - aes${dir}last $rndkey0,$inout0 - aes${dir}last $rndkey0,$inout1 - aes${dir}last $rndkey0,$inout2 - aes${dir}last $rndkey0,$inout3 - ret -.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 -___ -} -sub aesni_generate6 { -my $dir=shift; -# As already mentioned it takes in $key and $rounds, which are *not* -# preserved. $inout[0-5] is cipher/clear text... -$code.=<<___; -.type _aesni_${dir}rypt6,\@abi-omnipotent -.align 16 -_aesni_${dir}rypt6: - $movkey ($key),$rndkey0 - shr \$1,$rounds - $movkey 16($key),$rndkey1 - lea 32($key),$key - xorps $rndkey0,$inout0 - pxor $rndkey0,$inout1 - aes${dir} $rndkey1,$inout0 - pxor $rndkey0,$inout2 - aes${dir} $rndkey1,$inout1 - pxor $rndkey0,$inout3 - aes${dir} $rndkey1,$inout2 - pxor $rndkey0,$inout4 - aes${dir} $rndkey1,$inout3 - pxor $rndkey0,$inout5 - dec $rounds - aes${dir} $rndkey1,$inout4 - $movkey ($key),$rndkey0 - aes${dir} $rndkey1,$inout5 - jmp .L${dir}_loop6_enter -.align 16 -.L${dir}_loop6: - aes${dir} $rndkey1,$inout0 - aes${dir} $rndkey1,$inout1 - dec $rounds - aes${dir} $rndkey1,$inout2 - aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey1,$inout4 - aes${dir} $rndkey1,$inout5 -.L${dir}_loop6_enter: # happens to be 16-byte aligned - $movkey 16($key),$rndkey1 - aes${dir} $rndkey0,$inout0 - aes${dir} $rndkey0,$inout1 - lea 32($key),$key - aes${dir} $rndkey0,$inout2 - aes${dir} $rndkey0,$inout3 - aes${dir} $rndkey0,$inout4 - aes${dir} $rndkey0,$inout5 - $movkey ($key),$rndkey0 - jnz .L${dir}_loop6 - - aes${dir} $rndkey1,$inout0 - aes${dir} $rndkey1,$inout1 - aes${dir} $rndkey1,$inout2 - aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey1,$inout4 - aes${dir} $rndkey1,$inout5 - aes${dir}last $rndkey0,$inout0 - aes${dir}last $rndkey0,$inout1 - aes${dir}last $rndkey0,$inout2 - aes${dir}last $rndkey0,$inout3 - aes${dir}last $rndkey0,$inout4 - aes${dir}last $rndkey0,$inout5 - ret -.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 -___ -} -sub aesni_generate8 { -my $dir=shift; -# As already mentioned it takes in $key and $rounds, which are *not* -# preserved. $inout[0-7] is cipher/clear text... -$code.=<<___; -.type _aesni_${dir}rypt8,\@abi-omnipotent -.align 16 -_aesni_${dir}rypt8: - $movkey ($key),$rndkey0 - shr \$1,$rounds - $movkey 16($key),$rndkey1 - lea 32($key),$key - xorps $rndkey0,$inout0 - xorps $rndkey0,$inout1 - aes${dir} $rndkey1,$inout0 - pxor $rndkey0,$inout2 - aes${dir} $rndkey1,$inout1 - pxor $rndkey0,$inout3 - aes${dir} $rndkey1,$inout2 - pxor $rndkey0,$inout4 - aes${dir} $rndkey1,$inout3 - pxor $rndkey0,$inout5 - dec $rounds - aes${dir} $rndkey1,$inout4 - pxor $rndkey0,$inout6 - aes${dir} $rndkey1,$inout5 - pxor $rndkey0,$inout7 - $movkey ($key),$rndkey0 - aes${dir} $rndkey1,$inout6 - aes${dir} $rndkey1,$inout7 - $movkey 16($key),$rndkey1 - jmp .L${dir}_loop8_enter -.align 16 -.L${dir}_loop8: - aes${dir} $rndkey1,$inout0 - aes${dir} $rndkey1,$inout1 - dec $rounds - aes${dir} $rndkey1,$inout2 - aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey1,$inout4 - aes${dir} $rndkey1,$inout5 - aes${dir} $rndkey1,$inout6 - aes${dir} $rndkey1,$inout7 - $movkey 16($key),$rndkey1 -.L${dir}_loop8_enter: # happens to be 16-byte aligned - aes${dir} $rndkey0,$inout0 - aes${dir} $rndkey0,$inout1 - lea 32($key),$key - aes${dir} $rndkey0,$inout2 - aes${dir} $rndkey0,$inout3 - aes${dir} $rndkey0,$inout4 - aes${dir} $rndkey0,$inout5 - aes${dir} $rndkey0,$inout6 - aes${dir} $rndkey0,$inout7 - $movkey ($key),$rndkey0 - jnz .L${dir}_loop8 - - aes${dir} $rndkey1,$inout0 - aes${dir} $rndkey1,$inout1 - aes${dir} $rndkey1,$inout2 - aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey1,$inout4 - aes${dir} $rndkey1,$inout5 - aes${dir} $rndkey1,$inout6 - aes${dir} $rndkey1,$inout7 - aes${dir}last $rndkey0,$inout0 - aes${dir}last $rndkey0,$inout1 - aes${dir}last $rndkey0,$inout2 - aes${dir}last $rndkey0,$inout3 - aes${dir}last $rndkey0,$inout4 - aes${dir}last $rndkey0,$inout5 - aes${dir}last $rndkey0,$inout6 - aes${dir}last $rndkey0,$inout7 - ret -.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 -___ -} -&aesni_generate3("enc") if ($PREFIX eq "aesni"); -&aesni_generate3("dec"); -&aesni_generate4("enc") if ($PREFIX eq "aesni"); -&aesni_generate4("dec"); -&aesni_generate6("enc") if ($PREFIX eq "aesni"); -&aesni_generate6("dec"); -&aesni_generate8("enc") if ($PREFIX eq "aesni"); -&aesni_generate8("dec"); - -if ($PREFIX eq "aesni") { -######################################################################## -# void aesni_ecb_encrypt (const void *in, void *out, -# size_t length, const AES_KEY *key, -# int enc); -$code.=<<___; -.globl aesni_ecb_encrypt -.type aesni_ecb_encrypt,\@function,5 -.align 16 -aesni_ecb_encrypt: - and \$-16,$len - jz .Lecb_ret - - mov 240($key),$rounds # key->rounds - $movkey ($key),$rndkey0 - mov $key,$key_ # backup $key - mov $rounds,$rnds_ # backup $rounds - test %r8d,%r8d # 5th argument - jz .Lecb_decrypt -#--------------------------- ECB ENCRYPT ------------------------------# - cmp \$0x80,$len - jb .Lecb_enc_tail - - movdqu ($inp),$inout0 - movdqu 0x10($inp),$inout1 - movdqu 0x20($inp),$inout2 - movdqu 0x30($inp),$inout3 - movdqu 0x40($inp),$inout4 - movdqu 0x50($inp),$inout5 - movdqu 0x60($inp),$inout6 - movdqu 0x70($inp),$inout7 - lea 0x80($inp),$inp - sub \$0x80,$len - jmp .Lecb_enc_loop8_enter -.align 16 -.Lecb_enc_loop8: - movups $inout0,($out) - mov $key_,$key # restore $key - movdqu ($inp),$inout0 - mov $rnds_,$rounds # restore $rounds - movups $inout1,0x10($out) - movdqu 0x10($inp),$inout1 - movups $inout2,0x20($out) - movdqu 0x20($inp),$inout2 - movups $inout3,0x30($out) - movdqu 0x30($inp),$inout3 - movups $inout4,0x40($out) - movdqu 0x40($inp),$inout4 - movups $inout5,0x50($out) - movdqu 0x50($inp),$inout5 - movups $inout6,0x60($out) - movdqu 0x60($inp),$inout6 - movups $inout7,0x70($out) - lea 0x80($out),$out - movdqu 0x70($inp),$inout7 - lea 0x80($inp),$inp -.Lecb_enc_loop8_enter: - - call _aesni_encrypt8 - - sub \$0x80,$len - jnc .Lecb_enc_loop8 - - movups $inout0,($out) - mov $key_,$key # restore $key - movups $inout1,0x10($out) - mov $rnds_,$rounds # restore $rounds - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - movups $inout5,0x50($out) - movups $inout6,0x60($out) - movups $inout7,0x70($out) - lea 0x80($out),$out - add \$0x80,$len - jz .Lecb_ret - -.Lecb_enc_tail: - movups ($inp),$inout0 - cmp \$0x20,$len - jb .Lecb_enc_one - movups 0x10($inp),$inout1 - je .Lecb_enc_two - movups 0x20($inp),$inout2 - cmp \$0x40,$len - jb .Lecb_enc_three - movups 0x30($inp),$inout3 - je .Lecb_enc_four - movups 0x40($inp),$inout4 - cmp \$0x60,$len - jb .Lecb_enc_five - movups 0x50($inp),$inout5 - je .Lecb_enc_six - movdqu 0x60($inp),$inout6 - call _aesni_encrypt8 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - movups $inout5,0x50($out) - movups $inout6,0x60($out) - jmp .Lecb_ret -.align 16 -.Lecb_enc_one: -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - movups $inout0,($out) - jmp .Lecb_ret -.align 16 -.Lecb_enc_two: - xorps $inout2,$inout2 - call _aesni_encrypt3 - movups $inout0,($out) - movups $inout1,0x10($out) - jmp .Lecb_ret -.align 16 -.Lecb_enc_three: - call _aesni_encrypt3 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - jmp .Lecb_ret -.align 16 -.Lecb_enc_four: - call _aesni_encrypt4 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - jmp .Lecb_ret -.align 16 -.Lecb_enc_five: - xorps $inout5,$inout5 - call _aesni_encrypt6 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - jmp .Lecb_ret -.align 16 -.Lecb_enc_six: - call _aesni_encrypt6 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - movups $inout5,0x50($out) - jmp .Lecb_ret -#--------------------------- ECB DECRYPT ------------------------------# -.align 16 -.Lecb_decrypt: - cmp \$0x80,$len - jb .Lecb_dec_tail - - movdqu ($inp),$inout0 - movdqu 0x10($inp),$inout1 - movdqu 0x20($inp),$inout2 - movdqu 0x30($inp),$inout3 - movdqu 0x40($inp),$inout4 - movdqu 0x50($inp),$inout5 - movdqu 0x60($inp),$inout6 - movdqu 0x70($inp),$inout7 - lea 0x80($inp),$inp - sub \$0x80,$len - jmp .Lecb_dec_loop8_enter -.align 16 -.Lecb_dec_loop8: - movups $inout0,($out) - mov $key_,$key # restore $key - movdqu ($inp),$inout0 - mov $rnds_,$rounds # restore $rounds - movups $inout1,0x10($out) - movdqu 0x10($inp),$inout1 - movups $inout2,0x20($out) - movdqu 0x20($inp),$inout2 - movups $inout3,0x30($out) - movdqu 0x30($inp),$inout3 - movups $inout4,0x40($out) - movdqu 0x40($inp),$inout4 - movups $inout5,0x50($out) - movdqu 0x50($inp),$inout5 - movups $inout6,0x60($out) - movdqu 0x60($inp),$inout6 - movups $inout7,0x70($out) - lea 0x80($out),$out - movdqu 0x70($inp),$inout7 - lea 0x80($inp),$inp -.Lecb_dec_loop8_enter: - - call _aesni_decrypt8 - - $movkey ($key_),$rndkey0 - sub \$0x80,$len - jnc .Lecb_dec_loop8 - - movups $inout0,($out) - mov $key_,$key # restore $key - movups $inout1,0x10($out) - mov $rnds_,$rounds # restore $rounds - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - movups $inout5,0x50($out) - movups $inout6,0x60($out) - movups $inout7,0x70($out) - lea 0x80($out),$out - add \$0x80,$len - jz .Lecb_ret - -.Lecb_dec_tail: - movups ($inp),$inout0 - cmp \$0x20,$len - jb .Lecb_dec_one - movups 0x10($inp),$inout1 - je .Lecb_dec_two - movups 0x20($inp),$inout2 - cmp \$0x40,$len - jb .Lecb_dec_three - movups 0x30($inp),$inout3 - je .Lecb_dec_four - movups 0x40($inp),$inout4 - cmp \$0x60,$len - jb .Lecb_dec_five - movups 0x50($inp),$inout5 - je .Lecb_dec_six - movups 0x60($inp),$inout6 - $movkey ($key),$rndkey0 - call _aesni_decrypt8 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - movups $inout5,0x50($out) - movups $inout6,0x60($out) - jmp .Lecb_ret -.align 16 -.Lecb_dec_one: -___ - &aesni_generate1("dec",$key,$rounds); -$code.=<<___; - movups $inout0,($out) - jmp .Lecb_ret -.align 16 -.Lecb_dec_two: - xorps $inout2,$inout2 - call _aesni_decrypt3 - movups $inout0,($out) - movups $inout1,0x10($out) - jmp .Lecb_ret -.align 16 -.Lecb_dec_three: - call _aesni_decrypt3 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - jmp .Lecb_ret -.align 16 -.Lecb_dec_four: - call _aesni_decrypt4 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - jmp .Lecb_ret -.align 16 -.Lecb_dec_five: - xorps $inout5,$inout5 - call _aesni_decrypt6 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - jmp .Lecb_ret -.align 16 -.Lecb_dec_six: - call _aesni_decrypt6 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - movups $inout5,0x50($out) - -.Lecb_ret: - ret -.size aesni_ecb_encrypt,.-aesni_ecb_encrypt -___ - -{ -###################################################################### -# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, -# size_t blocks, const AES_KEY *key, -# const char *ivec,char *cmac); -# -# Handles only complete blocks, operates on 64-bit counter and -# does not update *ivec! Nor does it finalize CMAC value -# (see engine/eng_aesni.c for details) -# -{ -my $cmac="%r9"; # 6th argument - -my $increment="%xmm6"; -my $bswap_mask="%xmm7"; - -$code.=<<___; -.globl aesni_ccm64_encrypt_blocks -.type aesni_ccm64_encrypt_blocks,\@function,6 -.align 16 -aesni_ccm64_encrypt_blocks: -___ -$code.=<<___ if ($win64); - lea -0x58(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) - movaps %xmm8,0x20(%rsp) - movaps %xmm9,0x30(%rsp) -.Lccm64_enc_body: -___ -$code.=<<___; - mov 240($key),$rounds # key->rounds - movdqu ($ivp),$iv - movdqa .Lincrement64(%rip),$increment - movdqa .Lbswap_mask(%rip),$bswap_mask - - shr \$1,$rounds - lea 0($key),$key_ - movdqu ($cmac),$inout1 - movdqa $iv,$inout0 - mov $rounds,$rnds_ - pshufb $bswap_mask,$iv - jmp .Lccm64_enc_outer -.align 16 -.Lccm64_enc_outer: - $movkey ($key_),$rndkey0 - mov $rnds_,$rounds - movups ($inp),$in0 # load inp - - xorps $rndkey0,$inout0 # counter - $movkey 16($key_),$rndkey1 - xorps $in0,$rndkey0 - lea 32($key_),$key - xorps $rndkey0,$inout1 # cmac^=inp - $movkey ($key),$rndkey0 - -.Lccm64_enc2_loop: - aesenc $rndkey1,$inout0 - dec $rounds - aesenc $rndkey1,$inout1 - $movkey 16($key),$rndkey1 - aesenc $rndkey0,$inout0 - lea 32($key),$key - aesenc $rndkey0,$inout1 - $movkey 0($key),$rndkey0 - jnz .Lccm64_enc2_loop - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 - paddq $increment,$iv - aesenclast $rndkey0,$inout0 - aesenclast $rndkey0,$inout1 - - dec $len - lea 16($inp),$inp - xorps $inout0,$in0 # inp ^= E(iv) - movdqa $iv,$inout0 - movups $in0,($out) # save output - lea 16($out),$out - pshufb $bswap_mask,$inout0 - jnz .Lccm64_enc_outer - - movups $inout1,($cmac) -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - lea 0x58(%rsp),%rsp -.Lccm64_enc_ret: -___ -$code.=<<___; - ret -.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks -___ -###################################################################### -$code.=<<___; -.globl aesni_ccm64_decrypt_blocks -.type aesni_ccm64_decrypt_blocks,\@function,6 -.align 16 -aesni_ccm64_decrypt_blocks: -___ -$code.=<<___ if ($win64); - lea -0x58(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) - movaps %xmm8,0x20(%rsp) - movaps %xmm9,0x30(%rsp) -.Lccm64_dec_body: -___ -$code.=<<___; - mov 240($key),$rounds # key->rounds - movups ($ivp),$iv - movdqu ($cmac),$inout1 - movdqa .Lincrement64(%rip),$increment - movdqa .Lbswap_mask(%rip),$bswap_mask - - movaps $iv,$inout0 - mov $rounds,$rnds_ - mov $key,$key_ - pshufb $bswap_mask,$iv -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - movups ($inp),$in0 # load inp - paddq $increment,$iv - lea 16($inp),$inp - jmp .Lccm64_dec_outer -.align 16 -.Lccm64_dec_outer: - xorps $inout0,$in0 # inp ^= E(iv) - movdqa $iv,$inout0 - mov $rnds_,$rounds - movups $in0,($out) # save output - lea 16($out),$out - pshufb $bswap_mask,$inout0 - - sub \$1,$len - jz .Lccm64_dec_break - - $movkey ($key_),$rndkey0 - shr \$1,$rounds - $movkey 16($key_),$rndkey1 - xorps $rndkey0,$in0 - lea 32($key_),$key - xorps $rndkey0,$inout0 - xorps $in0,$inout1 # cmac^=out - $movkey ($key),$rndkey0 - -.Lccm64_dec2_loop: - aesenc $rndkey1,$inout0 - dec $rounds - aesenc $rndkey1,$inout1 - $movkey 16($key),$rndkey1 - aesenc $rndkey0,$inout0 - lea 32($key),$key - aesenc $rndkey0,$inout1 - $movkey 0($key),$rndkey0 - jnz .Lccm64_dec2_loop - movups ($inp),$in0 # load inp - paddq $increment,$iv - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 - lea 16($inp),$inp - aesenclast $rndkey0,$inout0 - aesenclast $rndkey0,$inout1 - jmp .Lccm64_dec_outer - -.align 16 -.Lccm64_dec_break: - #xorps $in0,$inout1 # cmac^=out -___ - &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); -$code.=<<___; - movups $inout1,($cmac) -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - lea 0x58(%rsp),%rsp -.Lccm64_dec_ret: -___ -$code.=<<___; - ret -.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks -___ -} -###################################################################### -# void aesni_ctr32_encrypt_blocks (const void *in, void *out, -# size_t blocks, const AES_KEY *key, -# const char *ivec); -# -# Handles only complete blocks, operates on 32-bit counter and -# does not update *ivec! (see crypto/modes/ctr128.c for details) -# -# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, -# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. -# Keywords are full unroll and modulo-schedule counter calculations -# with zero-round key xor. -{ -my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); -my ($key0,$ctr)=("${key_}d","${ivp}d"); -my $frame_size = 0x80 + ($win64?160:0); - -$code.=<<___; -.globl aesni_ctr32_encrypt_blocks -.type aesni_ctr32_encrypt_blocks,\@function,5 -.align 16 -aesni_ctr32_encrypt_blocks: - lea (%rsp),%rax - push %rbp - sub \$$frame_size,%rsp - and \$-16,%rsp # Linux kernel stack can be incorrectly seeded -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) -.Lctr32_body: -___ -$code.=<<___; - lea -8(%rax),%rbp - - cmp \$1,$len - je .Lctr32_one_shortcut - - movdqu ($ivp),$inout0 - movdqu ($key),$rndkey0 - mov 12($ivp),$ctr # counter LSB - pxor $rndkey0,$inout0 - mov 12($key),$key0 # 0-round key LSB - movdqa $inout0,0x00(%rsp) # populate counter block - bswap $ctr - movdqa $inout0,$inout1 - movdqa $inout0,$inout2 - movdqa $inout0,$inout3 - movdqa $inout0,0x40(%rsp) - movdqa $inout0,0x50(%rsp) - movdqa $inout0,0x60(%rsp) - movdqa $inout0,0x70(%rsp) - - mov 240($key),$rounds # key->rounds - - lea 1($ctr),%r9 - lea 2($ctr),%r10 - bswap %r9d - bswap %r10d - xor $key0,%r9d - xor $key0,%r10d - pinsrd \$3,%r9d,$inout1 - lea 3($ctr),%r9 - movdqa $inout1,0x10(%rsp) - pinsrd \$3,%r10d,$inout2 - bswap %r9d - lea 4($ctr),%r10 - movdqa $inout2,0x20(%rsp) - xor $key0,%r9d - bswap %r10d - pinsrd \$3,%r9d,$inout3 - xor $key0,%r10d - movdqa $inout3,0x30(%rsp) - lea 5($ctr),%r9 - mov %r10d,0x40+12(%rsp) - bswap %r9d - lea 6($ctr),%r10 - xor $key0,%r9d - bswap %r10d - mov %r9d,0x50+12(%rsp) - xor $key0,%r10d - lea 7($ctr),%r9 - mov %r10d,0x60+12(%rsp) - bswap %r9d - xor $key0,%r9d - mov %r9d,0x70+12(%rsp) - - $movkey 0x10($key),$rndkey1 - - movdqa 0x40(%rsp),$inout4 - movdqa 0x50(%rsp),$inout5 - - cmp \$8,$len - jb .Lctr32_tail - - lea 0x80($key),$key # size optimization - sub \$8,$len - jmp .Lctr32_loop8 - -.align 32 -.Lctr32_loop8: - add \$8,$ctr - movdqa 0x60(%rsp),$inout6 - aesenc $rndkey1,$inout0 - mov $ctr,%r9d - movdqa 0x70(%rsp),$inout7 - aesenc $rndkey1,$inout1 - bswap %r9d - $movkey 0x20-0x80($key),$rndkey0 - aesenc $rndkey1,$inout2 - xor $key0,%r9d - aesenc $rndkey1,$inout3 - mov %r9d,0x00+12(%rsp) - lea 1($ctr),%r9 - aesenc $rndkey1,$inout4 - aesenc $rndkey1,$inout5 - aesenc $rndkey1,$inout6 - aesenc $rndkey1,$inout7 - $movkey 0x30-0x80($key),$rndkey1 -___ -for($i=2;$i<8;$i++) { -my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; -$code.=<<___; - aesenc $rndkeyx,$inout0 - aesenc $rndkeyx,$inout1 - bswap %r9d - aesenc $rndkeyx,$inout2 - xor $key0,%r9d - aesenc $rndkeyx,$inout3 - mov %r9d,`0x10*($i-1)`+12(%rsp) - lea $i($ctr),%r9 - aesenc $rndkeyx,$inout4 - aesenc $rndkeyx,$inout5 - aesenc $rndkeyx,$inout6 - aesenc $rndkeyx,$inout7 - $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx -___ -} -$code.=<<___; - aesenc $rndkey0,$inout0 - aesenc $rndkey0,$inout1 - bswap %r9d - aesenc $rndkey0,$inout2 - xor $key0,%r9d - aesenc $rndkey0,$inout3 - mov %r9d,0x70+12(%rsp) - aesenc $rndkey0,$inout4 - aesenc $rndkey0,$inout5 - aesenc $rndkey0,$inout6 - movdqu 0x00($inp),$in0 - aesenc $rndkey0,$inout7 - $movkey 0xa0-0x80($key),$rndkey0 - - cmp \$11,$rounds - jb .Lctr32_enc_done - - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 - aesenc $rndkey1,$inout2 - aesenc $rndkey1,$inout3 - aesenc $rndkey1,$inout4 - aesenc $rndkey1,$inout5 - aesenc $rndkey1,$inout6 - aesenc $rndkey1,$inout7 - $movkey 0xb0-0x80($key),$rndkey1 - - aesenc $rndkey0,$inout0 - aesenc $rndkey0,$inout1 - aesenc $rndkey0,$inout2 - aesenc $rndkey0,$inout3 - aesenc $rndkey0,$inout4 - aesenc $rndkey0,$inout5 - aesenc $rndkey0,$inout6 - aesenc $rndkey0,$inout7 - $movkey 0xc0-0x80($key),$rndkey0 - je .Lctr32_enc_done - - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 - aesenc $rndkey1,$inout2 - aesenc $rndkey1,$inout3 - aesenc $rndkey1,$inout4 - aesenc $rndkey1,$inout5 - aesenc $rndkey1,$inout6 - aesenc $rndkey1,$inout7 - $movkey 0xd0-0x80($key),$rndkey1 - - aesenc $rndkey0,$inout0 - aesenc $rndkey0,$inout1 - aesenc $rndkey0,$inout2 - aesenc $rndkey0,$inout3 - aesenc $rndkey0,$inout4 - aesenc $rndkey0,$inout5 - aesenc $rndkey0,$inout6 - aesenc $rndkey0,$inout7 - $movkey 0xe0-0x80($key),$rndkey0 - -.Lctr32_enc_done: - movdqu 0x10($inp),$in1 - pxor $rndkey0,$in0 - movdqu 0x20($inp),$in2 - pxor $rndkey0,$in1 - movdqu 0x30($inp),$in3 - pxor $rndkey0,$in2 - movdqu 0x40($inp),$in4 - pxor $rndkey0,$in3 - movdqu 0x50($inp),$in5 - pxor $rndkey0,$in4 - aesenc $rndkey1,$inout0 - pxor $rndkey0,$in5 - aesenc $rndkey1,$inout1 - aesenc $rndkey1,$inout2 - aesenc $rndkey1,$inout3 - aesenc $rndkey1,$inout4 - aesenc $rndkey1,$inout5 - aesenc $rndkey1,$inout6 - aesenc $rndkey1,$inout7 - movdqu 0x60($inp),$rndkey1 - - aesenclast $in0,$inout0 - pxor $rndkey0,$rndkey1 - movdqu 0x70($inp),$in0 - lea 0x80($inp),$inp - aesenclast $in1,$inout1 - pxor $rndkey0,$in0 - movdqa 0x00(%rsp),$in1 # load next counter block - aesenclast $in2,$inout2 - movdqa 0x10(%rsp),$in2 - aesenclast $in3,$inout3 - movdqa 0x20(%rsp),$in3 - aesenclast $in4,$inout4 - movdqa 0x30(%rsp),$in4 - aesenclast $in5,$inout5 - movdqa 0x40(%rsp),$in5 - aesenclast $rndkey1,$inout6 - movdqa 0x50(%rsp),$rndkey0 - aesenclast $in0,$inout7 - $movkey 0x10-0x80($key),$rndkey1 - - movups $inout0,($out) # store output - movdqa $in1,$inout0 - movups $inout1,0x10($out) - movdqa $in2,$inout1 - movups $inout2,0x20($out) - movdqa $in3,$inout2 - movups $inout3,0x30($out) - movdqa $in4,$inout3 - movups $inout4,0x40($out) - movdqa $in5,$inout4 - movups $inout5,0x50($out) - movdqa $rndkey0,$inout5 - movups $inout6,0x60($out) - movups $inout7,0x70($out) - lea 0x80($out),$out - - sub \$8,$len - jnc .Lctr32_loop8 - - add \$8,$len - jz .Lctr32_done - lea -0x80($key),$key - -.Lctr32_tail: - lea 16($key),$key - cmp \$4,$len - jb .Lctr32_loop3 - je .Lctr32_loop4 - - movdqa 0x60(%rsp),$inout6 - pxor $inout7,$inout7 - - $movkey 16($key),$rndkey0 - aesenc $rndkey1,$inout0 - lea 16($key),$key - aesenc $rndkey1,$inout1 - shr \$1,$rounds - aesenc $rndkey1,$inout2 - dec $rounds - aesenc $rndkey1,$inout3 - movups ($inp),$in0 - aesenc $rndkey1,$inout4 - movups 0x10($inp),$in1 - aesenc $rndkey1,$inout5 - movups 0x20($inp),$in2 - aesenc $rndkey1,$inout6 - $movkey 16($key),$rndkey1 - - call .Lenc_loop8_enter - - movdqu 0x30($inp),$in3 - pxor $in0,$inout0 - movdqu 0x40($inp),$in0 - pxor $in1,$inout1 - movdqu $inout0,($out) - pxor $in2,$inout2 - movdqu $inout1,0x10($out) - pxor $in3,$inout3 - movdqu $inout2,0x20($out) - pxor $in0,$inout4 - movdqu $inout3,0x30($out) - movdqu $inout4,0x40($out) - cmp \$6,$len - jb .Lctr32_done - - movups 0x50($inp),$in1 - xorps $in1,$inout5 - movups $inout5,0x50($out) - je .Lctr32_done - - movups 0x60($inp),$in2 - xorps $in2,$inout6 - movups $inout6,0x60($out) - jmp .Lctr32_done - -.align 32 -.Lctr32_loop4: - aesenc $rndkey1,$inout0 - lea 16($key),$key - aesenc $rndkey1,$inout1 - aesenc $rndkey1,$inout2 - aesenc $rndkey1,$inout3 - $movkey ($key),$rndkey1 - dec $rounds - jnz .Lctr32_loop4 - aesenclast $rndkey1,$inout0 - movups ($inp),$in0 - aesenclast $rndkey1,$inout1 - movups 0x10($inp),$in1 - aesenclast $rndkey1,$inout2 - movups 0x20($inp),$in2 - aesenclast $rndkey1,$inout3 - movups 0x30($inp),$in3 - - xorps $in0,$inout0 - movups $inout0,($out) - xorps $in1,$inout1 - movups $inout1,0x10($out) - pxor $in2,$inout2 - movdqu $inout2,0x20($out) - pxor $in3,$inout3 - movdqu $inout3,0x30($out) - jmp .Lctr32_done - -.align 32 -.Lctr32_loop3: - aesenc $rndkey1,$inout0 - lea 16($key),$key - aesenc $rndkey1,$inout1 - aesenc $rndkey1,$inout2 - $movkey ($key),$rndkey1 - dec $rounds - jnz .Lctr32_loop3 - aesenclast $rndkey1,$inout0 - aesenclast $rndkey1,$inout1 - aesenclast $rndkey1,$inout2 - - movups ($inp),$in0 - xorps $in0,$inout0 - movups $inout0,($out) - cmp \$2,$len - jb .Lctr32_done - - movups 0x10($inp),$in1 - xorps $in1,$inout1 - movups $inout1,0x10($out) - je .Lctr32_done - - movups 0x20($inp),$in2 - xorps $in2,$inout2 - movups $inout2,0x20($out) - jmp .Lctr32_done - -.align 16 -.Lctr32_one_shortcut: - movups ($ivp),$inout0 - movups ($inp),$in0 - mov 240($key),$rounds # key->rounds -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - xorps $in0,$inout0 - movups $inout0,($out) - jmp .Lctr32_done - -.align 16 -.Lctr32_done: -___ -$code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps -0x90(%rbp),%xmm7 - movaps -0x80(%rbp),%xmm8 - movaps -0x70(%rbp),%xmm9 - movaps -0x60(%rbp),%xmm10 - movaps -0x50(%rbp),%xmm11 - movaps -0x40(%rbp),%xmm12 - movaps -0x30(%rbp),%xmm13 - movaps -0x20(%rbp),%xmm14 - movaps -0x10(%rbp),%xmm15 -___ -$code.=<<___; - lea (%rbp),%rsp - pop %rbp -.Lctr32_epilogue: - ret -.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks -___ -} - -###################################################################### -# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, -# const AES_KEY *key1, const AES_KEY *key2 -# const unsigned char iv[16]); -# -{ -my @tweak=map("%xmm$_",(10..15)); -my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); -my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); -my $frame_size = 0x70 + ($win64?160:0); - -$code.=<<___; -.globl aesni_xts_encrypt -.type aesni_xts_encrypt,\@function,6 -.align 16 -aesni_xts_encrypt: - lea (%rsp),%rax - push %rbp - sub \$$frame_size,%rsp - and \$-16,%rsp # Linux kernel stack can be incorrectly seeded -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) -.Lxts_enc_body: -___ -$code.=<<___; - lea -8(%rax),%rbp - movups ($ivp),@tweak[5] # load clear-text tweak - mov 240(%r8),$rounds # key2->rounds - mov 240($key),$rnds_ # key1->rounds -___ - # generate the tweak - &aesni_generate1("enc",$key2,$rounds,@tweak[5]); -$code.=<<___; - $movkey ($key),$rndkey0 # zero round key - mov $key,$key_ # backup $key - mov $rnds_,$rounds # backup $rounds - shl \$4,$rnds_ - mov $len,$len_ # backup $len - and \$-16,$len - - $movkey 16($key,$rnds_),$rndkey1 # last round key - mov $rounds,$rnds_ - - movdqa .Lxts_magic(%rip),$twmask - pshufd \$0x5f,@tweak[5],$twres - pxor $rndkey0,$rndkey1 -___ - # alternative tweak calculation algorithm is based on suggestions - # by Shay Gueron. psrad doesn't conflict with AES-NI instructions - # and should help in the future... - for ($i=0;$i<4;$i++) { - $code.=<<___; - movdqa $twres,$twtmp - paddd $twres,$twres - movdqa @tweak[5],@tweak[$i] - psrad \$31,$twtmp # broadcast upper bits - paddq @tweak[5],@tweak[5] - pand $twmask,$twtmp - pxor $rndkey0,@tweak[$i] - pxor $twtmp,@tweak[5] -___ - } -$code.=<<___; - movdqa @tweak[5],@tweak[4] - psrad \$31,$twres - paddq @tweak[5],@tweak[5] - pand $twmask,$twres - pxor $rndkey0,@tweak[4] - pxor $twres,@tweak[5] - movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] - - sub \$16*6,$len - jc .Lxts_enc_short - - shr \$1,$rounds - sub \$3,$rounds - $movkey 16($key_),$rndkey1 - mov $rounds,$rnds_ - lea .Lxts_magic(%rip),%r8 - jmp .Lxts_enc_grandloop - -.align 32 -.Lxts_enc_grandloop: - movdqu `16*0`($inp),$inout0 # load input - movdqa $rndkey0,$twmask - movdqu `16*1`($inp),$inout1 - pxor @tweak[0],$inout0 - movdqu `16*2`($inp),$inout2 - pxor @tweak[1],$inout1 - aesenc $rndkey1,$inout0 - movdqu `16*3`($inp),$inout3 - pxor @tweak[2],$inout2 - aesenc $rndkey1,$inout1 - movdqu `16*4`($inp),$inout4 - pxor @tweak[3],$inout3 - aesenc $rndkey1,$inout2 - movdqu `16*5`($inp),$inout5 - pxor @tweak[5],$twmask # round[0]^=tweak[5] - movdqa 0x60(%rsp),$twres # load round[0]^round[last] - pxor @tweak[4],$inout4 - aesenc $rndkey1,$inout3 - $movkey 32($key_),$rndkey0 - lea `16*6`($inp),$inp - pxor $twmask,$inout5 - - pxor $twres,@tweak[0] - aesenc $rndkey1,$inout4 - pxor $twres,@tweak[1] - movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key - aesenc $rndkey1,$inout5 - $movkey 48($key_),$rndkey1 - - aesenc $rndkey0,$inout0 - pxor $twres,@tweak[2] - movdqa @tweak[1],`16*1`(%rsp) - aesenc $rndkey0,$inout1 - pxor $twres,@tweak[3] - movdqa @tweak[2],`16*2`(%rsp) - aesenc $rndkey0,$inout2 - pxor $twres,@tweak[4] - aesenc $rndkey0,$inout3 - pxor $twres,$twmask - movdqa @tweak[4],`16*4`(%rsp) - aesenc $rndkey0,$inout4 - movdqa $twmask,`16*5`(%rsp) - aesenc $rndkey0,$inout5 - $movkey 64($key_),$rndkey0 - lea 64($key_),$key - pshufd \$0x5f,@tweak[5],$twres - jmp .Lxts_enc_loop6 -.align 32 -.Lxts_enc_loop6: - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 - aesenc $rndkey1,$inout2 - aesenc $rndkey1,$inout3 - aesenc $rndkey1,$inout4 - aesenc $rndkey1,$inout5 - $movkey 16($key),$rndkey1 - lea 32($key),$key - - aesenc $rndkey0,$inout0 - aesenc $rndkey0,$inout1 - aesenc $rndkey0,$inout2 - aesenc $rndkey0,$inout3 - aesenc $rndkey0,$inout4 - aesenc $rndkey0,$inout5 - $movkey ($key),$rndkey0 - dec $rounds - jnz .Lxts_enc_loop6 - - movdqa (%r8),$twmask - movdqa $twres,$twtmp - paddd $twres,$twres - aesenc $rndkey1,$inout0 - paddq @tweak[5],@tweak[5] - psrad \$31,$twtmp - aesenc $rndkey1,$inout1 - pand $twmask,$twtmp - $movkey ($key_),@tweak[0] # load round[0] - aesenc $rndkey1,$inout2 - aesenc $rndkey1,$inout3 - pxor $twtmp,@tweak[5] - aesenc $rndkey1,$inout4 - movaps @tweak[0],@tweak[1] # copy round[0] - aesenc $rndkey1,$inout5 - $movkey 16($key),$rndkey1 - - movdqa $twres,$twtmp - paddd $twres,$twres - aesenc $rndkey0,$inout0 - pxor @tweak[5],@tweak[0] - psrad \$31,$twtmp - aesenc $rndkey0,$inout1 - paddq @tweak[5],@tweak[5] - pand $twmask,$twtmp - aesenc $rndkey0,$inout2 - aesenc $rndkey0,$inout3 - pxor $twtmp,@tweak[5] - aesenc $rndkey0,$inout4 - movaps @tweak[1],@tweak[2] - aesenc $rndkey0,$inout5 - $movkey 32($key),$rndkey0 - - movdqa $twres,$twtmp - paddd $twres,$twres - aesenc $rndkey1,$inout0 - pxor @tweak[5],@tweak[1] - psrad \$31,$twtmp - aesenc $rndkey1,$inout1 - paddq @tweak[5],@tweak[5] - pand $twmask,$twtmp - aesenc $rndkey1,$inout2 - movdqa @tweak[3],`16*3`(%rsp) - aesenc $rndkey1,$inout3 - pxor $twtmp,@tweak[5] - aesenc $rndkey1,$inout4 - movaps @tweak[2],@tweak[3] - aesenc $rndkey1,$inout5 - $movkey 48($key),$rndkey1 - - movdqa $twres,$twtmp - paddd $twres,$twres - aesenc $rndkey0,$inout0 - pxor @tweak[5],@tweak[2] - psrad \$31,$twtmp - aesenc $rndkey0,$inout1 - paddq @tweak[5],@tweak[5] - pand $twmask,$twtmp - aesenc $rndkey0,$inout2 - aesenc $rndkey0,$inout3 - pxor $twtmp,@tweak[5] - aesenc $rndkey0,$inout4 - movaps @tweak[3],@tweak[4] - aesenc $rndkey0,$inout5 - - movdqa $twres,$rndkey0 - paddd $twres,$twres - aesenc $rndkey1,$inout0 - pxor @tweak[5],@tweak[3] - psrad \$31,$rndkey0 - aesenc $rndkey1,$inout1 - paddq @tweak[5],@tweak[5] - pand $twmask,$rndkey0 - aesenc $rndkey1,$inout2 - aesenc $rndkey1,$inout3 - pxor $rndkey0,@tweak[5] - $movkey ($key_),$rndkey0 - aesenc $rndkey1,$inout4 - aesenc $rndkey1,$inout5 - $movkey 16($key_),$rndkey1 - - pxor @tweak[5],@tweak[4] - psrad \$31,$twres - aesenclast `16*0`(%rsp),$inout0 - paddq @tweak[5],@tweak[5] - pand $twmask,$twres - aesenclast `16*1`(%rsp),$inout1 - aesenclast `16*2`(%rsp),$inout2 - pxor $twres,@tweak[5] - aesenclast `16*3`(%rsp),$inout3 - aesenclast `16*4`(%rsp),$inout4 - aesenclast `16*5`(%rsp),$inout5 - mov $rnds_,$rounds # restore $rounds - - lea `16*6`($out),$out - movups $inout0,`-16*6`($out) # write output - movups $inout1,`-16*5`($out) - movups $inout2,`-16*4`($out) - movups $inout3,`-16*3`($out) - movups $inout4,`-16*2`($out) - movups $inout5,`-16*1`($out) - sub \$16*6,$len - jnc .Lxts_enc_grandloop - - lea 7($rounds,$rounds),$rounds # restore original value - mov $key_,$key # restore $key - mov $rounds,$rnds_ # backup $rounds - -.Lxts_enc_short: - pxor $rndkey0,@tweak[0] - add \$16*6,$len - jz .Lxts_enc_done - - pxor $rndkey0,@tweak[1] - cmp \$0x20,$len - jb .Lxts_enc_one - pxor $rndkey0,@tweak[2] - je .Lxts_enc_two - - pxor $rndkey0,@tweak[3] - cmp \$0x40,$len - jb .Lxts_enc_three - pxor $rndkey0,@tweak[4] - je .Lxts_enc_four - - movdqu ($inp),$inout0 - movdqu 16*1($inp),$inout1 - movdqu 16*2($inp),$inout2 - pxor @tweak[0],$inout0 - movdqu 16*3($inp),$inout3 - pxor @tweak[1],$inout1 - movdqu 16*4($inp),$inout4 - lea 16*5($inp),$inp - pxor @tweak[2],$inout2 - pxor @tweak[3],$inout3 - pxor @tweak[4],$inout4 - - call _aesni_encrypt6 - - xorps @tweak[0],$inout0 - movdqa @tweak[5],@tweak[0] - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - movdqu $inout0,($out) - xorps @tweak[3],$inout3 - movdqu $inout1,16*1($out) - xorps @tweak[4],$inout4 - movdqu $inout2,16*2($out) - movdqu $inout3,16*3($out) - movdqu $inout4,16*4($out) - lea 16*5($out),$out - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_one: - movups ($inp),$inout0 - lea 16*1($inp),$inp - xorps @tweak[0],$inout0 -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - xorps @tweak[0],$inout0 - movdqa @tweak[1],@tweak[0] - movups $inout0,($out) - lea 16*1($out),$out - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_two: - movups ($inp),$inout0 - movups 16($inp),$inout1 - lea 32($inp),$inp - xorps @tweak[0],$inout0 - xorps @tweak[1],$inout1 - - call _aesni_encrypt3 - - xorps @tweak[0],$inout0 - movdqa @tweak[2],@tweak[0] - xorps @tweak[1],$inout1 - movups $inout0,($out) - movups $inout1,16*1($out) - lea 16*2($out),$out - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_three: - movups ($inp),$inout0 - movups 16*1($inp),$inout1 - movups 16*2($inp),$inout2 - lea 16*3($inp),$inp - xorps @tweak[0],$inout0 - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - - call _aesni_encrypt3 - - xorps @tweak[0],$inout0 - movdqa @tweak[3],@tweak[0] - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - movups $inout0,($out) - movups $inout1,16*1($out) - movups $inout2,16*2($out) - lea 16*3($out),$out - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_four: - movups ($inp),$inout0 - movups 16*1($inp),$inout1 - movups 16*2($inp),$inout2 - xorps @tweak[0],$inout0 - movups 16*3($inp),$inout3 - lea 16*4($inp),$inp - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - xorps @tweak[3],$inout3 - - call _aesni_encrypt4 - - pxor @tweak[0],$inout0 - movdqa @tweak[4],@tweak[0] - pxor @tweak[1],$inout1 - pxor @tweak[2],$inout2 - movdqu $inout0,($out) - pxor @tweak[3],$inout3 - movdqu $inout1,16*1($out) - movdqu $inout2,16*2($out) - movdqu $inout3,16*3($out) - lea 16*4($out),$out - jmp .Lxts_enc_done - -.align 16 -.Lxts_enc_done: - and \$15,$len_ - jz .Lxts_enc_ret - mov $len_,$len - -.Lxts_enc_steal: - movzb ($inp),%eax # borrow $rounds ... - movzb -16($out),%ecx # ... and $key - lea 1($inp),$inp - mov %al,-16($out) - mov %cl,0($out) - lea 1($out),$out - sub \$1,$len - jnz .Lxts_enc_steal - - sub $len_,$out # rewind $out - mov $key_,$key # restore $key - mov $rnds_,$rounds # restore $rounds - - movups -16($out),$inout0 - xorps @tweak[0],$inout0 -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - xorps @tweak[0],$inout0 - movups $inout0,-16($out) - -.Lxts_enc_ret: -___ -$code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps -0x90(%rbp),%xmm7 - movaps -0x80(%rbp),%xmm8 - movaps -0x70(%rbp),%xmm9 - movaps -0x60(%rbp),%xmm10 - movaps -0x50(%rbp),%xmm11 - movaps -0x40(%rbp),%xmm12 - movaps -0x30(%rbp),%xmm13 - movaps -0x20(%rbp),%xmm14 - movaps -0x10(%rbp),%xmm15 -___ -$code.=<<___; - lea (%rbp),%rsp - pop %rbp -.Lxts_enc_epilogue: - ret -.size aesni_xts_encrypt,.-aesni_xts_encrypt -___ - -$code.=<<___; -.globl aesni_xts_decrypt -.type aesni_xts_decrypt,\@function,6 -.align 16 -aesni_xts_decrypt: - lea (%rsp),%rax - push %rbp - sub \$$frame_size,%rsp - and \$-16,%rsp # Linux kernel stack can be incorrectly seeded -___ -$code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) -.Lxts_dec_body: -___ -$code.=<<___; - lea -8(%rax),%rbp - movups ($ivp),@tweak[5] # load clear-text tweak - mov 240($key2),$rounds # key2->rounds - mov 240($key),$rnds_ # key1->rounds -___ - # generate the tweak - &aesni_generate1("enc",$key2,$rounds,@tweak[5]); -$code.=<<___; - xor %eax,%eax # if ($len%16) len-=16; - test \$15,$len - setnz %al - shl \$4,%rax - sub %rax,$len - - $movkey ($key),$rndkey0 # zero round key - mov $key,$key_ # backup $key - mov $rnds_,$rounds # backup $rounds - shl \$4,$rnds_ - mov $len,$len_ # backup $len - and \$-16,$len - - $movkey 16($key,$rnds_),$rndkey1 # last round key - mov $rounds,$rnds_ - - movdqa .Lxts_magic(%rip),$twmask - pshufd \$0x5f,@tweak[5],$twres - pxor $rndkey0,$rndkey1 -___ - for ($i=0;$i<4;$i++) { - $code.=<<___; - movdqa $twres,$twtmp - paddd $twres,$twres - movdqa @tweak[5],@tweak[$i] - psrad \$31,$twtmp # broadcast upper bits - paddq @tweak[5],@tweak[5] - pand $twmask,$twtmp - pxor $rndkey0,@tweak[$i] - pxor $twtmp,@tweak[5] -___ - } -$code.=<<___; - movdqa @tweak[5],@tweak[4] - psrad \$31,$twres - paddq @tweak[5],@tweak[5] - pand $twmask,$twres - pxor $rndkey0,@tweak[4] - pxor $twres,@tweak[5] - movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] - - sub \$16*6,$len - jc .Lxts_dec_short - - shr \$1,$rounds - sub \$3,$rounds - $movkey 16($key_),$rndkey1 - mov $rounds,$rnds_ - lea .Lxts_magic(%rip),%r8 - jmp .Lxts_dec_grandloop - -.align 32 -.Lxts_dec_grandloop: - movdqu `16*0`($inp),$inout0 # load input - movdqa $rndkey0,$twmask - movdqu `16*1`($inp),$inout1 - pxor @tweak[0],$inout0 - movdqu `16*2`($inp),$inout2 - pxor @tweak[1],$inout1 - aesdec $rndkey1,$inout0 - movdqu `16*3`($inp),$inout3 - pxor @tweak[2],$inout2 - aesdec $rndkey1,$inout1 - movdqu `16*4`($inp),$inout4 - pxor @tweak[3],$inout3 - aesdec $rndkey1,$inout2 - movdqu `16*5`($inp),$inout5 - pxor @tweak[5],$twmask # round[0]^=tweak[5] - movdqa 0x60(%rsp),$twres # load round[0]^round[last] - pxor @tweak[4],$inout4 - aesdec $rndkey1,$inout3 - $movkey 32($key_),$rndkey0 - lea `16*6`($inp),$inp - pxor $twmask,$inout5 - - pxor $twres,@tweak[0] - aesdec $rndkey1,$inout4 - pxor $twres,@tweak[1] - movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key - aesdec $rndkey1,$inout5 - $movkey 48($key_),$rndkey1 - - aesdec $rndkey0,$inout0 - pxor $twres,@tweak[2] - movdqa @tweak[1],`16*1`(%rsp) - aesdec $rndkey0,$inout1 - pxor $twres,@tweak[3] - movdqa @tweak[2],`16*2`(%rsp) - aesdec $rndkey0,$inout2 - pxor $twres,@tweak[4] - aesdec $rndkey0,$inout3 - pxor $twres,$twmask - movdqa @tweak[4],`16*4`(%rsp) - aesdec $rndkey0,$inout4 - movdqa $twmask,`16*5`(%rsp) - aesdec $rndkey0,$inout5 - $movkey 64($key_),$rndkey0 - lea 64($key_),$key - pshufd \$0x5f,@tweak[5],$twres - jmp .Lxts_dec_loop6 -.align 32 -.Lxts_dec_loop6: - aesdec $rndkey1,$inout0 - aesdec $rndkey1,$inout1 - aesdec $rndkey1,$inout2 - aesdec $rndkey1,$inout3 - aesdec $rndkey1,$inout4 - aesdec $rndkey1,$inout5 - $movkey 16($key),$rndkey1 - lea 32($key),$key - - aesdec $rndkey0,$inout0 - aesdec $rndkey0,$inout1 - aesdec $rndkey0,$inout2 - aesdec $rndkey0,$inout3 - aesdec $rndkey0,$inout4 - aesdec $rndkey0,$inout5 - $movkey ($key),$rndkey0 - dec $rounds - jnz .Lxts_dec_loop6 - - movdqa (%r8),$twmask - movdqa $twres,$twtmp - paddd $twres,$twres - aesdec $rndkey1,$inout0 - paddq @tweak[5],@tweak[5] - psrad \$31,$twtmp - aesdec $rndkey1,$inout1 - pand $twmask,$twtmp - $movkey ($key_),@tweak[0] # load round[0] - aesdec $rndkey1,$inout2 - aesdec $rndkey1,$inout3 - pxor $twtmp,@tweak[5] - aesdec $rndkey1,$inout4 - movaps @tweak[0],@tweak[1] # copy round[0] - aesdec $rndkey1,$inout5 - $movkey 16($key),$rndkey1 - - movdqa $twres,$twtmp - paddd $twres,$twres - aesdec $rndkey0,$inout0 - pxor @tweak[5],@tweak[0] - psrad \$31,$twtmp - aesdec $rndkey0,$inout1 - paddq @tweak[5],@tweak[5] - pand $twmask,$twtmp - aesdec $rndkey0,$inout2 - aesdec $rndkey0,$inout3 - pxor $twtmp,@tweak[5] - aesdec $rndkey0,$inout4 - movaps @tweak[1],@tweak[2] - aesdec $rndkey0,$inout5 - $movkey 32($key),$rndkey0 - - movdqa $twres,$twtmp - paddd $twres,$twres - aesdec $rndkey1,$inout0 - pxor @tweak[5],@tweak[1] - psrad \$31,$twtmp - aesdec $rndkey1,$inout1 - paddq @tweak[5],@tweak[5] - pand $twmask,$twtmp - aesdec $rndkey1,$inout2 - movdqa @tweak[3],`16*3`(%rsp) - aesdec $rndkey1,$inout3 - pxor $twtmp,@tweak[5] - aesdec $rndkey1,$inout4 - movaps @tweak[2],@tweak[3] - aesdec $rndkey1,$inout5 - $movkey 48($key),$rndkey1 - - movdqa $twres,$twtmp - paddd $twres,$twres - aesdec $rndkey0,$inout0 - pxor @tweak[5],@tweak[2] - psrad \$31,$twtmp - aesdec $rndkey0,$inout1 - paddq @tweak[5],@tweak[5] - pand $twmask,$twtmp - aesdec $rndkey0,$inout2 - aesdec $rndkey0,$inout3 - pxor $twtmp,@tweak[5] - aesdec $rndkey0,$inout4 - movaps @tweak[3],@tweak[4] - aesdec $rndkey0,$inout5 - - movdqa $twres,$rndkey0 - paddd $twres,$twres - aesdec $rndkey1,$inout0 - pxor @tweak[5],@tweak[3] - psrad \$31,$rndkey0 - aesdec $rndkey1,$inout1 - paddq @tweak[5],@tweak[5] - pand $twmask,$rndkey0 - aesdec $rndkey1,$inout2 - aesdec $rndkey1,$inout3 - pxor $rndkey0,@tweak[5] - $movkey ($key_),$rndkey0 - aesdec $rndkey1,$inout4 - aesdec $rndkey1,$inout5 - $movkey 16($key_),$rndkey1 - - pxor @tweak[5],@tweak[4] - psrad \$31,$twres - aesdeclast `16*0`(%rsp),$inout0 - paddq @tweak[5],@tweak[5] - pand $twmask,$twres - aesdeclast `16*1`(%rsp),$inout1 - aesdeclast `16*2`(%rsp),$inout2 - pxor $twres,@tweak[5] - aesdeclast `16*3`(%rsp),$inout3 - aesdeclast `16*4`(%rsp),$inout4 - aesdeclast `16*5`(%rsp),$inout5 - mov $rnds_,$rounds # restore $rounds - - lea `16*6`($out),$out - movups $inout0,`-16*6`($out) # write output - movups $inout1,`-16*5`($out) - movups $inout2,`-16*4`($out) - movups $inout3,`-16*3`($out) - movups $inout4,`-16*2`($out) - movups $inout5,`-16*1`($out) - sub \$16*6,$len - jnc .Lxts_dec_grandloop - - lea 7($rounds,$rounds),$rounds # restore original value - mov $key_,$key # restore $key - mov $rounds,$rnds_ # backup $rounds - -.Lxts_dec_short: - pxor $rndkey0,@tweak[0] - pxor $rndkey0,@tweak[1] - add \$16*6,$len - jz .Lxts_dec_done - - pxor $rndkey0,@tweak[2] - cmp \$0x20,$len - jb .Lxts_dec_one - pxor $rndkey0,@tweak[3] - je .Lxts_dec_two - - pxor $rndkey0,@tweak[4] - cmp \$0x40,$len - jb .Lxts_dec_three - je .Lxts_dec_four - - movdqu ($inp),$inout0 - movdqu 16*1($inp),$inout1 - movdqu 16*2($inp),$inout2 - pxor @tweak[0],$inout0 - movdqu 16*3($inp),$inout3 - pxor @tweak[1],$inout1 - movdqu 16*4($inp),$inout4 - lea 16*5($inp),$inp - pxor @tweak[2],$inout2 - pxor @tweak[3],$inout3 - pxor @tweak[4],$inout4 - - call _aesni_decrypt6 - - xorps @tweak[0],$inout0 - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - movdqu $inout0,($out) - xorps @tweak[3],$inout3 - movdqu $inout1,16*1($out) - xorps @tweak[4],$inout4 - movdqu $inout2,16*2($out) - pxor $twtmp,$twtmp - movdqu $inout3,16*3($out) - pcmpgtd @tweak[5],$twtmp - movdqu $inout4,16*4($out) - lea 16*5($out),$out - pshufd \$0x13,$twtmp,@tweak[1] # $twres - and \$15,$len_ - jz .Lxts_dec_ret - - movdqa @tweak[5],@tweak[0] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - pand $twmask,@tweak[1] # isolate carry and residue - pxor @tweak[5],@tweak[1] - jmp .Lxts_dec_done2 - -.align 16 -.Lxts_dec_one: - movups ($inp),$inout0 - lea 16*1($inp),$inp - xorps @tweak[0],$inout0 -___ - &aesni_generate1("dec",$key,$rounds); -$code.=<<___; - xorps @tweak[0],$inout0 - movdqa @tweak[1],@tweak[0] - movups $inout0,($out) - movdqa @tweak[2],@tweak[1] - lea 16*1($out),$out - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_two: - movups ($inp),$inout0 - movups 16($inp),$inout1 - lea 32($inp),$inp - xorps @tweak[0],$inout0 - xorps @tweak[1],$inout1 - - call _aesni_decrypt3 - - xorps @tweak[0],$inout0 - movdqa @tweak[2],@tweak[0] - xorps @tweak[1],$inout1 - movdqa @tweak[3],@tweak[1] - movups $inout0,($out) - movups $inout1,16*1($out) - lea 16*2($out),$out - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_three: - movups ($inp),$inout0 - movups 16*1($inp),$inout1 - movups 16*2($inp),$inout2 - lea 16*3($inp),$inp - xorps @tweak[0],$inout0 - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - - call _aesni_decrypt3 - - xorps @tweak[0],$inout0 - movdqa @tweak[3],@tweak[0] - xorps @tweak[1],$inout1 - movdqa @tweak[4],@tweak[1] - xorps @tweak[2],$inout2 - movups $inout0,($out) - movups $inout1,16*1($out) - movups $inout2,16*2($out) - lea 16*3($out),$out - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_four: - movups ($inp),$inout0 - movups 16*1($inp),$inout1 - movups 16*2($inp),$inout2 - xorps @tweak[0],$inout0 - movups 16*3($inp),$inout3 - lea 16*4($inp),$inp - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - xorps @tweak[3],$inout3 - - call _aesni_decrypt4 - - pxor @tweak[0],$inout0 - movdqa @tweak[4],@tweak[0] - pxor @tweak[1],$inout1 - movdqa @tweak[5],@tweak[1] - pxor @tweak[2],$inout2 - movdqu $inout0,($out) - pxor @tweak[3],$inout3 - movdqu $inout1,16*1($out) - movdqu $inout2,16*2($out) - movdqu $inout3,16*3($out) - lea 16*4($out),$out - jmp .Lxts_dec_done - -.align 16 -.Lxts_dec_done: - and \$15,$len_ - jz .Lxts_dec_ret -.Lxts_dec_done2: - mov $len_,$len - mov $key_,$key # restore $key - mov $rnds_,$rounds # restore $rounds - - movups ($inp),$inout0 - xorps @tweak[1],$inout0 -___ - &aesni_generate1("dec",$key,$rounds); -$code.=<<___; - xorps @tweak[1],$inout0 - movups $inout0,($out) - -.Lxts_dec_steal: - movzb 16($inp),%eax # borrow $rounds ... - movzb ($out),%ecx # ... and $key - lea 1($inp),$inp - mov %al,($out) - mov %cl,16($out) - lea 1($out),$out - sub \$1,$len - jnz .Lxts_dec_steal - - sub $len_,$out # rewind $out - mov $key_,$key # restore $key - mov $rnds_,$rounds # restore $rounds - - movups ($out),$inout0 - xorps @tweak[0],$inout0 -___ - &aesni_generate1("dec",$key,$rounds); -$code.=<<___; - xorps @tweak[0],$inout0 - movups $inout0,($out) - -.Lxts_dec_ret: -___ -$code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps -0x90(%rbp),%xmm7 - movaps -0x80(%rbp),%xmm8 - movaps -0x70(%rbp),%xmm9 - movaps -0x60(%rbp),%xmm10 - movaps -0x50(%rbp),%xmm11 - movaps -0x40(%rbp),%xmm12 - movaps -0x30(%rbp),%xmm13 - movaps -0x20(%rbp),%xmm14 - movaps -0x10(%rbp),%xmm15 -___ -$code.=<<___; - lea (%rbp),%rsp - pop %rbp -.Lxts_dec_epilogue: - ret -.size aesni_xts_decrypt,.-aesni_xts_decrypt -___ -} }} - -######################################################################## -# void $PREFIX_cbc_encrypt (const void *inp, void *out, -# size_t length, const AES_KEY *key, -# unsigned char *ivp,const int enc); -{ -my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt -my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); -my $inp_=$key_; - -$code.=<<___; -.globl ${PREFIX}_cbc_encrypt -.type ${PREFIX}_cbc_encrypt,\@function,6 -.align 16 -${PREFIX}_cbc_encrypt: - test $len,$len # check length - jz .Lcbc_ret - - mov 240($key),$rnds_ # key->rounds - mov $key,$key_ # backup $key - test %r9d,%r9d # 6th argument - jz .Lcbc_decrypt -#--------------------------- CBC ENCRYPT ------------------------------# - movups ($ivp),$inout0 # load iv as initial state - mov $rnds_,$rounds - cmp \$16,$len - jb .Lcbc_enc_tail - sub \$16,$len - jmp .Lcbc_enc_loop -.align 16 -.Lcbc_enc_loop: - movups ($inp),$inout1 # load input - lea 16($inp),$inp - #xorps $inout1,$inout0 -___ - &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); -$code.=<<___; - mov $rnds_,$rounds # restore $rounds - mov $key_,$key # restore $key - movups $inout0,0($out) # store output - lea 16($out),$out - sub \$16,$len - jnc .Lcbc_enc_loop - add \$16,$len - jnz .Lcbc_enc_tail - movups $inout0,($ivp) - jmp .Lcbc_ret - -.Lcbc_enc_tail: - mov $len,%rcx # zaps $key - xchg $inp,$out # $inp is %rsi and $out is %rdi now - .long 0x9066A4F3 # rep movsb - mov \$16,%ecx # zero tail - sub $len,%rcx - xor %eax,%eax - .long 0x9066AAF3 # rep stosb - lea -16(%rdi),%rdi # rewind $out by 1 block - mov $rnds_,$rounds # restore $rounds - mov %rdi,%rsi # $inp and $out are the same - mov $key_,$key # restore $key - xor $len,$len # len=16 - jmp .Lcbc_enc_loop # one more spin -#--------------------------- CBC DECRYPT ------------------------------# -.align 16 -.Lcbc_decrypt: - lea (%rsp),%rax - push %rbp - sub \$$frame_size,%rsp - and \$-16,%rsp # Linux kernel stack can be incorrectly seeded -___ -$code.=<<___ if ($win64); - movaps %xmm6,0x10(%rsp) - movaps %xmm7,0x20(%rsp) - movaps %xmm8,0x30(%rsp) - movaps %xmm9,0x40(%rsp) - movaps %xmm10,0x50(%rsp) - movaps %xmm11,0x60(%rsp) - movaps %xmm12,0x70(%rsp) - movaps %xmm13,0x80(%rsp) - movaps %xmm14,0x90(%rsp) - movaps %xmm15,0xa0(%rsp) -.Lcbc_decrypt_body: -___ -$code.=<<___; - lea -8(%rax),%rbp - movups ($ivp),$iv - mov $rnds_,$rounds - cmp \$0x50,$len - jbe .Lcbc_dec_tail - - $movkey ($key),$rndkey0 - movdqu 0x00($inp),$inout0 # load input - movdqu 0x10($inp),$inout1 - movdqa $inout0,$in0 - movdqu 0x20($inp),$inout2 - movdqa $inout1,$in1 - movdqu 0x30($inp),$inout3 - movdqa $inout2,$in2 - movdqu 0x40($inp),$inout4 - movdqa $inout3,$in3 - movdqu 0x50($inp),$inout5 - movdqa $inout4,$in4 - cmp \$0x70,$len - jbe .Lcbc_dec_six_or_seven - - sub \$0x70,$len - lea 0x70($key),$key # size optimization - jmp .Lcbc_dec_loop8_enter -.align 16 -.Lcbc_dec_loop8: - movups $inout7,($out) - lea 0x10($out),$out -.Lcbc_dec_loop8_enter: - movdqu 0x60($inp),$inout6 - pxor $rndkey0,$inout0 - movdqu 0x70($inp),$inout7 - pxor $rndkey0,$inout1 - $movkey 0x10-0x70($key),$rndkey1 - pxor $rndkey0,$inout2 - xor $inp_,$inp_ - cmp \$0x70,$len # is there at least 0x60 bytes ahead? - pxor $rndkey0,$inout3 - pxor $rndkey0,$inout4 - pxor $rndkey0,$inout5 - pxor $rndkey0,$inout6 - - aesdec $rndkey1,$inout0 - pxor $rndkey0,$inout7 - $movkey 0x20-0x70($key),$rndkey0 - aesdec $rndkey1,$inout1 - aesdec $rndkey1,$inout2 - aesdec $rndkey1,$inout3 - aesdec $rndkey1,$inout4 - aesdec $rndkey1,$inout5 - setnc ${inp_}b - aesdec $rndkey1,$inout6 - shl \$7,$inp_ - aesdec $rndkey1,$inout7 - add $inp,$inp_ - $movkey 0x30-0x70($key),$rndkey1 -___ -for($i=1;$i<12;$i++) { -my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; -$code.=<<___; - aesdec $rndkeyx,$inout0 - aesdec $rndkeyx,$inout1 - aesdec $rndkeyx,$inout2 - aesdec $rndkeyx,$inout3 - aesdec $rndkeyx,$inout4 - aesdec $rndkeyx,$inout5 - aesdec $rndkeyx,$inout6 - aesdec $rndkeyx,$inout7 - $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx -___ -$code.=<<___ if ($i==7); - cmp \$11,$rounds - jb .Lcbc_dec_done -___ -$code.=<<___ if ($i==9); - je .Lcbc_dec_done -___ -} -$code.=<<___; -.Lcbc_dec_done: - aesdec $rndkey1,$inout0 - pxor $rndkey0,$iv - aesdec $rndkey1,$inout1 - pxor $rndkey0,$in0 - aesdec $rndkey1,$inout2 - pxor $rndkey0,$in1 - aesdec $rndkey1,$inout3 - pxor $rndkey0,$in2 - aesdec $rndkey1,$inout4 - pxor $rndkey0,$in3 - aesdec $rndkey1,$inout5 - pxor $rndkey0,$in4 - aesdec $rndkey1,$inout6 - aesdec $rndkey1,$inout7 - movdqu 0x50($inp),$rndkey1 - - aesdeclast $iv,$inout0 - movdqu 0x60($inp),$iv # borrow $iv - pxor $rndkey0,$rndkey1 - aesdeclast $in0,$inout1 - pxor $rndkey0,$iv - movdqu 0x70($inp),$rndkey0 # next IV - lea 0x80($inp),$inp - aesdeclast $in1,$inout2 - movdqu 0x00($inp_),$in0 - aesdeclast $in2,$inout3 - movdqu 0x10($inp_),$in1 - aesdeclast $in3,$inout4 - movdqu 0x20($inp_),$in2 - aesdeclast $in4,$inout5 - movdqu 0x30($inp_),$in3 - aesdeclast $rndkey1,$inout6 - movdqu 0x40($inp_),$in4 - aesdeclast $iv,$inout7 - movdqa $rndkey0,$iv # return $iv - movdqu 0x50($inp_),$rndkey1 - $movkey -0x70($key),$rndkey0 - - movups $inout0,($out) # store output - movdqa $in0,$inout0 - movups $inout1,0x10($out) - movdqa $in1,$inout1 - movups $inout2,0x20($out) - movdqa $in2,$inout2 - movups $inout3,0x30($out) - movdqa $in3,$inout3 - movups $inout4,0x40($out) - movdqa $in4,$inout4 - movups $inout5,0x50($out) - movdqa $rndkey1,$inout5 - movups $inout6,0x60($out) - lea 0x70($out),$out - - sub \$0x80,$len - ja .Lcbc_dec_loop8 - - movaps $inout7,$inout0 - lea -0x70($key),$key - add \$0x70,$len - jle .Lcbc_dec_tail_collected - movups $inout7,($out) - lea 0x10($out),$out - cmp \$0x50,$len - jbe .Lcbc_dec_tail - - movaps $in0,$inout0 -.Lcbc_dec_six_or_seven: - cmp \$0x60,$len - ja .Lcbc_dec_seven - - movaps $inout5,$inout6 - call _aesni_decrypt6 - pxor $iv,$inout0 # ^= IV - movaps $inout6,$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) - pxor $in2,$inout3 - movdqu $inout2,0x20($out) - pxor $in3,$inout4 - movdqu $inout3,0x30($out) - pxor $in4,$inout5 - movdqu $inout4,0x40($out) - lea 0x50($out),$out - movdqa $inout5,$inout0 - jmp .Lcbc_dec_tail_collected - -.align 16 -.Lcbc_dec_seven: - movups 0x60($inp),$inout6 - xorps $inout7,$inout7 - call _aesni_decrypt8 - movups 0x50($inp),$inout7 - pxor $iv,$inout0 # ^= IV - movups 0x60($inp),$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) - pxor $in2,$inout3 - movdqu $inout2,0x20($out) - pxor $in3,$inout4 - movdqu $inout3,0x30($out) - pxor $in4,$inout5 - movdqu $inout4,0x40($out) - pxor $inout7,$inout6 - movdqu $inout5,0x50($out) - lea 0x60($out),$out - movdqa $inout6,$inout0 - jmp .Lcbc_dec_tail_collected - -.Lcbc_dec_tail: - movups ($inp),$inout0 - sub \$0x10,$len - jbe .Lcbc_dec_one - - movups 0x10($inp),$inout1 - movaps $inout0,$in0 - sub \$0x10,$len - jbe .Lcbc_dec_two - - movups 0x20($inp),$inout2 - movaps $inout1,$in1 - sub \$0x10,$len - jbe .Lcbc_dec_three - - movups 0x30($inp),$inout3 - movaps $inout2,$in2 - sub \$0x10,$len - jbe .Lcbc_dec_four - - movups 0x40($inp),$inout4 - movaps $inout3,$in3 - movaps $inout4,$in4 - xorps $inout5,$inout5 - call _aesni_decrypt6 - pxor $iv,$inout0 - movaps $in4,$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) - pxor $in2,$inout3 - movdqu $inout2,0x20($out) - pxor $in3,$inout4 - movdqu $inout3,0x30($out) - lea 0x40($out),$out - movdqa $inout4,$inout0 - sub \$0x10,$len - jmp .Lcbc_dec_tail_collected - -.align 16 -.Lcbc_dec_one: - movaps $inout0,$in0 -___ - &aesni_generate1("dec",$key,$rounds); -$code.=<<___; - xorps $iv,$inout0 - movaps $in0,$iv - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_two: - movaps $inout1,$in1 - xorps $inout2,$inout2 - call _aesni_decrypt3 - pxor $iv,$inout0 - movaps $in1,$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - movdqa $inout1,$inout0 - lea 0x10($out),$out - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_three: - movaps $inout2,$in2 - call _aesni_decrypt3 - pxor $iv,$inout0 - movaps $in2,$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) - movdqa $inout2,$inout0 - lea 0x20($out),$out - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_four: - movaps $inout3,$in3 - call _aesni_decrypt4 - pxor $iv,$inout0 - movaps $in3,$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) - pxor $in2,$inout3 - movdqu $inout2,0x20($out) - movdqa $inout3,$inout0 - lea 0x30($out),$out - jmp .Lcbc_dec_tail_collected - -.align 16 -.Lcbc_dec_tail_collected: - movups $iv,($ivp) - and \$15,$len - jnz .Lcbc_dec_tail_partial - movups $inout0,($out) - jmp .Lcbc_dec_ret -.align 16 -.Lcbc_dec_tail_partial: - movaps $inout0,(%rsp) - mov \$16,%rcx - mov $out,%rdi - sub $len,%rcx - lea (%rsp),%rsi - .long 0x9066A4F3 # rep movsb - -.Lcbc_dec_ret: -___ -$code.=<<___ if ($win64); - movaps 0x10(%rsp),%xmm6 - movaps 0x20(%rsp),%xmm7 - movaps 0x30(%rsp),%xmm8 - movaps 0x40(%rsp),%xmm9 - movaps 0x50(%rsp),%xmm10 - movaps 0x60(%rsp),%xmm11 - movaps 0x70(%rsp),%xmm12 - movaps 0x80(%rsp),%xmm13 - movaps 0x90(%rsp),%xmm14 - movaps 0xa0(%rsp),%xmm15 -___ -$code.=<<___; - lea (%rbp),%rsp - pop %rbp -.Lcbc_ret: - ret -.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt -___ -} -# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, -# int bits, AES_KEY *key) -{ my ($inp,$bits,$key) = @_4args; - $bits =~ s/%r/%e/; - -$code.=<<___; -.globl ${PREFIX}_set_decrypt_key -.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent -.align 16 -${PREFIX}_set_decrypt_key: - .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 - call __aesni_set_encrypt_key - shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key - test %eax,%eax - jnz .Ldec_key_ret - lea 16($key,$bits),$inp # points at the end of key schedule - - $movkey ($key),%xmm0 # just swap - $movkey ($inp),%xmm1 - $movkey %xmm0,($inp) - $movkey %xmm1,($key) - lea 16($key),$key - lea -16($inp),$inp - -.Ldec_key_inverse: - $movkey ($key),%xmm0 # swap and inverse - $movkey ($inp),%xmm1 - aesimc %xmm0,%xmm0 - aesimc %xmm1,%xmm1 - lea 16($key),$key - lea -16($inp),$inp - $movkey %xmm0,16($inp) - $movkey %xmm1,-16($key) - cmp $key,$inp - ja .Ldec_key_inverse - - $movkey ($key),%xmm0 # inverse middle - aesimc %xmm0,%xmm0 - $movkey %xmm0,($inp) -.Ldec_key_ret: - add \$8,%rsp - ret -.LSEH_end_set_decrypt_key: -.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key -___ - -# This is based on submission by -# -# Huang Ying <ying.huang@intel.com> -# Vinodh Gopal <vinodh.gopal@intel.com> -# Kahraman Akdemir -# -# Agressively optimized in respect to aeskeygenassist's critical path -# and is contained in %xmm0-5 to meet Win64 ABI requirement. -# -$code.=<<___; -.globl ${PREFIX}_set_encrypt_key -.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent -.align 16 -${PREFIX}_set_encrypt_key: -__aesni_set_encrypt_key: - .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 - mov \$-1,%rax - test $inp,$inp - jz .Lenc_key_ret - test $key,$key - jz .Lenc_key_ret - - movups ($inp),%xmm0 # pull first 128 bits of *userKey - xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 - lea 16($key),%rax - cmp \$256,$bits - je .L14rounds - cmp \$192,$bits - je .L12rounds - cmp \$128,$bits - jne .Lbad_keybits - -.L10rounds: - mov \$9,$bits # 10 rounds for 128-bit key - $movkey %xmm0,($key) # round 0 - aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 - call .Lkey_expansion_128_cold - aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 - call .Lkey_expansion_128 - aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 - call .Lkey_expansion_128 - aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 - call .Lkey_expansion_128 - aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 - call .Lkey_expansion_128 - aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 - call .Lkey_expansion_128 - aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 - call .Lkey_expansion_128 - aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 - call .Lkey_expansion_128 - aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 - call .Lkey_expansion_128 - aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 - call .Lkey_expansion_128 - $movkey %xmm0,(%rax) - mov $bits,80(%rax) # 240(%rdx) - xor %eax,%eax - jmp .Lenc_key_ret - -.align 16 -.L12rounds: - movq 16($inp),%xmm2 # remaining 1/3 of *userKey - mov \$11,$bits # 12 rounds for 192 - $movkey %xmm0,($key) # round 0 - aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 - call .Lkey_expansion_192a_cold - aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 - call .Lkey_expansion_192b - aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 - call .Lkey_expansion_192a - aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 - call .Lkey_expansion_192b - aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 - call .Lkey_expansion_192a - aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 - call .Lkey_expansion_192b - aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 - call .Lkey_expansion_192a - aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 - call .Lkey_expansion_192b - $movkey %xmm0,(%rax) - mov $bits,48(%rax) # 240(%rdx) - xor %rax, %rax - jmp .Lenc_key_ret - -.align 16 -.L14rounds: - movups 16($inp),%xmm2 # remaning half of *userKey - mov \$13,$bits # 14 rounds for 256 - lea 16(%rax),%rax - $movkey %xmm0,($key) # round 0 - $movkey %xmm2,16($key) # round 1 - aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 - call .Lkey_expansion_256a_cold - aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 - call .Lkey_expansion_256b - aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 - call .Lkey_expansion_256a - aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 - call .Lkey_expansion_256b - aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 - call .Lkey_expansion_256a - aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 - call .Lkey_expansion_256b - aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 - call .Lkey_expansion_256a - aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 - call .Lkey_expansion_256b - aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 - call .Lkey_expansion_256a - aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 - call .Lkey_expansion_256b - aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 - call .Lkey_expansion_256a - aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 - call .Lkey_expansion_256b - aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 - call .Lkey_expansion_256a - $movkey %xmm0,(%rax) - mov $bits,16(%rax) # 240(%rdx) - xor %rax,%rax - jmp .Lenc_key_ret - -.align 16 -.Lbad_keybits: - mov \$-2,%rax -.Lenc_key_ret: - add \$8,%rsp - ret -.LSEH_end_set_encrypt_key: - -.align 16 -.Lkey_expansion_128: - $movkey %xmm0,(%rax) - lea 16(%rax),%rax -.Lkey_expansion_128_cold: - shufps \$0b00010000,%xmm0,%xmm4 - xorps %xmm4, %xmm0 - shufps \$0b10001100,%xmm0,%xmm4 - xorps %xmm4, %xmm0 - shufps \$0b11111111,%xmm1,%xmm1 # critical path - xorps %xmm1,%xmm0 - ret - -.align 16 -.Lkey_expansion_192a: - $movkey %xmm0,(%rax) - lea 16(%rax),%rax -.Lkey_expansion_192a_cold: - movaps %xmm2, %xmm5 -.Lkey_expansion_192b_warm: - shufps \$0b00010000,%xmm0,%xmm4 - movdqa %xmm2,%xmm3 - xorps %xmm4,%xmm0 - shufps \$0b10001100,%xmm0,%xmm4 - pslldq \$4,%xmm3 - xorps %xmm4,%xmm0 - pshufd \$0b01010101,%xmm1,%xmm1 # critical path - pxor %xmm3,%xmm2 - pxor %xmm1,%xmm0 - pshufd \$0b11111111,%xmm0,%xmm3 - pxor %xmm3,%xmm2 - ret - -.align 16 -.Lkey_expansion_192b: - movaps %xmm0,%xmm3 - shufps \$0b01000100,%xmm0,%xmm5 - $movkey %xmm5,(%rax) - shufps \$0b01001110,%xmm2,%xmm3 - $movkey %xmm3,16(%rax) - lea 32(%rax),%rax - jmp .Lkey_expansion_192b_warm - -.align 16 -.Lkey_expansion_256a: - $movkey %xmm2,(%rax) - lea 16(%rax),%rax -.Lkey_expansion_256a_cold: - shufps \$0b00010000,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps \$0b10001100,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps \$0b11111111,%xmm1,%xmm1 # critical path - xorps %xmm1,%xmm0 - ret - -.align 16 -.Lkey_expansion_256b: - $movkey %xmm0,(%rax) - lea 16(%rax),%rax - - shufps \$0b00010000,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps \$0b10001100,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps \$0b10101010,%xmm1,%xmm1 # critical path - xorps %xmm1,%xmm2 - ret -.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key -.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key -___ -} - -$code.=<<___; -.align 64 -.Lbswap_mask: - .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.Lincrement32: - .long 6,6,6,0 -.Lincrement64: - .long 1,0,0,0 -.Lxts_magic: - .long 0x87,0,1,0 -.Lincrement1: - .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 - -.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" -.align 64 -___ - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -___ -$code.=<<___ if ($PREFIX eq "aesni"); -.type ecb_se_handler,\@abi-omnipotent -.align 16 -ecb_se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 152($context),%rax # pull context->Rsp - - jmp .Lcommon_seh_tail -.size ecb_se_handler,.-ecb_se_handler - -.type ccm64_se_handler,\@abi-omnipotent -.align 16 -ccm64_se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - lea 0(%rax),%rsi # %xmm save area - lea 512($context),%rdi # &context.Xmm6 - mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) - .long 0xa548f3fc # cld; rep movsq - lea 0x58(%rax),%rax # adjust stack pointer - - jmp .Lcommon_seh_tail -.size ccm64_se_handler,.-ccm64_se_handler - -.type ctr_xts_se_handler,\@abi-omnipotent -.align 16 -ctr_xts_se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue lable - cmp %r10,%rbx # context->Rip<prologue label - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - mov 160($context),%rax # pull context->Rbp - lea -0xa0(%rax),%rsi # %xmm save area - lea 512($context),%rdi # & context.Xmm6 - mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) - .long 0xa548f3fc # cld; rep movsq - - jmp .Lcommon_rbp_tail -.size ctr_xts_se_handler,.-ctr_xts_se_handler -___ -$code.=<<___; -.type cbc_se_handler,\@abi-omnipotent -.align 16 -cbc_se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 152($context),%rax # pull context->Rsp - mov 248($context),%rbx # pull context->Rip - - lea .Lcbc_decrypt(%rip),%r10 - cmp %r10,%rbx # context->Rip<"prologue" label - jb .Lcommon_seh_tail - - lea .Lcbc_decrypt_body(%rip),%r10 - cmp %r10,%rbx # context->Rip<cbc_decrypt_body - jb .Lrestore_cbc_rax - - lea .Lcbc_ret(%rip),%r10 - cmp %r10,%rbx # context->Rip>="epilogue" label - jae .Lcommon_seh_tail - - lea 16(%rax),%rsi # %xmm save area - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) - .long 0xa548f3fc # cld; rep movsq - -.Lcommon_rbp_tail: - mov 160($context),%rax # pull context->Rbp - mov (%rax),%rbp # restore saved %rbp - lea 8(%rax),%rax # adjust stack pointer - mov %rbp,160($context) # restore context->Rbp - jmp .Lcommon_seh_tail - -.Lrestore_cbc_rax: - mov 120($context),%rax - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size cbc_se_handler,.-cbc_se_handler - -.section .pdata -.align 4 -___ -$code.=<<___ if ($PREFIX eq "aesni"); - .rva .LSEH_begin_aesni_ecb_encrypt - .rva .LSEH_end_aesni_ecb_encrypt - .rva .LSEH_info_ecb - - .rva .LSEH_begin_aesni_ccm64_encrypt_blocks - .rva .LSEH_end_aesni_ccm64_encrypt_blocks - .rva .LSEH_info_ccm64_enc - - .rva .LSEH_begin_aesni_ccm64_decrypt_blocks - .rva .LSEH_end_aesni_ccm64_decrypt_blocks - .rva .LSEH_info_ccm64_dec - - .rva .LSEH_begin_aesni_ctr32_encrypt_blocks - .rva .LSEH_end_aesni_ctr32_encrypt_blocks - .rva .LSEH_info_ctr32 - - .rva .LSEH_begin_aesni_xts_encrypt - .rva .LSEH_end_aesni_xts_encrypt - .rva .LSEH_info_xts_enc - - .rva .LSEH_begin_aesni_xts_decrypt - .rva .LSEH_end_aesni_xts_decrypt - .rva .LSEH_info_xts_dec -___ -$code.=<<___; - .rva .LSEH_begin_${PREFIX}_cbc_encrypt - .rva .LSEH_end_${PREFIX}_cbc_encrypt - .rva .LSEH_info_cbc - - .rva ${PREFIX}_set_decrypt_key - .rva .LSEH_end_set_decrypt_key - .rva .LSEH_info_key - - .rva ${PREFIX}_set_encrypt_key - .rva .LSEH_end_set_encrypt_key - .rva .LSEH_info_key -.section .xdata -.align 8 -___ -$code.=<<___ if ($PREFIX eq "aesni"); -.LSEH_info_ecb: - .byte 9,0,0,0 - .rva ecb_se_handler -.LSEH_info_ccm64_enc: - .byte 9,0,0,0 - .rva ccm64_se_handler - .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] -.LSEH_info_ccm64_dec: - .byte 9,0,0,0 - .rva ccm64_se_handler - .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] -.LSEH_info_ctr32: - .byte 9,0,0,0 - .rva ctr_xts_se_handler - .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] -.LSEH_info_xts_enc: - .byte 9,0,0,0 - .rva ctr_xts_se_handler - .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] -.LSEH_info_xts_dec: - .byte 9,0,0,0 - .rva ctr_xts_se_handler - .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] -___ -$code.=<<___; -.LSEH_info_cbc: - .byte 9,0,0,0 - .rva cbc_se_handler -.LSEH_info_key: - .byte 0x01,0x04,0x01,0x00 - .byte 0x04,0x02,0x00,0x00 # sub rsp,8 -___ -} - -sub rex { - local *opcode=shift; - my ($dst,$src)=@_; - my $rex=0; - - $rex|=0x04 if($dst>=8); - $rex|=0x01 if($src>=8); - push @opcode,$rex|0x40 if($rex); -} - -sub aesni { - my $line=shift; - my @opcode=(0x66); - - if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - rex(\@opcode,$4,$3); - push @opcode,0x0f,0x3a,0xdf; - push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M - my $c=$2; - push @opcode,$c=~/^0/?oct($c):$c; - return ".byte\t".join(',',@opcode); - } - elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my %opcodelet = ( - "aesimc" => 0xdb, - "aesenc" => 0xdc, "aesenclast" => 0xdd, - "aesdec" => 0xde, "aesdeclast" => 0xdf - ); - return undef if (!defined($opcodelet{$1})); - rex(\@opcode,$3,$2); - push @opcode,0x0f,0x38,$opcodelet{$1}; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - return ".byte\t".join(',',@opcode); - } - elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { - my %opcodelet = ( - "aesenc" => 0xdc, "aesenclast" => 0xdd, - "aesdec" => 0xde, "aesdeclast" => 0xdf - ); - return undef if (!defined($opcodelet{$1})); - my $off = $2; - push @opcode,0x44 if ($3>=8); - push @opcode,0x0f,0x38,$opcodelet{$1}; - push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M - push @opcode,($off=~/^0/?oct($off):$off)&0xff; - return ".byte\t".join(',',@opcode); - } - return $line; -} - -$code =~ s/\`([^\`]*)\`/eval($1)/gem; -$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; - -print $code; - -close STDOUT; +../openssl/./crypto/aes/asm/aesni-x86_64.pl
\ No newline at end of file diff --git a/devel/perlasm/cbc.pl b/devel/perlasm/cbc.pl deleted file mode 100644 index 24561e759a..0000000000 --- a/devel/perlasm/cbc.pl +++ /dev/null @@ -1,349 +0,0 @@ -#!/usr/local/bin/perl - -# void des_ncbc_encrypt(input, output, length, schedule, ivec, enc) -# des_cblock (*input); -# des_cblock (*output); -# long length; -# des_key_schedule schedule; -# des_cblock (*ivec); -# int enc; -# -# calls -# des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT); -# - -#&cbc("des_ncbc_encrypt","des_encrypt",0); -#&cbc("BF_cbc_encrypt","BF_encrypt","BF_encrypt", -# 1,4,5,3,5,-1); -#&cbc("des_ncbc_encrypt","des_encrypt","des_encrypt", -# 0,4,5,3,5,-1); -#&cbc("des_ede3_cbc_encrypt","des_encrypt3","des_decrypt3", -# 0,6,7,3,4,5); -# -# When doing a cipher that needs bigendian order, -# for encrypt, the iv is kept in bigendian form, -# while for decrypt, it is kept in little endian. -sub cbc - { - local($name,$enc_func,$dec_func,$swap,$iv_off,$enc_off,$p1,$p2,$p3)=@_; - # name is the function name - # enc_func and dec_func and the functions to call for encrypt/decrypt - # swap is true if byte order needs to be reversed - # iv_off is parameter number for the iv - # enc_off is parameter number for the encrypt/decrypt flag - # p1,p2,p3 are the offsets for parameters to be passed to the - # underlying calls. - - &function_begin_B($name,""); - &comment(""); - - $in="esi"; - $out="edi"; - $count="ebp"; - - &push("ebp"); - &push("ebx"); - &push("esi"); - &push("edi"); - - $data_off=4; - $data_off+=4 if ($p1 > 0); - $data_off+=4 if ($p2 > 0); - $data_off+=4 if ($p3 > 0); - - &mov($count, &wparam(2)); # length - - &comment("getting iv ptr from parameter $iv_off"); - &mov("ebx", &wparam($iv_off)); # Get iv ptr - - &mov($in, &DWP(0,"ebx","",0));# iv[0] - &mov($out, &DWP(4,"ebx","",0));# iv[1] - - &push($out); - &push($in); - &push($out); # used in decrypt for iv[1] - &push($in); # used in decrypt for iv[0] - - &mov("ebx", "esp"); # This is the address of tin[2] - - &mov($in, &wparam(0)); # in - &mov($out, &wparam(1)); # out - - # We have loaded them all, how lets push things - &comment("getting encrypt flag from parameter $enc_off"); - &mov("ecx", &wparam($enc_off)); # Get enc flag - if ($p3 > 0) - { - &comment("get and push parameter $p3"); - if ($enc_off != $p3) - { &mov("eax", &wparam($p3)); &push("eax"); } - else { &push("ecx"); } - } - if ($p2 > 0) - { - &comment("get and push parameter $p2"); - if ($enc_off != $p2) - { &mov("eax", &wparam($p2)); &push("eax"); } - else { &push("ecx"); } - } - if ($p1 > 0) - { - &comment("get and push parameter $p1"); - if ($enc_off != $p1) - { &mov("eax", &wparam($p1)); &push("eax"); } - else { &push("ecx"); } - } - &push("ebx"); # push data/iv - - &cmp("ecx",0); - &jz(&label("decrypt")); - - &and($count,0xfffffff8); - &mov("eax", &DWP($data_off,"esp","",0)); # load iv[0] - &mov("ebx", &DWP($data_off+4,"esp","",0)); # load iv[1] - - &jz(&label("encrypt_finish")); - - ############################################################# - - &set_label("encrypt_loop"); - # encrypt start - # "eax" and "ebx" hold iv (or the last cipher text) - - &mov("ecx", &DWP(0,$in,"",0)); # load first 4 bytes - &mov("edx", &DWP(4,$in,"",0)); # second 4 bytes - - &xor("eax", "ecx"); - &xor("ebx", "edx"); - - &bswap("eax") if $swap; - &bswap("ebx") if $swap; - - &mov(&DWP($data_off,"esp","",0), "eax"); # put in array for call - &mov(&DWP($data_off+4,"esp","",0), "ebx"); # - - &call($enc_func); - - &mov("eax", &DWP($data_off,"esp","",0)); - &mov("ebx", &DWP($data_off+4,"esp","",0)); - - &bswap("eax") if $swap; - &bswap("ebx") if $swap; - - &mov(&DWP(0,$out,"",0),"eax"); - &mov(&DWP(4,$out,"",0),"ebx"); - - # eax and ebx are the next iv. - - &add($in, 8); - &add($out, 8); - - &sub($count, 8); - &jnz(&label("encrypt_loop")); - -###################################################################3 - &set_label("encrypt_finish"); - &mov($count, &wparam(2)); # length - &and($count, 7); - &jz(&label("finish")); - &call(&label("PIC_point")); -&set_label("PIC_point"); - &blindpop("edx"); - &lea("ecx",&DWP(&label("cbc_enc_jmp_table")."-".&label("PIC_point"),"edx")); - &mov($count,&DWP(0,"ecx",$count,4)); - &add($count,"edx"); - &xor("ecx","ecx"); - &xor("edx","edx"); - #&mov($count,&DWP(&label("cbc_enc_jmp_table"),"",$count,4)); - &jmp_ptr($count); - -&set_label("ej7"); - &movb(&HB("edx"), &BP(6,$in,"",0)); - &shl("edx",8); -&set_label("ej6"); - &movb(&HB("edx"), &BP(5,$in,"",0)); -&set_label("ej5"); - &movb(&LB("edx"), &BP(4,$in,"",0)); -&set_label("ej4"); - &mov("ecx", &DWP(0,$in,"",0)); - &jmp(&label("ejend")); -&set_label("ej3"); - &movb(&HB("ecx"), &BP(2,$in,"",0)); - &shl("ecx",8); -&set_label("ej2"); - &movb(&HB("ecx"), &BP(1,$in,"",0)); -&set_label("ej1"); - &movb(&LB("ecx"), &BP(0,$in,"",0)); -&set_label("ejend"); - - &xor("eax", "ecx"); - &xor("ebx", "edx"); - - &bswap("eax") if $swap; - &bswap("ebx") if $swap; - - &mov(&DWP($data_off,"esp","",0), "eax"); # put in array for call - &mov(&DWP($data_off+4,"esp","",0), "ebx"); # - - &call($enc_func); - - &mov("eax", &DWP($data_off,"esp","",0)); - &mov("ebx", &DWP($data_off+4,"esp","",0)); - - &bswap("eax") if $swap; - &bswap("ebx") if $swap; - - &mov(&DWP(0,$out,"",0),"eax"); - &mov(&DWP(4,$out,"",0),"ebx"); - - &jmp(&label("finish")); - - ############################################################# - ############################################################# - &set_label("decrypt",1); - # decrypt start - &and($count,0xfffffff8); - # The next 2 instructions are only for if the jz is taken - &mov("eax", &DWP($data_off+8,"esp","",0)); # get iv[0] - &mov("ebx", &DWP($data_off+12,"esp","",0)); # get iv[1] - &jz(&label("decrypt_finish")); - - &set_label("decrypt_loop"); - &mov("eax", &DWP(0,$in,"",0)); # load first 4 bytes - &mov("ebx", &DWP(4,$in,"",0)); # second 4 bytes - - &bswap("eax") if $swap; - &bswap("ebx") if $swap; - - &mov(&DWP($data_off,"esp","",0), "eax"); # put back - &mov(&DWP($data_off+4,"esp","",0), "ebx"); # - - &call($dec_func); - - &mov("eax", &DWP($data_off,"esp","",0)); # get return - &mov("ebx", &DWP($data_off+4,"esp","",0)); # - - &bswap("eax") if $swap; - &bswap("ebx") if $swap; - - &mov("ecx", &DWP($data_off+8,"esp","",0)); # get iv[0] - &mov("edx", &DWP($data_off+12,"esp","",0)); # get iv[1] - - &xor("ecx", "eax"); - &xor("edx", "ebx"); - - &mov("eax", &DWP(0,$in,"",0)); # get old cipher text, - &mov("ebx", &DWP(4,$in,"",0)); # next iv actually - - &mov(&DWP(0,$out,"",0),"ecx"); - &mov(&DWP(4,$out,"",0),"edx"); - - &mov(&DWP($data_off+8,"esp","",0), "eax"); # save iv - &mov(&DWP($data_off+12,"esp","",0), "ebx"); # - - &add($in, 8); - &add($out, 8); - - &sub($count, 8); - &jnz(&label("decrypt_loop")); -############################ ENDIT #######################3 - &set_label("decrypt_finish"); - &mov($count, &wparam(2)); # length - &and($count, 7); - &jz(&label("finish")); - - &mov("eax", &DWP(0,$in,"",0)); # load first 4 bytes - &mov("ebx", &DWP(4,$in,"",0)); # second 4 bytes - - &bswap("eax") if $swap; - &bswap("ebx") if $swap; - - &mov(&DWP($data_off,"esp","",0), "eax"); # put back - &mov(&DWP($data_off+4,"esp","",0), "ebx"); # - - &call($dec_func); - - &mov("eax", &DWP($data_off,"esp","",0)); # get return - &mov("ebx", &DWP($data_off+4,"esp","",0)); # - - &bswap("eax") if $swap; - &bswap("ebx") if $swap; - - &mov("ecx", &DWP($data_off+8,"esp","",0)); # get iv[0] - &mov("edx", &DWP($data_off+12,"esp","",0)); # get iv[1] - - &xor("ecx", "eax"); - &xor("edx", "ebx"); - - # this is for when we exit - &mov("eax", &DWP(0,$in,"",0)); # get old cipher text, - &mov("ebx", &DWP(4,$in,"",0)); # next iv actually - -&set_label("dj7"); - &rotr("edx", 16); - &movb(&BP(6,$out,"",0), &LB("edx")); - &shr("edx",16); -&set_label("dj6"); - &movb(&BP(5,$out,"",0), &HB("edx")); -&set_label("dj5"); - &movb(&BP(4,$out,"",0), &LB("edx")); -&set_label("dj4"); - &mov(&DWP(0,$out,"",0), "ecx"); - &jmp(&label("djend")); -&set_label("dj3"); - &rotr("ecx", 16); - &movb(&BP(2,$out,"",0), &LB("ecx")); - &shl("ecx",16); -&set_label("dj2"); - &movb(&BP(1,$in,"",0), &HB("ecx")); -&set_label("dj1"); - &movb(&BP(0,$in,"",0), &LB("ecx")); -&set_label("djend"); - - # final iv is still in eax:ebx - &jmp(&label("finish")); - - -############################ FINISH #######################3 - &set_label("finish",1); - &mov("ecx", &wparam($iv_off)); # Get iv ptr - - ################################################# - $total=16+4; - $total+=4 if ($p1 > 0); - $total+=4 if ($p2 > 0); - $total+=4 if ($p3 > 0); - &add("esp",$total); - - &mov(&DWP(0,"ecx","",0), "eax"); # save iv - &mov(&DWP(4,"ecx","",0), "ebx"); # save iv - - &function_end_A($name); - - &align(64); - &set_label("cbc_enc_jmp_table"); - &data_word("0"); - &data_word(&label("ej1")."-".&label("PIC_point")); - &data_word(&label("ej2")."-".&label("PIC_point")); - &data_word(&label("ej3")."-".&label("PIC_point")); - &data_word(&label("ej4")."-".&label("PIC_point")); - &data_word(&label("ej5")."-".&label("PIC_point")); - &data_word(&label("ej6")."-".&label("PIC_point")); - &data_word(&label("ej7")."-".&label("PIC_point")); - # not used - #&set_label("cbc_dec_jmp_table",1); - #&data_word("0"); - #&data_word(&label("dj1")."-".&label("PIC_point")); - #&data_word(&label("dj2")."-".&label("PIC_point")); - #&data_word(&label("dj3")."-".&label("PIC_point")); - #&data_word(&label("dj4")."-".&label("PIC_point")); - #&data_word(&label("dj5")."-".&label("PIC_point")); - #&data_word(&label("dj6")."-".&label("PIC_point")); - #&data_word(&label("dj7")."-".&label("PIC_point")); - &align(64); - - &function_end_B($name); - - } - -1; diff --git a/devel/perlasm/cbc.pl.license b/devel/perlasm/cbc.pl.license deleted file mode 120000 index cd301a44ab..0000000000 --- a/devel/perlasm/cbc.pl.license +++ /dev/null @@ -1 +0,0 @@ -license.txt
\ No newline at end of file diff --git a/devel/perlasm/e_padlock-x86.pl b/devel/perlasm/e_padlock-x86.pl index 4148468c41..4b6c45aaa6 100644..120000 --- a/devel/perlasm/e_padlock-x86.pl +++ b/devel/perlasm/e_padlock-x86.pl @@ -1,606 +1 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# September 2011 -# -# Assembler helpers for Padlock engine. Compared to original engine -# version relying on inline assembler and compiled with gcc 3.4.6 it -# was measured to provide ~100% improvement on misaligned data in ECB -# mode and ~75% in CBC mode. For aligned data improvement can be -# observed for short inputs only, e.g. 45% for 64-byte messages in -# ECB mode, 20% in CBC. Difference in performance for aligned vs. -# misaligned data depends on misalignment and is either ~1.8x or 2.9x. -# These are approximately same factors as for hardware support, so -# there is little reason to rely on the latter. On the contrary, it -# might actually hurt performance in mixture of aligned and misaligned -# buffers, because a) if you choose to flip 'align' flag in control -# word on per-buffer basis, then you'd have to reload key context, -# which incurs penalty; b) if you choose to set 'align' flag -# permanently, it limits performance even for aligned data to ~1/2. -# All above mentioned results were collected on 1.5GHz C7. Nano on the -# other hand handles unaligned data more gracefully. Depending on -# algorithm and how unaligned data is, hardware can be up to 70% more -# efficient than below software alignment procedures, nor does 'align' -# flag have affect on aligned performance [if has any meaning at all]. -# Therefore suggestion is to unconditionally set 'align' flag on Nano -# for optimal performance. - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../crypto/perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],$0); - -%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata -$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 - -$ctx="edx"; -$out="edi"; -$inp="esi"; -$len="ecx"; -$chunk="ebx"; - -&function_begin_B("padlock_capability"); - &push ("ebx"); - &pushf (); - &pop ("eax"); - &mov ("ecx","eax"); - &xor ("eax",1<<21); - &push ("eax"); - &popf (); - &pushf (); - &pop ("eax"); - &xor ("ecx","eax"); - &xor ("eax","eax"); - &bt ("ecx",21); - &jnc (&label("noluck")); - &cpuid (); - &xor ("eax","eax"); - &cmp ("ebx","0x".unpack("H*",'tneC')); - &jne (&label("noluck")); - &cmp ("edx","0x".unpack("H*",'Hrua')); - &jne (&label("noluck")); - &cmp ("ecx","0x".unpack("H*",'slua')); - &jne (&label("noluck")); - &mov ("eax",0xC0000000); - &cpuid (); - &mov ("edx","eax"); - &xor ("eax","eax"); - &cmp ("edx",0xC0000001); - &jb (&label("noluck")); - &mov ("eax",1); - &cpuid (); - &or ("eax",0x0f); - &xor ("ebx","ebx"); - &and ("eax",0x0fff); - &cmp ("eax",0x06ff); # check for Nano - &sete ("bl"); - &mov ("eax",0xC0000001); - &push ("ebx"); - &cpuid (); - &pop ("ebx"); - &mov ("eax","edx"); - &shl ("ebx",4); # bit#4 denotes Nano - &and ("eax",0xffffffef); - &or ("eax","ebx") -&set_label("noluck"); - &pop ("ebx"); - &ret (); -&function_end_B("padlock_capability") - -&function_begin_B("padlock_key_bswap"); - &mov ("edx",&wparam(0)); - &mov ("ecx",&DWP(240,"edx")); -&set_label("bswap_loop"); - &mov ("eax",&DWP(0,"edx")); - &bswap ("eax"); - &mov (&DWP(0,"edx"),"eax"); - &lea ("edx",&DWP(4,"edx")); - &sub ("ecx",1); - &jnz (&label("bswap_loop")); - &ret (); -&function_end_B("padlock_key_bswap"); - -# This is heuristic key context tracing. At first one -# believes that one should use atomic swap instructions, -# but it's not actually necessary. Point is that if -# padlock_saved_context was changed by another thread -# after we've read it and before we compare it with ctx, -# our key *shall* be reloaded upon thread context switch -# and we are therefore set in either case... -&static_label("padlock_saved_context"); - -&function_begin_B("padlock_verify_context"); - &mov ($ctx,&wparam(0)); - &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : - &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point"))); - &pushf (); - &call ("_padlock_verify_ctx"); -&set_label("verify_pic_point"); - &lea ("esp",&DWP(4,"esp")); - &ret (); -&function_end_B("padlock_verify_context"); - -&function_begin_B("_padlock_verify_ctx"); - &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context - &bt (&DWP(4,"esp"),30); # eflags - &jnc (&label("verified")); - &cmp ($ctx,&DWP(0,"eax")); - &je (&label("verified")); - &pushf (); - &popf (); -&set_label("verified"); - &mov (&DWP(0,"eax"),$ctx); - &ret (); -&function_end_B("_padlock_verify_ctx"); - -&function_begin_B("padlock_reload_key"); - &pushf (); - &popf (); - &ret (); -&function_end_B("padlock_reload_key"); - -&function_begin_B("padlock_aes_block"); - &push ("edi"); - &push ("esi"); - &push ("ebx"); - &mov ($out,&wparam(0)); # must be 16-byte aligned - &mov ($inp,&wparam(1)); # must be 16-byte aligned - &mov ($ctx,&wparam(2)); - &mov ($len,1); - &lea ("ebx",&DWP(32,$ctx)); # key - &lea ($ctx,&DWP(16,$ctx)); # control word - &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb - &pop ("ebx"); - &pop ("esi"); - &pop ("edi"); - &ret (); -&function_end_B("padlock_aes_block"); - -sub generate_mode { -my ($mode,$opcode) = @_; -# int padlock_$mode_encrypt(void *out, const void *inp, -# struct padlock_cipher_data *ctx, size_t len); -&function_begin("padlock_${mode}_encrypt"); - &mov ($out,&wparam(0)); - &mov ($inp,&wparam(1)); - &mov ($ctx,&wparam(2)); - &mov ($len,&wparam(3)); - &test ($ctx,15); - &jnz (&label("${mode}_abort")); - &test ($len,15); - &jnz (&label("${mode}_abort")); - &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : - &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point"))); - &pushf (); - &cld (); - &call ("_padlock_verify_ctx"); -&set_label("${mode}_pic_point"); - &lea ($ctx,&DWP(16,$ctx)); # control word - &xor ("eax","eax"); - if ($mode eq "ctr32") { - &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter - } else { - &xor ("ebx","ebx"); - &test (&DWP(0,$ctx),1<<5); # align bit in control word - &jnz (&label("${mode}_aligned")); - &test ($out,0x0f); - &setz ("al"); # !out_misaligned - &test ($inp,0x0f); - &setz ("bl"); # !inp_misaligned - &test ("eax","ebx"); - &jnz (&label("${mode}_aligned")); - &neg ("eax"); - } - &mov ($chunk,$PADLOCK_CHUNK); - ¬ ("eax"); # out_misaligned?-1:0 - &lea ("ebp",&DWP(-24,"esp")); - &cmp ($len,$chunk); - &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len - &and ("eax",$chunk); # out_misaligned?chunk:0 - &mov ($chunk,$len); - &neg ("eax"); - &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK - &lea ("esp",&DWP(0,"eax","ebp")); # alloca - &mov ("eax",$PADLOCK_CHUNK); - &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK - &mov ("eax","ebp"); - &and ("ebp",-16); - &and ("esp",-16); - &mov (&DWP(16,"ebp"),"eax"); - if ($PADLOCK_PREFETCH{$mode}) { - &cmp ($len,$chunk); - &ja (&label("${mode}_loop")); - &mov ("eax",$inp); # check if prefetch crosses page - &cmp ("ebp","esp"); - &cmove ("eax",$out); - &add ("eax",$len); - &neg ("eax"); - &and ("eax",0xfff); # distance to page boundary - &cmp ("eax",$PADLOCK_PREFETCH{$mode}); - &mov ("eax",-$PADLOCK_PREFETCH{$mode}); - &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 - &and ($chunk,"eax"); - &jz (&label("${mode}_unaligned_tail")); - } - &jmp (&label("${mode}_loop")); - -&set_label("${mode}_loop",16); - &mov (&DWP(0,"ebp"),$out); # save parameters - &mov (&DWP(4,"ebp"),$inp); - &mov (&DWP(8,"ebp"),$len); - &mov ($len,$chunk); - &mov (&DWP(12,"ebp"),$chunk); # chunk - if ($mode eq "ctr32") { - &mov ("ecx",&DWP(-4,$ctx)); - &xor ($out,$out); - &mov ("eax",&DWP(-8,$ctx)); # borrow $len -&set_label("${mode}_prepare"); - &mov (&DWP(12,"esp",$out),"ecx"); - &bswap ("ecx"); - &movq (&QWP(0,"esp",$out),"mm0"); - &inc ("ecx"); - &mov (&DWP(8,"esp",$out),"eax"); - &bswap ("ecx"); - &lea ($out,&DWP(16,$out)); - &cmp ($out,$chunk); - &jb (&label("${mode}_prepare")); - - &mov (&DWP(-4,$ctx),"ecx"); - &lea ($inp,&DWP(0,"esp")); - &lea ($out,&DWP(0,"esp")); - &mov ($len,$chunk); - } else { - &test ($out,0x0f); # out_misaligned - &cmovnz ($out,"esp"); - &test ($inp,0x0f); # inp_misaligned - &jz (&label("${mode}_inp_aligned")); - &shr ($len,2); - &data_byte(0xf3,0xa5); # rep movsl - &sub ($out,$chunk); - &mov ($len,$chunk); - &mov ($inp,$out); -&set_label("${mode}_inp_aligned"); - } - &lea ("eax",&DWP(-16,$ctx)); # ivp - &lea ("ebx",&DWP(16,$ctx)); # key - &shr ($len,4); # len/=AES_BLOCK_SIZE - &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* - if ($mode !~ /ecb|ctr/) { - &movaps ("xmm0",&QWP(0,"eax")); - &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv - } - &mov ($out,&DWP(0,"ebp")); # restore parameters - &mov ($chunk,&DWP(12,"ebp")); - if ($mode eq "ctr32") { - &mov ($inp,&DWP(4,"ebp")); - &xor ($len,$len); -&set_label("${mode}_xor"); - &movups ("xmm1",&QWP(0,$inp,$len)); - &lea ($len,&DWP(16,$len)); - &pxor ("xmm1",&QWP(-16,"esp",$len)); - &movups (&QWP(-16,$out,$len),"xmm1"); - &cmp ($len,$chunk); - &jb (&label("${mode}_xor")); - } else { - &test ($out,0x0f); - &jz (&label("${mode}_out_aligned")); - &mov ($len,$chunk); - &lea ($inp,&DWP(0,"esp")); - &shr ($len,2); - &data_byte(0xf3,0xa5); # rep movsl - &sub ($out,$chunk); -&set_label("${mode}_out_aligned"); - &mov ($inp,&DWP(4,"ebp")); - } - &mov ($len,&DWP(8,"ebp")); - &add ($out,$chunk); - &add ($inp,$chunk); - &sub ($len,$chunk); - &mov ($chunk,$PADLOCK_CHUNK); - if (!$PADLOCK_PREFETCH{$mode}) { - &jnz (&label("${mode}_loop")); - } else { - &jz (&label("${mode}_break")); - &cmp ($len,$chunk); - &jae (&label("${mode}_loop")); - -&set_label("${mode}_unaligned_tail"); - &xor ("eax","eax"); - &cmp ("esp","ebp"); - &cmove ("eax",$len); - &sub ("esp","eax"); # alloca - &mov ("eax", $out); # save parameters - &mov ($chunk,$len); - &shr ($len,2); - &lea ($out,&DWP(0,"esp")); - &data_byte(0xf3,0xa5); # rep movsl - &mov ($inp,"esp"); - &mov ($out,"eax"); # restore parameters - &mov ($len,$chunk); - &jmp (&label("${mode}_loop")); - -&set_label("${mode}_break",16); - } - if ($mode ne "ctr32") { - &cmp ("esp","ebp"); - &je (&label("${mode}_done")); - } - &pxor ("xmm0","xmm0"); - &lea ("eax",&DWP(0,"esp")); -&set_label("${mode}_bzero"); - &movaps (&QWP(0,"eax"),"xmm0"); - &lea ("eax",&DWP(16,"eax")); - &cmp ("ebp","eax"); - &ja (&label("${mode}_bzero")); - -&set_label("${mode}_done"); - &mov ("ebp",&DWP(16,"ebp")); - &lea ("esp",&DWP(24,"ebp")); - if ($mode ne "ctr32") { - &jmp (&label("${mode}_exit")); - -&set_label("${mode}_aligned",16); - if ($PADLOCK_PREFETCH{$mode}) { - &lea ("ebp",&DWP(0,$inp,$len)); - &neg ("ebp"); - &and ("ebp",0xfff); # distance to page boundary - &xor ("eax","eax"); - &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); - &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); - &cmovae ("ebp","eax"); - &and ("ebp",$len); # remainder - &sub ($len,"ebp"); - &jz (&label("${mode}_aligned_tail")); - } - &lea ("eax",&DWP(-16,$ctx)); # ivp - &lea ("ebx",&DWP(16,$ctx)); # key - &shr ($len,4); # len/=AES_BLOCK_SIZE - &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* - if ($mode ne "ecb") { - &movaps ("xmm0",&QWP(0,"eax")); - &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv - } - if ($PADLOCK_PREFETCH{$mode}) { - &test ("ebp","ebp"); - &jz (&label("${mode}_exit")); - -&set_label("${mode}_aligned_tail"); - &mov ($len,"ebp"); - &lea ("ebp",&DWP(-24,"esp")); - &mov ("esp","ebp"); - &mov ("eax","ebp"); - &sub ("esp",$len); - &and ("ebp",-16); - &and ("esp",-16); - &mov (&DWP(16,"ebp"),"eax"); - &mov ("eax", $out); # save parameters - &mov ($chunk,$len); - &shr ($len,2); - &lea ($out,&DWP(0,"esp")); - &data_byte(0xf3,0xa5); # rep movsl - &mov ($inp,"esp"); - &mov ($out,"eax"); # restore parameters - &mov ($len,$chunk); - &jmp (&label("${mode}_loop")); - } -&set_label("${mode}_exit"); } - &mov ("eax",1); - &lea ("esp",&DWP(4,"esp")); # popf - &emms () if ($mode eq "ctr32"); -&set_label("${mode}_abort"); -&function_end("padlock_${mode}_encrypt"); -} - -&generate_mode("ecb",0xc8); -&generate_mode("cbc",0xd0); -&generate_mode("cfb",0xe0); -&generate_mode("ofb",0xe8); -&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, - # because hardware CTR was introduced later - # and even has errata on certain C7 stepping. - # own implementation *always* works, though - # ~15% slower than dedicated hardware... - -&function_begin_B("padlock_xstore"); - &push ("edi"); - &mov ("edi",&wparam(0)); - &mov ("edx",&wparam(1)); - &data_byte(0x0f,0xa7,0xc0); # xstore - &pop ("edi"); - &ret (); -&function_end_B("padlock_xstore"); - -&function_begin_B("_win32_segv_handler"); - &mov ("eax",1); # ExceptionContinueSearch - &mov ("edx",&wparam(0)); # *ExceptionRecord - &mov ("ecx",&wparam(2)); # *ContextRecord - &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION - &jne (&label("ret")); - &add (&DWP(184,"ecx"),4); # skip over rep sha* - &mov ("eax",0); # ExceptionContinueExecution -&set_label("ret"); - &ret (); -&function_end_B("_win32_segv_handler"); -&safeseh("_win32_segv_handler") if ($::win32); - -&function_begin_B("padlock_sha1_oneshot"); - &push ("edi"); - &push ("esi"); - &xor ("eax","eax"); - &mov ("edi",&wparam(0)); - &mov ("esi",&wparam(1)); - &mov ("ecx",&wparam(2)); - if ($::win32 or $::coff) { - &push (&::islabel("_win32_segv_handler")); - &data_byte(0x64,0xff,0x30); # push %fs:(%eax) - &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) - } - &mov ("edx","esp"); # put aside %esp - &add ("esp",-128); # 32 is enough but spec says 128 - &movups ("xmm0",&QWP(0,"edi")); # copy-in context - &and ("esp",-16); - &mov ("eax",&DWP(16,"edi")); - &movaps (&QWP(0,"esp"),"xmm0"); - &mov ("edi","esp"); - &mov (&DWP(16,"esp"),"eax"); - &xor ("eax","eax"); - &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 - &movaps ("xmm0",&QWP(0,"esp")); - &mov ("eax",&DWP(16,"esp")); - &mov ("esp","edx"); # restore %esp - if ($::win32 or $::coff) { - &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 - &lea ("esp",&DWP(4,"esp")); - } - &mov ("edi",&wparam(0)); - &movups (&QWP(0,"edi"),"xmm0"); # copy-out context - &mov (&DWP(16,"edi"),"eax"); - &pop ("esi"); - &pop ("edi"); - &ret (); -&function_end_B("padlock_sha1_oneshot"); - -&function_begin_B("padlock_sha1_blocks"); - &push ("edi"); - &push ("esi"); - &mov ("edi",&wparam(0)); - &mov ("esi",&wparam(1)); - &mov ("edx","esp"); # put aside %esp - &mov ("ecx",&wparam(2)); - &add ("esp",-128); - &movups ("xmm0",&QWP(0,"edi")); # copy-in context - &and ("esp",-16); - &mov ("eax",&DWP(16,"edi")); - &movaps (&QWP(0,"esp"),"xmm0"); - &mov ("edi","esp"); - &mov (&DWP(16,"esp"),"eax"); - &mov ("eax",-1); - &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 - &movaps ("xmm0",&QWP(0,"esp")); - &mov ("eax",&DWP(16,"esp")); - &mov ("esp","edx"); # restore %esp - &mov ("edi",&wparam(0)); - &movups (&QWP(0,"edi"),"xmm0"); # copy-out context - &mov (&DWP(16,"edi"),"eax"); - &pop ("esi"); - &pop ("edi"); - &ret (); -&function_end_B("padlock_sha1_blocks"); - -&function_begin_B("padlock_sha256_oneshot"); - &push ("edi"); - &push ("esi"); - &xor ("eax","eax"); - &mov ("edi",&wparam(0)); - &mov ("esi",&wparam(1)); - &mov ("ecx",&wparam(2)); - if ($::win32 or $::coff) { - &push (&::islabel("_win32_segv_handler")); - &data_byte(0x64,0xff,0x30); # push %fs:(%eax) - &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) - } - &mov ("edx","esp"); # put aside %esp - &add ("esp",-128); - &movups ("xmm0",&QWP(0,"edi")); # copy-in context - &and ("esp",-16); - &movups ("xmm1",&QWP(16,"edi")); - &movaps (&QWP(0,"esp"),"xmm0"); - &mov ("edi","esp"); - &movaps (&QWP(16,"esp"),"xmm1"); - &xor ("eax","eax"); - &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 - &movaps ("xmm0",&QWP(0,"esp")); - &movaps ("xmm1",&QWP(16,"esp")); - &mov ("esp","edx"); # restore %esp - if ($::win32 or $::coff) { - &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 - &lea ("esp",&DWP(4,"esp")); - } - &mov ("edi",&wparam(0)); - &movups (&QWP(0,"edi"),"xmm0"); # copy-out context - &movups (&QWP(16,"edi"),"xmm1"); - &pop ("esi"); - &pop ("edi"); - &ret (); -&function_end_B("padlock_sha256_oneshot"); - -&function_begin_B("padlock_sha256_blocks"); - &push ("edi"); - &push ("esi"); - &mov ("edi",&wparam(0)); - &mov ("esi",&wparam(1)); - &mov ("ecx",&wparam(2)); - &mov ("edx","esp"); # put aside %esp - &add ("esp",-128); - &movups ("xmm0",&QWP(0,"edi")); # copy-in context - &and ("esp",-16); - &movups ("xmm1",&QWP(16,"edi")); - &movaps (&QWP(0,"esp"),"xmm0"); - &mov ("edi","esp"); - &movaps (&QWP(16,"esp"),"xmm1"); - &mov ("eax",-1); - &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 - &movaps ("xmm0",&QWP(0,"esp")); - &movaps ("xmm1",&QWP(16,"esp")); - &mov ("esp","edx"); # restore %esp - &mov ("edi",&wparam(0)); - &movups (&QWP(0,"edi"),"xmm0"); # copy-out context - &movups (&QWP(16,"edi"),"xmm1"); - &pop ("esi"); - &pop ("edi"); - &ret (); -&function_end_B("padlock_sha256_blocks"); - -&function_begin_B("padlock_sha512_blocks"); - &push ("edi"); - &push ("esi"); - &mov ("edi",&wparam(0)); - &mov ("esi",&wparam(1)); - &mov ("ecx",&wparam(2)); - &mov ("edx","esp"); # put aside %esp - &add ("esp",-128); - &movups ("xmm0",&QWP(0,"edi")); # copy-in context - &and ("esp",-16); - &movups ("xmm1",&QWP(16,"edi")); - &movups ("xmm2",&QWP(32,"edi")); - &movups ("xmm3",&QWP(48,"edi")); - &movaps (&QWP(0,"esp"),"xmm0"); - &mov ("edi","esp"); - &movaps (&QWP(16,"esp"),"xmm1"); - &movaps (&QWP(32,"esp"),"xmm2"); - &movaps (&QWP(48,"esp"),"xmm3"); - &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 - &movaps ("xmm0",&QWP(0,"esp")); - &movaps ("xmm1",&QWP(16,"esp")); - &movaps ("xmm2",&QWP(32,"esp")); - &movaps ("xmm3",&QWP(48,"esp")); - &mov ("esp","edx"); # restore %esp - &mov ("edi",&wparam(0)); - &movups (&QWP(0,"edi"),"xmm0"); # copy-out context - &movups (&QWP(16,"edi"),"xmm1"); - &movups (&QWP(32,"edi"),"xmm2"); - &movups (&QWP(48,"edi"),"xmm3"); - &pop ("esi"); - &pop ("edi"); - &ret (); -&function_end_B("padlock_sha512_blocks"); - -&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); -&align (16); - -&dataseg(); -# Essentially this variable belongs in thread local storage. -# Having this variable global on the other hand can only cause -# few bogus key reloads [if any at all on signle-CPU system], -# so we accept the penalty... -&set_label("padlock_saved_context",4); -&data_word(0); - -&asm_finish(); +../openssl/./engines/asm/e_padlock-x86.pl
\ No newline at end of file diff --git a/devel/perlasm/e_padlock-x86_64.pl b/devel/perlasm/e_padlock-x86_64.pl index f8ba1e909f..1546fabede 100644..120000 --- a/devel/perlasm/e_padlock-x86_64.pl +++ b/devel/perlasm/e_padlock-x86_64.pl @@ -1,567 +1 @@ -#!/usr/bin/env perl - -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# September 2011 -# -# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for -# details. - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -$code=".text\n"; - -%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata -$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 - -$ctx="%rdx"; -$out="%rdi"; -$inp="%rsi"; -$len="%rcx"; -$chunk="%rbx"; - -($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order - -$code.=<<___; -.globl padlock_capability -.type padlock_capability,\@abi-omnipotent -.align 16 -padlock_capability: - mov %rbx,%r8 - xor %eax,%eax - cpuid - xor %eax,%eax - cmp \$`"0x".unpack("H*",'tneC')`,%ebx - jne .Lnoluck - cmp \$`"0x".unpack("H*",'Hrua')`,%edx - jne .Lnoluck - cmp \$`"0x".unpack("H*",'slua')`,%ecx - jne .Lnoluck - mov \$0xC0000000,%eax - cpuid - mov %eax,%edx - xor %eax,%eax - cmp \$0xC0000001,%edx - jb .Lnoluck - mov \$0xC0000001,%eax - cpuid - mov %edx,%eax - and \$0xffffffef,%eax - or \$0x10,%eax # set Nano bit#4 -.Lnoluck: - mov %r8,%rbx - ret -.size padlock_capability,.-padlock_capability - -.globl padlock_key_bswap -.type padlock_key_bswap,\@abi-omnipotent,0 -.align 16 -padlock_key_bswap: - mov 240($arg1),%edx -.Lbswap_loop: - mov ($arg1),%eax - bswap %eax - mov %eax,($arg1) - lea 4($arg1),$arg1 - sub \$1,%edx - jnz .Lbswap_loop - ret -.size padlock_key_bswap,.-padlock_key_bswap - -.globl padlock_verify_context -.type padlock_verify_context,\@abi-omnipotent -.align 16 -padlock_verify_context: - mov $arg1,$ctx - pushf - lea .Lpadlock_saved_context(%rip),%rax - call _padlock_verify_ctx - lea 8(%rsp),%rsp - ret -.size padlock_verify_context,.-padlock_verify_context - -.type _padlock_verify_ctx,\@abi-omnipotent -.align 16 -_padlock_verify_ctx: - mov 8(%rsp),%r8 - bt \$30,%r8 - jnc .Lverified - cmp (%rax),$ctx - je .Lverified - pushf - popf -.Lverified: - mov $ctx,(%rax) - ret -.size _padlock_verify_ctx,.-_padlock_verify_ctx - -.globl padlock_reload_key -.type padlock_reload_key,\@abi-omnipotent -.align 16 -padlock_reload_key: - pushf - popf - ret -.size padlock_reload_key,.-padlock_reload_key - -.globl padlock_aes_block -.type padlock_aes_block,\@function,3 -.align 16 -padlock_aes_block: - mov %rbx,%r8 - mov \$1,$len - lea 32($ctx),%rbx # key - lea 16($ctx),$ctx # control word - .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb - mov %r8,%rbx - ret -.size padlock_aes_block,.-padlock_aes_block - -.globl padlock_xstore -.type padlock_xstore,\@function,2 -.align 16 -padlock_xstore: - mov %esi,%edx - .byte 0x0f,0xa7,0xc0 # xstore - ret -.size padlock_xstore,.-padlock_xstore - -.globl padlock_sha1_oneshot -.type padlock_sha1_oneshot,\@function,3 -.align 16 -padlock_sha1_oneshot: - mov %rdx,%rcx - mov %rdi,%rdx # put aside %rdi - movups (%rdi),%xmm0 # copy-in context - sub \$128+8,%rsp - mov 16(%rdi),%eax - movaps %xmm0,(%rsp) - mov %rsp,%rdi - mov %eax,16(%rsp) - xor %rax,%rax - .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 - movaps (%rsp),%xmm0 - mov 16(%rsp),%eax - add \$128+8,%rsp - movups %xmm0,(%rdx) # copy-out context - mov %eax,16(%rdx) - ret -.size padlock_sha1_oneshot,.-padlock_sha1_oneshot - -.globl padlock_sha1_blocks -.type padlock_sha1_blocks,\@function,3 -.align 16 -padlock_sha1_blocks: - mov %rdx,%rcx - mov %rdi,%rdx # put aside %rdi - movups (%rdi),%xmm0 # copy-in context - sub \$128+8,%rsp - mov 16(%rdi),%eax - movaps %xmm0,(%rsp) - mov %rsp,%rdi - mov %eax,16(%rsp) - mov \$-1,%rax - .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 - movaps (%rsp),%xmm0 - mov 16(%rsp),%eax - add \$128+8,%rsp - movups %xmm0,(%rdx) # copy-out context - mov %eax,16(%rdx) - ret -.size padlock_sha1_blocks,.-padlock_sha1_blocks - -.globl padlock_sha256_oneshot -.type padlock_sha256_oneshot,\@function,3 -.align 16 -padlock_sha256_oneshot: - mov %rdx,%rcx - mov %rdi,%rdx # put aside %rdi - movups (%rdi),%xmm0 # copy-in context - sub \$128+8,%rsp - movups 16(%rdi),%xmm1 - movaps %xmm0,(%rsp) - mov %rsp,%rdi - movaps %xmm1,16(%rsp) - xor %rax,%rax - .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 - movaps (%rsp),%xmm0 - movaps 16(%rsp),%xmm1 - add \$128+8,%rsp - movups %xmm0,(%rdx) # copy-out context - movups %xmm1,16(%rdx) - ret -.size padlock_sha256_oneshot,.-padlock_sha256_oneshot - -.globl padlock_sha256_blocks -.type padlock_sha256_blocks,\@function,3 -.align 16 -padlock_sha256_blocks: - mov %rdx,%rcx - mov %rdi,%rdx # put aside %rdi - movups (%rdi),%xmm0 # copy-in context - sub \$128+8,%rsp - movups 16(%rdi),%xmm1 - movaps %xmm0,(%rsp) - mov %rsp,%rdi - movaps %xmm1,16(%rsp) - mov \$-1,%rax - .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 - movaps (%rsp),%xmm0 - movaps 16(%rsp),%xmm1 - add \$128+8,%rsp - movups %xmm0,(%rdx) # copy-out context - movups %xmm1,16(%rdx) - ret -.size padlock_sha256_blocks,.-padlock_sha256_blocks - -.globl padlock_sha512_blocks -.type padlock_sha512_blocks,\@function,3 -.align 16 -padlock_sha512_blocks: - mov %rdx,%rcx - mov %rdi,%rdx # put aside %rdi - movups (%rdi),%xmm0 # copy-in context - sub \$128+8,%rsp - movups 16(%rdi),%xmm1 - movups 32(%rdi),%xmm2 - movups 48(%rdi),%xmm3 - movaps %xmm0,(%rsp) - mov %rsp,%rdi - movaps %xmm1,16(%rsp) - movaps %xmm2,32(%rsp) - movaps %xmm3,48(%rsp) - .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 - movaps (%rsp),%xmm0 - movaps 16(%rsp),%xmm1 - movaps 32(%rsp),%xmm2 - movaps 48(%rsp),%xmm3 - add \$128+8,%rsp - movups %xmm0,(%rdx) # copy-out context - movups %xmm1,16(%rdx) - movups %xmm2,32(%rdx) - movups %xmm3,48(%rdx) - ret -.size padlock_sha512_blocks,.-padlock_sha512_blocks -___ - -sub generate_mode { -my ($mode,$opcode) = @_; -# int padlock_$mode_encrypt(void *out, const void *inp, -# struct padlock_cipher_data *ctx, size_t len); -$code.=<<___; -.globl padlock_${mode}_encrypt -.type padlock_${mode}_encrypt,\@function,4 -.align 16 -padlock_${mode}_encrypt: - push %rbp - push %rbx - - xor %eax,%eax - test \$15,$ctx - jnz .L${mode}_abort - test \$15,$len - jnz .L${mode}_abort - lea .Lpadlock_saved_context(%rip),%rax - pushf - cld - call _padlock_verify_ctx - lea 16($ctx),$ctx # control word - xor %eax,%eax - xor %ebx,%ebx - testl \$`1<<5`,($ctx) # align bit in control word - jnz .L${mode}_aligned - test \$0x0f,$out - setz %al # !out_misaligned - test \$0x0f,$inp - setz %bl # !inp_misaligned - test %ebx,%eax - jnz .L${mode}_aligned - neg %rax - mov \$$PADLOCK_CHUNK,$chunk - not %rax # out_misaligned?-1:0 - lea (%rsp),%rbp - cmp $chunk,$len - cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len - and $chunk,%rax # out_misaligned?chunk:0 - mov $len,$chunk - neg %rax - and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK - lea (%rax,%rbp),%rsp - mov \$$PADLOCK_CHUNK,%rax - cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK -___ -$code.=<<___ if ($mode eq "ctr32"); -.L${mode}_reenter: - mov -4($ctx),%eax # pull 32-bit counter - bswap %eax - neg %eax - and \$`$PADLOCK_CHUNK/16-1`,%eax - mov \$$PADLOCK_CHUNK,$chunk - shl \$4,%eax - cmovz $chunk,%rax - cmp %rax,$len - cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK - cmovbe $len,$chunk -___ -$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); - cmp $chunk,$len - ja .L${mode}_loop - mov $inp,%rax # check if prefetch crosses page - cmp %rsp,%rbp - cmove $out,%rax - add $len,%rax - neg %rax - and \$0xfff,%rax # distance to page boundary - cmp \$$PADLOCK_PREFETCH{$mode},%rax - mov \$-$PADLOCK_PREFETCH{$mode},%rax - cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 - and %rax,$chunk - jz .L${mode}_unaligned_tail -___ -$code.=<<___; - jmp .L${mode}_loop -.align 16 -.L${mode}_loop: - cmp $len,$chunk # ctr32 artefact - cmova $len,$chunk # ctr32 artefact - mov $out,%r8 # save parameters - mov $inp,%r9 - mov $len,%r10 - mov $chunk,$len - mov $chunk,%r11 - test \$0x0f,$out # out_misaligned - cmovnz %rsp,$out - test \$0x0f,$inp # inp_misaligned - jz .L${mode}_inp_aligned - shr \$3,$len - .byte 0xf3,0x48,0xa5 # rep movsq - sub $chunk,$out - mov $chunk,$len - mov $out,$inp -.L${mode}_inp_aligned: - lea -16($ctx),%rax # ivp - lea 16($ctx),%rbx # key - shr \$4,$len - .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* -___ -$code.=<<___ if ($mode !~ /ecb|ctr/); - movdqa (%rax),%xmm0 - movdqa %xmm0,-16($ctx) # copy [or refresh] iv -___ -$code.=<<___ if ($mode eq "ctr32"); - mov -4($ctx),%eax # pull 32-bit counter - test \$0xffff0000,%eax - jnz .L${mode}_no_carry - bswap %eax - add \$0x10000,%eax - bswap %eax - mov %eax,-4($ctx) -.L${mode}_no_carry: -___ -$code.=<<___; - mov %r8,$out # restore paramters - mov %r11,$chunk - test \$0x0f,$out - jz .L${mode}_out_aligned - mov $chunk,$len - lea (%rsp),$inp - shr \$3,$len - .byte 0xf3,0x48,0xa5 # rep movsq - sub $chunk,$out -.L${mode}_out_aligned: - mov %r9,$inp - mov %r10,$len - add $chunk,$out - add $chunk,$inp - sub $chunk,$len - mov \$$PADLOCK_CHUNK,$chunk -___ - if (!$PADLOCK_PREFETCH{$mode}) { -$code.=<<___; - jnz .L${mode}_loop -___ - } else { -$code.=<<___; - jz .L${mode}_break - cmp $chunk,$len - jae .L${mode}_loop -___ -$code.=<<___ if ($mode eq "ctr32"); - mov $len,$chunk - mov $inp,%rax # check if prefetch crosses page - cmp %rsp,%rbp - cmove $out,%rax - add $len,%rax - neg %rax - and \$0xfff,%rax # distance to page boundary - cmp \$$PADLOCK_PREFETCH{$mode},%rax - mov \$-$PADLOCK_PREFETCH{$mode},%rax - cmovae $chunk,%rax - and %rax,$chunk - jnz .L${mode}_loop -___ -$code.=<<___; -.L${mode}_unaligned_tail: - xor %eax,%eax - cmp %rsp,%rbp - cmove $len,%rax - mov $out,%r8 # save parameters - mov $len,$chunk - sub %rax,%rsp # alloca - shr \$3,$len - lea (%rsp),$out - .byte 0xf3,0x48,0xa5 # rep movsq - mov %rsp,$inp - mov %r8, $out # restore parameters - mov $chunk,$len - jmp .L${mode}_loop -.align 16 -.L${mode}_break: -___ - } -$code.=<<___; - cmp %rbp,%rsp - je .L${mode}_done - - pxor %xmm0,%xmm0 - lea (%rsp),%rax -.L${mode}_bzero: - movaps %xmm0,(%rax) - lea 16(%rax),%rax - cmp %rax,%rbp - ja .L${mode}_bzero - -.L${mode}_done: - lea (%rbp),%rsp - jmp .L${mode}_exit - -.align 16 -.L${mode}_aligned: -___ -$code.=<<___ if ($mode eq "ctr32"); - mov -4($ctx),%eax # pull 32-bit counter - bswap %eax - neg %eax - and \$0xffff,%eax - mov \$`16*0x10000`,$chunk - shl \$4,%eax - cmovz $chunk,%rax - cmp %rax,$len - cmova %rax,$chunk # don't let counter cross 2^16 - cmovbe $len,$chunk - jbe .L${mode}_aligned_skip - -.L${mode}_aligned_loop: - mov $len,%r10 # save parameters - mov $chunk,$len - mov $chunk,%r11 - - lea -16($ctx),%rax # ivp - lea 16($ctx),%rbx # key - shr \$4,$len # len/=AES_BLOCK_SIZE - .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* - - mov -4($ctx),%eax # pull 32-bit counter - bswap %eax - add \$0x10000,%eax - bswap %eax - mov %eax,-4($ctx) - - mov %r10,$len # restore paramters - sub %r11,$len - mov \$`16*0x10000`,$chunk - jz .L${mode}_exit - cmp $chunk,$len - jae .L${mode}_aligned_loop - -.L${mode}_aligned_skip: -___ -$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); - lea ($inp,$len),%rbp - neg %rbp - and \$0xfff,%rbp # distance to page boundary - xor %eax,%eax - cmp \$$PADLOCK_PREFETCH{$mode},%rbp - mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp - cmovae %rax,%rbp - and $len,%rbp # remainder - sub %rbp,$len - jz .L${mode}_aligned_tail -___ -$code.=<<___; - lea -16($ctx),%rax # ivp - lea 16($ctx),%rbx # key - shr \$4,$len # len/=AES_BLOCK_SIZE - .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* -___ -$code.=<<___ if ($mode !~ /ecb|ctr/); - movdqa (%rax),%xmm0 - movdqa %xmm0,-16($ctx) # copy [or refresh] iv -___ -$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); - test %rbp,%rbp # check remainder - jz .L${mode}_exit - -.L${mode}_aligned_tail: - mov $out,%r8 - mov %rbp,$chunk - mov %rbp,$len - lea (%rsp),%rbp - sub $len,%rsp - shr \$3,$len - lea (%rsp),$out - .byte 0xf3,0x48,0xa5 # rep movsq - lea (%r8),$out - lea (%rsp),$inp - mov $chunk,$len - jmp .L${mode}_loop -___ -$code.=<<___; -.L${mode}_exit: - mov \$1,%eax - lea 8(%rsp),%rsp -.L${mode}_abort: - pop %rbx - pop %rbp - ret -.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt -___ -} - -&generate_mode("ecb",0xc8); -&generate_mode("cbc",0xd0); -&generate_mode("cfb",0xe0); -&generate_mode("ofb",0xe8); -&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... - -$code.=<<___; -.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" -.align 16 -.data -.align 8 -.Lpadlock_saved_context: - .quad 0 -___ -$code =~ s/\`([^\`]*)\`/eval($1)/gem; - -print $code; - -close STDOUT; +../openssl/./engines/asm/e_padlock-x86_64.pl
\ No newline at end of file diff --git a/devel/perlasm/ghash-x86.pl b/devel/perlasm/ghash-x86.pl index e6b9663c13..5a234e3ec8 100644..120000 --- a/devel/perlasm/ghash-x86.pl +++ b/devel/perlasm/ghash-x86.pl @@ -1,1391 +1 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# March, May, June 2010 -# -# The module implements "4-bit" GCM GHASH function and underlying -# single multiplication operation in GF(2^128). "4-bit" means that it -# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two -# code paths: vanilla x86 and vanilla SSE. Former will be executed on -# 486 and Pentium, latter on all others. SSE GHASH features so called -# "528B" variant of "4-bit" method utilizing additional 256+16 bytes -# of per-key storage [+512 bytes shared table]. Performance results -# are for streamed GHASH subroutine and are expressed in cycles per -# processed byte, less is better: -# -# gcc 2.95.3(*) SSE assembler x86 assembler -# -# Pentium 105/111(**) - 50 -# PIII 68 /75 12.2 24 -# P4 125/125 17.8 84(***) -# Opteron 66 /70 10.1 30 -# Core2 54 /67 8.4 18 -# Atom 105/105 16.8 53 -# VIA Nano 69 /71 13.0 27 -# -# (*) gcc 3.4.x was observed to generate few percent slower code, -# which is one of reasons why 2.95.3 results were chosen, -# another reason is lack of 3.4.x results for older CPUs; -# comparison with SSE results is not completely fair, because C -# results are for vanilla "256B" implementation, while -# assembler results are for "528B";-) -# (**) second number is result for code compiled with -fPIC flag, -# which is actually more relevant, because assembler code is -# position-independent; -# (***) see comment in non-MMX routine for further details; -# -# To summarize, it's >2-5 times faster than gcc-generated code. To -# anchor it to something else SHA1 assembler processes one byte in -# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE -# in particular, see comment at the end of the file... - -# May 2010 -# -# Add PCLMULQDQ version performing at 2.10 cycles per processed byte. -# The question is how close is it to theoretical limit? The pclmulqdq -# instruction latency appears to be 14 cycles and there can't be more -# than 2 of them executing at any given time. This means that single -# Karatsuba multiplication would take 28 cycles *plus* few cycles for -# pre- and post-processing. Then multiplication has to be followed by -# modulo-reduction. Given that aggregated reduction method [see -# "Carry-less Multiplication and Its Usage for Computing the GCM Mode" -# white paper by Intel] allows you to perform reduction only once in -# a while we can assume that asymptotic performance can be estimated -# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction -# and Naggr is the aggregation factor. -# -# Before we proceed to this implementation let's have closer look at -# the best-performing code suggested by Intel in their white paper. -# By tracing inter-register dependencies Tmod is estimated as ~19 -# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per -# processed byte. As implied, this is quite optimistic estimate, -# because it does not account for Karatsuba pre- and post-processing, -# which for a single multiplication is ~5 cycles. Unfortunately Intel -# does not provide performance data for GHASH alone. But benchmarking -# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt -# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that -# the result accounts even for pre-computing of degrees of the hash -# key H, but its portion is negligible at 16KB buffer size. -# -# Moving on to the implementation in question. Tmod is estimated as -# ~13 cycles and Naggr is 2, giving asymptotic performance of ... -# 2.16. How is it possible that measured performance is better than -# optimistic theoretical estimate? There is one thing Intel failed -# to recognize. By serializing GHASH with CTR in same subroutine -# former's performance is really limited to above (Tmul + Tmod/Naggr) -# equation. But if GHASH procedure is detached, the modulo-reduction -# can be interleaved with Naggr-1 multiplications at instruction level -# and under ideal conditions even disappear from the equation. So that -# optimistic theoretical estimate for this implementation is ... -# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, -# at least for such small Naggr. I'd argue that (28+Tproc/Naggr), -# where Tproc is time required for Karatsuba pre- and post-processing, -# is more realistic estimate. In this case it gives ... 1.91 cycles. -# Or in other words, depending on how well we can interleave reduction -# and one of the two multiplications the performance should be betwen -# 1.91 and 2.16. As already mentioned, this implementation processes -# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart -# - in 2.02. x86_64 performance is better, because larger register -# bank allows to interleave reduction and multiplication better. -# -# Does it make sense to increase Naggr? To start with it's virtually -# impossible in 32-bit mode, because of limited register bank -# capacity. Otherwise improvement has to be weighed agiainst slower -# setup, as well as code size and complexity increase. As even -# optimistic estimate doesn't promise 30% performance improvement, -# there are currently no plans to increase Naggr. -# -# Special thanks to David Woodhouse <dwmw2@infradead.org> for -# providing access to a Westmere-based system on behalf of Intel -# Open Source Technology Centre. - -# January 2010 -# -# Tweaked to optimize transitions between integer and FP operations -# on same XMM register, PCLMULQDQ subroutine was measured to process -# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. -# The minor regression on Westmere is outweighed by ~15% improvement -# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in -# similar manner resulted in almost 20% degradation on Sandy Bridge, -# where original 64-bit code processes one byte in 1.95 cycles. - -##################################################################### -# For reference, AMD Bulldozer processes one byte in 1.98 cycles in -# 32-bit mode and 1.89 in 64-bit. - -# February 2013 -# -# Overhaul: aggregate Karatsuba post-processing, improve ILP in -# reduction_alg9. Resulting performance is 1.96 cycles per byte on -# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer. - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); - -$sse2=0; -for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } - -($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); -$inp = "edi"; -$Htbl = "esi"; - -$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse - # than unrolled, which has to be weighted against - # 2.5x x86-specific code size reduction. - -sub x86_loop { - my $off = shift; - my $rem = "eax"; - - &mov ($Zhh,&DWP(4,$Htbl,$Zll)); - &mov ($Zhl,&DWP(0,$Htbl,$Zll)); - &mov ($Zlh,&DWP(12,$Htbl,$Zll)); - &mov ($Zll,&DWP(8,$Htbl,$Zll)); - &xor ($rem,$rem); # avoid partial register stalls on PIII - - # shrd practically kills P4, 2.5x deterioration, but P4 has - # MMX code-path to execute. shrd runs tad faster [than twice - # the shifts, move's and or's] on pre-MMX Pentium (as well as - # PIII and Core2), *but* minimizes code size, spares register - # and thus allows to fold the loop... - if (!$unroll) { - my $cnt = $inp; - &mov ($cnt,15); - &jmp (&label("x86_loop")); - &set_label("x86_loop",16); - for($i=1;$i<=2;$i++) { - &mov (&LB($rem),&LB($Zll)); - &shrd ($Zll,$Zlh,4); - &and (&LB($rem),0xf); - &shrd ($Zlh,$Zhl,4); - &shrd ($Zhl,$Zhh,4); - &shr ($Zhh,4); - &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); - - &mov (&LB($rem),&BP($off,"esp",$cnt)); - if ($i&1) { - &and (&LB($rem),0xf0); - } else { - &shl (&LB($rem),4); - } - - &xor ($Zll,&DWP(8,$Htbl,$rem)); - &xor ($Zlh,&DWP(12,$Htbl,$rem)); - &xor ($Zhl,&DWP(0,$Htbl,$rem)); - &xor ($Zhh,&DWP(4,$Htbl,$rem)); - - if ($i&1) { - &dec ($cnt); - &js (&label("x86_break")); - } else { - &jmp (&label("x86_loop")); - } - } - &set_label("x86_break",16); - } else { - for($i=1;$i<32;$i++) { - &comment($i); - &mov (&LB($rem),&LB($Zll)); - &shrd ($Zll,$Zlh,4); - &and (&LB($rem),0xf); - &shrd ($Zlh,$Zhl,4); - &shrd ($Zhl,$Zhh,4); - &shr ($Zhh,4); - &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); - - if ($i&1) { - &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); - &and (&LB($rem),0xf0); - } else { - &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); - &shl (&LB($rem),4); - } - - &xor ($Zll,&DWP(8,$Htbl,$rem)); - &xor ($Zlh,&DWP(12,$Htbl,$rem)); - &xor ($Zhl,&DWP(0,$Htbl,$rem)); - &xor ($Zhh,&DWP(4,$Htbl,$rem)); - } - } - &bswap ($Zll); - &bswap ($Zlh); - &bswap ($Zhl); - if (!$x86only) { - &bswap ($Zhh); - } else { - &mov ("eax",$Zhh); - &bswap ("eax"); - &mov ($Zhh,"eax"); - } -} - -if ($unroll) { - &function_begin_B("_x86_gmult_4bit_inner"); - &x86_loop(4); - &ret (); - &function_end_B("_x86_gmult_4bit_inner"); -} - -sub deposit_rem_4bit { - my $bias = shift; - - &mov (&DWP($bias+0, "esp"),0x0000<<16); - &mov (&DWP($bias+4, "esp"),0x1C20<<16); - &mov (&DWP($bias+8, "esp"),0x3840<<16); - &mov (&DWP($bias+12,"esp"),0x2460<<16); - &mov (&DWP($bias+16,"esp"),0x7080<<16); - &mov (&DWP($bias+20,"esp"),0x6CA0<<16); - &mov (&DWP($bias+24,"esp"),0x48C0<<16); - &mov (&DWP($bias+28,"esp"),0x54E0<<16); - &mov (&DWP($bias+32,"esp"),0xE100<<16); - &mov (&DWP($bias+36,"esp"),0xFD20<<16); - &mov (&DWP($bias+40,"esp"),0xD940<<16); - &mov (&DWP($bias+44,"esp"),0xC560<<16); - &mov (&DWP($bias+48,"esp"),0x9180<<16); - &mov (&DWP($bias+52,"esp"),0x8DA0<<16); - &mov (&DWP($bias+56,"esp"),0xA9C0<<16); - &mov (&DWP($bias+60,"esp"),0xB5E0<<16); -} - -$suffix = $x86only ? "" : "_x86"; - -&function_begin("gcm_gmult_4bit".$suffix); - &stack_push(16+4+1); # +1 for stack alignment - &mov ($inp,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable - - &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] - &mov ($Zhl,&DWP(4,$inp)); - &mov ($Zlh,&DWP(8,$inp)); - &mov ($Zll,&DWP(12,$inp)); - - &deposit_rem_4bit(16); - - &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack - &mov (&DWP(4,"esp"),$Zhl); - &mov (&DWP(8,"esp"),$Zlh); - &mov (&DWP(12,"esp"),$Zll); - &shr ($Zll,20); - &and ($Zll,0xf0); - - if ($unroll) { - &call ("_x86_gmult_4bit_inner"); - } else { - &x86_loop(0); - &mov ($inp,&wparam(0)); - } - - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(0,$inp),$Zhh); - &stack_pop(16+4+1); -&function_end("gcm_gmult_4bit".$suffix); - -&function_begin("gcm_ghash_4bit".$suffix); - &stack_push(16+4+1); # +1 for 64-bit alignment - &mov ($Zll,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable - &mov ($inp,&wparam(2)); # load in - &mov ("ecx",&wparam(3)); # load len - &add ("ecx",$inp); - &mov (&wparam(3),"ecx"); - - &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] - &mov ($Zhl,&DWP(4,$Zll)); - &mov ($Zlh,&DWP(8,$Zll)); - &mov ($Zll,&DWP(12,$Zll)); - - &deposit_rem_4bit(16); - - &set_label("x86_outer_loop",16); - &xor ($Zll,&DWP(12,$inp)); # xor with input - &xor ($Zlh,&DWP(8,$inp)); - &xor ($Zhl,&DWP(4,$inp)); - &xor ($Zhh,&DWP(0,$inp)); - &mov (&DWP(12,"esp"),$Zll); # dump it on stack - &mov (&DWP(8,"esp"),$Zlh); - &mov (&DWP(4,"esp"),$Zhl); - &mov (&DWP(0,"esp"),$Zhh); - - &shr ($Zll,20); - &and ($Zll,0xf0); - - if ($unroll) { - &call ("_x86_gmult_4bit_inner"); - } else { - &x86_loop(0); - &mov ($inp,&wparam(2)); - } - &lea ($inp,&DWP(16,$inp)); - &cmp ($inp,&wparam(3)); - &mov (&wparam(2),$inp) if (!$unroll); - &jb (&label("x86_outer_loop")); - - &mov ($inp,&wparam(0)); # load Xi - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(0,$inp),$Zhh); - &stack_pop(16+4+1); -&function_end("gcm_ghash_4bit".$suffix); - -if (!$x86only) {{{ - -&static_label("rem_4bit"); - -if (!$sse2) {{ # pure-MMX "May" version... - -$S=12; # shift factor for rem_4bit - -&function_begin_B("_mmx_gmult_4bit_inner"); -# MMX version performs 3.5 times better on P4 (see comment in non-MMX -# routine for further details), 100% better on Opteron, ~70% better -# on Core2 and PIII... In other words effort is considered to be well -# spent... Since initial release the loop was unrolled in order to -# "liberate" register previously used as loop counter. Instead it's -# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. -# The path involves move of Z.lo from MMX to integer register, -# effective address calculation and finally merge of value to Z.hi. -# Reference to rem_4bit is scheduled so late that I had to >>4 -# rem_4bit elements. This resulted in 20-45% procent improvement -# on contemporary µ-archs. -{ - my $cnt; - my $rem_4bit = "eax"; - my @rem = ($Zhh,$Zll); - my $nhi = $Zhl; - my $nlo = $Zlh; - - my ($Zlo,$Zhi) = ("mm0","mm1"); - my $tmp = "mm2"; - - &xor ($nlo,$nlo); # avoid partial register stalls on PIII - &mov ($nhi,$Zll); - &mov (&LB($nlo),&LB($nhi)); - &shl (&LB($nlo),4); - &and ($nhi,0xf0); - &movq ($Zlo,&QWP(8,$Htbl,$nlo)); - &movq ($Zhi,&QWP(0,$Htbl,$nlo)); - &movd ($rem[0],$Zlo); - - for ($cnt=28;$cnt>=-2;$cnt--) { - my $odd = $cnt&1; - my $nix = $odd ? $nlo : $nhi; - - &shl (&LB($nlo),4) if ($odd); - &psrlq ($Zlo,4); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &pxor ($Zlo,&QWP(8,$Htbl,$nix)); - &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); - &psllq ($tmp,60); - &and ($nhi,0xf0) if ($odd); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); - &and ($rem[0],0xf); - &pxor ($Zhi,&QWP(0,$Htbl,$nix)); - &mov ($nhi,$nlo) if (!$odd && $cnt>=0); - &movd ($rem[1],$Zlo); - &pxor ($Zlo,$tmp); - - push (@rem,shift(@rem)); # "rotate" registers - } - - &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] - - &psrlq ($Zlo,32); # lower part of Zlo is already there - &movd ($Zhl,$Zhi); - &psrlq ($Zhi,32); - &movd ($Zlh,$Zlo); - &movd ($Zhh,$Zhi); - &shl ($inp,4); # compensate for rem_4bit[i] being >>4 - - &bswap ($Zll); - &bswap ($Zhl); - &bswap ($Zlh); - &xor ($Zhh,$inp); - &bswap ($Zhh); - - &ret (); -} -&function_end_B("_mmx_gmult_4bit_inner"); - -&function_begin("gcm_gmult_4bit_mmx"); - &mov ($inp,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable - - &call (&label("pic_point")); - &set_label("pic_point"); - &blindpop("eax"); - &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); - - &movz ($Zll,&BP(15,$inp)); - - &call ("_mmx_gmult_4bit_inner"); - - &mov ($inp,&wparam(0)); # load Xi - &emms (); - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(0,$inp),$Zhh); -&function_end("gcm_gmult_4bit_mmx"); - -# Streamed version performs 20% better on P4, 7% on Opteron, -# 10% on Core2 and PIII... -&function_begin("gcm_ghash_4bit_mmx"); - &mov ($Zhh,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable - &mov ($inp,&wparam(2)); # load in - &mov ($Zlh,&wparam(3)); # load len - - &call (&label("pic_point")); - &set_label("pic_point"); - &blindpop("eax"); - &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); - - &add ($Zlh,$inp); - &mov (&wparam(3),$Zlh); # len to point at the end of input - &stack_push(4+1); # +1 for stack alignment - - &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] - &mov ($Zhl,&DWP(4,$Zhh)); - &mov ($Zlh,&DWP(8,$Zhh)); - &mov ($Zhh,&DWP(0,$Zhh)); - &jmp (&label("mmx_outer_loop")); - - &set_label("mmx_outer_loop",16); - &xor ($Zll,&DWP(12,$inp)); - &xor ($Zhl,&DWP(4,$inp)); - &xor ($Zlh,&DWP(8,$inp)); - &xor ($Zhh,&DWP(0,$inp)); - &mov (&wparam(2),$inp); - &mov (&DWP(12,"esp"),$Zll); - &mov (&DWP(4,"esp"),$Zhl); - &mov (&DWP(8,"esp"),$Zlh); - &mov (&DWP(0,"esp"),$Zhh); - - &mov ($inp,"esp"); - &shr ($Zll,24); - - &call ("_mmx_gmult_4bit_inner"); - - &mov ($inp,&wparam(2)); - &lea ($inp,&DWP(16,$inp)); - &cmp ($inp,&wparam(3)); - &jb (&label("mmx_outer_loop")); - - &mov ($inp,&wparam(0)); # load Xi - &emms (); - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(0,$inp),$Zhh); - - &stack_pop(4+1); -&function_end("gcm_ghash_4bit_mmx"); - -}} else {{ # "June" MMX version... - # ... has slower "April" gcm_gmult_4bit_mmx with folded - # loop. This is done to conserve code size... -$S=16; # shift factor for rem_4bit - -sub mmx_loop() { -# MMX version performs 2.8 times better on P4 (see comment in non-MMX -# routine for further details), 40% better on Opteron and Core2, 50% -# better on PIII... In other words effort is considered to be well -# spent... - my $inp = shift; - my $rem_4bit = shift; - my $cnt = $Zhh; - my $nhi = $Zhl; - my $nlo = $Zlh; - my $rem = $Zll; - - my ($Zlo,$Zhi) = ("mm0","mm1"); - my $tmp = "mm2"; - - &xor ($nlo,$nlo); # avoid partial register stalls on PIII - &mov ($nhi,$Zll); - &mov (&LB($nlo),&LB($nhi)); - &mov ($cnt,14); - &shl (&LB($nlo),4); - &and ($nhi,0xf0); - &movq ($Zlo,&QWP(8,$Htbl,$nlo)); - &movq ($Zhi,&QWP(0,$Htbl,$nlo)); - &movd ($rem,$Zlo); - &jmp (&label("mmx_loop")); - - &set_label("mmx_loop",16); - &psrlq ($Zlo,4); - &and ($rem,0xf); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); - &mov (&LB($nlo),&BP(0,$inp,$cnt)); - &psllq ($tmp,60); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); - &dec ($cnt); - &movd ($rem,$Zlo); - &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); - &mov ($nhi,$nlo); - &pxor ($Zlo,$tmp); - &js (&label("mmx_break")); - - &shl (&LB($nlo),4); - &and ($rem,0xf); - &psrlq ($Zlo,4); - &and ($nhi,0xf0); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); - &psllq ($tmp,60); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); - &movd ($rem,$Zlo); - &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); - &pxor ($Zlo,$tmp); - &jmp (&label("mmx_loop")); - - &set_label("mmx_break",16); - &shl (&LB($nlo),4); - &and ($rem,0xf); - &psrlq ($Zlo,4); - &and ($nhi,0xf0); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); - &psllq ($tmp,60); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); - &movd ($rem,$Zlo); - &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); - &pxor ($Zlo,$tmp); - - &psrlq ($Zlo,4); - &and ($rem,0xf); - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); - &psllq ($tmp,60); - &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); - &movd ($rem,$Zlo); - &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); - &pxor ($Zlo,$tmp); - - &psrlq ($Zlo,32); # lower part of Zlo is already there - &movd ($Zhl,$Zhi); - &psrlq ($Zhi,32); - &movd ($Zlh,$Zlo); - &movd ($Zhh,$Zhi); - - &bswap ($Zll); - &bswap ($Zhl); - &bswap ($Zlh); - &bswap ($Zhh); -} - -&function_begin("gcm_gmult_4bit_mmx"); - &mov ($inp,&wparam(0)); # load Xi - &mov ($Htbl,&wparam(1)); # load Htable - - &call (&label("pic_point")); - &set_label("pic_point"); - &blindpop("eax"); - &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); - - &movz ($Zll,&BP(15,$inp)); - - &mmx_loop($inp,"eax"); - - &emms (); - &mov (&DWP(12,$inp),$Zll); - &mov (&DWP(4,$inp),$Zhl); - &mov (&DWP(8,$inp),$Zlh); - &mov (&DWP(0,$inp),$Zhh); -&function_end("gcm_gmult_4bit_mmx"); - -###################################################################### -# Below subroutine is "528B" variant of "4-bit" GCM GHASH function -# (see gcm128.c for details). It provides further 20-40% performance -# improvement over above mentioned "May" version. - -&static_label("rem_8bit"); - -&function_begin("gcm_ghash_4bit_mmx"); -{ my ($Zlo,$Zhi) = ("mm7","mm6"); - my $rem_8bit = "esi"; - my $Htbl = "ebx"; - - # parameter block - &mov ("eax",&wparam(0)); # Xi - &mov ("ebx",&wparam(1)); # Htable - &mov ("ecx",&wparam(2)); # inp - &mov ("edx",&wparam(3)); # len - &mov ("ebp","esp"); # original %esp - &call (&label("pic_point")); - &set_label ("pic_point"); - &blindpop ($rem_8bit); - &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit)); - - &sub ("esp",512+16+16); # allocate stack frame... - &and ("esp",-64); # ...and align it - &sub ("esp",16); # place for (u8)(H[]<<4) - - &add ("edx","ecx"); # pointer to the end of input - &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi - &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len - &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp - - { my @lo = ("mm0","mm1","mm2"); - my @hi = ("mm3","mm4","mm5"); - my @tmp = ("mm6","mm7"); - my ($off1,$off2,$i) = (0,0,); - - &add ($Htbl,128); # optimize for size - &lea ("edi",&DWP(16+128,"esp")); - &lea ("ebp",&DWP(16+256+128,"esp")); - - # decompose Htable (low and high parts are kept separately), - # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack... - for ($i=0;$i<18;$i++) { - - &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16); - &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16); - &psllq ($tmp[1],60) if ($i>1); - &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16); - &por ($lo[2],$tmp[1]) if ($i>1); - &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17); - &psrlq ($lo[1],4) if ($i>0 && $i<17); - &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17); - &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17); - &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1); - &psrlq ($hi[1],4) if ($i>0 && $i<17); - &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1); - &shl ("edx",4) if ($i<16); - &mov (&BP($i,"esp"),&LB("edx")) if ($i<16); - - unshift (@lo,pop(@lo)); # "rotate" registers - unshift (@hi,pop(@hi)); - unshift (@tmp,pop(@tmp)); - $off1 += 8 if ($i>0); - $off2 += 8 if ($i>1); - } - } - - &movq ($Zhi,&QWP(0,"eax")); - &mov ("ebx",&DWP(8,"eax")); - &mov ("edx",&DWP(12,"eax")); # load Xi - -&set_label("outer",16); - { my $nlo = "eax"; - my $dat = "edx"; - my @nhi = ("edi","ebp"); - my @rem = ("ebx","ecx"); - my @red = ("mm0","mm1","mm2"); - my $tmp = "mm3"; - - &xor ($dat,&DWP(12,"ecx")); # merge input data - &xor ("ebx",&DWP(8,"ecx")); - &pxor ($Zhi,&QWP(0,"ecx")); - &lea ("ecx",&DWP(16,"ecx")); # inp+=16 - #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi - &mov (&DWP(528+8,"esp"),"ebx"); - &movq (&QWP(528+0,"esp"),$Zhi); - &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp - - &xor ($nlo,$nlo); - &rol ($dat,8); - &mov (&LB($nlo),&LB($dat)); - &mov ($nhi[1],$nlo); - &and (&LB($nlo),0x0f); - &shr ($nhi[1],4); - &pxor ($red[0],$red[0]); - &rol ($dat,8); # next byte - &pxor ($red[1],$red[1]); - &pxor ($red[2],$red[2]); - - # Just like in "May" verson modulo-schedule for critical path in - # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' - # is scheduled so late that rem_8bit[] has to be shifted *right* - # by 16, which is why last argument to pinsrw is 2, which - # corresponds to <<32=<<48>>16... - for ($j=11,$i=0;$i<15;$i++) { - - if ($i>0) { - &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] - &rol ($dat,8); # next byte - &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); - - &pxor ($Zlo,$tmp); - &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); - &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) - } else { - &movq ($Zlo,&QWP(16,"esp",$nlo,8)); - &movq ($Zhi,&QWP(16+128,"esp",$nlo,8)); - } - - &mov (&LB($nlo),&LB($dat)); - &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0); - - &movd ($rem[0],$Zlo); - &movz ($rem[1],&LB($rem[1])) if ($i>0); - &psrlq ($Zlo,8); # Z>>=8 - - &movq ($tmp,$Zhi); - &mov ($nhi[0],$nlo); - &psrlq ($Zhi,8); - - &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4 - &and (&LB($nlo),0x0f); - &psllq ($tmp,56); - - &pxor ($Zhi,$red[1]) if ($i>1); - &shr ($nhi[0],4); - &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0); - - unshift (@red,pop(@red)); # "rotate" registers - unshift (@rem,pop(@rem)); - unshift (@nhi,pop(@nhi)); - } - - &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] - &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); - &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) - - &pxor ($Zlo,$tmp); - &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); - &movz ($rem[1],&LB($rem[1])); - - &pxor ($red[2],$red[2]); # clear 2nd word - &psllq ($red[1],4); - - &movd ($rem[0],$Zlo); - &psrlq ($Zlo,4); # Z>>=4 - - &movq ($tmp,$Zhi); - &psrlq ($Zhi,4); - &shl ($rem[0],4); # rem<<4 - - &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi] - &psllq ($tmp,60); - &movz ($rem[0],&LB($rem[0])); - - &pxor ($Zlo,$tmp); - &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8)); - - &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2); - &pxor ($Zhi,$red[1]); - - &movd ($dat,$Zlo); - &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48 - - &psllq ($red[0],12); # correct by <<16>>4 - &pxor ($Zhi,$red[0]); - &psrlq ($Zlo,32); - &pxor ($Zhi,$red[2]); - - &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp - &movd ("ebx",$Zlo); - &movq ($tmp,$Zhi); # 01234567 - &psllw ($Zhi,8); # 1.3.5.7. - &psrlw ($tmp,8); # .0.2.4.6 - &por ($Zhi,$tmp); # 10325476 - &bswap ($dat); - &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 - &bswap ("ebx"); - - &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? - &jne (&label("outer")); - } - - &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi - &mov (&DWP(12,"eax"),"edx"); - &mov (&DWP(8,"eax"),"ebx"); - &movq (&QWP(0,"eax"),$Zhi); - - &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp - &emms (); -} -&function_end("gcm_ghash_4bit_mmx"); -}} - -if ($sse2) {{ -###################################################################### -# PCLMULQDQ version. - -$Xip="eax"; -$Htbl="edx"; -$const="ecx"; -$inp="esi"; -$len="ebx"; - -($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; -($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); -($Xn,$Xhn)=("xmm6","xmm7"); - -&static_label("bswap"); - -sub clmul64x64_T2 { # minimal "register" pressure -my ($Xhi,$Xi,$Hkey,$HK)=@_; - - &movdqa ($Xhi,$Xi); # - &pshufd ($T1,$Xi,0b01001110); - &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK)); - &pxor ($T1,$Xi); # - &pxor ($T2,$Hkey) if (!defined($HK)); - $HK=$T2 if (!defined($HK)); - - &pclmulqdq ($Xi,$Hkey,0x00); ####### - &pclmulqdq ($Xhi,$Hkey,0x11); ####### - &pclmulqdq ($T1,$HK,0x00); ####### - &xorps ($T1,$Xi); # - &xorps ($T1,$Xhi); # - - &movdqa ($T2,$T1); # - &psrldq ($T1,8); - &pslldq ($T2,8); # - &pxor ($Xhi,$T1); - &pxor ($Xi,$T2); # -} - -sub clmul64x64_T3 { -# Even though this subroutine offers visually better ILP, it -# was empirically found to be a tad slower than above version. -# At least in gcm_ghash_clmul context. But it's just as well, -# because loop modulo-scheduling is possible only thanks to -# minimized "register" pressure... -my ($Xhi,$Xi,$Hkey)=@_; - - &movdqa ($T1,$Xi); # - &movdqa ($Xhi,$Xi); - &pclmulqdq ($Xi,$Hkey,0x00); ####### - &pclmulqdq ($Xhi,$Hkey,0x11); ####### - &pshufd ($T2,$T1,0b01001110); # - &pshufd ($T3,$Hkey,0b01001110); - &pxor ($T2,$T1); # - &pxor ($T3,$Hkey); - &pclmulqdq ($T2,$T3,0x00); ####### - &pxor ($T2,$Xi); # - &pxor ($T2,$Xhi); # - - &movdqa ($T3,$T2); # - &psrldq ($T2,8); - &pslldq ($T3,8); # - &pxor ($Xhi,$T2); - &pxor ($Xi,$T3); # -} - -if (1) { # Algorithm 9 with <<1 twist. - # Reduction is shorter and uses only two - # temporary registers, which makes it better - # candidate for interleaving with 64x64 - # multiplication. Pre-modulo-scheduled loop - # was found to be ~20% faster than Algorithm 5 - # below. Algorithm 9 was therefore chosen for - # further optimization... - -sub reduction_alg9 { # 17/11 times faster than Intel version -my ($Xhi,$Xi) = @_; - - # 1st phase - &movdqa ($T2,$Xi); # - &movdqa ($T1,$Xi); - &psllq ($Xi,5); - &pxor ($T1,$Xi); # - &psllq ($Xi,1); - &pxor ($Xi,$T1); # - &psllq ($Xi,57); # - &movdqa ($T1,$Xi); # - &pslldq ($Xi,8); - &psrldq ($T1,8); # - &pxor ($Xi,$T2); - &pxor ($Xhi,$T1); # - - # 2nd phase - &movdqa ($T2,$Xi); - &psrlq ($Xi,1); - &pxor ($Xhi,$T2); # - &pxor ($T2,$Xi); - &psrlq ($Xi,5); - &pxor ($Xi,$T2); # - &psrlq ($Xi,1); # - &pxor ($Xi,$Xhi) # -} - -&function_begin_B("gcm_init_clmul"); - &mov ($Htbl,&wparam(0)); - &mov ($Xip,&wparam(1)); - - &call (&label("pic")); -&set_label("pic"); - &blindpop ($const); - &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); - - &movdqu ($Hkey,&QWP(0,$Xip)); - &pshufd ($Hkey,$Hkey,0b01001110);# dword swap - - # <<1 twist - &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword - &movdqa ($T1,$Hkey); - &psllq ($Hkey,1); - &pxor ($T3,$T3); # - &psrlq ($T1,63); - &pcmpgtd ($T3,$T2); # broadcast carry bit - &pslldq ($T1,8); - &por ($Hkey,$T1); # H<<=1 - - # magic reduction - &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial - &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial - - # calculate H^2 - &movdqa ($Xi,$Hkey); - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); - &reduction_alg9 ($Xhi,$Xi); - - &pshufd ($T1,$Hkey,0b01001110); - &pshufd ($T2,$Xi,0b01001110); - &pxor ($T1,$Hkey); # Karatsuba pre-processing - &movdqu (&QWP(0,$Htbl),$Hkey); # save H - &pxor ($T2,$Xi); # Karatsuba pre-processing - &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 - &palignr ($T2,$T1,8); # low part is H.lo^H.hi - &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt" - - &ret (); -&function_end_B("gcm_init_clmul"); - -&function_begin_B("gcm_gmult_clmul"); - &mov ($Xip,&wparam(0)); - &mov ($Htbl,&wparam(1)); - - &call (&label("pic")); -&set_label("pic"); - &blindpop ($const); - &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); - - &movdqu ($Xi,&QWP(0,$Xip)); - &movdqa ($T3,&QWP(0,$const)); - &movups ($Hkey,&QWP(0,$Htbl)); - &pshufb ($Xi,$T3); - &movups ($T2,&QWP(32,$Htbl)); - - &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); - &reduction_alg9 ($Xhi,$Xi); - - &pshufb ($Xi,$T3); - &movdqu (&QWP(0,$Xip),$Xi); - - &ret (); -&function_end_B("gcm_gmult_clmul"); - -&function_begin("gcm_ghash_clmul"); - &mov ($Xip,&wparam(0)); - &mov ($Htbl,&wparam(1)); - &mov ($inp,&wparam(2)); - &mov ($len,&wparam(3)); - - &call (&label("pic")); -&set_label("pic"); - &blindpop ($const); - &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); - - &movdqu ($Xi,&QWP(0,$Xip)); - &movdqa ($T3,&QWP(0,$const)); - &movdqu ($Hkey,&QWP(0,$Htbl)); - &pshufb ($Xi,$T3); - - &sub ($len,0x10); - &jz (&label("odd_tail")); - - ####### - # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = - # [(H*Ii+1) + (H*Xi+1)] mod P = - # [(H*Ii+1) + H^2*(Ii+Xi)] mod P - # - &movdqu ($T1,&QWP(0,$inp)); # Ii - &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 - &pshufb ($T1,$T3); - &pshufb ($Xn,$T3); - &movdqu ($T3,&QWP(32,$Htbl)); - &pxor ($Xi,$T1); # Ii+Xi - - &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 - &movdqa ($Xhn,$Xn); - &pxor ($T1,$Xn); # - - &pclmulqdq ($Xn,$Hkey,0x00); ####### - &pclmulqdq ($Xhn,$Hkey,0x11); ####### - &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 - &pclmulqdq ($T1,$T3,0x00); ####### - - &lea ($inp,&DWP(32,$inp)); # i+=2 - &sub ($len,0x20); - &jbe (&label("even_tail")); - &jmp (&label("mod_loop")); - -&set_label("mod_loop",32); - &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) - &movdqa ($Xhi,$Xi); - &pxor ($T2,$Xi); # - - &pclmulqdq ($Xi,$Hkey,0x00); ####### - &pclmulqdq ($Xhi,$Hkey,0x11); ####### - &movups ($Hkey,&QWP(0,$Htbl)); # load H - &pclmulqdq ($T2,$T3,0x10); ####### - &movdqa ($T3,&QWP(0,$const)); - - &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) - &xorps ($Xhi,$Xhn); - &movdqu ($Xhn,&QWP(0,$inp)); # Ii - &pxor ($T1,$Xi); # aggregated Karatsuba post-processing - &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 - &pxor ($T1,$Xhi); # - - &pxor ($T2,$T1); # - &pshufb ($Xhn,$T3); - - &movdqa ($T1,$T2); # - &psrldq ($T2,8); - &pslldq ($T1,8); # - &pxor ($Xhi,$T2); - &pxor ($Xi,$T1); # - &pshufb ($Xn,$T3); - &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early - - &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 - &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase - &movdqa ($T1,$Xi); - &psllq ($Xi,5); - &pxor ($T1,$Xi); # - &psllq ($Xi,1); - &pxor ($Xi,$T1); # - &movups ($T3,&QWP(32,$Htbl)); - &pclmulqdq ($Xn,$Hkey,0x00); ####### - &psllq ($Xi,57); # - &movdqa ($T1,$Xi); # - &pslldq ($Xi,8); - &psrldq ($T1,8); # - &pxor ($Xi,$T2); - &pxor ($Xhi,$T1); # - &pshufd ($T1,$Xhn,0b01001110); - &movdqa ($T2,$Xi); # 2nd phase - &psrlq ($Xi,1); - &pxor ($T1,$Xhn); - &pclmulqdq ($Xhn,$Hkey,0x11); ####### - &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 - &pxor ($Xhi,$T2); # - &pxor ($T2,$Xi); - &psrlq ($Xi,5); - &pxor ($Xi,$T2); # - &psrlq ($Xi,1); # - &pxor ($Xi,$Xhi) # - &pclmulqdq ($T1,$T3,0x00); ####### - - &lea ($inp,&DWP(32,$inp)); - &sub ($len,0x20); - &ja (&label("mod_loop")); - -&set_label("even_tail"); - &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) - &movdqa ($Xhi,$Xi); - &pxor ($T2,$Xi); # - - &pclmulqdq ($Xi,$Hkey,0x00); ####### - &pclmulqdq ($Xhi,$Hkey,0x11); ####### - &pclmulqdq ($T2,$T3,0x10); ####### - &movdqa ($T3,&QWP(0,$const)); - - &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) - &xorps ($Xhi,$Xhn); - &pxor ($T1,$Xi); # aggregated Karatsuba post-processing - &pxor ($T1,$Xhi); # - - &pxor ($T2,$T1); # - - &movdqa ($T1,$T2); # - &psrldq ($T2,8); - &pslldq ($T1,8); # - &pxor ($Xhi,$T2); - &pxor ($Xi,$T1); # - - &reduction_alg9 ($Xhi,$Xi); - - &test ($len,$len); - &jnz (&label("done")); - - &movups ($Hkey,&QWP(0,$Htbl)); # load H -&set_label("odd_tail"); - &movdqu ($T1,&QWP(0,$inp)); # Ii - &pshufb ($T1,$T3); - &pxor ($Xi,$T1); # Ii+Xi - - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) - &reduction_alg9 ($Xhi,$Xi); - -&set_label("done"); - &pshufb ($Xi,$T3); - &movdqu (&QWP(0,$Xip),$Xi); -&function_end("gcm_ghash_clmul"); - -} else { # Algorith 5. Kept for reference purposes. - -sub reduction_alg5 { # 19/16 times faster than Intel version -my ($Xhi,$Xi)=@_; - - # <<1 - &movdqa ($T1,$Xi); # - &movdqa ($T2,$Xhi); - &pslld ($Xi,1); - &pslld ($Xhi,1); # - &psrld ($T1,31); - &psrld ($T2,31); # - &movdqa ($T3,$T1); - &pslldq ($T1,4); - &psrldq ($T3,12); # - &pslldq ($T2,4); - &por ($Xhi,$T3); # - &por ($Xi,$T1); - &por ($Xhi,$T2); # - - # 1st phase - &movdqa ($T1,$Xi); - &movdqa ($T2,$Xi); - &movdqa ($T3,$Xi); # - &pslld ($T1,31); - &pslld ($T2,30); - &pslld ($Xi,25); # - &pxor ($T1,$T2); - &pxor ($T1,$Xi); # - &movdqa ($T2,$T1); # - &pslldq ($T1,12); - &psrldq ($T2,4); # - &pxor ($T3,$T1); - - # 2nd phase - &pxor ($Xhi,$T3); # - &movdqa ($Xi,$T3); - &movdqa ($T1,$T3); - &psrld ($Xi,1); # - &psrld ($T1,2); - &psrld ($T3,7); # - &pxor ($Xi,$T1); - &pxor ($Xhi,$T2); - &pxor ($Xi,$T3); # - &pxor ($Xi,$Xhi); # -} - -&function_begin_B("gcm_init_clmul"); - &mov ($Htbl,&wparam(0)); - &mov ($Xip,&wparam(1)); - - &call (&label("pic")); -&set_label("pic"); - &blindpop ($const); - &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); - - &movdqu ($Hkey,&QWP(0,$Xip)); - &pshufd ($Hkey,$Hkey,0b01001110);# dword swap - - # calculate H^2 - &movdqa ($Xi,$Hkey); - &clmul64x64_T3 ($Xhi,$Xi,$Hkey); - &reduction_alg5 ($Xhi,$Xi); - - &movdqu (&QWP(0,$Htbl),$Hkey); # save H - &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 - - &ret (); -&function_end_B("gcm_init_clmul"); - -&function_begin_B("gcm_gmult_clmul"); - &mov ($Xip,&wparam(0)); - &mov ($Htbl,&wparam(1)); - - &call (&label("pic")); -&set_label("pic"); - &blindpop ($const); - &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); - - &movdqu ($Xi,&QWP(0,$Xip)); - &movdqa ($Xn,&QWP(0,$const)); - &movdqu ($Hkey,&QWP(0,$Htbl)); - &pshufb ($Xi,$Xn); - - &clmul64x64_T3 ($Xhi,$Xi,$Hkey); - &reduction_alg5 ($Xhi,$Xi); - - &pshufb ($Xi,$Xn); - &movdqu (&QWP(0,$Xip),$Xi); - - &ret (); -&function_end_B("gcm_gmult_clmul"); - -&function_begin("gcm_ghash_clmul"); - &mov ($Xip,&wparam(0)); - &mov ($Htbl,&wparam(1)); - &mov ($inp,&wparam(2)); - &mov ($len,&wparam(3)); - - &call (&label("pic")); -&set_label("pic"); - &blindpop ($const); - &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); - - &movdqu ($Xi,&QWP(0,$Xip)); - &movdqa ($T3,&QWP(0,$const)); - &movdqu ($Hkey,&QWP(0,$Htbl)); - &pshufb ($Xi,$T3); - - &sub ($len,0x10); - &jz (&label("odd_tail")); - - ####### - # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = - # [(H*Ii+1) + (H*Xi+1)] mod P = - # [(H*Ii+1) + H^2*(Ii+Xi)] mod P - # - &movdqu ($T1,&QWP(0,$inp)); # Ii - &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 - &pshufb ($T1,$T3); - &pshufb ($Xn,$T3); - &pxor ($Xi,$T1); # Ii+Xi - - &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 - &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 - - &sub ($len,0x20); - &lea ($inp,&DWP(32,$inp)); # i+=2 - &jbe (&label("even_tail")); - -&set_label("mod_loop"); - &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) - &movdqu ($Hkey,&QWP(0,$Htbl)); # load H - - &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) - &pxor ($Xhi,$Xhn); - - &reduction_alg5 ($Xhi,$Xi); - - ####### - &movdqa ($T3,&QWP(0,$const)); - &movdqu ($T1,&QWP(0,$inp)); # Ii - &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 - &pshufb ($T1,$T3); - &pshufb ($Xn,$T3); - &pxor ($Xi,$T1); # Ii+Xi - - &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 - &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 - - &sub ($len,0x20); - &lea ($inp,&DWP(32,$inp)); - &ja (&label("mod_loop")); - -&set_label("even_tail"); - &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) - - &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) - &pxor ($Xhi,$Xhn); - - &reduction_alg5 ($Xhi,$Xi); - - &movdqa ($T3,&QWP(0,$const)); - &test ($len,$len); - &jnz (&label("done")); - - &movdqu ($Hkey,&QWP(0,$Htbl)); # load H -&set_label("odd_tail"); - &movdqu ($T1,&QWP(0,$inp)); # Ii - &pshufb ($T1,$T3); - &pxor ($Xi,$T1); # Ii+Xi - - &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) - &reduction_alg5 ($Xhi,$Xi); - - &movdqa ($T3,&QWP(0,$const)); -&set_label("done"); - &pshufb ($Xi,$T3); - &movdqu (&QWP(0,$Xip),$Xi); -&function_end("gcm_ghash_clmul"); - -} - -&set_label("bswap",64); - &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); - &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial -&set_label("rem_8bit",64); - &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); - &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); - &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E); - &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E); - &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E); - &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E); - &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E); - &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E); - &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE); - &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE); - &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE); - &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE); - &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E); - &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E); - &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE); - &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE); - &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E); - &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E); - &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E); - &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E); - &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E); - &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E); - &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E); - &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E); - &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE); - &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE); - &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE); - &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE); - &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E); - &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); - &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); - &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); -}} # $sse2 - -&set_label("rem_4bit",64); - &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); - &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); - &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); - &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); -}}} # !$x86only - -&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); -&asm_finish(); - -# A question was risen about choice of vanilla MMX. Or rather why wasn't -# SSE2 chosen instead? In addition to the fact that MMX runs on legacy -# CPUs such as PIII, "4-bit" MMX version was observed to provide better -# performance than *corresponding* SSE2 one even on contemporary CPUs. -# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 -# implementation featuring full range of lookup-table sizes, but with -# per-invocation lookup table setup. Latter means that table size is -# chosen depending on how much data is to be hashed in every given call, -# more data - larger table. Best reported result for Core2 is ~4 cycles -# per processed byte out of 64KB block. This number accounts even for -# 64KB table setup overhead. As discussed in gcm128.c we choose to be -# more conservative in respect to lookup table sizes, but how do the -# results compare? Minimalistic "256B" MMX version delivers ~11 cycles -# on same platform. As also discussed in gcm128.c, next in line "8-bit -# Shoup's" or "4KB" method should deliver twice the performance of -# "256B" one, in other words not worse than ~6 cycles per byte. It -# should be also be noted that in SSE2 case improvement can be "super- -# linear," i.e. more than twice, mostly because >>8 maps to single -# instruction on SSE2 register. This is unlike "4-bit" case when >>4 -# maps to same amount of instructions in both MMX and SSE2 cases. -# Bottom line is that switch to SSE2 is considered to be justifiable -# only in case we choose to implement "8-bit" method... +../openssl/./crypto/modes/asm/ghash-x86.pl
\ No newline at end of file diff --git a/devel/perlasm/ghash-x86_64.pl b/devel/perlasm/ghash-x86_64.pl index 7904248070..0a2efd6c53 100644..120000 --- a/devel/perlasm/ghash-x86_64.pl +++ b/devel/perlasm/ghash-x86_64.pl @@ -1,1736 +1 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# March, June 2010 -# -# The module implements "4-bit" GCM GHASH function and underlying -# single multiplication operation in GF(2^128). "4-bit" means that -# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH -# function features so called "528B" variant utilizing additional -# 256+16 bytes of per-key storage [+512 bytes shared table]. -# Performance results are for this streamed GHASH subroutine and are -# expressed in cycles per processed byte, less is better: -# -# gcc 3.4.x(*) assembler -# -# P4 28.6 14.0 +100% -# Opteron 19.3 7.7 +150% -# Core2 17.8 8.1(**) +120% -# Atom 31.6 16.8 +88% -# VIA Nano 21.8 10.1 +115% -# -# (*) comparison is not completely fair, because C results are -# for vanilla "256B" implementation, while assembler results -# are for "528B";-) -# (**) it's mystery [to me] why Core2 result is not same as for -# Opteron; - -# May 2010 -# -# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. -# See ghash-x86.pl for background information and details about coding -# techniques. -# -# Special thanks to David Woodhouse <dwmw2@infradead.org> for -# providing access to a Westmere-based system on behalf of Intel -# Open Source Technology Centre. - -# December 2012 -# -# Overhaul: aggregate Karatsuba post-processing, improve ILP in -# reduction_alg9, increase reduction aggregate factor to 4x. As for -# the latter. ghash-x86.pl discusses that it makes lesser sense to -# increase aggregate factor. Then why increase here? Critical path -# consists of 3 independent pclmulqdq instructions, Karatsuba post- -# processing and reduction. "On top" of this we lay down aggregated -# multiplication operations, triplets of independent pclmulqdq's. As -# issue rate for pclmulqdq is limited, it makes lesser sense to -# aggregate more multiplications than it takes to perform remaining -# non-multiplication operations. 2x is near-optimal coefficient for -# contemporary Intel CPUs (therefore modest improvement coefficient), -# but not for Bulldozer. Latter is because logical SIMD operations -# are twice as slow in comparison to Intel, so that critical path is -# longer. A CPU with higher pclmulqdq issue rate would also benefit -# from higher aggregate factor... -# -# Westmere 1.76(+14%) -# Sandy Bridge 1.79(+9%) -# Ivy Bridge 1.79(+8%) -# Haswell 0.55(+93%) (if system doesn't support AVX) -# Bulldozer 1.52(+25%) - -# March 2013 -# -# ... 8x aggregate factor AVX code path is using reduction algorithm -# suggested by Shay Gueron[1]. Even though contemporary AVX-capable -# CPUs such as Sandy and Ivy Bridge can execute it, the code performs -# sub-optimally in comparison to above mentioned version. But thanks -# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that -# it performs in 0.41 cycles per byte on Haswell processor. -# -# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); -} - -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.09) + ($1>=2.10); -} - -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); -} - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -$do4xaggr=1; - -# common register layout -$nlo="%rax"; -$nhi="%rbx"; -$Zlo="%r8"; -$Zhi="%r9"; -$tmp="%r10"; -$rem_4bit = "%r11"; - -$Xi="%rdi"; -$Htbl="%rsi"; - -# per-function register layout -$cnt="%rcx"; -$rem="%rdx"; - -sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or - $r =~ s/%[er]([sd]i)/%\1l/ or - $r =~ s/%[er](bp)/%\1l/ or - $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } - -sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; - my $arg = pop; - $arg = "\$$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; -} - -{ my $N; - sub loop() { - my $inp = shift; - - $N++; -$code.=<<___; - xor $nlo,$nlo - xor $nhi,$nhi - mov `&LB("$Zlo")`,`&LB("$nlo")` - mov `&LB("$Zlo")`,`&LB("$nhi")` - shl \$4,`&LB("$nlo")` - mov \$14,$cnt - mov 8($Htbl,$nlo),$Zlo - mov ($Htbl,$nlo),$Zhi - and \$0xf0,`&LB("$nhi")` - mov $Zlo,$rem - jmp .Loop$N - -.align 16 -.Loop$N: - shr \$4,$Zlo - and \$0xf,$rem - mov $Zhi,$tmp - mov ($inp,$cnt),`&LB("$nlo")` - shr \$4,$Zhi - xor 8($Htbl,$nhi),$Zlo - shl \$60,$tmp - xor ($Htbl,$nhi),$Zhi - mov `&LB("$nlo")`,`&LB("$nhi")` - xor ($rem_4bit,$rem,8),$Zhi - mov $Zlo,$rem - shl \$4,`&LB("$nlo")` - xor $tmp,$Zlo - dec $cnt - js .Lbreak$N - - shr \$4,$Zlo - and \$0xf,$rem - mov $Zhi,$tmp - shr \$4,$Zhi - xor 8($Htbl,$nlo),$Zlo - shl \$60,$tmp - xor ($Htbl,$nlo),$Zhi - and \$0xf0,`&LB("$nhi")` - xor ($rem_4bit,$rem,8),$Zhi - mov $Zlo,$rem - xor $tmp,$Zlo - jmp .Loop$N - -.align 16 -.Lbreak$N: - shr \$4,$Zlo - and \$0xf,$rem - mov $Zhi,$tmp - shr \$4,$Zhi - xor 8($Htbl,$nlo),$Zlo - shl \$60,$tmp - xor ($Htbl,$nlo),$Zhi - and \$0xf0,`&LB("$nhi")` - xor ($rem_4bit,$rem,8),$Zhi - mov $Zlo,$rem - xor $tmp,$Zlo - - shr \$4,$Zlo - and \$0xf,$rem - mov $Zhi,$tmp - shr \$4,$Zhi - xor 8($Htbl,$nhi),$Zlo - shl \$60,$tmp - xor ($Htbl,$nhi),$Zhi - xor $tmp,$Zlo - xor ($rem_4bit,$rem,8),$Zhi - - bswap $Zlo - bswap $Zhi -___ -}} - -$code=<<___; -.text - -.globl gcm_gmult_4bit -.type gcm_gmult_4bit,\@function,2 -.align 16 -gcm_gmult_4bit: - push %rbx - push %rbp # %rbp and %r12 are pushed exclusively in - push %r12 # order to reuse Win64 exception handler... -.Lgmult_prologue: - - movzb 15($Xi),$Zlo - lea .Lrem_4bit(%rip),$rem_4bit -___ - &loop ($Xi); -$code.=<<___; - mov $Zlo,8($Xi) - mov $Zhi,($Xi) - - mov 16(%rsp),%rbx - lea 24(%rsp),%rsp -.Lgmult_epilogue: - ret -.size gcm_gmult_4bit,.-gcm_gmult_4bit -___ - -# per-function register layout -$inp="%rdx"; -$len="%rcx"; -$rem_8bit=$rem_4bit; - -$code.=<<___; -.globl gcm_ghash_4bit -.type gcm_ghash_4bit,\@function,4 -.align 16 -gcm_ghash_4bit: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - sub \$280,%rsp -.Lghash_prologue: - mov $inp,%r14 # reassign couple of args - mov $len,%r15 -___ -{ my $inp="%r14"; - my $dat="%edx"; - my $len="%r15"; - my @nhi=("%ebx","%ecx"); - my @rem=("%r12","%r13"); - my $Hshr4="%rbp"; - - &sub ($Htbl,-128); # size optimization - &lea ($Hshr4,"16+128(%rsp)"); - { my @lo =($nlo,$nhi); - my @hi =($Zlo,$Zhi); - - &xor ($dat,$dat); - for ($i=0,$j=-2;$i<18;$i++,$j++) { - &mov ("$j(%rsp)",&LB($dat)) if ($i>1); - &or ($lo[0],$tmp) if ($i>1); - &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); - &shr ($lo[1],4) if ($i>0 && $i<17); - &mov ($tmp,$hi[1]) if ($i>0 && $i<17); - &shr ($hi[1],4) if ($i>0 && $i<17); - &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); - &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); - &shl (&LB($dat),4) if ($i>0 && $i<17); - &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); - &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); - &shl ($tmp,60) if ($i>0 && $i<17); - - push (@lo,shift(@lo)); - push (@hi,shift(@hi)); - } - } - &add ($Htbl,-128); - &mov ($Zlo,"8($Xi)"); - &mov ($Zhi,"0($Xi)"); - &add ($len,$inp); # pointer to the end of data - &lea ($rem_8bit,".Lrem_8bit(%rip)"); - &jmp (".Louter_loop"); - -$code.=".align 16\n.Louter_loop:\n"; - &xor ($Zhi,"($inp)"); - &mov ("%rdx","8($inp)"); - &lea ($inp,"16($inp)"); - &xor ("%rdx",$Zlo); - &mov ("($Xi)",$Zhi); - &mov ("8($Xi)","%rdx"); - &shr ("%rdx",32); - - &xor ($nlo,$nlo); - &rol ($dat,8); - &mov (&LB($nlo),&LB($dat)); - &movz ($nhi[0],&LB($dat)); - &shl (&LB($nlo),4); - &shr ($nhi[0],4); - - for ($j=11,$i=0;$i<15;$i++) { - &rol ($dat,8); - &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); - &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); - &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); - &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); - - &mov (&LB($nlo),&LB($dat)); - &xor ($Zlo,$tmp) if ($i>0); - &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); - - &movz ($nhi[1],&LB($dat)); - &shl (&LB($nlo),4); - &movzb ($rem[0],"(%rsp,$nhi[0])"); - - &shr ($nhi[1],4) if ($i<14); - &and ($nhi[1],0xf0) if ($i==14); - &shl ($rem[1],48) if ($i>0); - &xor ($rem[0],$Zlo); - - &mov ($tmp,$Zhi); - &xor ($Zhi,$rem[1]) if ($i>0); - &shr ($Zlo,8); - - &movz ($rem[0],&LB($rem[0])); - &mov ($dat,"$j($Xi)") if (--$j%4==0); - &shr ($Zhi,8); - - &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); - &shl ($tmp,56); - &xor ($Zhi,"($Hshr4,$nhi[0],8)"); - - unshift (@nhi,pop(@nhi)); # "rotate" registers - unshift (@rem,pop(@rem)); - } - &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); - &xor ($Zlo,"8($Htbl,$nlo)"); - &xor ($Zhi,"($Htbl,$nlo)"); - - &shl ($rem[1],48); - &xor ($Zlo,$tmp); - - &xor ($Zhi,$rem[1]); - &movz ($rem[0],&LB($Zlo)); - &shr ($Zlo,4); - - &mov ($tmp,$Zhi); - &shl (&LB($rem[0]),4); - &shr ($Zhi,4); - - &xor ($Zlo,"8($Htbl,$nhi[0])"); - &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); - &shl ($tmp,60); - - &xor ($Zhi,"($Htbl,$nhi[0])"); - &xor ($Zlo,$tmp); - &shl ($rem[0],48); - - &bswap ($Zlo); - &xor ($Zhi,$rem[0]); - - &bswap ($Zhi); - &cmp ($inp,$len); - &jb (".Louter_loop"); -} -$code.=<<___; - mov $Zlo,8($Xi) - mov $Zhi,($Xi) - - lea 280(%rsp),%rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lghash_epilogue: - ret -.size gcm_ghash_4bit,.-gcm_ghash_4bit -___ - -###################################################################### -# PCLMULQDQ version. - -@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order - -($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; -($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); - -sub clmul64x64_T2 { # minimal register pressure -my ($Xhi,$Xi,$Hkey,$HK)=@_; - -if (!defined($HK)) { $HK = $T2; -$code.=<<___; - movdqa $Xi,$Xhi # - pshufd \$0b01001110,$Xi,$T1 - pshufd \$0b01001110,$Hkey,$T2 - pxor $Xi,$T1 # - pxor $Hkey,$T2 -___ -} else { -$code.=<<___; - movdqa $Xi,$Xhi # - pshufd \$0b01001110,$Xi,$T1 - pxor $Xi,$T1 # -___ -} -$code.=<<___; - pclmulqdq \$0x00,$Hkey,$Xi ####### - pclmulqdq \$0x11,$Hkey,$Xhi ####### - pclmulqdq \$0x00,$HK,$T1 ####### - pxor $Xi,$T1 # - pxor $Xhi,$T1 # - - movdqa $T1,$T2 # - psrldq \$8,$T1 - pslldq \$8,$T2 # - pxor $T1,$Xhi - pxor $T2,$Xi # -___ -} - -sub reduction_alg9 { # 17/11 times faster than Intel version -my ($Xhi,$Xi) = @_; - -$code.=<<___; - # 1st phase - movdqa $Xi,$T2 # - movdqa $Xi,$T1 - psllq \$5,$Xi - pxor $Xi,$T1 # - psllq \$1,$Xi - pxor $T1,$Xi # - psllq \$57,$Xi # - movdqa $Xi,$T1 # - pslldq \$8,$Xi - psrldq \$8,$T1 # - pxor $T2,$Xi - pxor $T1,$Xhi # - - # 2nd phase - movdqa $Xi,$T2 - psrlq \$1,$Xi - pxor $T2,$Xhi # - pxor $Xi,$T2 - psrlq \$5,$Xi - pxor $T2,$Xi # - psrlq \$1,$Xi # - pxor $Xhi,$Xi # -___ -} - -{ my ($Htbl,$Xip)=@_4args; - my $HK="%xmm6"; - -$code.=<<___; -.globl gcm_init_clmul -.type gcm_init_clmul,\@abi-omnipotent -.align 16 -gcm_init_clmul: -.L_init_clmul: -___ -$code.=<<___ if ($win64); -.LSEH_begin_gcm_init_clmul: - # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) -___ -$code.=<<___; - movdqu ($Xip),$Hkey - pshufd \$0b01001110,$Hkey,$Hkey # dword swap - - # <<1 twist - pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword - movdqa $Hkey,$T1 - psllq \$1,$Hkey - pxor $T3,$T3 # - psrlq \$63,$T1 - pcmpgtd $T2,$T3 # broadcast carry bit - pslldq \$8,$T1 - por $T1,$Hkey # H<<=1 - - # magic reduction - pand .L0x1c2_polynomial(%rip),$T3 - pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial - - # calculate H^2 - pshufd \$0b01001110,$Hkey,$HK - movdqa $Hkey,$Xi - pxor $Hkey,$HK -___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); - &reduction_alg9 ($Xhi,$Xi); -$code.=<<___; - pshufd \$0b01001110,$Hkey,$T1 - pshufd \$0b01001110,$Xi,$T2 - pxor $Hkey,$T1 # Karatsuba pre-processing - movdqu $Hkey,0x00($Htbl) # save H - pxor $Xi,$T2 # Karatsuba pre-processing - movdqu $Xi,0x10($Htbl) # save H^2 - palignr \$8,$T1,$T2 # low part is H.lo^H.hi... - movdqu $T2,0x20($Htbl) # save Karatsuba "salt" -___ -if ($do4xaggr) { - &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 - &reduction_alg9 ($Xhi,$Xi); -$code.=<<___; - movdqa $Xi,$T3 -___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 - &reduction_alg9 ($Xhi,$Xi); -$code.=<<___; - pshufd \$0b01001110,$T3,$T1 - pshufd \$0b01001110,$Xi,$T2 - pxor $T3,$T1 # Karatsuba pre-processing - movdqu $T3,0x30($Htbl) # save H^3 - pxor $Xi,$T2 # Karatsuba pre-processing - movdqu $Xi,0x40($Htbl) # save H^4 - palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... - movdqu $T2,0x50($Htbl) # save Karatsuba "salt" -___ -} -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - lea 0x18(%rsp),%rsp -.LSEH_end_gcm_init_clmul: -___ -$code.=<<___; - ret -.size gcm_init_clmul,.-gcm_init_clmul -___ -} - -{ my ($Xip,$Htbl)=@_4args; - -$code.=<<___; -.globl gcm_gmult_clmul -.type gcm_gmult_clmul,\@abi-omnipotent -.align 16 -gcm_gmult_clmul: -.L_gmult_clmul: - movdqu ($Xip),$Xi - movdqa .Lbswap_mask(%rip),$T3 - movdqu ($Htbl),$Hkey - movdqu 0x20($Htbl),$T2 - pshufb $T3,$Xi -___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); -$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); - # experimental alternative. special thing about is that there - # no dependency between the two multiplications... - mov \$`0xE1<<1`,%eax - mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff - mov \$0x07,%r11d - movq %rax,$T1 - movq %r10,$T2 - movq %r11,$T3 # borrow $T3 - pand $Xi,$T3 - pshufb $T3,$T2 # ($Xi&7)·0xE0 - movq %rax,$T3 - pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) - pxor $Xi,$T2 - pslldq \$15,$T2 - paddd $T2,$T2 # <<(64+56+1) - pxor $T2,$Xi - pclmulqdq \$0x01,$T3,$Xi - movdqa .Lbswap_mask(%rip),$T3 # reload $T3 - psrldq \$1,$T1 - pxor $T1,$Xhi - pslldq \$7,$Xi - pxor $Xhi,$Xi -___ -$code.=<<___; - pshufb $T3,$Xi - movdqu $Xi,($Xip) - ret -.size gcm_gmult_clmul,.-gcm_gmult_clmul -___ -} - -{ my ($Xip,$Htbl,$inp,$len)=@_4args; - my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10)); - -$code.=<<___; -.globl gcm_ghash_clmul -.type gcm_ghash_clmul,\@abi-omnipotent -.align 32 -gcm_ghash_clmul: -.L_ghash_clmul: -___ -$code.=<<___ if ($win64); - lea -0x88(%rsp),%rax -.LSEH_begin_gcm_ghash_clmul: - # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp - .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) - .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) - .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) - .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) - .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) - .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) - .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) - .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) - .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) - .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) -___ -$code.=<<___; - movdqa .Lbswap_mask(%rip),$T3 - mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff - - movdqu ($Xip),$Xi - movdqu ($Htbl),$Hkey - movdqu 0x20($Htbl),$HK - pshufb $T3,$Xi - - sub \$0x10,$len - jz .Lodd_tail - - movdqu 0x10($Htbl),$Hkey2 -___ -if ($do4xaggr) { -my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); - -$code.=<<___; - cmp \$0x30,$len - jb .Lskip4x - - sub \$0x30,$len - movdqu 0x30($Htbl),$Hkey3 - movdqu 0x40($Htbl),$Hkey4 - - ####### - # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P - # - movdqu 0x30($inp),$Xln - movdqu 0x20($inp),$Xl - pshufb $T3,$Xln - pshufb $T3,$Xl - movdqa $Xln,$Xhn - pshufd \$0b01001110,$Xln,$Xmn - pxor $Xln,$Xmn - pclmulqdq \$0x00,$Hkey,$Xln - pclmulqdq \$0x11,$Hkey,$Xhn - pclmulqdq \$0x00,$HK,$Xmn - - movdqa $Xl,$Xh - pshufd \$0b01001110,$Xl,$Xm - pxor $Xl,$Xm - pclmulqdq \$0x00,$Hkey2,$Xl - pclmulqdq \$0x11,$Hkey2,$Xh - xorps $Xl,$Xln - pclmulqdq \$0x10,$HK,$Xm - xorps $Xh,$Xhn - movups 0x50($Htbl),$HK - xorps $Xm,$Xmn - - movdqu 0x10($inp),$Xl - movdqu 0($inp),$T1 - pshufb $T3,$Xl - pshufb $T3,$T1 - movdqa $Xl,$Xh - pshufd \$0b01001110,$Xl,$Xm - pxor $T1,$Xi - pxor $Xl,$Xm - pclmulqdq \$0x00,$Hkey3,$Xl - movdqa $Xi,$Xhi - pshufd \$0b01001110,$Xi,$T1 - pxor $Xi,$T1 - pclmulqdq \$0x11,$Hkey3,$Xh - xorps $Xl,$Xln - pclmulqdq \$0x00,$HK,$Xm - xorps $Xh,$Xhn - - lea 0x40($inp),$inp - sub \$0x40,$len - jc .Ltail4x - - jmp .Lmod4_loop -.align 32 -.Lmod4_loop: - pclmulqdq \$0x00,$Hkey4,$Xi - xorps $Xm,$Xmn - movdqu 0x30($inp),$Xl - pshufb $T3,$Xl - pclmulqdq \$0x11,$Hkey4,$Xhi - xorps $Xln,$Xi - movdqu 0x20($inp),$Xln - movdqa $Xl,$Xh - pshufd \$0b01001110,$Xl,$Xm - pclmulqdq \$0x10,$HK,$T1 - xorps $Xhn,$Xhi - pxor $Xl,$Xm - pshufb $T3,$Xln - movups 0x20($Htbl),$HK - pclmulqdq \$0x00,$Hkey,$Xl - xorps $Xmn,$T1 - movdqa $Xln,$Xhn - pshufd \$0b01001110,$Xln,$Xmn - - pxor $Xi,$T1 # aggregated Karatsuba post-processing - pxor $Xln,$Xmn - pxor $Xhi,$T1 # - movdqa $T1,$T2 # - pslldq \$8,$T1 - pclmulqdq \$0x11,$Hkey,$Xh - psrldq \$8,$T2 # - pxor $T1,$Xi - movdqa .L7_mask(%rip),$T1 - pxor $T2,$Xhi # - movq %rax,$T2 - - pand $Xi,$T1 # 1st phase - pshufb $T1,$T2 # - pclmulqdq \$0x00,$HK,$Xm - pxor $Xi,$T2 # - psllq \$57,$T2 # - movdqa $T2,$T1 # - pslldq \$8,$T2 - pclmulqdq \$0x00,$Hkey2,$Xln - psrldq \$8,$T1 # - pxor $T2,$Xi - pxor $T1,$Xhi # - movdqu 0($inp),$T1 - - movdqa $Xi,$T2 # 2nd phase - psrlq \$1,$Xi - pclmulqdq \$0x11,$Hkey2,$Xhn - xorps $Xl,$Xln - movdqu 0x10($inp),$Xl - pshufb $T3,$Xl - pclmulqdq \$0x10,$HK,$Xmn - xorps $Xh,$Xhn - movups 0x50($Htbl),$HK - pshufb $T3,$T1 - pxor $T2,$Xhi # - pxor $Xi,$T2 - psrlq \$5,$Xi - - movdqa $Xl,$Xh - pxor $Xm,$Xmn - pshufd \$0b01001110,$Xl,$Xm - pxor $Xl,$Xm - pclmulqdq \$0x00,$Hkey3,$Xl - pxor $T2,$Xi # - pxor $T1,$Xhi - psrlq \$1,$Xi # - pclmulqdq \$0x11,$Hkey3,$Xh - xorps $Xl,$Xln - pxor $Xhi,$Xi # - - pclmulqdq \$0x00,$HK,$Xm - xorps $Xh,$Xhn - - movdqa $Xi,$Xhi - pshufd \$0b01001110,$Xi,$T1 - pxor $Xi,$T1 - - lea 0x40($inp),$inp - sub \$0x40,$len - jnc .Lmod4_loop - -.Ltail4x: - pclmulqdq \$0x00,$Hkey4,$Xi - xorps $Xm,$Xmn - pclmulqdq \$0x11,$Hkey4,$Xhi - xorps $Xln,$Xi - pclmulqdq \$0x10,$HK,$T1 - xorps $Xhn,$Xhi - pxor $Xi,$Xhi # aggregated Karatsuba post-processing - pxor $Xmn,$T1 - - pxor $Xhi,$T1 # - pxor $Xi,$Xhi - - movdqa $T1,$T2 # - psrldq \$8,$T1 - pslldq \$8,$T2 # - pxor $T1,$Xhi - pxor $T2,$Xi # -___ - &reduction_alg9($Xhi,$Xi); -$code.=<<___; - add \$0x40,$len - jz .Ldone - movdqu 0x20($Htbl),$HK - sub \$0x10,$len - jz .Lodd_tail -.Lskip4x: -___ -} -$code.=<<___; - ####### - # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = - # [(H*Ii+1) + (H*Xi+1)] mod P = - # [(H*Ii+1) + H^2*(Ii+Xi)] mod P - # - movdqu ($inp),$T1 # Ii - movdqu 16($inp),$Xln # Ii+1 - pshufb $T3,$T1 - pshufb $T3,$Xln - pxor $T1,$Xi # Ii+Xi - - movdqa $Xln,$Xhn - pshufd \$0b01001110,$Xln,$T1 - pxor $Xln,$T1 - pclmulqdq \$0x00,$Hkey,$Xln - pclmulqdq \$0x11,$Hkey,$Xhn - pclmulqdq \$0x00,$HK,$T1 - - lea 32($inp),$inp # i+=2 - sub \$0x20,$len - jbe .Leven_tail - jmp .Lmod_loop - -.align 32 -.Lmod_loop: - movdqa $Xi,$Xhi - pshufd \$0b01001110,$Xi,$T2 # - pxor $Xi,$T2 # - - pclmulqdq \$0x00,$Hkey2,$Xi - pclmulqdq \$0x11,$Hkey2,$Xhi - pclmulqdq \$0x10,$HK,$T2 - - pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) - pxor $Xhn,$Xhi - movdqu ($inp),$Xhn # Ii - pshufb $T3,$Xhn - movdqu 16($inp),$Xln # Ii+1 - - pxor $Xi,$T1 # aggregated Karatsuba post-processing - pxor $Xhi,$T1 - pxor $Xhn,$Xhi # "Ii+Xi", consume early - pxor $T1,$T2 - pshufb $T3,$Xln - movdqa $T2,$T1 # - psrldq \$8,$T1 - pslldq \$8,$T2 # - pxor $T1,$Xhi - pxor $T2,$Xi # - - movdqa $Xln,$Xhn # - - movdqa $Xi,$T2 # 1st phase - movdqa $Xi,$T1 - psllq \$5,$Xi - pclmulqdq \$0x00,$Hkey,$Xln ####### - pxor $Xi,$T1 # - psllq \$1,$Xi - pxor $T1,$Xi # - psllq \$57,$Xi # - movdqa $Xi,$T1 # - pslldq \$8,$Xi - psrldq \$8,$T1 # - pxor $T2,$Xi - pxor $T1,$Xhi # - pshufd \$0b01001110,$Xhn,$T1 - pxor $Xhn,$T1 # - - pclmulqdq \$0x11,$Hkey,$Xhn ####### - movdqa $Xi,$T2 # 2nd phase - psrlq \$1,$Xi - pxor $T2,$Xhi # - pxor $Xi,$T2 - psrlq \$5,$Xi - pxor $T2,$Xi # - psrlq \$1,$Xi # - pclmulqdq \$0x00,$HK,$T1 ####### - pxor $Xhi,$Xi # - - lea 32($inp),$inp - sub \$0x20,$len - ja .Lmod_loop - -.Leven_tail: - movdqa $Xi,$Xhi - pshufd \$0b01001110,$Xi,$T2 # - pxor $Xi,$T2 # - - pclmulqdq \$0x00,$Hkey2,$Xi - pclmulqdq \$0x11,$Hkey2,$Xhi - pclmulqdq \$0x10,$HK,$T2 - - pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) - pxor $Xhn,$Xhi - pxor $Xi,$T1 - pxor $Xhi,$T1 - pxor $T1,$T2 - movdqa $T2,$T1 # - psrldq \$8,$T1 - pslldq \$8,$T2 # - pxor $T1,$Xhi - pxor $T2,$Xi # -___ - &reduction_alg9 ($Xhi,$Xi); -$code.=<<___; - test $len,$len - jnz .Ldone - -.Lodd_tail: - movdqu ($inp),$T1 # Ii - pshufb $T3,$T1 - pxor $T1,$Xi # Ii+Xi -___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) - &reduction_alg9 ($Xhi,$Xi); -$code.=<<___; -.Ldone: - pshufb $T3,$Xi - movdqu $Xi,($Xip) -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - movaps 0x40(%rsp),%xmm10 - movaps 0x50(%rsp),%xmm11 - movaps 0x60(%rsp),%xmm12 - movaps 0x70(%rsp),%xmm13 - movaps 0x80(%rsp),%xmm14 - movaps 0x90(%rsp),%xmm15 - lea 0xa8(%rsp),%rsp -.LSEH_end_gcm_ghash_clmul: -___ -$code.=<<___; - ret -.size gcm_ghash_clmul,.-gcm_ghash_clmul -___ -} - -$code.=<<___; -.globl gcm_init_avx -.type gcm_init_avx,\@abi-omnipotent -.align 32 -gcm_init_avx: -___ -if ($avx) { -my ($Htbl,$Xip)=@_4args; -my $HK="%xmm6"; - -$code.=<<___ if ($win64); -.LSEH_begin_gcm_init_avx: - # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) -___ -$code.=<<___; - vzeroupper - - vmovdqu ($Xip),$Hkey - vpshufd \$0b01001110,$Hkey,$Hkey # dword swap - - # <<1 twist - vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword - vpsrlq \$63,$Hkey,$T1 - vpsllq \$1,$Hkey,$Hkey - vpxor $T3,$T3,$T3 # - vpcmpgtd $T2,$T3,$T3 # broadcast carry bit - vpslldq \$8,$T1,$T1 - vpor $T1,$Hkey,$Hkey # H<<=1 - - # magic reduction - vpand .L0x1c2_polynomial(%rip),$T3,$T3 - vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial - - vpunpckhqdq $Hkey,$Hkey,$HK - vmovdqa $Hkey,$Xi - vpxor $Hkey,$HK,$HK - mov \$4,%r10 # up to H^8 - jmp .Linit_start_avx -___ - -sub clmul64x64_avx { -my ($Xhi,$Xi,$Hkey,$HK)=@_; - -if (!defined($HK)) { $HK = $T2; -$code.=<<___; - vpunpckhqdq $Xi,$Xi,$T1 - vpunpckhqdq $Hkey,$Hkey,$T2 - vpxor $Xi,$T1,$T1 # - vpxor $Hkey,$T2,$T2 -___ -} else { -$code.=<<___; - vpunpckhqdq $Xi,$Xi,$T1 - vpxor $Xi,$T1,$T1 # -___ -} -$code.=<<___; - vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### - vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### - vpclmulqdq \$0x00,$HK,$T1,$T1 ####### - vpxor $Xi,$Xhi,$T2 # - vpxor $T2,$T1,$T1 # - - vpslldq \$8,$T1,$T2 # - vpsrldq \$8,$T1,$T1 - vpxor $T2,$Xi,$Xi # - vpxor $T1,$Xhi,$Xhi -___ -} - -sub reduction_avx { -my ($Xhi,$Xi) = @_; - -$code.=<<___; - vpsllq \$57,$Xi,$T1 # 1st phase - vpsllq \$62,$Xi,$T2 - vpxor $T1,$T2,$T2 # - vpsllq \$63,$Xi,$T1 - vpxor $T1,$T2,$T2 # - vpslldq \$8,$T2,$T1 # - vpsrldq \$8,$T2,$T2 - vpxor $T1,$Xi,$Xi # - vpxor $T2,$Xhi,$Xhi - - vpsrlq \$1,$Xi,$T2 # 2nd phase - vpxor $Xi,$Xhi,$Xhi - vpxor $T2,$Xi,$Xi # - vpsrlq \$5,$T2,$T2 - vpxor $T2,$Xi,$Xi # - vpsrlq \$1,$Xi,$Xi # - vpxor $Xhi,$Xi,$Xi # -___ -} - -$code.=<<___; -.align 32 -.Linit_loop_avx: - vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... - vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" -___ - &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 - &reduction_avx ($Xhi,$Xi); -$code.=<<___; -.Linit_start_avx: - vmovdqa $Xi,$T3 -___ - &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 - &reduction_avx ($Xhi,$Xi); -$code.=<<___; - vpshufd \$0b01001110,$T3,$T1 - vpshufd \$0b01001110,$Xi,$T2 - vpxor $T3,$T1,$T1 # Karatsuba pre-processing - vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 - vpxor $Xi,$T2,$T2 # Karatsuba pre-processing - vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 - lea 0x30($Htbl),$Htbl - sub \$1,%r10 - jnz .Linit_loop_avx - - vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped - vmovdqu $T3,-0x10($Htbl) - - vzeroupper -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - lea 0x18(%rsp),%rsp -.LSEH_end_gcm_init_avx: -___ -$code.=<<___; - ret -.size gcm_init_avx,.-gcm_init_avx -___ -} else { -$code.=<<___; - jmp .L_init_clmul -.size gcm_init_avx,.-gcm_init_avx -___ -} - -$code.=<<___; -.globl gcm_gmult_avx -.type gcm_gmult_avx,\@abi-omnipotent -.align 32 -gcm_gmult_avx: - jmp .L_gmult_clmul -.size gcm_gmult_avx,.-gcm_gmult_avx -___ - -$code.=<<___; -.globl gcm_ghash_avx -.type gcm_ghash_avx,\@abi-omnipotent -.align 32 -gcm_ghash_avx: -___ -if ($avx) { -my ($Xip,$Htbl,$inp,$len)=@_4args; -my ($Xlo,$Xhi,$Xmi, - $Zlo,$Zhi,$Zmi, - $Hkey,$HK,$T1,$T2, - $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); - -$code.=<<___ if ($win64); - lea -0x88(%rsp),%rax -.LSEH_begin_gcm_ghash_avx: - # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp - .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) - .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) - .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) - .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) - .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) - .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) - .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) - .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) - .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) - .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) -___ -$code.=<<___; - vzeroupper - - vmovdqu ($Xip),$Xi # load $Xi - lea .L0x1c2_polynomial(%rip),%r10 - lea 0x40($Htbl),$Htbl # size optimization - vmovdqu .Lbswap_mask(%rip),$bswap - vpshufb $bswap,$Xi,$Xi - cmp \$0x80,$len - jb .Lshort_avx - sub \$0x80,$len - - vmovdqu 0x70($inp),$Ii # I[7] - vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 - vpshufb $bswap,$Ii,$Ii - vmovdqu 0x20-0x40($Htbl),$HK - - vpunpckhqdq $Ii,$Ii,$T2 - vmovdqu 0x60($inp),$Ij # I[6] - vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo - vpxor $Ii,$T2,$T2 - vpshufb $bswap,$Ij,$Ij - vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi - vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 - vpunpckhqdq $Ij,$Ij,$T1 - vmovdqu 0x50($inp),$Ii # I[5] - vpclmulqdq \$0x00,$HK,$T2,$Xmi - vpxor $Ij,$T1,$T1 - - vpshufb $bswap,$Ii,$Ii - vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo - vpunpckhqdq $Ii,$Ii,$T2 - vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi - vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 - vpxor $Ii,$T2,$T2 - vmovdqu 0x40($inp),$Ij # I[4] - vpclmulqdq \$0x10,$HK,$T1,$Zmi - vmovdqu 0x50-0x40($Htbl),$HK - - vpshufb $bswap,$Ij,$Ij - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo - vpxor $Xhi,$Zhi,$Zhi - vpunpckhqdq $Ij,$Ij,$T1 - vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi - vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T2,$Xmi - vpxor $Ij,$T1,$T1 - - vmovdqu 0x30($inp),$Ii # I[3] - vpxor $Zlo,$Xlo,$Xlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo - vpxor $Zhi,$Xhi,$Xhi - vpshufb $bswap,$Ii,$Ii - vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi - vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 - vpxor $Zmi,$Xmi,$Xmi - vpunpckhqdq $Ii,$Ii,$T2 - vpclmulqdq \$0x10,$HK,$T1,$Zmi - vmovdqu 0x80-0x40($Htbl),$HK - vpxor $Ii,$T2,$T2 - - vmovdqu 0x20($inp),$Ij # I[2] - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo - vpxor $Xhi,$Zhi,$Zhi - vpshufb $bswap,$Ij,$Ij - vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi - vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 - vpxor $Xmi,$Zmi,$Zmi - vpunpckhqdq $Ij,$Ij,$T1 - vpclmulqdq \$0x00,$HK,$T2,$Xmi - vpxor $Ij,$T1,$T1 - - vmovdqu 0x10($inp),$Ii # I[1] - vpxor $Zlo,$Xlo,$Xlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo - vpxor $Zhi,$Xhi,$Xhi - vpshufb $bswap,$Ii,$Ii - vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi - vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 - vpxor $Zmi,$Xmi,$Xmi - vpunpckhqdq $Ii,$Ii,$T2 - vpclmulqdq \$0x10,$HK,$T1,$Zmi - vmovdqu 0xb0-0x40($Htbl),$HK - vpxor $Ii,$T2,$T2 - - vmovdqu ($inp),$Ij # I[0] - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo - vpxor $Xhi,$Zhi,$Zhi - vpshufb $bswap,$Ij,$Ij - vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi - vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x10,$HK,$T2,$Xmi - - lea 0x80($inp),$inp - cmp \$0x80,$len - jb .Ltail_avx - - vpxor $Xi,$Ij,$Ij # accumulate $Xi - sub \$0x80,$len - jmp .Loop8x_avx - -.align 32 -.Loop8x_avx: - vpunpckhqdq $Ij,$Ij,$T1 - vmovdqu 0x70($inp),$Ii # I[7] - vpxor $Xlo,$Zlo,$Zlo - vpxor $Ij,$T1,$T1 - vpclmulqdq \$0x00,$Hkey,$Ij,$Xi - vpshufb $bswap,$Ii,$Ii - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x11,$Hkey,$Ij,$Xo - vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 - vpunpckhqdq $Ii,$Ii,$T2 - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T1,$Tred - vmovdqu 0x20-0x40($Htbl),$HK - vpxor $Ii,$T2,$T2 - - vmovdqu 0x60($inp),$Ij # I[6] - vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo - vpxor $Zlo,$Xi,$Xi # collect result - vpshufb $bswap,$Ij,$Ij - vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi - vxorps $Zhi,$Xo,$Xo - vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 - vpunpckhqdq $Ij,$Ij,$T1 - vpclmulqdq \$0x00,$HK, $T2,$Xmi - vpxor $Zmi,$Tred,$Tred - vxorps $Ij,$T1,$T1 - - vmovdqu 0x50($inp),$Ii # I[5] - vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing - vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo - vpxor $Xo,$Tred,$Tred - vpslldq \$8,$Tred,$T2 - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi - vpsrldq \$8,$Tred,$Tred - vpxor $T2, $Xi, $Xi - vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 - vpshufb $bswap,$Ii,$Ii - vxorps $Tred,$Xo, $Xo - vpxor $Xhi,$Zhi,$Zhi - vpunpckhqdq $Ii,$Ii,$T2 - vpclmulqdq \$0x10,$HK, $T1,$Zmi - vmovdqu 0x50-0x40($Htbl),$HK - vpxor $Ii,$T2,$T2 - vpxor $Xmi,$Zmi,$Zmi - - vmovdqu 0x40($inp),$Ij # I[4] - vpalignr \$8,$Xi,$Xi,$Tred # 1st phase - vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo - vpshufb $bswap,$Ij,$Ij - vpxor $Zlo,$Xlo,$Xlo - vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi - vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Zhi,$Xhi,$Xhi - vpclmulqdq \$0x00,$HK, $T2,$Xmi - vxorps $Ij,$T1,$T1 - vpxor $Zmi,$Xmi,$Xmi - - vmovdqu 0x30($inp),$Ii # I[3] - vpclmulqdq \$0x10,(%r10),$Xi,$Xi - vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo - vpshufb $bswap,$Ii,$Ii - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi - vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 - vpunpckhqdq $Ii,$Ii,$T2 - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x10,$HK, $T1,$Zmi - vmovdqu 0x80-0x40($Htbl),$HK - vpxor $Ii,$T2,$T2 - vpxor $Xmi,$Zmi,$Zmi - - vmovdqu 0x20($inp),$Ij # I[2] - vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo - vpshufb $bswap,$Ij,$Ij - vpxor $Zlo,$Xlo,$Xlo - vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi - vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Zhi,$Xhi,$Xhi - vpclmulqdq \$0x00,$HK, $T2,$Xmi - vpxor $Ij,$T1,$T1 - vpxor $Zmi,$Xmi,$Xmi - vxorps $Tred,$Xi,$Xi - - vmovdqu 0x10($inp),$Ii # I[1] - vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase - vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo - vpshufb $bswap,$Ii,$Ii - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi - vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 - vpclmulqdq \$0x10,(%r10),$Xi,$Xi - vxorps $Xo,$Tred,$Tred - vpunpckhqdq $Ii,$Ii,$T2 - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x10,$HK, $T1,$Zmi - vmovdqu 0xb0-0x40($Htbl),$HK - vpxor $Ii,$T2,$T2 - vpxor $Xmi,$Zmi,$Zmi - - vmovdqu ($inp),$Ij # I[0] - vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo - vpshufb $bswap,$Ij,$Ij - vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi - vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 - vpxor $Tred,$Ij,$Ij - vpclmulqdq \$0x10,$HK, $T2,$Xmi - vpxor $Xi,$Ij,$Ij # accumulate $Xi - - lea 0x80($inp),$inp - sub \$0x80,$len - jnc .Loop8x_avx - - add \$0x80,$len - jmp .Ltail_no_xor_avx - -.align 32 -.Lshort_avx: - vmovdqu -0x10($inp,$len),$Ii # very last word - lea ($inp,$len),$inp - vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 - vmovdqu 0x20-0x40($Htbl),$HK - vpshufb $bswap,$Ii,$Ij - - vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, - vmovdqa $Xhi,$Zhi # $Zhi and - vmovdqa $Xmi,$Zmi # $Zmi - sub \$0x10,$len - jz .Ltail_avx - - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo - vpxor $Ij,$T1,$T1 - vmovdqu -0x20($inp),$Ii - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi - vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 - vpshufb $bswap,$Ii,$Ij - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T1,$Xmi - vpsrldq \$8,$HK,$HK - sub \$0x10,$len - jz .Ltail_avx - - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo - vpxor $Ij,$T1,$T1 - vmovdqu -0x30($inp),$Ii - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi - vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 - vpshufb $bswap,$Ii,$Ij - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T1,$Xmi - vmovdqu 0x50-0x40($Htbl),$HK - sub \$0x10,$len - jz .Ltail_avx - - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo - vpxor $Ij,$T1,$T1 - vmovdqu -0x40($inp),$Ii - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi - vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 - vpshufb $bswap,$Ii,$Ij - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T1,$Xmi - vpsrldq \$8,$HK,$HK - sub \$0x10,$len - jz .Ltail_avx - - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo - vpxor $Ij,$T1,$T1 - vmovdqu -0x50($inp),$Ii - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi - vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 - vpshufb $bswap,$Ii,$Ij - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T1,$Xmi - vmovdqu 0x80-0x40($Htbl),$HK - sub \$0x10,$len - jz .Ltail_avx - - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo - vpxor $Ij,$T1,$T1 - vmovdqu -0x60($inp),$Ii - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi - vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 - vpshufb $bswap,$Ii,$Ij - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T1,$Xmi - vpsrldq \$8,$HK,$HK - sub \$0x10,$len - jz .Ltail_avx - - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo - vpxor $Ij,$T1,$T1 - vmovdqu -0x70($inp),$Ii - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi - vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 - vpshufb $bswap,$Ii,$Ij - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T1,$Xmi - vmovq 0xb8-0x40($Htbl),$HK - sub \$0x10,$len - jmp .Ltail_avx - -.align 32 -.Ltail_avx: - vpxor $Xi,$Ij,$Ij # accumulate $Xi -.Ltail_no_xor_avx: - vpunpckhqdq $Ij,$Ij,$T1 - vpxor $Xlo,$Zlo,$Zlo - vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo - vpxor $Ij,$T1,$T1 - vpxor $Xhi,$Zhi,$Zhi - vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi - vpxor $Xmi,$Zmi,$Zmi - vpclmulqdq \$0x00,$HK,$T1,$Xmi - - vmovdqu (%r10),$Tred - - vpxor $Xlo,$Zlo,$Xi - vpxor $Xhi,$Zhi,$Xo - vpxor $Xmi,$Zmi,$Zmi - - vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing - vpxor $Xo, $Zmi,$Zmi - vpslldq \$8, $Zmi,$T2 - vpsrldq \$8, $Zmi,$Zmi - vpxor $T2, $Xi, $Xi - vpxor $Zmi,$Xo, $Xo - - vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase - vpalignr \$8,$Xi,$Xi,$Xi - vpxor $T2,$Xi,$Xi - - vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase - vpalignr \$8,$Xi,$Xi,$Xi - vpxor $Xo,$Xi,$Xi - vpxor $T2,$Xi,$Xi - - cmp \$0,$len - jne .Lshort_avx - - vpshufb $bswap,$Xi,$Xi - vmovdqu $Xi,($Xip) - vzeroupper -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - movaps 0x40(%rsp),%xmm10 - movaps 0x50(%rsp),%xmm11 - movaps 0x60(%rsp),%xmm12 - movaps 0x70(%rsp),%xmm13 - movaps 0x80(%rsp),%xmm14 - movaps 0x90(%rsp),%xmm15 - lea 0xa8(%rsp),%rsp -.LSEH_end_gcm_ghash_avx: -___ -$code.=<<___; - ret -.size gcm_ghash_avx,.-gcm_ghash_avx -___ -} else { -$code.=<<___; - jmp .L_ghash_clmul -.size gcm_ghash_avx,.-gcm_ghash_avx -___ -} - -$code.=<<___; -.align 64 -.Lbswap_mask: - .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.L0x1c2_polynomial: - .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -.L7_mask: - .long 7,0,7,0 -.L7_mask_poly: - .long 7,0,`0xE1<<1`,0 -.align 64 -.type .Lrem_4bit,\@object -.Lrem_4bit: - .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` - .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` - .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` - .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` -.type .Lrem_8bit,\@object -.Lrem_8bit: - .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E - .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E - .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E - .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E - .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E - .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E - .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E - .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E - .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE - .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE - .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE - .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE - .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E - .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E - .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE - .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE - .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E - .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E - .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E - .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E - .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E - .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E - .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E - .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E - .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE - .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE - .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE - .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE - .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E - .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E - .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE - .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE - -.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -.align 64 -___ - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lin_prologue - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lin_prologue - - lea 24(%rax),%rax # adjust "rsp" - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - -.Lin_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$`1232/8`,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size se_handler,.-se_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_gcm_gmult_4bit - .rva .LSEH_end_gcm_gmult_4bit - .rva .LSEH_info_gcm_gmult_4bit - - .rva .LSEH_begin_gcm_ghash_4bit - .rva .LSEH_end_gcm_ghash_4bit - .rva .LSEH_info_gcm_ghash_4bit - - .rva .LSEH_begin_gcm_init_clmul - .rva .LSEH_end_gcm_init_clmul - .rva .LSEH_info_gcm_init_clmul - - .rva .LSEH_begin_gcm_ghash_clmul - .rva .LSEH_end_gcm_ghash_clmul - .rva .LSEH_info_gcm_ghash_clmul -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_gcm_init_avx - .rva .LSEH_end_gcm_init_avx - .rva .LSEH_info_gcm_init_clmul - - .rva .LSEH_begin_gcm_ghash_avx - .rva .LSEH_end_gcm_ghash_avx - .rva .LSEH_info_gcm_ghash_clmul -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_gcm_gmult_4bit: - .byte 9,0,0,0 - .rva se_handler - .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData -.LSEH_info_gcm_ghash_4bit: - .byte 9,0,0,0 - .rva se_handler - .rva .Lghash_prologue,.Lghash_epilogue # HandlerData -.LSEH_info_gcm_init_clmul: - .byte 0x01,0x08,0x03,0x00 - .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 - .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 -.LSEH_info_gcm_ghash_clmul: - .byte 0x01,0x33,0x16,0x00 - .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 - .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 - .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 - .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 - .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 - .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 - .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 - .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 - .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 - .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 - .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 -___ -} - -$code =~ s/\`([^\`]*)\`/eval($1)/gem; - -print $code; - -close STDOUT; +../openssl/./crypto/modes/asm/ghash-x86_64.pl
\ No newline at end of file diff --git a/devel/perlasm/openssl-cpuid-x86.pl b/devel/perlasm/openssl-cpuid-x86.pl deleted file mode 100644 index ef1216a8b2..0000000000 --- a/devel/perlasm/openssl-cpuid-x86.pl +++ /dev/null @@ -1,477 +0,0 @@ -#!/usr/bin/env perl - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC, "${dir}perlasm", "perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],"x86cpuid"); - -for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } - -&function_begin("OPENSSL_ia32_cpuid"); - &xor ("edx","edx"); - &pushf (); - &pop ("eax"); - &mov ("ecx","eax"); - &xor ("eax",1<<21); - &push ("eax"); - &popf (); - &pushf (); - &pop ("eax"); - &xor ("ecx","eax"); - &xor ("eax","eax"); - &bt ("ecx",21); - &jnc (&label("nocpuid")); - &mov ("esi",&wparam(0)); - &mov (&DWP(8,"esi"),"eax"); # clear 3rd word - &cpuid (); - &mov ("edi","eax"); # max value for standard query level - - &xor ("eax","eax"); - &cmp ("ebx",0x756e6547); # "Genu" - &setne (&LB("eax")); - &mov ("ebp","eax"); - &cmp ("edx",0x49656e69); # "ineI" - &setne (&LB("eax")); - &or ("ebp","eax"); - &cmp ("ecx",0x6c65746e); # "ntel" - &setne (&LB("eax")); - &or ("ebp","eax"); # 0 indicates Intel CPU - &jz (&label("intel")); - - &cmp ("ebx",0x68747541); # "Auth" - &setne (&LB("eax")); - &mov ("esi","eax"); - &cmp ("edx",0x69746E65); # "enti" - &setne (&LB("eax")); - &or ("esi","eax"); - &cmp ("ecx",0x444D4163); # "cAMD" - &setne (&LB("eax")); - &or ("esi","eax"); # 0 indicates AMD CPU - &jnz (&label("intel")); - - # AMD specific - &mov ("eax",0x80000000); - &cpuid (); - &cmp ("eax",0x80000001); - &jb (&label("intel")); - &mov ("esi","eax"); - &mov ("eax",0x80000001); - &cpuid (); - &or ("ebp","ecx"); - &and ("ebp",1<<11|1); # isolate XOP bit - &cmp ("esi",0x80000008); - &jb (&label("intel")); - - &mov ("eax",0x80000008); - &cpuid (); - &movz ("esi",&LB("ecx")); # number of cores - 1 - &inc ("esi"); # number of cores - - &mov ("eax",1); - &xor ("ecx","ecx"); - &cpuid (); - &bt ("edx",28); - &jnc (&label("generic")); - &shr ("ebx",16); - &and ("ebx",0xff); - &cmp ("ebx","esi"); - &ja (&label("generic")); - &and ("edx",0xefffffff); # clear hyper-threading bit - &jmp (&label("generic")); - -&set_label("intel"); - &cmp ("edi",7); - &jb (&label("cacheinfo")); - - &mov ("esi",&wparam(0)); - &mov ("eax",7); - &xor ("ecx","ecx"); - &cpuid (); - &mov (&DWP(8,"esi"),"ebx"); - -&set_label("cacheinfo"); - &cmp ("edi",4); - &mov ("edi",-1); - &jb (&label("nocacheinfo")); - - &mov ("eax",4); - &mov ("ecx",0); # query L1D - &cpuid (); - &mov ("edi","eax"); - &shr ("edi",14); - &and ("edi",0xfff); # number of cores -1 per L1D - -&set_label("nocacheinfo"); - &mov ("eax",1); - &xor ("ecx","ecx"); - &cpuid (); - &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 - &cmp ("ebp",0); - &jne (&label("notintel")); - &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs - &and (&HB("eax"),15); # familiy ID - &cmp (&HB("eax"),15); # P4? - &jne (&label("notintel")); - &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR -&set_label("notintel"); - &bt ("edx",28); # test hyper-threading bit - &jnc (&label("generic")); - &and ("edx",0xefffffff); - &cmp ("edi",0); - &je (&label("generic")); - - &or ("edx",0x10000000); - &shr ("ebx",16); - &cmp (&LB("ebx"),1); - &ja (&label("generic")); - &and ("edx",0xefffffff); # clear hyper-threading bit if not - -&set_label("generic"); - &and ("ebp",1<<11); # isolate AMD XOP flag - &and ("ecx",0xfffff7ff); # force 11th bit to 0 - &mov ("esi","edx"); - &or ("ebp","ecx"); # merge AMD XOP flag - - &bt ("ecx",27); # check OSXSAVE bit - &jnc (&label("clear_avx")); - &xor ("ecx","ecx"); - &data_byte(0x0f,0x01,0xd0); # xgetbv - &and ("eax",6); - &cmp ("eax",6); - &je (&label("done")); - &cmp ("eax",2); - &je (&label("clear_avx")); -&set_label("clear_xmm"); - &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits - &and ("esi",0xfeffffff); # clear FXSR -&set_label("clear_avx"); - &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits - &mov ("edi",&wparam(0)); - &and (&DWP(8,"edi"),0xffffffdf); # clear AVX2 -&set_label("done"); - &mov ("eax","esi"); - &mov ("edx","ebp"); -&set_label("nocpuid"); -&function_end("OPENSSL_ia32_cpuid"); - -&external_label("OPENSSL_ia32cap_P"); - -&function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); - &xor ("eax","eax"); - &xor ("edx","edx"); - &picmeup("ecx","OPENSSL_ia32cap_P"); - &bt (&DWP(0,"ecx"),4); - &jnc (&label("notsc")); - &rdtsc (); -&set_label("notsc"); - &ret (); -&function_end_B("OPENSSL_rdtsc"); - -# This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host], -# but it's safe to call it on any [supported] 32-bit platform... -# Just check for [non-]zero return value... -&function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); - &picmeup("ecx","OPENSSL_ia32cap_P"); - &bt (&DWP(0,"ecx"),4); - &jnc (&label("nohalt")); # no TSC - - &data_word(0x9058900e); # push %cs; pop %eax - &and ("eax",3); - &jnz (&label("nohalt")); # not enough privileges - - &pushf (); - &pop ("eax"); - &bt ("eax",9); - &jnc (&label("nohalt")); # interrupts are disabled - - &rdtsc (); - &push ("edx"); - &push ("eax"); - &halt (); - &rdtsc (); - - &sub ("eax",&DWP(0,"esp")); - &sbb ("edx",&DWP(4,"esp")); - &add ("esp",8); - &ret (); - -&set_label("nohalt"); - &xor ("eax","eax"); - &xor ("edx","edx"); - &ret (); -&function_end_B("OPENSSL_instrument_halt"); - -# Essentially there is only one use for this function. Under DJGPP: -# -# #include <go32.h> -# ... -# i=OPENSSL_far_spin(_dos_ds,0x46c); -# ... -# to obtain the number of spins till closest timer interrupt. - -&function_begin_B("OPENSSL_far_spin"); - &pushf (); - &pop ("eax"); - &bt ("eax",9); - &jnc (&label("nospin")); # interrupts are disabled - - &mov ("eax",&DWP(4,"esp")); - &mov ("ecx",&DWP(8,"esp")); - &data_word (0x90d88e1e); # push %ds, mov %eax,%ds - &xor ("eax","eax"); - &mov ("edx",&DWP(0,"ecx")); - &jmp (&label("spin")); - - &align (16); -&set_label("spin"); - &inc ("eax"); - &cmp ("edx",&DWP(0,"ecx")); - &je (&label("spin")); - - &data_word (0x1f909090); # pop %ds - &ret (); - -&set_label("nospin"); - &xor ("eax","eax"); - &xor ("edx","edx"); - &ret (); -&function_end_B("OPENSSL_far_spin"); - -&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD"); - &xor ("eax","eax"); - &xor ("edx","edx"); - &picmeup("ecx","OPENSSL_ia32cap_P"); - &mov ("ecx",&DWP(0,"ecx")); - &bt (&DWP(0,"ecx"),1); - &jnc (&label("no_x87")); - if ($sse2) { - &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits - &cmp ("ecx",1<<26|1<<24); - &jne (&label("no_sse2")); - &pxor ("xmm0","xmm0"); - &pxor ("xmm1","xmm1"); - &pxor ("xmm2","xmm2"); - &pxor ("xmm3","xmm3"); - &pxor ("xmm4","xmm4"); - &pxor ("xmm5","xmm5"); - &pxor ("xmm6","xmm6"); - &pxor ("xmm7","xmm7"); - &set_label("no_sse2"); - } - # just a bunch of fldz to zap the fp/mm bank followed by finit... - &data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b); -&set_label("no_x87"); - &lea ("eax",&DWP(4,"esp")); - &ret (); -&function_end_B("OPENSSL_wipe_cpu"); - -&function_begin_B("OPENSSL_atomic_add"); - &mov ("edx",&DWP(4,"esp")); # fetch the pointer, 1st arg - &mov ("ecx",&DWP(8,"esp")); # fetch the increment, 2nd arg - &push ("ebx"); - &nop (); - &mov ("eax",&DWP(0,"edx")); -&set_label("spin"); - &lea ("ebx",&DWP(0,"eax","ecx")); - &nop (); - &data_word(0x1ab10ff0); # lock; cmpxchg %ebx,(%edx) # %eax is envolved and is always reloaded - &jne (&label("spin")); - &mov ("eax","ebx"); # OpenSSL expects the new value - &pop ("ebx"); - &ret (); -&function_end_B("OPENSSL_atomic_add"); - -# This function can become handy under Win32 in situations when -# we don't know which calling convention, __stdcall or __cdecl(*), -# indirect callee is using. In C it can be deployed as -# -#ifdef OPENSSL_CPUID_OBJ -# type OPENSSL_indirect_call(void *f,...); -# ... -# OPENSSL_indirect_call(func,[up to $max arguments]); -#endif -# -# (*) it's designed to work even for __fastcall if number of -# arguments is 1 or 2! -&function_begin_B("OPENSSL_indirect_call"); - { - my ($max,$i)=(7,); # $max has to be chosen as 4*n-1 - # in order to preserve eventual - # stack alignment - &push ("ebp"); - &mov ("ebp","esp"); - &sub ("esp",$max*4); - &mov ("ecx",&DWP(12,"ebp")); - &mov (&DWP(0,"esp"),"ecx"); - &mov ("edx",&DWP(16,"ebp")); - &mov (&DWP(4,"esp"),"edx"); - for($i=2;$i<$max;$i++) - { - # Some copies will be redundant/bogus... - &mov ("eax",&DWP(12+$i*4,"ebp")); - &mov (&DWP(0+$i*4,"esp"),"eax"); - } - &call_ptr (&DWP(8,"ebp"));# make the call... - &mov ("esp","ebp"); # ... and just restore the stack pointer - # without paying attention to what we called, - # (__cdecl *func) or (__stdcall *one). - &pop ("ebp"); - &ret (); - } -&function_end_B("OPENSSL_indirect_call"); - -&function_begin_B("OPENSSL_cleanse"); - &mov ("edx",&wparam(0)); - &mov ("ecx",&wparam(1)); - &xor ("eax","eax"); - &cmp ("ecx",7); - &jae (&label("lot")); - &cmp ("ecx",0); - &je (&label("ret")); -&set_label("little"); - &mov (&BP(0,"edx"),"al"); - &sub ("ecx",1); - &lea ("edx",&DWP(1,"edx")); - &jnz (&label("little")); -&set_label("ret"); - &ret (); - -&set_label("lot",16); - &test ("edx",3); - &jz (&label("aligned")); - &mov (&BP(0,"edx"),"al"); - &lea ("ecx",&DWP(-1,"ecx")); - &lea ("edx",&DWP(1,"edx")); - &jmp (&label("lot")); -&set_label("aligned"); - &mov (&DWP(0,"edx"),"eax"); - &lea ("ecx",&DWP(-4,"ecx")); - &test ("ecx",-4); - &lea ("edx",&DWP(4,"edx")); - &jnz (&label("aligned")); - &cmp ("ecx",0); - &jne (&label("little")); - &ret (); -&function_end_B("OPENSSL_cleanse"); - -{ -my $lasttick = "esi"; -my $lastdiff = "ebx"; -my $out = "edi"; -my $cnt = "ecx"; -my $max = "ebp"; - -&function_begin("OPENSSL_instrument_bus"); - &mov ("eax",0); - if ($sse2) { - &picmeup("edx","OPENSSL_ia32cap_P"); - &bt (&DWP(0,"edx"),4); - &jnc (&label("nogo")); # no TSC - &bt (&DWP(0,"edx"),19); - &jnc (&label("nogo")); # no CLFLUSH - - &mov ($out,&wparam(0)); # load arguments - &mov ($cnt,&wparam(1)); - - # collect 1st tick - &rdtsc (); - &mov ($lasttick,"eax"); # lasttick = tick - &mov ($lastdiff,0); # lastdiff = 0 - &clflush(&DWP(0,$out)); - &data_byte(0xf0); # lock - &add (&DWP(0,$out),$lastdiff); - &jmp (&label("loop")); - -&set_label("loop",16); - &rdtsc (); - &mov ("edx","eax"); # put aside tick (yes, I neglect edx) - &sub ("eax",$lasttick); # diff - &mov ($lasttick,"edx"); # lasttick = tick - &mov ($lastdiff,"eax"); # lastdiff = diff - &clflush(&DWP(0,$out)); - &data_byte(0xf0); # lock - &add (&DWP(0,$out),"eax"); # accumulate diff - &lea ($out,&DWP(4,$out)); # ++$out - &sub ($cnt,1); # --$cnt - &jnz (&label("loop")); - - &mov ("eax",&wparam(1)); -&set_label("nogo"); - } -&function_end("OPENSSL_instrument_bus"); - -&function_begin("OPENSSL_instrument_bus2"); - &mov ("eax",0); - if ($sse2) { - &picmeup("edx","OPENSSL_ia32cap_P"); - &bt (&DWP(0,"edx"),4); - &jnc (&label("nogo")); # no TSC - &bt (&DWP(0,"edx"),19); - &jnc (&label("nogo")); # no CLFLUSH - - &mov ($out,&wparam(0)); # load arguments - &mov ($cnt,&wparam(1)); - &mov ($max,&wparam(2)); - - &rdtsc (); # collect 1st tick - &mov ($lasttick,"eax"); # lasttick = tick - &mov ($lastdiff,0); # lastdiff = 0 - - &clflush(&DWP(0,$out)); - &data_byte(0xf0); # lock - &add (&DWP(0,$out),$lastdiff); - - &rdtsc (); # collect 1st diff - &mov ("edx","eax"); # put aside tick (yes, I neglect edx) - &sub ("eax",$lasttick); # diff - &mov ($lasttick,"edx"); # lasttick = tick - &mov ($lastdiff,"eax"); # lastdiff = diff - &jmp (&label("loop2")); - -&set_label("loop2",16); - &clflush(&DWP(0,$out)); - &data_byte(0xf0); # lock - &add (&DWP(0,$out),"eax"); # accumulate diff - - &sub ($max,1); - &jz (&label("done2")); - - &rdtsc (); - &mov ("edx","eax"); # put aside tick (yes, I neglect edx) - &sub ("eax",$lasttick); # diff - &mov ($lasttick,"edx"); # lasttick = tick - &cmp ("eax",$lastdiff); - &mov ($lastdiff,"eax"); # lastdiff = diff - &mov ("edx",0); - &setne ("dl"); - &sub ($cnt,"edx"); # conditional --$cnt - &lea ($out,&DWP(0,$out,"edx",4)); # conditional ++$out - &jnz (&label("loop2")); - -&set_label("done2"); - &mov ("eax",&wparam(1)); - &sub ("eax",$cnt); -&set_label("nogo"); - } -&function_end("OPENSSL_instrument_bus2"); -} - -&function_begin_B("OPENSSL_ia32_rdrand"); - &mov ("ecx",8); -&set_label("loop"); - &rdrand ("eax"); - &jc (&label("break")); - &loop (&label("loop")); -&set_label("break"); - &cmp ("eax",0); - &cmove ("eax","ecx"); - &ret (); -&function_end_B("OPENSSL_ia32_rdrand"); - -&initseg("OPENSSL_cpuid_setup"); - -&hidden("OPENSSL_cpuid_setup"); -&hidden("OPENSSL_ia32cap_P"); - -&asm_finish(); diff --git a/devel/perlasm/openssl-cpuid-x86.pl.license b/devel/perlasm/openssl-cpuid-x86.pl.license deleted file mode 120000 index cd301a44ab..0000000000 --- a/devel/perlasm/openssl-cpuid-x86.pl.license +++ /dev/null @@ -1 +0,0 @@ -license.txt
\ No newline at end of file diff --git a/devel/perlasm/ppc-xlate.pl b/devel/perlasm/ppc-xlate.pl index c075d5fce0..e85ad2ef95 100755..120000 --- a/devel/perlasm/ppc-xlate.pl +++ b/devel/perlasm/ppc-xlate.pl @@ -1,180 +1 @@ -#!/usr/bin/env perl - -# PowerPC assembler distiller by <appro>. - -my $flavour = shift; -my $output = shift; -open STDOUT,">$output" || die "can't open $output: $!"; - -my %GLOBALS; -my $dotinlocallabels=($flavour=~/linux/)?1:0; - -################################################################ -# directives which need special treatment on different platforms -################################################################ -my $globl = sub { - my $junk = shift; - my $name = shift; - my $global = \$GLOBALS{$name}; - my $ret; - - $name =~ s|^[\.\_]||; - - SWITCH: for ($flavour) { - /aix/ && do { $name = ".$name"; - last; - }; - /osx/ && do { $name = "_$name"; - last; - }; - /linux.*32/ && do { $ret .= ".globl $name\n"; - $ret .= ".type $name,\@function"; - last; - }; - /linux.*64/ && do { $ret .= ".globl $name\n"; - $ret .= ".type $name,\@function\n"; - $ret .= ".section \".opd\",\"aw\"\n"; - $ret .= ".align 3\n"; - $ret .= "$name:\n"; - $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; - $ret .= ".previous\n"; - - $name = ".$name"; - last; - }; - } - - $ret = ".globl $name" if (!$ret); - $$global = $name; - $ret; -}; -my $text = sub { - ($flavour =~ /aix/) ? ".csect" : ".text"; -}; -my $machine = sub { - my $junk = shift; - my $arch = shift; - if ($flavour =~ /osx/) - { $arch =~ s/\"//g; - $arch = ($flavour=~/64/) ? "ppc970-64" : "ppc970" if ($arch eq "any"); - } - ".machine $arch"; -}; -my $size = sub { - if ($flavour =~ /linux/) - { shift; - my $name = shift; $name =~ s|^[\.\_]||; - my $ret = ".size $name,.-".($flavour=~/64/?".":"").$name; - $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64/); - $ret; - } - else - { ""; } -}; -my $asciz = sub { - shift; - my $line = join(",",@_); - if ($line =~ /^"(.*)"$/) - { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } - else - { ""; } -}; -my $quad = sub { - shift; - my @ret; - my ($hi,$lo); - for (@_) { - if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io) - { $hi=$1?"0x$1":"0"; $lo="0x$2"; } - elsif (/^([0-9]+)$/o) - { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl - else - { $hi=undef; $lo=$_; } - - if (defined($hi)) - { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); } - else - { push(@ret,".quad $lo"); } - } - join("\n",@ret); -}; - -################################################################ -# simplified mnemonics not handled by at least one assembler -################################################################ -my $cmplw = sub { - my $f = shift; - my $cr = 0; $cr = shift if ($#_>1); - # Some out-of-date 32-bit GNU assembler just can't handle cmplw... - ($flavour =~ /linux.*32/) ? - " .long ".sprintf "0x%x",31<<26|$cr<<23|$_[0]<<16|$_[1]<<11|64 : - " cmplw ".join(',',$cr,@_); -}; -my $bdnz = sub { - my $f = shift; - my $bo = $f=~/[\+\-]/ ? 16+9 : 16; # optional "to be taken" hint - " bc $bo,0,".shift; -} if ($flavour!~/linux/); -my $bltlr = sub { - my $f = shift; - my $bo = $f=~/\-/ ? 12+2 : 12; # optional "not to be taken" hint - ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints - " .long ".sprintf "0x%x",19<<26|$bo<<21|16<<1 : - " bclr $bo,0"; -}; -my $bnelr = sub { - my $f = shift; - my $bo = $f=~/\-/ ? 4+2 : 4; # optional "not to be taken" hint - ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints - " .long ".sprintf "0x%x",19<<26|$bo<<21|2<<16|16<<1 : - " bclr $bo,2"; -}; -my $beqlr = sub { - my $f = shift; - my $bo = $f=~/-/ ? 12+2 : 12; # optional "not to be taken" hint - ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints - " .long ".sprintf "0x%X",19<<26|$bo<<21|2<<16|16<<1 : - " bclr $bo,2"; -}; -# GNU assembler can't handle extrdi rA,rS,16,48, or when sum of last two -# arguments is 64, with "operand out of range" error. -my $extrdi = sub { - my ($f,$ra,$rs,$n,$b) = @_; - $b = ($b+$n)&63; $n = 64-$n; - " rldicl $ra,$rs,$b,$n"; -}; - -while($line=<>) { - - $line =~ s|[#!;].*$||; # get rid of asm-style comments... - $line =~ s|/\*.*\*/||; # ... and C-style comments... - $line =~ s|^\s+||; # ... and skip white spaces in beginning... - $line =~ s|\s+$||; # ... and at the end - - { - $line =~ s|\b\.L(\w+)|L$1|g; # common denominator for Locallabel - $line =~ s|\bL(\w+)|\.L$1|g if ($dotinlocallabels); - } - - { - $line =~ s|(^[\.\w]+)\:\s*||; - my $label = $1; - printf "%s:",($GLOBALS{$label} or $label) if ($label); - } - - { - $line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||; - my $c = $1; $c = "\t" if ($c eq ""); - my $mnemonic = $2; - my $f = $3; - my $opcode = eval("\$$mnemonic"); - $line =~ s|\bc?[rf]([0-9]+)\b|$1|g if ($c ne "." and $flavour !~ /osx/); - if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); } - elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; } - } - - print $line if ($line); - print "\n"; -} - -close STDOUT; +.././openssl/crypto/perlasm/ppc-xlate.pl
\ No newline at end of file diff --git a/devel/perlasm/sha1-ssse3-x86.pl b/devel/perlasm/sha1-ssse3-x86.pl index 632dbbe122..97a21e2067 100644..120000 --- a/devel/perlasm/sha1-ssse3-x86.pl +++ b/devel/perlasm/sha1-ssse3-x86.pl @@ -1,1266 +1 @@ -#!/usr/bin/env perl - -# ==================================================================== -# [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# "[Re]written" was achieved in two major overhauls. In 2004 BODY_* -# functions were re-implemented to address P4 performance issue [see -# commentary below], and in 2006 the rest was rewritten in order to -# gain freedom to liberate licensing terms. - -# January, September 2004. -# -# It was noted that Intel IA-32 C compiler generates code which -# performs ~30% *faster* on P4 CPU than original *hand-coded* -# SHA1 assembler implementation. To address this problem (and -# prove that humans are still better than machines:-), the -# original code was overhauled, which resulted in following -# performance changes: -# -# compared with original compared with Intel cc -# assembler impl. generated code -# Pentium -16% +48% -# PIII/AMD +8% +16% -# P4 +85%(!) +45% -# -# As you can see Pentium came out as looser:-( Yet I reckoned that -# improvement on P4 outweights the loss and incorporate this -# re-tuned code to 0.9.7 and later. -# ---------------------------------------------------------------- -# <appro@fy.chalmers.se> - -# August 2009. -# -# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as -# '(c&d) + (b&(c^d))', which allows to accumulate partial results -# and lighten "pressure" on scratch registers. This resulted in -# >12% performance improvement on contemporary AMD cores (with no -# degradation on other CPUs:-). Also, the code was revised to maximize -# "distance" between instructions producing input to 'lea' instruction -# and the 'lea' instruction itself, which is essential for Intel Atom -# core and resulted in ~15% improvement. - -# October 2010. -# -# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it -# is to offload message schedule denoted by Wt in NIST specification, -# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, -# and in SSE2 context was first explored by Dean Gaudet in 2004, see -# http://arctic.org/~dean/crypto/sha1.html. Since then several things -# have changed that made it interesting again: -# -# a) XMM units became faster and wider; -# b) instruction set became more versatile; -# c) an important observation was made by Max Locktykhin, which made -# it possible to reduce amount of instructions required to perform -# the operation in question, for further details see -# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. - -# April 2011. -# -# Add AVX code path, probably most controversial... The thing is that -# switch to AVX alone improves performance by as little as 4% in -# comparison to SSSE3 code path. But below result doesn't look like -# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as -# pair of µ-ops, and it's the additional µ-ops, two per round, that -# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded -# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with -# equivalent 'sh[rl]d' that is responsible for the impressive 5.1 -# cycles per processed byte. But 'sh[rl]d' is not something that used -# to be fast, nor does it appear to be fast in upcoming Bulldozer -# [according to its optimization manual]. Which is why AVX code path -# is guarded by *both* AVX and synthetic bit denoting Intel CPUs. -# One can argue that it's unfair to AMD, but without 'sh[rl]d' it -# makes no sense to keep the AVX code path. If somebody feels that -# strongly, it's probably more appropriate to discuss possibility of -# using vector rotate XOP on AMD... - -###################################################################### -# Current performance is summarized in following table. Numbers are -# CPU clock cycles spent to process single byte (less is better). -# -# x86 SSSE3 AVX -# Pentium 15.7 - -# PIII 11.5 - -# P4 10.6 - -# AMD K8 7.1 - -# Core2 7.3 6.0/+22% - -# Atom 12.5 9.3(*)/+35% - -# Westmere 7.3 5.5/+33% - -# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% -# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% -# Bulldozer 11.6 6.0/+92% -# VIA Nano 10.6 7.4/+43% -# -# (*) Loop is 1056 instructions long and expected result is ~8.25. -# It remains mystery [to me] why ILP is limited to 1.7. -# -# (**) As per above comment, the result is for AVX *plus* sh[rl]d. - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); - -$xmm=$ymm=0; -for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } - -$ymm=1 if ($xmm && - `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/ && - $1>=2.19); # first version supporting AVX - -$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && - $1>=2.03); # first version supporting AVX - -$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && - `ml 2>&1` =~ /Version ([0-9]+)\./ && - $1>=10); # first version supporting AVX - -&external_label("OPENSSL_ia32cap_P") if ($xmm); - - -$A="eax"; -$B="ebx"; -$C="ecx"; -$D="edx"; -$E="edi"; -$T="esi"; -$tmp1="ebp"; - -@V=($A,$B,$C,$D,$E,$T); - -$alt=0; # 1 denotes alternative IALU implementation, which performs - # 8% *worse* on P4, same on Westmere and Atom, 2% better on - # Sandy Bridge... - -sub BODY_00_15 - { - local($n,$a,$b,$c,$d,$e,$f)=@_; - - &comment("00_15 $n"); - - &mov($f,$c); # f to hold F_00_19(b,c,d) - if ($n==0) { &mov($tmp1,$a); } - else { &mov($a,$tmp1); } - &rotl($tmp1,5); # tmp1=ROTATE(a,5) - &xor($f,$d); - &add($tmp1,$e); # tmp1+=e; - &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded - # with xi, also note that e becomes - # f in next round... - &and($f,$b); - &rotr($b,2); # b=ROTATE(b,30) - &xor($f,$d); # f holds F_00_19(b,c,d) - &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi - - if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round - &add($f,$tmp1); } # f+=tmp1 - else { &add($tmp1,$f); } # f becomes a in next round - &mov($tmp1,$a) if ($alt && $n==15); - } - -sub BODY_16_19 - { - local($n,$a,$b,$c,$d,$e,$f)=@_; - - &comment("16_19 $n"); - -if ($alt) { - &xor($c,$d); - &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d - &xor($f,&swtmp(($n+8)%16)); - &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) - &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd - &rotl($f,1); # f=ROTATE(f,1) - &add($e,$tmp1); # e+=F_00_19(b,c,d) - &xor($c,$d); # restore $c - &mov($tmp1,$a); # b in next round - &rotr($b,$n==16?2:7); # b=ROTATE(b,30) - &mov(&swtmp($n%16),$f); # xi=f - &rotl($a,5); # ROTATE(a,5) - &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e - &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round - &add($f,$a); # f+=ROTATE(a,5) -} else { - &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) - &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &xor($tmp1,$d); - &xor($f,&swtmp(($n+8)%16)); - &and($tmp1,$b); - &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd - &rotl($f,1); # f=ROTATE(f,1) - &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) - &add($e,$tmp1); # e+=F_00_19(b,c,d) - &mov($tmp1,$a); - &rotr($b,2); # b=ROTATE(b,30) - &mov(&swtmp($n%16),$f); # xi=f - &rotl($tmp1,5); # ROTATE(a,5) - &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e - &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round - &add($f,$tmp1); # f+=ROTATE(a,5) -} - } - -sub BODY_20_39 - { - local($n,$a,$b,$c,$d,$e,$f)=@_; - local $K=($n<40)?0x6ed9eba1:0xca62c1d6; - - &comment("20_39 $n"); - -if ($alt) { - &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c - &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) - &xor($f,&swtmp(($n+8)%16)); - &add($e,$tmp1); # e+=F_20_39(b,c,d) - &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd - &rotl($f,1); # f=ROTATE(f,1) - &mov($tmp1,$a); # b in next round - &rotr($b,7); # b=ROTATE(b,30) - &mov(&swtmp($n%16),$f) if($n<77);# xi=f - &rotl($a,5); # ROTATE(a,5) - &xor($b,$c) if($n==39);# warm up for BODY_40_59 - &and($tmp1,$b) if($n==39); - &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY - &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round - &add($f,$a); # f+=ROTATE(a,5) - &rotr($a,5) if ($n==79); -} else { - &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) - &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &xor($tmp1,$c); - &xor($f,&swtmp(($n+8)%16)); - &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) - &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd - &rotl($f,1); # f=ROTATE(f,1) - &add($e,$tmp1); # e+=F_20_39(b,c,d) - &rotr($b,2); # b=ROTATE(b,30) - &mov($tmp1,$a); - &rotl($tmp1,5); # ROTATE(a,5) - &mov(&swtmp($n%16),$f) if($n<77);# xi=f - &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY - &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round - &add($f,$tmp1); # f+=ROTATE(a,5) -} - } - -sub BODY_40_59 - { - local($n,$a,$b,$c,$d,$e,$f)=@_; - - &comment("40_59 $n"); - -if ($alt) { - &add($e,$tmp1); # e+=b&(c^d) - &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &mov($tmp1,$d); - &xor($f,&swtmp(($n+8)%16)); - &xor($c,$d); # restore $c - &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd - &rotl($f,1); # f=ROTATE(f,1) - &and($tmp1,$c); - &rotr($b,7); # b=ROTATE(b,30) - &add($e,$tmp1); # e+=c&d - &mov($tmp1,$a); # b in next round - &mov(&swtmp($n%16),$f); # xi=f - &rotl($a,5); # ROTATE(a,5) - &xor($b,$c) if ($n<59); - &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) - &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) - &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round - &add($f,$a); # f+=ROTATE(a,5) -} else { - &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) - &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &xor($tmp1,$d); - &xor($f,&swtmp(($n+8)%16)); - &and($tmp1,$b); - &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd - &rotl($f,1); # f=ROTATE(f,1) - &add($tmp1,$e); # b&(c^d)+=e - &rotr($b,2); # b=ROTATE(b,30) - &mov($e,$a); # e becomes volatile - &rotl($e,5); # ROTATE(a,5) - &mov(&swtmp($n%16),$f); # xi=f - &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) - &mov($tmp1,$c); - &add($f,$e); # f+=ROTATE(a,5) - &and($tmp1,$d); - &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round - &add($f,$tmp1); # f+=c&d -} - } - -&function_begin("sha1_block_data_order"); -if ($xmm) { - &static_label("ssse3_shortcut"); - &static_label("avx_shortcut") if ($ymm); - &static_label("K_XX_XX"); - - &call (&label("pic_point")); # make it PIC! - &set_label("pic_point"); - &blindpop($tmp1); - &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); - &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); - - &mov ($A,&DWP(0,$T)); - &mov ($D,&DWP(4,$T)); - &test ($D,1<<9); # check SSSE3 bit - &jz (&label("x86")); - &test ($A,1<<24); # check FXSR bit - &jz (&label("x86")); - if ($ymm) { - &and ($D,1<<28); # mask AVX bit - &and ($A,1<<30); # mask "Intel CPU" bit - &or ($A,$D); - &cmp ($A,1<<28|1<<30); - &je (&label("avx_shortcut")); - } - &jmp (&label("ssse3_shortcut")); - &set_label("x86",16); -} - &mov($tmp1,&wparam(0)); # SHA_CTX *c - &mov($T,&wparam(1)); # const void *input - &mov($A,&wparam(2)); # size_t num - &stack_push(16+3); # allocate X[16] - &shl($A,6); - &add($A,$T); - &mov(&wparam(2),$A); # pointer beyond the end of input - &mov($E,&DWP(16,$tmp1));# pre-load E - &jmp(&label("loop")); - -&set_label("loop",16); - - # copy input chunk to X, but reversing byte order! - for ($i=0; $i<16; $i+=4) - { - &mov($A,&DWP(4*($i+0),$T)); - &mov($B,&DWP(4*($i+1),$T)); - &mov($C,&DWP(4*($i+2),$T)); - &mov($D,&DWP(4*($i+3),$T)); - &bswap($A); - &bswap($B); - &bswap($C); - &bswap($D); - &mov(&swtmp($i+0),$A); - &mov(&swtmp($i+1),$B); - &mov(&swtmp($i+2),$C); - &mov(&swtmp($i+3),$D); - } - &mov(&wparam(1),$T); # redundant in 1st spin - - &mov($A,&DWP(0,$tmp1)); # load SHA_CTX - &mov($B,&DWP(4,$tmp1)); - &mov($C,&DWP(8,$tmp1)); - &mov($D,&DWP(12,$tmp1)); - # E is pre-loaded - - for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } - for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } - for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } - for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } - for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } - - (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check - - &mov($tmp1,&wparam(0)); # re-load SHA_CTX* - &mov($D,&wparam(1)); # D is last "T" and is discarded - - &add($E,&DWP(0,$tmp1)); # E is last "A"... - &add($T,&DWP(4,$tmp1)); - &add($A,&DWP(8,$tmp1)); - &add($B,&DWP(12,$tmp1)); - &add($C,&DWP(16,$tmp1)); - - &mov(&DWP(0,$tmp1),$E); # update SHA_CTX - &add($D,64); # advance input pointer - &mov(&DWP(4,$tmp1),$T); - &cmp($D,&wparam(2)); # have we reached the end yet? - &mov(&DWP(8,$tmp1),$A); - &mov($E,$C); # C is last "E" which needs to be "pre-loaded" - &mov(&DWP(12,$tmp1),$B); - &mov($T,$D); # input pointer - &mov(&DWP(16,$tmp1),$C); - &jb(&label("loop")); - - &stack_pop(16+3); -&function_end("sha1_block_data_order"); - -if ($xmm) { -###################################################################### -# The SSSE3 implementation. -# -# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last -# 32 elements of the message schedule or Xupdate outputs. First 4 -# quadruples are simply byte-swapped input, next 4 are calculated -# according to method originally suggested by Dean Gaudet (modulo -# being implemented in SSSE3). Once 8 quadruples or 32 elements are -# collected, it switches to routine proposed by Max Locktyukhin. -# -# Calculations inevitably require temporary reqisters, and there are -# no %xmm registers left to spare. For this reason part of the ring -# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring -# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - -# X[-5], and X[4] - X[-4]... -# -# Another notable optimization is aggressive stack frame compression -# aiming to minimize amount of 9-byte instructions... -# -# Yet another notable optimization is "jumping" $B variable. It means -# that there is no register permanently allocated for $B value. This -# allowed to eliminate one instruction from body_20_39... -# -my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded -my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 -my @V=($A,$B,$C,$D,$E); -my $j=0; # hash round -my $rx=0; -my @T=($T,$tmp1); -my $inp; - -my $_rol=sub { &rol(@_) }; -my $_ror=sub { &ror(@_) }; - -&function_begin("_sha1_block_data_order_ssse3"); - &call (&label("pic_point")); # make it PIC! - &set_label("pic_point"); - &blindpop($tmp1); - &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); -&set_label("ssse3_shortcut"); - - &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 - &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 - &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 - &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 - &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask - - &mov ($E,&wparam(0)); # load argument block - &mov ($inp=@T[1],&wparam(1)); - &mov ($D,&wparam(2)); - &mov (@T[0],"esp"); - - # stack frame layout - # - # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area - # X[4]+K X[5]+K X[6]+K X[7]+K - # X[8]+K X[9]+K X[10]+K X[11]+K - # X[12]+K X[13]+K X[14]+K X[15]+K - # - # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area - # X[4] X[5] X[6] X[7] - # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 - # - # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants - # K_40_59 K_40_59 K_40_59 K_40_59 - # K_60_79 K_60_79 K_60_79 K_60_79 - # K_00_19 K_00_19 K_00_19 K_00_19 - # pbswap mask - # - # +192 ctx # argument block - # +196 inp - # +200 end - # +204 esp - &sub ("esp",208); - &and ("esp",-64); - - &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants - &movdqa (&QWP(112+16,"esp"),@X[5]); - &movdqa (&QWP(112+32,"esp"),@X[6]); - &shl ($D,6); # len*64 - &movdqa (&QWP(112+48,"esp"),@X[3]); - &add ($D,$inp); # end of input - &movdqa (&QWP(112+64,"esp"),@X[2]); - &add ($inp,64); - &mov (&DWP(192+0,"esp"),$E); # save argument block - &mov (&DWP(192+4,"esp"),$inp); - &mov (&DWP(192+8,"esp"),$D); - &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp - - &mov ($A,&DWP(0,$E)); # load context - &mov ($B,&DWP(4,$E)); - &mov ($C,&DWP(8,$E)); - &mov ($D,&DWP(12,$E)); - &mov ($E,&DWP(16,$E)); - &mov (@T[0],$B); # magic seed - - &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] - &movdqu (@X[-3&7],&QWP(-48,$inp)); - &movdqu (@X[-2&7],&QWP(-32,$inp)); - &movdqu (@X[-1&7],&QWP(-16,$inp)); - &pshufb (@X[-4&7],@X[2]); # byte swap - &pshufb (@X[-3&7],@X[2]); - &pshufb (@X[-2&7],@X[2]); - &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot - &pshufb (@X[-1&7],@X[2]); - &paddd (@X[-4&7],@X[3]); # add K_00_19 - &paddd (@X[-3&7],@X[3]); - &paddd (@X[-2&7],@X[3]); - &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU - &psubd (@X[-4&7],@X[3]); # restore X[] - &movdqa (&QWP(0+16,"esp"),@X[-3&7]); - &psubd (@X[-3&7],@X[3]); - &movdqa (&QWP(0+32,"esp"),@X[-2&7]); - &mov (@T[1],$C); - &psubd (@X[-2&7],@X[3]); - &xor (@T[1],$D); - &movdqa (@X[0],@X[-3&7]); - &and (@T[0],@T[1]); - &jmp (&label("loop")); - -###################################################################### -# SSE instruction sequence is first broken to groups of indepentent -# instructions, independent in respect to their inputs and shifter -# (not all architectures have more than one). Then IALU instructions -# are "knitted in" between the SSE groups. Distance is maintained for -# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer -# [which allegedly also implements SSSE3]... -# -# Temporary registers usage. X[2] is volatile at the entry and at the -# end is restored from backtrace ring buffer. X[3] is expected to -# contain current K_XX_XX constant and is used to caclulate X[-1]+K -# from previous round, it becomes volatile the moment the value is -# saved to stack for transfer to IALU. X[4] becomes volatile whenever -# X[-4] is accumulated and offloaded to backtrace ring buffer, at the -# end it is loaded with next K_XX_XX [which becomes X[3] in next -# round]... -# -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 40 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" - &movdqa (@X[2],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - - &paddd (@X[3],@X[-1&7]); - &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer - eval(shift(@insns)); - eval(shift(@insns)); - &psrldq (@X[2],4); # "X[-3]", 3 dwords - eval(shift(@insns)); - eval(shift(@insns)); - &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - &movdqa (@X[4],@X[0]); - &movdqa (@X[2],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &pslldq (@X[4],12); # "X[0]"<<96, extract one dword - &paddd (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &psrld (@X[2],31); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@X[3],@X[4]); - eval(shift(@insns)); - eval(shift(@insns)); - - &psrld (@X[4],30); - &por (@X[0],@X[2]); # "X[0]"<<<=1 - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer - eval(shift(@insns)); - eval(shift(@insns)); - - &pslld (@X[3],2); - &pxor (@X[0],@X[4]); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 - &movdqa (@X[1],@X[-2&7]) if ($Xi<7); - eval(shift(@insns)); - eval(shift(@insns)); - - foreach (@insns) { eval; } # remaining instructions [if any] - - $Xi++; push(@X,shift(@X)); # "rotate" X[] -} - -sub Xupdate_ssse3_32_79() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions - my ($a,$b,$c,$d,$e); - - &movdqa (@X[2],@X[-1&7]) if ($Xi==8); - eval(shift(@insns)); # body_20_39 - &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" - &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer - eval(shift(@insns)); - eval(shift(@insns)); - if ($Xi%5) { - &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... - } else { # ... or load next one - &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); - } - &paddd (@X[3],@X[-1&7]); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &movdqa (@X[2],@X[0]); - &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &pslld (@X[0],2); - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - &psrld (@X[2],30); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &por (@X[0],@X[2]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - &movdqa (@X[3],@X[0]) if ($Xi<19); - eval(shift(@insns)); - - foreach (@insns) { eval; } # remaining instructions - - $Xi++; push(@X,shift(@X)); # "rotate" X[] -} - -sub Xuplast_ssse3_80() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - &paddd (@X[3],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU - - foreach (@insns) { eval; } # remaining instructions - - &mov ($inp=@T[1],&DWP(192+4,"esp")); - &cmp ($inp,&DWP(192+8,"esp")); - &je (&label("done")); - - &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 - &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask - &movdqu (@X[-4&7],&QWP(0,$inp)); # load input - &movdqu (@X[-3&7],&QWP(16,$inp)); - &movdqu (@X[-2&7],&QWP(32,$inp)); - &movdqu (@X[-1&7],&QWP(48,$inp)); - &add ($inp,64); - &pshufb (@X[-4&7],@X[2]); # byte swap - &mov (&DWP(192+4,"esp"),$inp); - &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot - - $Xi=0; -} - -sub Xloop_ssse3() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &pshufb (@X[($Xi-3)&7],@X[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[($Xi-4)&7],@X[3]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - &psubd (@X[($Xi-4)&7],@X[3]); - - foreach (@insns) { eval; } - $Xi++; -} - -sub Xtail_ssse3() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - foreach (@insns) { eval; } -} - -sub body_00_19 () { # ((c^d)&b)^d - # on start @T[0]=(c^d)&b - return &body_20_39() if ($rx==19); $rx++; - ( - '($a,$b,$c,$d,$e)=@V;'. - '&$_ror ($b,$j?7:2);', # $b>>>2 - '&xor (@T[0],$d);', - '&mov (@T[1],$a);', # $b in next round - - '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer - '&xor ($b,$c);', # $c^$d for next round - - '&$_rol ($a,5);', - '&add ($e,@T[0]);', - '&and (@T[1],$b);', # ($b&($c^$d)) for next round - - '&xor ($b,$c);', # restore $b - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); -} - -sub body_20_39 () { # b^d^c - # on entry @T[0]=b^d - return &body_40_59() if ($rx==39); $rx++; - ( - '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer - '&xor (@T[0],$d) if($j==19);'. - '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) - '&mov (@T[1],$a);', # $b in next round - - '&$_rol ($a,5);', - '&add ($e,@T[0]);', - '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round - - '&$_ror ($b,7);', # $b>>>2 - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); -} - -sub body_40_59 () { # ((b^c)&(c^d))^c - # on entry @T[0]=(b^c), (c^=d) - $rx++; - ( - '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer - '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) - '&xor ($c,$d) if ($j>=40);', # restore $c - - '&$_ror ($b,7);', # $b>>>2 - '&mov (@T[1],$a);', # $b for next round - '&xor (@T[0],$c);', - - '&$_rol ($a,5);', - '&add ($e,@T[0]);', - '&xor (@T[1],$c) if ($j==59);'. - '&xor (@T[1],$b) if ($j< 59);', # b^c for next round - - '&xor ($b,$c) if ($j< 59);', # c^d for next round - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); -} - -&set_label("loop",16); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_32_79(\&body_00_19); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" - - $saved_j=$j; @saved_V=@V; - - &Xloop_ssse3(\&body_20_39); - &Xloop_ssse3(\&body_20_39); - &Xloop_ssse3(\&body_20_39); - - &mov (@T[1],&DWP(192,"esp")); # update context - &add ($A,&DWP(0,@T[1])); - &add (@T[0],&DWP(4,@T[1])); # $b - &add ($C,&DWP(8,@T[1])); - &mov (&DWP(0,@T[1]),$A); - &add ($D,&DWP(12,@T[1])); - &mov (&DWP(4,@T[1]),@T[0]); - &add ($E,&DWP(16,@T[1])); - &mov (&DWP(8,@T[1]),$C); - &mov ($B,$C); - &mov (&DWP(12,@T[1]),$D); - &xor ($B,$D); - &mov (&DWP(16,@T[1]),$E); - &and ($B,@T[0]); - &movdqa (@X[0],@X[-3&7]); - &xchg ($B,@T[0]); - - &jmp (&label("loop")); - -&set_label("done",16); $j=$saved_j; @V=@saved_V; - - &Xtail_ssse3(\&body_20_39); - &Xtail_ssse3(\&body_20_39); - &Xtail_ssse3(\&body_20_39); - - &mov (@T[1],&DWP(192,"esp")); # update context - &add ($A,&DWP(0,@T[1])); - &mov ("esp",&DWP(192+12,"esp")); # restore %esp - &add (@T[0],&DWP(4,@T[1])); # $b - &add ($C,&DWP(8,@T[1])); - &mov (&DWP(0,@T[1]),$A); - &add ($D,&DWP(12,@T[1])); - &mov (&DWP(4,@T[1]),@T[0]); - &add ($E,&DWP(16,@T[1])); - &mov (&DWP(8,@T[1]),$C); - &mov (&DWP(12,@T[1]),$D); - &mov (&DWP(16,@T[1]),$E); - -&function_end("_sha1_block_data_order_ssse3"); - -$rx=0; # reset - -if ($ymm) { -my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded -my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 -my @V=($A,$B,$C,$D,$E); -my $j=0; # hash round -my @T=($T,$tmp1); -my $inp; - -my $_rol=sub { &shld(@_[0],@_) }; -my $_ror=sub { &shrd(@_[0],@_) }; - -&function_begin("_sha1_block_data_order_avx"); - &call (&label("pic_point")); # make it PIC! - &set_label("pic_point"); - &blindpop($tmp1); - &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); -&set_label("avx_shortcut"); - &vzeroall(); - - &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 - &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 - &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 - &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 - &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask - - &mov ($E,&wparam(0)); # load argument block - &mov ($inp=@T[1],&wparam(1)); - &mov ($D,&wparam(2)); - &mov (@T[0],"esp"); - - # stack frame layout - # - # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area - # X[4]+K X[5]+K X[6]+K X[7]+K - # X[8]+K X[9]+K X[10]+K X[11]+K - # X[12]+K X[13]+K X[14]+K X[15]+K - # - # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area - # X[4] X[5] X[6] X[7] - # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 - # - # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants - # K_40_59 K_40_59 K_40_59 K_40_59 - # K_60_79 K_60_79 K_60_79 K_60_79 - # K_00_19 K_00_19 K_00_19 K_00_19 - # pbswap mask - # - # +192 ctx # argument block - # +196 inp - # +200 end - # +204 esp - &sub ("esp",208); - &and ("esp",-64); - - &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants - &vmovdqa(&QWP(112+16,"esp"),@X[5]); - &vmovdqa(&QWP(112+32,"esp"),@X[6]); - &shl ($D,6); # len*64 - &vmovdqa(&QWP(112+48,"esp"),@X[3]); - &add ($D,$inp); # end of input - &vmovdqa(&QWP(112+64,"esp"),@X[2]); - &add ($inp,64); - &mov (&DWP(192+0,"esp"),$E); # save argument block - &mov (&DWP(192+4,"esp"),$inp); - &mov (&DWP(192+8,"esp"),$D); - &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp - - &mov ($A,&DWP(0,$E)); # load context - &mov ($B,&DWP(4,$E)); - &mov ($C,&DWP(8,$E)); - &mov ($D,&DWP(12,$E)); - &mov ($E,&DWP(16,$E)); - &mov (@T[0],$B); # magic seed - - &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] - &vmovdqu(@X[-3&7],&QWP(-48,$inp)); - &vmovdqu(@X[-2&7],&QWP(-32,$inp)); - &vmovdqu(@X[-1&7],&QWP(-16,$inp)); - &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap - &vpshufb(@X[-3&7],@X[-3&7],@X[2]); - &vpshufb(@X[-2&7],@X[-2&7],@X[2]); - &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot - &vpshufb(@X[-1&7],@X[-1&7],@X[2]); - &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 - &vpaddd (@X[1],@X[-3&7],@X[3]); - &vpaddd (@X[2],@X[-2&7],@X[3]); - &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU - &mov (@T[1],$C); - &vmovdqa(&QWP(0+16,"esp"),@X[1]); - &xor (@T[1],$D); - &vmovdqa(&QWP(0+32,"esp"),@X[2]); - &and (@T[0],@T[1]); - &jmp (&label("loop")); - -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 40 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpaddd (@X[3],@X[3],@X[-1&7]); - &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@X[2],@X[0],31); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword - &vpaddd (@X[0],@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@X[3],@X[4],30); - &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslld (@X[4],@X[4],2); - &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor (@X[0],@X[0],@X[3]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX - eval(shift(@insns)); - eval(shift(@insns)); - - foreach (@insns) { eval; } # remaining instructions [if any] - - $Xi++; push(@X,shift(@X)); # "rotate" X[] -} - -sub Xupdate_avx_32_79() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions - my ($a,$b,$c,$d,$e); - - &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" - &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer - eval(shift(@insns)); - eval(shift(@insns)); - if ($Xi%5) { - &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... - } else { # ... or load next one - &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); - } - &vpaddd (@X[3],@X[3],@X[-1&7]); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &vpsrld (@X[2],@X[0],30); - &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpslld (@X[0],@X[0],2); - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - foreach (@insns) { eval; } # remaining instructions - - $Xi++; push(@X,shift(@X)); # "rotate" X[] -} - -sub Xuplast_avx_80() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - &vpaddd (@X[3],@X[3],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU - - foreach (@insns) { eval; } # remaining instructions - - &mov ($inp=@T[1],&DWP(192+4,"esp")); - &cmp ($inp,&DWP(192+8,"esp")); - &je (&label("done")); - - &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 - &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask - &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input - &vmovdqu(@X[-3&7],&QWP(16,$inp)); - &vmovdqu(@X[-2&7],&QWP(32,$inp)); - &vmovdqu(@X[-1&7],&QWP(48,$inp)); - &add ($inp,64); - &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap - &mov (&DWP(192+4,"esp"),$inp); - &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot - - $Xi=0; -} - -sub Xloop_avx() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - foreach (@insns) { eval; } - $Xi++; -} - -sub Xtail_avx() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - foreach (@insns) { eval; } -} - -&set_label("loop",16); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_32_79(\&body_00_19); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_20_39); - &Xuplast_avx_80(\&body_20_39); # can jump to "done" - - $saved_j=$j; @saved_V=@V; - - &Xloop_avx(\&body_20_39); - &Xloop_avx(\&body_20_39); - &Xloop_avx(\&body_20_39); - - &mov (@T[1],&DWP(192,"esp")); # update context - &add ($A,&DWP(0,@T[1])); - &add (@T[0],&DWP(4,@T[1])); # $b - &add ($C,&DWP(8,@T[1])); - &mov (&DWP(0,@T[1]),$A); - &add ($D,&DWP(12,@T[1])); - &mov (&DWP(4,@T[1]),@T[0]); - &add ($E,&DWP(16,@T[1])); - &mov ($B,$C); - &mov (&DWP(8,@T[1]),$C); - &xor ($B,$D); - &mov (&DWP(12,@T[1]),$D); - &and ($B,@T[0]); - &mov (&DWP(16,@T[1]),$E); - &xchg ($B,@T[0]); - - &jmp (&label("loop")); - -&set_label("done",16); $j=$saved_j; @V=@saved_V; - - &Xtail_avx(\&body_20_39); - &Xtail_avx(\&body_20_39); - &Xtail_avx(\&body_20_39); - - &vzeroall(); - - &mov (@T[1],&DWP(192,"esp")); # update context - &add ($A,&DWP(0,@T[1])); - &mov ("esp",&DWP(192+12,"esp")); # restore %esp - &add (@T[0],&DWP(4,@T[1])); # $b - &add ($C,&DWP(8,@T[1])); - &mov (&DWP(0,@T[1]),$A); - &add ($D,&DWP(12,@T[1])); - &mov (&DWP(4,@T[1]),@T[0]); - &add ($E,&DWP(16,@T[1])); - &mov (&DWP(8,@T[1]),$C); - &mov (&DWP(12,@T[1]),$D); - &mov (&DWP(16,@T[1]),$E); -&function_end("_sha1_block_data_order_avx"); -} -&set_label("K_XX_XX",64); -&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 -&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 -&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 -&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 -&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask -} -&asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); - -&asm_finish(); +../openssl/./crypto/sha/asm/sha1-586.pl
\ No newline at end of file diff --git a/devel/perlasm/sha1-ssse3-x86_64.pl b/devel/perlasm/sha1-ssse3-x86_64.pl index 2c89b1feea..9502f766be 100755..120000 --- a/devel/perlasm/sha1-ssse3-x86_64.pl +++ b/devel/perlasm/sha1-ssse3-x86_64.pl @@ -1,1815 +1 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# sha1_block procedure for x86_64. -# -# It was brought to my attention that on EM64T compiler-generated code -# was far behind 32-bit assembler implementation. This is unlike on -# Opteron where compiler-generated code was only 15% behind 32-bit -# assembler, which originally made it hard to motivate the effort. -# There was suggestion to mechanically translate 32-bit code, but I -# dismissed it, reasoning that x86_64 offers enough register bank -# capacity to fully utilize SHA-1 parallelism. Therefore this fresh -# implementation:-) However! While 64-bit code does perform better -# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, -# x86_64 does offer larger *addressable* bank, but out-of-order core -# reaches for even more registers through dynamic aliasing, and EM64T -# core must have managed to run-time optimize even 32-bit code just as -# good as 64-bit one. Performance improvement is summarized in the -# following table: -# -# gcc 3.4 32-bit asm cycles/byte -# Opteron +45% +20% 6.8 -# Xeon P4 +65% +0% 9.9 -# Core2 +60% +10% 7.0 - -# August 2009. -# -# The code was revised to minimize code size and to maximize -# "distance" between instructions producing input to 'lea' -# instruction and the 'lea' instruction itself, which is essential -# for Intel Atom core. - -# October 2010. -# -# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it -# is to offload message schedule denoted by Wt in NIST specification, -# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module -# for background and implementation details. The only difference from -# 32-bit code is that 64-bit code doesn't have to spill @X[] elements -# to free temporary registers. - -# April 2011. -# -# Add AVX code path. See sha1-586.pl for further information. - -# May 2013. -# -# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions -# and loading pair of consecutive blocks to 256-bit %ymm registers) -# did not provide impressive performance improvement till a crucial -# hint regarding the number of Xupdate iterations to pre-compute in -# advance was provided by Ilya Albrekht of Intel Corp. - -###################################################################### -# Current performance is summarized in following table. Numbers are -# CPU clock cycles spent to process single byte (less is better). -# -# x86_64 SSSE3 AVX[2] -# P4 9.8 - -# Opteron 6.65 - -# Core2 6.70 6.05/+11% - -# Westmere 7.08 5.44/+30% - -# Sandy Bridge 7.93 6.16/+28% 4.99/+59% -# Ivy Bridge 6.30 4.63/+36% 4.60/+37% -# Haswell 5.98 4.36/+37% 3.57/+67% -# Bulldozer 10.9 5.95/+82% -# VIA Nano 10.2 7.46/+37% -# Atom 11.0 9.61/+14% - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); -} - -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.09) + ($1>=2.10); -} - -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); -} - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -$ctx="%rdi"; # 1st arg -$inp="%rsi"; # 2nd arg -$num="%rdx"; # 3rd arg - -# reassign arguments in order to produce more compact code -$ctx="%r8"; -$inp="%r9"; -$num="%r10"; - -$t0="%eax"; -$t1="%ebx"; -$t2="%ecx"; -@xi=("%edx","%ebp"); -$A="%esi"; -$B="%edi"; -$C="%r11d"; -$D="%r12d"; -$E="%r13d"; - -@V=($A,$B,$C,$D,$E); - -sub BODY_00_19 { -my ($i,$a,$b,$c,$d,$e)=@_; -my $j=$i+1; -$code.=<<___ if ($i==0); - mov `4*$i`($inp),$xi[0] - bswap $xi[0] - mov $xi[0],`4*$i`(%rsp) -___ -$code.=<<___ if ($i<15); - mov $c,$t0 - mov `4*$j`($inp),$xi[1] - mov $a,$t2 - xor $d,$t0 - bswap $xi[1] - rol \$5,$t2 - lea 0x5a827999($xi[0],$e),$e - and $b,$t0 - mov $xi[1],`4*$j`(%rsp) - add $t2,$e - xor $d,$t0 - rol \$30,$b - add $t0,$e -___ -$code.=<<___ if ($i>=15); - mov `4*($j%16)`(%rsp),$xi[1] - mov $c,$t0 - mov $a,$t2 - xor `4*(($j+2)%16)`(%rsp),$xi[1] - xor $d,$t0 - rol \$5,$t2 - xor `4*(($j+8)%16)`(%rsp),$xi[1] - and $b,$t0 - lea 0x5a827999($xi[0],$e),$e - xor `4*(($j+13)%16)`(%rsp),$xi[1] - xor $d,$t0 - rol \$1,$xi[1] - add $t2,$e - rol \$30,$b - mov $xi[1],`4*($j%16)`(%rsp) - add $t0,$e -___ -unshift(@xi,pop(@xi)); -} - -sub BODY_20_39 { -my ($i,$a,$b,$c,$d,$e)=@_; -my $j=$i+1; -my $K=($i<40)?0x6ed9eba1:0xca62c1d6; -$code.=<<___ if ($i<79); - mov `4*($j%16)`(%rsp),$xi[1] - mov $c,$t0 - mov $a,$t2 - xor `4*(($j+2)%16)`(%rsp),$xi[1] - xor $b,$t0 - rol \$5,$t2 - lea $K($xi[0],$e),$e - xor `4*(($j+8)%16)`(%rsp),$xi[1] - xor $d,$t0 - add $t2,$e - xor `4*(($j+13)%16)`(%rsp),$xi[1] - rol \$30,$b - add $t0,$e - rol \$1,$xi[1] -___ -$code.=<<___ if ($i<76); - mov $xi[1],`4*($j%16)`(%rsp) -___ -$code.=<<___ if ($i==79); - mov $c,$t0 - mov $a,$t2 - xor $b,$t0 - lea $K($xi[0],$e),$e - rol \$5,$t2 - xor $d,$t0 - add $t2,$e - rol \$30,$b - add $t0,$e -___ -unshift(@xi,pop(@xi)); -} - -sub BODY_40_59 { -my ($i,$a,$b,$c,$d,$e)=@_; -my $j=$i+1; -$code.=<<___; - mov `4*($j%16)`(%rsp),$xi[1] - mov $c,$t0 - mov $c,$t1 - xor `4*(($j+2)%16)`(%rsp),$xi[1] - and $d,$t0 - mov $a,$t2 - xor `4*(($j+8)%16)`(%rsp),$xi[1] - xor $d,$t1 - lea 0x8f1bbcdc($xi[0],$e),$e - rol \$5,$t2 - xor `4*(($j+13)%16)`(%rsp),$xi[1] - add $t0,$e - and $b,$t1 - rol \$1,$xi[1] - add $t1,$e - rol \$30,$b - mov $xi[1],`4*($j%16)`(%rsp) - add $t2,$e -___ -unshift(@xi,pop(@xi)); -} - -$code.=<<___; -.text -.extern OPENSSL_ia32cap_P - -.globl sha1_block_data_order -.type sha1_block_data_order,\@function,3 -.align 16 -sha1_block_data_order: - mov OPENSSL_ia32cap_P+0(%rip),%r9d - mov OPENSSL_ia32cap_P+4(%rip),%r8d - mov OPENSSL_ia32cap_P+8(%rip),%r10d - test \$`1<<9`,%r8d # check SSSE3 bit - jz .Lialu -___ -$code.=<<___ if ($avx>1); - and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2 - cmp \$`1<<3|1<<5|1<<8`,%r10d - je _avx2_shortcut -___ -$code.=<<___ if ($avx); - and \$`1<<28`,%r8d # mask AVX bit - and \$`1<<30`,%r9d # mask "Intel CPU" bit - or %r9d,%r8d - cmp \$`1<<28|1<<30`,%r8d - je _avx_shortcut -___ -$code.=<<___; - jmp _ssse3_shortcut - -.align 16 -.Lialu: - push %rbx - push %rbp - push %r12 - push %r13 - mov %rsp,%r11 - mov %rdi,$ctx # reassigned argument - sub \$`8+16*4`,%rsp - mov %rsi,$inp # reassigned argument - and \$-64,%rsp - mov %rdx,$num # reassigned argument - mov %r11,`16*4`(%rsp) -.Lprologue: - - mov 0($ctx),$A - mov 4($ctx),$B - mov 8($ctx),$C - mov 12($ctx),$D - mov 16($ctx),$E - jmp .Lloop - -.align 16 -.Lloop: -___ -for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } -for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } -for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } -for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; - add 0($ctx),$A - add 4($ctx),$B - add 8($ctx),$C - add 12($ctx),$D - add 16($ctx),$E - mov $A,0($ctx) - mov $B,4($ctx) - mov $C,8($ctx) - mov $D,12($ctx) - mov $E,16($ctx) - - sub \$1,$num - lea `16*4`($inp),$inp - jnz .Lloop - - mov `16*4`(%rsp),%rsi - mov (%rsi),%r13 - mov 8(%rsi),%r12 - mov 16(%rsi),%rbp - mov 24(%rsi),%rbx - lea 32(%rsi),%rsp -.Lepilogue: - ret -.size sha1_block_data_order,.-sha1_block_data_order -___ -{{{ -my $Xi=4; -my @X=map("%xmm$_",(4..7,0..3)); -my @Tx=map("%xmm$_",(8..10)); -my $Kx="%xmm11"; -my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization -my @T=("%esi","%edi"); -my $j=0; -my $rx=0; -my $K_XX_XX="%r11"; - -my $_rol=sub { &rol(@_) }; -my $_ror=sub { &ror(@_) }; - -{ my $sn; -sub align32() { - ++$sn; -$code.=<<___; - jmp .Lalign32_$sn # see "Decoded ICache" in manual -.align 32 -.Lalign32_$sn: -___ -} -} - -$code.=<<___; -.type sha1_block_data_order_ssse3,\@function,3 -.align 16 -sha1_block_data_order_ssse3: -_ssse3_shortcut: - push %rbx - push %rbp - push %r12 - lea `-64-($win64?6*16:0)`(%rsp),%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,64+0(%rsp) - movaps %xmm7,64+16(%rsp) - movaps %xmm8,64+32(%rsp) - movaps %xmm9,64+48(%rsp) - movaps %xmm10,64+64(%rsp) - movaps %xmm11,64+80(%rsp) -.Lprologue_ssse3: -___ -$code.=<<___; - mov %rdi,$ctx # reassigned argument - mov %rsi,$inp # reassigned argument - mov %rdx,$num # reassigned argument - - shl \$6,$num - add $inp,$num - lea K_XX_XX+64(%rip),$K_XX_XX - - mov 0($ctx),$A # load context - mov 4($ctx),$B - mov 8($ctx),$C - mov 12($ctx),$D - mov $B,@T[0] # magic seed - mov 16($ctx),$E - mov $C,@T[1] - xor $D,@T[1] - and @T[1],@T[0] - - movdqa 64($K_XX_XX),@X[2] # pbswap mask - movdqa -64($K_XX_XX),@Tx[1] # K_00_19 - movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] - movdqu 16($inp),@X[-3&7] - movdqu 32($inp),@X[-2&7] - movdqu 48($inp),@X[-1&7] - pshufb @X[2],@X[-4&7] # byte swap - add \$64,$inp - pshufb @X[2],@X[-3&7] - pshufb @X[2],@X[-2&7] - pshufb @X[2],@X[-1&7] - paddd @Tx[1],@X[-4&7] # add K_00_19 - paddd @Tx[1],@X[-3&7] - paddd @Tx[1],@X[-2&7] - movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU - psubd @Tx[1],@X[-4&7] # restore X[] - movdqa @X[-3&7],16(%rsp) - psubd @Tx[1],@X[-3&7] - movdqa @X[-2&7],32(%rsp) - psubd @Tx[1],@X[-2&7] - jmp .Loop_ssse3 -___ - -sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; - my $arg = pop; - $arg = "\$$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; -} - -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 40 instructions - my ($a,$b,$c,$d,$e); - - &movdqa (@X[0],@X[-3&7]); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@Tx[0],@X[-1&7]); - &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" - eval(shift(@insns)); - eval(shift(@insns)); - - &paddd (@Tx[1],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - &psrldq (@Tx[0],4); # "X[-3]", 3 dwords - eval(shift(@insns)); - eval(shift(@insns)); - &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - &movdqa (@Tx[2],@X[0]); - &movdqa (@Tx[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword - &paddd (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &psrld (@Tx[0],31); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@Tx[1],@Tx[2]); - eval(shift(@insns)); - eval(shift(@insns)); - - &psrld (@Tx[2],30); - &por (@X[0],@Tx[0]); # "X[0]"<<<=1 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &pslld (@Tx[1],2); - &pxor (@X[0],@Tx[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX - eval(shift(@insns)); - eval(shift(@insns)); - - &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 - - foreach (@insns) { eval; } # remaining instructions [if any] - - $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); -} - -sub Xupdate_ssse3_32_79() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions - my ($a,$b,$c,$d,$e); - - &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); - eval(shift(@insns)); # body_20_39 - &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" - eval(shift(@insns)); - eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); - if ($Xi%5) { - &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... - } else { # ... or load next one - &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); - } - &paddd (@Tx[1],@X[-1&7]); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &movdqa (@Tx[0],@X[0]); - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &pslld (@X[0],2); - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - &psrld (@Tx[0],30); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &por (@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - &movdqa (@Tx[1],@X[0]) if ($Xi<19); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - - foreach (@insns) { eval; } # remaining instructions - - $Xi++; push(@X,shift(@X)); # "rotate" X[] - push(@Tx,shift(@Tx)); -} - -sub Xuplast_ssse3_80() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - &paddd (@Tx[1],@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU - - foreach (@insns) { eval; } # remaining instructions - - &cmp ($inp,$num); - &je (".Ldone_ssse3"); - - unshift(@Tx,pop(@Tx)); - - &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask - &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19 - &movdqu (@X[-4&7],"0($inp)"); # load input - &movdqu (@X[-3&7],"16($inp)"); - &movdqu (@X[-2&7],"32($inp)"); - &movdqu (@X[-1&7],"48($inp)"); - &pshufb (@X[-4&7],@X[2]); # byte swap - &add ($inp,64); - - $Xi=0; -} - -sub Xloop_ssse3() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &pshufb (@X[($Xi-3)&7],@X[2]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[($Xi-4)&7],@Tx[1]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - &psubd (@X[($Xi-4)&7],@Tx[1]); - - foreach (@insns) { eval; } - $Xi++; -} - -sub Xtail_ssse3() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - foreach (@insns) { eval; } -} - -sub body_00_19 () { # ((c^d)&b)^d - # on start @T[0]=(c^d)&b - return &body_20_39() if ($rx==19); $rx++; - ( - '($a,$b,$c,$d,$e)=@V;'. - '&$_ror ($b,$j?7:2)', # $b>>>2 - '&xor (@T[0],$d)', - '&mov (@T[1],$a)', # $b for next round - - '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer - '&xor ($b,$c)', # $c^$d for next round - - '&$_rol ($a,5)', - '&add ($e,@T[0])', - '&and (@T[1],$b)', # ($b&($c^$d)) for next round - - '&xor ($b,$c)', # restore $b - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); -} - -sub body_20_39 () { # b^d^c - # on entry @T[0]=b^d - return &body_40_59() if ($rx==39); $rx++; - ( - '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer - '&xor (@T[0],$d) if($j==19);'. - '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c) - '&mov (@T[1],$a)', # $b for next round - - '&$_rol ($a,5)', - '&add ($e,@T[0])', - '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round - - '&$_ror ($b,7)', # $b>>>2 - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); -} - -sub body_40_59 () { # ((b^c)&(c^d))^c - # on entry @T[0]=(b^c), (c^=d) - $rx++; - ( - '($a,$b,$c,$d,$e)=@V;'. - '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer - '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d) - '&xor ($c,$d) if ($j>=40)', # restore $c - - '&$_ror ($b,7)', # $b>>>2 - '&mov (@T[1],$a)', # $b for next round - '&xor (@T[0],$c)', - - '&$_rol ($a,5)', - '&add ($e,@T[0])', - '&xor (@T[1],$c) if ($j==59);'. - '&xor (@T[1],$b) if ($j< 59)', # b^c for next round - - '&xor ($b,$c) if ($j< 59)', # c^d for next round - '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' - ); -} -$code.=<<___; -.align 16 -.Loop_ssse3: -___ - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_16_31(\&body_00_19); - &Xupdate_ssse3_32_79(\&body_00_19); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_40_59); - &Xupdate_ssse3_32_79(\&body_20_39); - &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" - - $saved_j=$j; @saved_V=@V; - - &Xloop_ssse3(\&body_20_39); - &Xloop_ssse3(\&body_20_39); - &Xloop_ssse3(\&body_20_39); - -$code.=<<___; - add 0($ctx),$A # update context - add 4($ctx),@T[0] - add 8($ctx),$C - add 12($ctx),$D - mov $A,0($ctx) - add 16($ctx),$E - mov @T[0],4($ctx) - mov @T[0],$B # magic seed - mov $C,8($ctx) - mov $C,@T[1] - mov $D,12($ctx) - xor $D,@T[1] - mov $E,16($ctx) - and @T[1],@T[0] - jmp .Loop_ssse3 - -.align 16 -.Ldone_ssse3: -___ - $j=$saved_j; @V=@saved_V; - - &Xtail_ssse3(\&body_20_39); - &Xtail_ssse3(\&body_20_39); - &Xtail_ssse3(\&body_20_39); - -$code.=<<___; - add 0($ctx),$A # update context - add 4($ctx),@T[0] - add 8($ctx),$C - mov $A,0($ctx) - add 12($ctx),$D - mov @T[0],4($ctx) - add 16($ctx),$E - mov $C,8($ctx) - mov $D,12($ctx) - mov $E,16($ctx) -___ -$code.=<<___ if ($win64); - movaps 64+0(%rsp),%xmm6 - movaps 64+16(%rsp),%xmm7 - movaps 64+32(%rsp),%xmm8 - movaps 64+48(%rsp),%xmm9 - movaps 64+64(%rsp),%xmm10 - movaps 64+80(%rsp),%xmm11 -___ -$code.=<<___; - lea `64+($win64?6*16:0)`(%rsp),%rsi - mov 0(%rsi),%r12 - mov 8(%rsi),%rbp - mov 16(%rsi),%rbx - lea 24(%rsi),%rsp -.Lepilogue_ssse3: - ret -.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 -___ - -if ($avx) { -$Xi=4; # reset variables -@X=map("%xmm$_",(4..7,0..3)); -@Tx=map("%xmm$_",(8..10)); -$j=0; -$rx=0; - -my $done_avx_label=".Ldone_avx"; - -my $_rol=sub { &shld(@_[0],@_) }; -my $_ror=sub { &shrd(@_[0],@_) }; - -$code.=<<___; -.type sha1_block_data_order_avx,\@function,3 -.align 16 -sha1_block_data_order_avx: -_avx_shortcut: - push %rbx - push %rbp - push %r12 - lea `-64-($win64?6*16:0)`(%rsp),%rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,64+0(%rsp) - movaps %xmm7,64+16(%rsp) - movaps %xmm8,64+32(%rsp) - movaps %xmm9,64+48(%rsp) - movaps %xmm10,64+64(%rsp) - movaps %xmm11,64+80(%rsp) -.Lprologue_avx: -___ -$code.=<<___; - mov %rdi,$ctx # reassigned argument - mov %rsi,$inp # reassigned argument - mov %rdx,$num # reassigned argument - vzeroupper - - shl \$6,$num - add $inp,$num - lea K_XX_XX+64(%rip),$K_XX_XX - - mov 0($ctx),$A # load context - mov 4($ctx),$B - mov 8($ctx),$C - mov 12($ctx),$D - mov $B,@T[0] # magic seed - mov 16($ctx),$E - mov $C,@T[1] - xor $D,@T[1] - and @T[1],@T[0] - - vmovdqa 64($K_XX_XX),@X[2] # pbswap mask - vmovdqa -64($K_XX_XX),$Kx # K_00_19 - vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] - vmovdqu 16($inp),@X[-3&7] - vmovdqu 32($inp),@X[-2&7] - vmovdqu 48($inp),@X[-1&7] - vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap - add \$64,$inp - vpshufb @X[2],@X[-3&7],@X[-3&7] - vpshufb @X[2],@X[-2&7],@X[-2&7] - vpshufb @X[2],@X[-1&7],@X[-1&7] - vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 - vpaddd $Kx,@X[-3&7],@X[1] - vpaddd $Kx,@X[-2&7],@X[2] - vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU - vmovdqa @X[1],16(%rsp) - vmovdqa @X[2],32(%rsp) - jmp .Loop_avx -___ - -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 40 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpaddd (@Tx[1],$Kx,@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[0],@X[0],31); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword - &vpaddd (@X[0],@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[1],@Tx[2],30); - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslld (@Tx[2],@Tx[2],2); - &vpxor (@X[0],@X[0],@Tx[1]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX - eval(shift(@insns)); - eval(shift(@insns)); - - - foreach (@insns) { eval; } # remaining instructions [if any] - - $Xi++; push(@X,shift(@X)); # "rotate" X[] -} - -sub Xupdate_avx_32_79() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions - my ($a,$b,$c,$d,$e); - - &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" - eval(shift(@insns)); - eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); - &vpaddd (@Tx[1],$Kx,@X[-1&7]); - &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - - &vpsrld (@Tx[0],@X[0],30); - &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpslld (@X[0],@X[0],2); - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # ror - eval(shift(@insns)); - - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); # body_20_39 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # rol - eval(shift(@insns)); - - foreach (@insns) { eval; } # remaining instructions - - $Xi++; push(@X,shift(@X)); # "rotate" X[] -} - -sub Xuplast_avx_80() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - &vpaddd (@Tx[1],$Kx,@X[-1&7]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU - - foreach (@insns) { eval; } # remaining instructions - - &cmp ($inp,$num); - &je ($done_avx_label); - - &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask - &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19 - &vmovdqu(@X[-4&7],"0($inp)"); # load input - &vmovdqu(@X[-3&7],"16($inp)"); - &vmovdqu(@X[-2&7],"32($inp)"); - &vmovdqu(@X[-1&7],"48($inp)"); - &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap - &add ($inp,64); - - $Xi=0; -} - -sub Xloop_avx() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - eval(shift(@insns)); - eval(shift(@insns)); - &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU - eval(shift(@insns)); - eval(shift(@insns)); - - foreach (@insns) { eval; } - $Xi++; -} - -sub Xtail_avx() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - foreach (@insns) { eval; } -} - -$code.=<<___; -.align 16 -.Loop_avx: -___ - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_16_31(\&body_00_19); - &Xupdate_avx_32_79(\&body_00_19); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_20_39); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_40_59); - &Xupdate_avx_32_79(\&body_20_39); - &Xuplast_avx_80(\&body_20_39); # can jump to "done" - - $saved_j=$j; @saved_V=@V; - - &Xloop_avx(\&body_20_39); - &Xloop_avx(\&body_20_39); - &Xloop_avx(\&body_20_39); - -$code.=<<___; - add 0($ctx),$A # update context - add 4($ctx),@T[0] - add 8($ctx),$C - add 12($ctx),$D - mov $A,0($ctx) - add 16($ctx),$E - mov @T[0],4($ctx) - mov @T[0],$B # magic seed - mov $C,8($ctx) - mov $C,@T[1] - mov $D,12($ctx) - xor $D,@T[1] - mov $E,16($ctx) - and @T[1],@T[0] - jmp .Loop_avx - -.align 16 -$done_avx_label: -___ - $j=$saved_j; @V=@saved_V; - - &Xtail_avx(\&body_20_39); - &Xtail_avx(\&body_20_39); - &Xtail_avx(\&body_20_39); - -$code.=<<___; - vzeroupper - - add 0($ctx),$A # update context - add 4($ctx),@T[0] - add 8($ctx),$C - mov $A,0($ctx) - add 12($ctx),$D - mov @T[0],4($ctx) - add 16($ctx),$E - mov $C,8($ctx) - mov $D,12($ctx) - mov $E,16($ctx) -___ -$code.=<<___ if ($win64); - movaps 64+0(%rsp),%xmm6 - movaps 64+16(%rsp),%xmm7 - movaps 64+32(%rsp),%xmm8 - movaps 64+48(%rsp),%xmm9 - movaps 64+64(%rsp),%xmm10 - movaps 64+80(%rsp),%xmm11 -___ -$code.=<<___; - lea `64+($win64?6*16:0)`(%rsp),%rsi - mov 0(%rsi),%r12 - mov 8(%rsi),%rbp - mov 16(%rsi),%rbx - lea 24(%rsi),%rsp -.Lepilogue_avx: - ret -.size sha1_block_data_order_avx,.-sha1_block_data_order_avx -___ - -if ($avx>1) { -use integer; -$Xi=4; # reset variables -@X=map("%ymm$_",(4..7,0..3)); -@Tx=map("%ymm$_",(8..10)); -$Kx="%ymm11"; -$j=0; - -my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi"); -my ($a5,$t0)=("%r12d","%edi"); - -my ($A,$F,$B,$C,$D,$E)=@ROTX; -my $rx=0; -my $frame="%r13"; - -$code.=<<___; -.type sha1_block_data_order_avx2,\@function,3 -.align 16 -sha1_block_data_order_avx2: -_avx2_shortcut: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - lea (%rsp),%r14 -___ -$code.=<<___ if ($win64); - lea -6*16(%rsp),%rsp - movaps %xmm6,-6*16(%r14) - movaps %xmm7,-5*16(%r14) - movaps %xmm8,-4*16(%r14) - movaps %xmm9,-3*16(%r14) - movaps %xmm10,-2*16(%r14) - movaps %xmm11,-1*16(%r14) -.Lprologue_avx2: -___ -$code.=<<___; - mov %rdi,$ctx # reassigned argument - mov %rsi,$inp # reassigned argument - mov %rdx,$num # reassigned argument - vzeroupper - - lea -640(%rsp),%rsp - shl \$6,$num - lea 64($inp),$frame - and \$-128,%rsp - add $inp,$num - lea K_XX_XX+64(%rip),$K_XX_XX - - mov 0($ctx),$A # load context - cmp $num,$frame - cmovae $inp,$frame # next or same block - mov 4($ctx),$F - mov 8($ctx),$C - mov 12($ctx),$D - mov 16($ctx),$E - vmovdqu 64($K_XX_XX),@X[2] # pbswap mask - - vmovdqu ($inp),%xmm0 - vmovdqu 16($inp),%xmm1 - vmovdqu 32($inp),%xmm2 - vmovdqu 48($inp),%xmm3 - lea 64($inp),$inp - vinserti128 \$1,($frame),@X[-4&7],@X[-4&7] - vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] - vpshufb @X[2],@X[-4&7],@X[-4&7] - vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] - vpshufb @X[2],@X[-3&7],@X[-3&7] - vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] - vpshufb @X[2],@X[-2&7],@X[-2&7] - vmovdqu -64($K_XX_XX),$Kx # K_00_19 - vpshufb @X[2],@X[-1&7],@X[-1&7] - - vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 - vpaddd $Kx,@X[-3&7],@X[1] - vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU - vpaddd $Kx,@X[-2&7],@X[2] - vmovdqu @X[1],32(%rsp) - vpaddd $Kx,@X[-1&7],@X[3] - vmovdqu @X[2],64(%rsp) - vmovdqu @X[3],96(%rsp) -___ -for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31 - use integer; - - &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" - &vpsrld (@Tx[0],@X[0],31); - &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX - &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword - &vpaddd (@X[0],@X[0],@X[0]); - &vpsrld (@Tx[1],@Tx[2],30); - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 - &vpslld (@Tx[2],@Tx[2],2); - &vpxor (@X[0],@X[0],@Tx[1]); - &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 - &vpaddd (@Tx[1],@X[0],$Kx); - &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU - - push(@X,shift(@X)); # "rotate" X[] -} -$code.=<<___; - lea 128(%rsp),$frame - jmp .Loop_avx2 -.align 32 -.Loop_avx2: - rorx \$2,$F,$B - andn $D,$F,$t0 - and $C,$F - xor $t0,$F -___ -sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path - # at start $f=(b&c)^(~b&d), $b>>>=2 - return &bodyx_20_39() if ($rx==19); $rx++; - ( - '($a,$f,$b,$c,$d,$e)=@ROTX;'. - - '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K - '&lea ($frame,"256($frame)") if ($j%32==31);', - '&andn ($t0,$a,$c)', # ~b&d for next round - - '&add ($e,$f)', # e+=(b&c)^(~b&d) - '&rorx ($a5,$a,27)', # a<<<5 - '&rorx ($f,$a,2)', # b>>>2 for next round - '&and ($a,$b)', # b&c for next round - - '&add ($e,$a5)', # e+=a<<<5 - '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round - - 'unshift(@ROTX,pop(@ROTX)); $j++;' - ) -} - -sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path - # on entry $f=b^c^d, $b>>>=2 - return &bodyx_40_59() if ($rx==39); $rx++; - ( - '($a,$f,$b,$c,$d,$e)=@ROTX;'. - - '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K - '&lea ($frame,"256($frame)") if ($j%32==31);', - - '&lea ($e,"($e,$f)")', # e+=b^c^d - '&rorx ($a5,$a,27)', # a<<<5 - '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round - '&xor ($a,$b) if ($j<79)', # b^c for next round - - '&add ($e,$a5)', # e+=a<<<5 - '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round - - 'unshift(@ROTX,pop(@ROTX)); $j++;' - ) -} - -sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path - # on entry $f=((b^c)&(c^d)), $b>>>=2 - $rx++; - ( - '($a,$f,$b,$c,$d,$e)=@ROTX;'. - - '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K - '&lea ($frame,"256($frame)") if ($j%32==31);', - '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c - '&mov ($t0,$b) if ($j<59)', # count on zero latency - '&xor ($t0,$c) if ($j<59)', # c^d for next round - - '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c - '&rorx ($a5,$a,27)', # a<<<5 - '&rorx ($f,$a,2)', # b>>>2 in next round - '&xor ($a,$b)', # b^c for next round - - '&add ($e,$a5)', # e+=a<<<5 - '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round - '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round - - 'unshift(@ROTX,pop(@ROTX)); $j++;' - ) -} - -sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4 -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions - my ($a,$b,$c,$d,$e); - - &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" - &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[0],@X[0],31); - &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword - &vpaddd (@X[0],@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[1],@Tx[2],30); - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 - eval(shift(@insns)); - eval(shift(@insns)); - - &vpslld (@Tx[2],@Tx[2],2); - &vpxor (@X[0],@X[0],@Tx[1]); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpaddd (@Tx[1],@X[0],$Kx); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU - - foreach (@insns) { eval; } # remaining instructions [if any] - - $Xi++; - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xupdate_avx2_32_79() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions - my ($a,$b,$c,$d,$e); - - &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" - &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" - &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpsrld (@Tx[0],@X[0],30); - &vpslld (@X[0],@X[0],2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - #&vpslld (@X[0],@X[0],2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vpaddd (@Tx[1],@X[0],$Kx); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - - &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU - - foreach (@insns) { eval; } # remaining instructions - - $Xi++; - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xloop_avx2() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions - my ($a,$b,$c,$d,$e); - - foreach (@insns) { eval; } -} - - &align32(); - &Xupdate_avx2_32_79(\&bodyx_00_19); - &Xupdate_avx2_32_79(\&bodyx_00_19); - &Xupdate_avx2_32_79(\&bodyx_00_19); - &Xupdate_avx2_32_79(\&bodyx_00_19); - - &Xupdate_avx2_32_79(\&bodyx_20_39); - &Xupdate_avx2_32_79(\&bodyx_20_39); - &Xupdate_avx2_32_79(\&bodyx_20_39); - &Xupdate_avx2_32_79(\&bodyx_20_39); - - &align32(); - &Xupdate_avx2_32_79(\&bodyx_40_59); - &Xupdate_avx2_32_79(\&bodyx_40_59); - &Xupdate_avx2_32_79(\&bodyx_40_59); - &Xupdate_avx2_32_79(\&bodyx_40_59); - - &Xloop_avx2(\&bodyx_20_39); - &Xloop_avx2(\&bodyx_20_39); - &Xloop_avx2(\&bodyx_20_39); - &Xloop_avx2(\&bodyx_20_39); - -$code.=<<___; - lea 128($inp),$frame - lea 128($inp),%rdi # borrow $t0 - cmp $num,$frame - cmovae $inp,$frame # next or previous block - - # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c - add 0($ctx),@ROTX[0] # update context - add 4($ctx),@ROTX[1] - add 8($ctx),@ROTX[3] - mov @ROTX[0],0($ctx) - add 12($ctx),@ROTX[4] - mov @ROTX[1],4($ctx) - mov @ROTX[0],$A # A=d - add 16($ctx),@ROTX[5] - mov @ROTX[3],$a5 - mov @ROTX[3],8($ctx) - mov @ROTX[4],$D # D=b - #xchg @ROTX[5],$F # F=c, C=f - mov @ROTX[4],12($ctx) - mov @ROTX[1],$F # F=e - mov @ROTX[5],16($ctx) - #mov $F,16($ctx) - mov @ROTX[5],$E # E=c - mov $a5,$C # C=f - #xchg $F,$E # E=c, F=e - - cmp $num,$inp - je .Ldone_avx2 -___ - -$Xi=4; # reset variables -@X=map("%ymm$_",(4..7,0..3)); - -$code.=<<___; - vmovdqu 64($K_XX_XX),@X[2] # pbswap mask - cmp $num,%rdi # borrowed $t0 - ja .Last_avx2 - - vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7] - vmovdqu -48(%rdi),%xmm1 - vmovdqu -32(%rdi),%xmm2 - vmovdqu -16(%rdi),%xmm3 - vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7] - vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] - vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] - vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] - jmp .Last_avx2 - -.align 32 -.Last_avx2: - lea 128+16(%rsp),$frame - rorx \$2,$F,$B - andn $D,$F,$t0 - and $C,$F - xor $t0,$F - sub \$-128,$inp -___ - $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E); - - &Xloop_avx2 (\&bodyx_00_19); - &Xloop_avx2 (\&bodyx_00_19); - &Xloop_avx2 (\&bodyx_00_19); - &Xloop_avx2 (\&bodyx_00_19); - - &Xloop_avx2 (\&bodyx_20_39); - &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19 - &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap - &Xloop_avx2 (\&bodyx_20_39); - &vpshufb (@X[-3&7],@X[-3&7],@X[2]); - &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19 - &Xloop_avx2 (\&bodyx_20_39); - &vmovdqu ("0(%rsp)",@Tx[0]); - &vpshufb (@X[-2&7],@X[-2&7],@X[2]); - &vpaddd (@Tx[1],@X[-3&7],$Kx); - &Xloop_avx2 (\&bodyx_20_39); - &vmovdqu ("32(%rsp)",@Tx[1]); - &vpshufb (@X[-1&7],@X[-1&7],@X[2]); - &vpaddd (@X[2],@X[-2&7],$Kx); - - &Xloop_avx2 (\&bodyx_40_59); - &align32 (); - &vmovdqu ("64(%rsp)",@X[2]); - &vpaddd (@X[3],@X[-1&7],$Kx); - &Xloop_avx2 (\&bodyx_40_59); - &vmovdqu ("96(%rsp)",@X[3]); - &Xloop_avx2 (\&bodyx_40_59); - &Xupdate_avx2_16_31(\&bodyx_40_59); - - &Xupdate_avx2_16_31(\&bodyx_20_39); - &Xupdate_avx2_16_31(\&bodyx_20_39); - &Xupdate_avx2_16_31(\&bodyx_20_39); - &Xloop_avx2 (\&bodyx_20_39); - -$code.=<<___; - lea 128(%rsp),$frame - - # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c - add 0($ctx),@ROTX[0] # update context - add 4($ctx),@ROTX[1] - add 8($ctx),@ROTX[3] - mov @ROTX[0],0($ctx) - add 12($ctx),@ROTX[4] - mov @ROTX[1],4($ctx) - mov @ROTX[0],$A # A=d - add 16($ctx),@ROTX[5] - mov @ROTX[3],$a5 - mov @ROTX[3],8($ctx) - mov @ROTX[4],$D # D=b - #xchg @ROTX[5],$F # F=c, C=f - mov @ROTX[4],12($ctx) - mov @ROTX[1],$F # F=e - mov @ROTX[5],16($ctx) - #mov $F,16($ctx) - mov @ROTX[5],$E # E=c - mov $a5,$C # C=f - #xchg $F,$E # E=c, F=e - - cmp $num,$inp - jbe .Loop_avx2 - -.Ldone_avx2: - vzeroupper -___ -$code.=<<___ if ($win64); - movaps -6*16(%r14),%xmm6 - movaps -5*16(%r14),%xmm7 - movaps -4*16(%r14),%xmm8 - movaps -3*16(%r14),%xmm9 - movaps -2*16(%r14),%xmm10 - movaps -1*16(%r14),%xmm11 -___ -$code.=<<___; - lea (%r14),%rsi - mov 0(%rsi),%r14 - mov 8(%rsi),%r13 - mov 16(%rsi),%r12 - mov 24(%rsi),%rbp - mov 32(%rsi),%rbx - lea 40(%rsi),%rsp -.Lepilogue_avx2: - ret -.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 -___ -} -} -$code.=<<___; -.align 64 -K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask -___ -}}} -$code.=<<___; -.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -.align 64 -___ - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - lea .Lprologue(%rip),%r10 - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - lea .Lepilogue(%rip),%r10 - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - mov `16*4`(%rax),%rax # pull saved stack pointer - lea 32(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - - jmp .Lcommon_seh_tail -.size se_handler,.-se_handler - -.type ssse3_handler,\@abi-omnipotent -.align 16 -ssse3_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - lea 64(%rax),%rsi - lea 512($context),%rdi # &context.Xmm6 - mov \$12,%ecx - .long 0xa548f3fc # cld; rep movsq - lea `24+64+6*16`(%rax),%rax # adjust stack pointer - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore cotnext->R12 - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size ssse3_handler,.-ssse3_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_sha1_block_data_order - .rva .LSEH_end_sha1_block_data_order - .rva .LSEH_info_sha1_block_data_order - .rva .LSEH_begin_sha1_block_data_order_ssse3 - .rva .LSEH_end_sha1_block_data_order_ssse3 - .rva .LSEH_info_sha1_block_data_order_ssse3 -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_sha1_block_data_order_avx - .rva .LSEH_end_sha1_block_data_order_avx - .rva .LSEH_info_sha1_block_data_order_avx -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_sha1_block_data_order_avx2 - .rva .LSEH_end_sha1_block_data_order_avx2 - .rva .LSEH_info_sha1_block_data_order_avx2 -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_sha1_block_data_order: - .byte 9,0,0,0 - .rva se_handler -.LSEH_info_sha1_block_data_order_ssse3: - .byte 9,0,0,0 - .rva ssse3_handler - .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] -___ -$code.=<<___ if ($avx); -.LSEH_info_sha1_block_data_order_avx: - .byte 9,0,0,0 - .rva ssse3_handler - .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] -___ -$code.=<<___ if ($avx>1); -.LSEH_info_sha1_block_data_order_avx2: - .byte 9,0,0,0 - .rva ssse3_handler - .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] -___ -} - -#################################################################### - -$code =~ s/\`([^\`]*)\`/eval $1/gem; -print $code; -close STDOUT; +../openssl/./crypto/sha/asm/sha1-x86_64.pl
\ No newline at end of file diff --git a/devel/perlasm/sha256-ssse3-x86.pl b/devel/perlasm/sha256-ssse3-x86.pl index bd48b638c5..e6dc8db1e2 100644..120000 --- a/devel/perlasm/sha256-ssse3-x86.pl +++ b/devel/perlasm/sha256-ssse3-x86.pl @@ -1,1125 +1 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# SHA256 block transform for x86. September 2007. -# -# Performance improvement over compiler generated code varies from -# 10% to 40% [see below]. Not very impressive on some µ-archs, but -# it's 5 times smaller and optimizies amount of writes. -# -# May 2012. -# -# Optimization including two of Pavel Semjanov's ideas, alternative -# Maj and full unroll, resulted in ~20-25% improvement on most CPUs, -# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost -# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not -# on P4, where it kills performance, nor Sandy Bridge, where folded -# loop is approximately as fast... -# -# June 2012. -# -# Add AMD XOP-specific code path, >30% improvement on Bulldozer over -# May version, >60% over original. Add AVX+shrd code path, >25% -# improvement on Sandy Bridge over May version, 60% over original. -# -# May 2013. -# -# Replace AMD XOP code path with SSSE3 to cover more processors. -# (Biggest improvement coefficient is on upcoming Atom Silvermont, -# not shown.) Add AVX+BMI code path. -# -# Performance in clock cycles per processed byte (less is better): -# -# gcc icc x86 asm(*) SIMD x86_64 asm(**) -# Pentium 46 57 40/38 - - -# PIII 36 33 27/24 - - -# P4 41 38 28 - 17.3 -# AMD K8 27 25 19/15.5 - 14.9 -# Core2 26 23 18/15.6 14.3 13.8 -# Westmere 27 - 19/15.7 13.4 12.3 -# Sandy Bridge 25 - 15.9 12.4 11.6 -# Ivy Bridge 24 - 15.0 11.4 10.3 -# Haswell 22 - 13.9 9.46 7.80 -# Bulldozer 36 - 27/22 17.0 13.6 -# VIA Nano 36 - 25/22 16.8 16.5 -# Atom 50 - 30/25 21.9 18.9 -# -# (*) numbers after slash are for unrolled loop, where applicable; -# (**) x86_64 assembly performance is presented for reference -# purposes, results are best-available; - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); - -$xmm=$avx=0; -for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } - -if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); -} - -if ($xmm && !$avx && $ARGV[0] eq "win32n" && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.03) + ($1>=2.10); -} - -if ($xmm && !$avx && $ARGV[0] eq "win32" && - `ml 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); -} - -$unroll_after = 64*4; # If pre-evicted from L1P cache first spin of - # fully unrolled loop was measured to run about - # 3-4x slower. If slowdown coefficient is N and - # unrolled loop is m times faster, then you break - # even at (N-1)/(m-1) blocks. Then it needs to be - # adjusted for probability of code being evicted, - # code size/cache size=1/4. Typical m is 1.15... - -$A="eax"; -$E="edx"; -$T="ebx"; -$Aoff=&DWP(4,"esp"); -$Boff=&DWP(8,"esp"); -$Coff=&DWP(12,"esp"); -$Doff=&DWP(16,"esp"); -$Eoff=&DWP(20,"esp"); -$Foff=&DWP(24,"esp"); -$Goff=&DWP(28,"esp"); -$Hoff=&DWP(32,"esp"); -$Xoff=&DWP(36,"esp"); -$K256="ebp"; - -sub BODY_16_63() { - &mov ($T,"ecx"); # "ecx" is preloaded - &mov ("esi",&DWP(4*(9+15+16-14),"esp")); - &ror ("ecx",18-7); - &mov ("edi","esi"); - &ror ("esi",19-17); - &xor ("ecx",$T); - &shr ($T,3); - &ror ("ecx",7); - &xor ("esi","edi"); - &xor ($T,"ecx"); # T = sigma0(X[-15]) - &ror ("esi",17); - &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] - &shr ("edi",10); - &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] - #&xor ("edi","esi") # sigma1(X[-2]) - # &add ($T,"edi"); # T += sigma1(X[-2]) - # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] - - &BODY_00_15(1); -} -sub BODY_00_15() { - my $in_16_63=shift; - - &mov ("ecx",$E); - &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) - &mov ("esi",$Foff); - &ror ("ecx",25-11); - &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) - &mov ("edi",$Goff); - &xor ("ecx",$E); - &xor ("esi","edi"); - &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); - &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] - &ror ("ecx",11-6); - &and ("esi",$E); - &mov ($Eoff,$E); # modulo-scheduled - &xor ($E,"ecx"); - &add ($T,$Hoff); # T += h - &xor ("esi","edi"); # Ch(e,f,g) - &ror ($E,6); # Sigma1(e) - &mov ("ecx",$A); - &add ($T,"esi"); # T += Ch(e,f,g) - - &ror ("ecx",22-13); - &add ($T,$E); # T += Sigma1(e) - &mov ("edi",$Boff); - &xor ("ecx",$A); - &mov ($Aoff,$A); # modulo-scheduled - &lea ("esp",&DWP(-4,"esp")); - &ror ("ecx",13-2); - &mov ("esi",&DWP(0,$K256)); - &xor ("ecx",$A); - &mov ($E,$Eoff); # e in next iteration, d in this one - &xor ($A,"edi"); # a ^= b - &ror ("ecx",2); # Sigma0(a) - - &add ($T,"esi"); # T+= K[i] - &mov (&DWP(0,"esp"),$A); # (b^c) in next round - &add ($E,$T); # d += T - &and ($A,&DWP(4,"esp")); # a &= (b^c) - &add ($T,"ecx"); # T += Sigma0(a) - &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) - &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T - &add ($K256,4); - &add ($A,$T); # h += T -} - -&external_label("OPENSSL_ia32cap_P") if (!$i386); - -&function_begin("sha256_block_data_order"); - &mov ("esi",wparam(0)); # ctx - &mov ("edi",wparam(1)); # inp - &mov ("eax",wparam(2)); # num - &mov ("ebx","esp"); # saved sp - - &call (&label("pic_point")); # make it PIC! -&set_label("pic_point"); - &blindpop($K256); - &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); - - &sub ("esp",16); - &and ("esp",-64); - - &shl ("eax",6); - &add ("eax","edi"); - &mov (&DWP(0,"esp"),"esi"); # ctx - &mov (&DWP(4,"esp"),"edi"); # inp - &mov (&DWP(8,"esp"),"eax"); # inp+num*128 - &mov (&DWP(12,"esp"),"ebx"); # saved sp - if (!$i386) { - &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); - &mov ("ecx",&DWP(0,"edx")); - &mov ("ebx",&DWP(4,"edx")); - &test ("ecx",1<<20); # check for P4 - &jnz (&label("loop")); - &and ("ecx",1<<30); # mask "Intel CPU" bit - &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits - &or ("ecx","ebx"); - &and ("ecx",1<<28|1<<30); - &cmp ("ecx",1<<28|1<<30); - if ($xmm) { - &je (&label("AVX")) if ($avx); - &test ("ebx",1<<9); # check for SSSE3 - &jnz (&label("SSSE3")); - } else { - &je (&label("loop_shrd")); - } - if ($unroll_after) { - &sub ("eax","edi"); - &cmp ("eax",$unroll_after); - &jae (&label("unrolled")); - } } - &jmp (&label("loop")); - -sub COMPACT_LOOP() { -my $suffix=shift; - -&set_label("loop$suffix",$suffix?32:16); - # copy input block to stack reversing byte and dword order - for($i=0;$i<4;$i++) { - &mov ("eax",&DWP($i*16+0,"edi")); - &mov ("ebx",&DWP($i*16+4,"edi")); - &mov ("ecx",&DWP($i*16+8,"edi")); - &bswap ("eax"); - &mov ("edx",&DWP($i*16+12,"edi")); - &bswap ("ebx"); - &push ("eax"); - &bswap ("ecx"); - &push ("ebx"); - &bswap ("edx"); - &push ("ecx"); - &push ("edx"); - } - &add ("edi",64); - &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H - &mov (&DWP(4*(9+16)+4,"esp"),"edi"); - - # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack - &mov ($A,&DWP(0,"esi")); - &mov ("ebx",&DWP(4,"esi")); - &mov ("ecx",&DWP(8,"esi")); - &mov ("edi",&DWP(12,"esi")); - # &mov ($Aoff,$A); - &mov ($Boff,"ebx"); - &xor ("ebx","ecx"); - &mov ($Coff,"ecx"); - &mov ($Doff,"edi"); - &mov (&DWP(0,"esp"),"ebx"); # magic - &mov ($E,&DWP(16,"esi")); - &mov ("ebx",&DWP(20,"esi")); - &mov ("ecx",&DWP(24,"esi")); - &mov ("edi",&DWP(28,"esi")); - # &mov ($Eoff,$E); - &mov ($Foff,"ebx"); - &mov ($Goff,"ecx"); - &mov ($Hoff,"edi"); - -&set_label("00_15$suffix",16); - - &BODY_00_15(); - - &cmp ("esi",0xc19bf174); - &jne (&label("00_15$suffix")); - - &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) - &jmp (&label("16_63$suffix")); - -&set_label("16_63$suffix",16); - - &BODY_16_63(); - - &cmp ("esi",0xc67178f2); - &jne (&label("16_63$suffix")); - - &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx - # &mov ($A,$Aoff); - &mov ("ebx",$Boff); - # &mov ("edi",$Coff); - &mov ("ecx",$Doff); - &add ($A,&DWP(0,"esi")); - &add ("ebx",&DWP(4,"esi")); - &add ("edi",&DWP(8,"esi")); - &add ("ecx",&DWP(12,"esi")); - &mov (&DWP(0,"esi"),$A); - &mov (&DWP(4,"esi"),"ebx"); - &mov (&DWP(8,"esi"),"edi"); - &mov (&DWP(12,"esi"),"ecx"); - # &mov ($E,$Eoff); - &mov ("eax",$Foff); - &mov ("ebx",$Goff); - &mov ("ecx",$Hoff); - &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp - &add ($E,&DWP(16,"esi")); - &add ("eax",&DWP(20,"esi")); - &add ("ebx",&DWP(24,"esi")); - &add ("ecx",&DWP(28,"esi")); - &mov (&DWP(16,"esi"),$E); - &mov (&DWP(20,"esi"),"eax"); - &mov (&DWP(24,"esi"),"ebx"); - &mov (&DWP(28,"esi"),"ecx"); - - &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame - &sub ($K256,4*64); # rewind K - - &cmp ("edi",&DWP(8,"esp")); # are we done yet? - &jb (&label("loop$suffix")); -} - &COMPACT_LOOP(); - &mov ("esp",&DWP(12,"esp")); # restore sp -&function_end_A(); - if (!$i386 && !$xmm) { - # ~20% improvement on Sandy Bridge - local *ror = sub { &shrd(@_[0],@_) }; - &COMPACT_LOOP("_shrd"); - &mov ("esp",&DWP(12,"esp")); # restore sp -&function_end_A(); - } - -&set_label("K256",64); # Yes! I keep it in the code segment! -@K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, - 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, - 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, - 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, - 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, - 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, - 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, - 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, - 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, - 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, - 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, - 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, - 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, - 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, - 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, - 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); -&data_word(@K256); -&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask -&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); - -($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets -sub off { &DWP(4*(((shift)-$i)&7),"esp"); } - -if (!$i386 && $unroll_after) { -my @AH=($A,$K256); - -&set_label("unrolled",16); - &lea ("esp",&DWP(-96,"esp")); - # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack - &mov ($AH[0],&DWP(0,"esi")); - &mov ($AH[1],&DWP(4,"esi")); - &mov ("ecx",&DWP(8,"esi")); - &mov ("ebx",&DWP(12,"esi")); - #&mov (&DWP(0,"esp"),$AH[0]); - &mov (&DWP(4,"esp"),$AH[1]); - &xor ($AH[1],"ecx"); # magic - &mov (&DWP(8,"esp"),"ecx"); - &mov (&DWP(12,"esp"),"ebx"); - &mov ($E,&DWP(16,"esi")); - &mov ("ebx",&DWP(20,"esi")); - &mov ("ecx",&DWP(24,"esi")); - &mov ("esi",&DWP(28,"esi")); - #&mov (&DWP(16,"esp"),$E); - &mov (&DWP(20,"esp"),"ebx"); - &mov (&DWP(24,"esp"),"ecx"); - &mov (&DWP(28,"esp"),"esi"); - &jmp (&label("grand_loop")); - -&set_label("grand_loop",16); - # copy input block to stack reversing byte order - for($i=0;$i<5;$i++) { - &mov ("ebx",&DWP(12*$i+0,"edi")); - &mov ("ecx",&DWP(12*$i+4,"edi")); - &bswap ("ebx"); - &mov ("esi",&DWP(12*$i+8,"edi")); - &bswap ("ecx"); - &mov (&DWP(32+12*$i+0,"esp"),"ebx"); - &bswap ("esi"); - &mov (&DWP(32+12*$i+4,"esp"),"ecx"); - &mov (&DWP(32+12*$i+8,"esp"),"esi"); - } - &mov ("ebx",&DWP($i*12,"edi")); - &add ("edi",64); - &bswap ("ebx"); - &mov (&DWP(96+4,"esp"),"edi"); - &mov (&DWP(32+12*$i,"esp"),"ebx"); - - my ($t1,$t2) = ("ecx","esi"); - - for ($i=0;$i<64;$i++) { - - if ($i>=16) { - &mov ($T,$t1); # $t1 is preloaded - # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); - &ror ($t1,18-7); - &mov ("edi",$t2); - &ror ($t2,19-17); - &xor ($t1,$T); - &shr ($T,3); - &ror ($t1,7); - &xor ($t2,"edi"); - &xor ($T,$t1); # T = sigma0(X[-15]) - &ror ($t2,17); - &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] - &shr ("edi",10); - &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] - #&xor ("edi",$t2) # sigma1(X[-2]) - # &add ($T,"edi"); # T += sigma1(X[-2]) - # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] - } - &mov ($t1,$E); - &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) - &mov ($t2,&off($f)); - &ror ($E,25-11); - &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) - &mov ("edi",&off($g)); - &xor ($E,$t1); - &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] - &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] - &xor ($t2,"edi"); - &ror ($E,11-6); - &and ($t2,$t1); - &mov (&off($e),$t1); # save $E, modulo-scheduled - &xor ($E,$t1); - &add ($T,&off($h)); # T += h - &xor ("edi",$t2); # Ch(e,f,g) - &ror ($E,6); # Sigma1(e) - &mov ($t1,$AH[0]); - &add ($T,"edi"); # T += Ch(e,f,g) - - &ror ($t1,22-13); - &mov ($t2,$AH[0]); - &mov ("edi",&off($b)); - &xor ($t1,$AH[0]); - &mov (&off($a),$AH[0]); # save $A, modulo-scheduled - &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round - &ror ($t1,13-2); - &and ($AH[1],$AH[0]); # (b^c) &= (a^b) - &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] - &xor ($t1,$t2); - &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) - &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); - &ror ($t1,2); # Sigma0(a) - - &add ($AH[1],$E); # h += T - &add ($E,&off($d)); # d += T - &add ($AH[1],$t1); # h += Sigma0(a) - &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); - - @AH = reverse(@AH); # rotate(a,h) - ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) - } - &mov ("esi",&DWP(96,"esp")); #ctx - #&mov ($AH[0],&DWP(0,"esp")); - &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); - #&mov ("edi", &DWP(8,"esp")); - &mov ("ecx",&DWP(12,"esp")); - &add ($AH[0],&DWP(0,"esi")); - &add ($AH[1],&DWP(4,"esi")); - &add ("edi",&DWP(8,"esi")); - &add ("ecx",&DWP(12,"esi")); - &mov (&DWP(0,"esi"),$AH[0]); - &mov (&DWP(4,"esi"),$AH[1]); - &mov (&DWP(8,"esi"),"edi"); - &mov (&DWP(12,"esi"),"ecx"); - #&mov (&DWP(0,"esp"),$AH[0]); - &mov (&DWP(4,"esp"),$AH[1]); - &xor ($AH[1],"edi"); # magic - &mov (&DWP(8,"esp"),"edi"); - &mov (&DWP(12,"esp"),"ecx"); - #&mov ($E,&DWP(16,"esp")); - &mov ("edi",&DWP(20,"esp")); - &mov ("ebx",&DWP(24,"esp")); - &mov ("ecx",&DWP(28,"esp")); - &add ($E,&DWP(16,"esi")); - &add ("edi",&DWP(20,"esi")); - &add ("ebx",&DWP(24,"esi")); - &add ("ecx",&DWP(28,"esi")); - &mov (&DWP(16,"esi"),$E); - &mov (&DWP(20,"esi"),"edi"); - &mov (&DWP(24,"esi"),"ebx"); - &mov (&DWP(28,"esi"),"ecx"); - #&mov (&DWP(16,"esp"),$E); - &mov (&DWP(20,"esp"),"edi"); - &mov ("edi",&DWP(96+4,"esp")); # inp - &mov (&DWP(24,"esp"),"ebx"); - &mov (&DWP(28,"esp"),"ecx"); - - &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? - &jb (&label("grand_loop")); - - &mov ("esp",&DWP(96+12,"esp")); # restore sp -&function_end_A(); -} - if (!$i386 && $xmm) {{{ -my @X = map("xmm$_",(0..3)); -my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); -my @AH = ($A,$T); - -&set_label("SSSE3",32); - &lea ("esp",&DWP(-96,"esp")); - # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack - &mov ($AH[0],&DWP(0,"esi")); - &mov ($AH[1],&DWP(4,"esi")); - &mov ("ecx",&DWP(8,"esi")); - &mov ("edi",&DWP(12,"esi")); - #&mov (&DWP(0,"esp"),$AH[0]); - &mov (&DWP(4,"esp"),$AH[1]); - &xor ($AH[1],"ecx"); # magic - &mov (&DWP(8,"esp"),"ecx"); - &mov (&DWP(12,"esp"),"edi"); - &mov ($E,&DWP(16,"esi")); - &mov ("edi",&DWP(20,"esi")); - &mov ("ecx",&DWP(24,"esi")); - &mov ("esi",&DWP(28,"esi")); - #&mov (&DWP(16,"esp"),$E); - &mov (&DWP(20,"esp"),"edi"); - &mov ("edi",&DWP(96+4,"esp")); # inp - &mov (&DWP(24,"esp"),"ecx"); - &mov (&DWP(28,"esp"),"esi"); - &movdqa ($t3,&QWP(256,$K256)); - &jmp (&label("grand_ssse3")); - -&set_label("grand_ssse3",16); - # load input, reverse byte order, add K256[0..15], save to stack - &movdqu (@X[0],&QWP(0,"edi")); - &movdqu (@X[1],&QWP(16,"edi")); - &movdqu (@X[2],&QWP(32,"edi")); - &movdqu (@X[3],&QWP(48,"edi")); - &add ("edi",64); - &pshufb (@X[0],$t3); - &mov (&DWP(96+4,"esp"),"edi"); - &pshufb (@X[1],$t3); - &movdqa ($t0,&QWP(0,$K256)); - &pshufb (@X[2],$t3); - &movdqa ($t1,&QWP(16,$K256)); - &paddd ($t0,@X[0]); - &pshufb (@X[3],$t3); - &movdqa ($t2,&QWP(32,$K256)); - &paddd ($t1,@X[1]); - &movdqa ($t3,&QWP(48,$K256)); - &movdqa (&QWP(32+0,"esp"),$t0); - &paddd ($t2,@X[2]); - &movdqa (&QWP(32+16,"esp"),$t1); - &paddd ($t3,@X[3]); - &movdqa (&QWP(32+32,"esp"),$t2); - &movdqa (&QWP(32+48,"esp"),$t3); - &jmp (&label("ssse3_00_47")); - -&set_label("ssse3_00_47",16); - &add ($K256,64); - -sub SSSE3_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 120 instructions - - eval(shift(@insns)); - &movdqa ($t0,@X[1]); - eval(shift(@insns)); # @ - eval(shift(@insns)); - &movdqa ($t3,@X[3]); - eval(shift(@insns)); - eval(shift(@insns)); - &palignr ($t0,@X[0],4); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); # @ - eval(shift(@insns)); - &palignr ($t3,@X[2],4); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t1,$t0); - eval(shift(@insns)); # @ - eval(shift(@insns)); - &movdqa ($t2,$t0); - eval(shift(@insns)); - eval(shift(@insns)); - &psrld ($t0,3); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &paddd (@X[0],$t3); # X[0..3] += X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &psrld ($t2,7); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - eval(shift(@insns)); - &pshufd ($t3,@X[3],0b11111010); # X[14..15] - eval(shift(@insns)); - eval(shift(@insns)); - &pslld ($t1,32-18); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &pxor ($t0,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - &psrld ($t2,18-7); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &pxor ($t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - &pslld ($t1,18-7); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &pxor ($t0,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t2,$t3); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &pxor ($t0,$t1); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &psrld ($t3,10); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &psrlq ($t2,17); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &pxor ($t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - &psrlq ($t2,19-17); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &pxor ($t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - &pshufd ($t3,$t3,0b10000000); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - eval(shift(@insns)); - &psrldq ($t3,8); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); # @ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - eval(shift(@insns)); - &pshufd ($t3,@X[0],0b01010000); # X[16..17] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t2,$t3); - eval(shift(@insns)); # @ - &psrld ($t3,10); - eval(shift(@insns)); - &psrlq ($t2,17); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &pxor ($t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - &psrlq ($t2,19-17); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &pxor ($t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &pshufd ($t3,$t3,0b00001000); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &movdqa ($t2,&QWP(16*$j,$K256)); - eval(shift(@insns)); - eval(shift(@insns)); - &pslldq ($t3,8); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); # @ - &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd ($t2,@X[0]); - eval(shift(@insns)); # @ - - foreach (@insns) { eval; } # remaining instructions - - &movdqa (&QWP(32+16*$j,"esp"),$t2); -} - -sub body_00_15 () { - ( - '&mov ("ecx",$E);', - '&ror ($E,25-11);', - '&mov ("esi",&off($f));', - '&xor ($E,"ecx");', - '&mov ("edi",&off($g));', - '&xor ("esi","edi");', - '&ror ($E,11-6);', - '&and ("esi","ecx");', - '&mov (&off($e),"ecx");', # save $E, modulo-scheduled - '&xor ($E,"ecx");', - '&xor ("edi","esi");', # Ch(e,f,g) - '&ror ($E,6);', # T = Sigma1(e) - '&mov ("ecx",$AH[0]);', - '&add ($E,"edi");', # T += Ch(e,f,g) - '&mov ("edi",&off($b));', - '&mov ("esi",$AH[0]);', - - '&ror ("ecx",22-13);', - '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled - '&xor ("ecx",$AH[0]);', - '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round - '&add ($E,&off($h));', # T += h - '&ror ("ecx",13-2);', - '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) - '&xor ("ecx","esi");', - '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] - '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) - '&ror ("ecx",2);', # Sigma0(a) - - '&add ($AH[1],$E);', # h += T - '&add ($E,&off($d));', # d += T - '&add ($AH[1],"ecx");'. # h += Sigma0(a) - - '@AH = reverse(@AH); $i++;' # rotate(a,h) - ); -} - - for ($i=0,$j=0; $j<4; $j++) { - &SSSE3_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmp (&DWP(16*$j,$K256),0x00010203); - &jne (&label("ssse3_00_47")); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } - - &mov ("esi",&DWP(96,"esp")); #ctx - #&mov ($AH[0],&DWP(0,"esp")); - &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); - #&mov ("edi", &DWP(8,"esp")); - &mov ("ecx",&DWP(12,"esp")); - &add ($AH[0],&DWP(0,"esi")); - &add ($AH[1],&DWP(4,"esi")); - &add ("edi",&DWP(8,"esi")); - &add ("ecx",&DWP(12,"esi")); - &mov (&DWP(0,"esi"),$AH[0]); - &mov (&DWP(4,"esi"),$AH[1]); - &mov (&DWP(8,"esi"),"edi"); - &mov (&DWP(12,"esi"),"ecx"); - #&mov (&DWP(0,"esp"),$AH[0]); - &mov (&DWP(4,"esp"),$AH[1]); - &xor ($AH[1],"edi"); # magic - &mov (&DWP(8,"esp"),"edi"); - &mov (&DWP(12,"esp"),"ecx"); - #&mov ($E,&DWP(16,"esp")); - &mov ("edi",&DWP(20,"esp")); - &mov ("ecx",&DWP(24,"esp")); - &add ($E,&DWP(16,"esi")); - &add ("edi",&DWP(20,"esi")); - &add ("ecx",&DWP(24,"esi")); - &mov (&DWP(16,"esi"),$E); - &mov (&DWP(20,"esi"),"edi"); - &mov (&DWP(20,"esp"),"edi"); - &mov ("edi",&DWP(28,"esp")); - &mov (&DWP(24,"esi"),"ecx"); - #&mov (&DWP(16,"esp"),$E); - &add ("edi",&DWP(28,"esi")); - &mov (&DWP(24,"esp"),"ecx"); - &mov (&DWP(28,"esi"),"edi"); - &mov (&DWP(28,"esp"),"edi"); - &mov ("edi",&DWP(96+4,"esp")); # inp - - &movdqa ($t3,&QWP(64,$K256)); - &sub ($K256,3*64); # rewind K - &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? - &jb (&label("grand_ssse3")); - - &mov ("esp",&DWP(96+12,"esp")); # restore sp -&function_end_A(); - if ($avx) { -&set_label("AVX",32); - if ($avx>1) { - &mov ("edx",&DWP(8,"edx")); - &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 - &cmp ("edx",1<<8|1<<3); - &je (&label("AVX_BMI")); - } - &lea ("esp",&DWP(-96,"esp")); - &vzeroall (); - # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack - &mov ($AH[0],&DWP(0,"esi")); - &mov ($AH[1],&DWP(4,"esi")); - &mov ("ecx",&DWP(8,"esi")); - &mov ("edi",&DWP(12,"esi")); - #&mov (&DWP(0,"esp"),$AH[0]); - &mov (&DWP(4,"esp"),$AH[1]); - &xor ($AH[1],"ecx"); # magic - &mov (&DWP(8,"esp"),"ecx"); - &mov (&DWP(12,"esp"),"edi"); - &mov ($E,&DWP(16,"esi")); - &mov ("edi",&DWP(20,"esi")); - &mov ("ecx",&DWP(24,"esi")); - &mov ("esi",&DWP(28,"esi")); - #&mov (&DWP(16,"esp"),$E); - &mov (&DWP(20,"esp"),"edi"); - &mov ("edi",&DWP(96+4,"esp")); # inp - &mov (&DWP(24,"esp"),"ecx"); - &mov (&DWP(28,"esp"),"esi"); - &vmovdqa ($t3,&QWP(256,$K256)); - &jmp (&label("grand_avx")); - -&set_label("grand_avx",32); - # load input, reverse byte order, add K256[0..15], save to stack - &vmovdqu (@X[0],&QWP(0,"edi")); - &vmovdqu (@X[1],&QWP(16,"edi")); - &vmovdqu (@X[2],&QWP(32,"edi")); - &vmovdqu (@X[3],&QWP(48,"edi")); - &add ("edi",64); - &vpshufb (@X[0],@X[0],$t3); - &mov (&DWP(96+4,"esp"),"edi"); - &vpshufb (@X[1],@X[1],$t3); - &vpshufb (@X[2],@X[2],$t3); - &vpaddd ($t0,@X[0],&QWP(0,$K256)); - &vpshufb (@X[3],@X[3],$t3); - &vpaddd ($t1,@X[1],&QWP(16,$K256)); - &vpaddd ($t2,@X[2],&QWP(32,$K256)); - &vpaddd ($t3,@X[3],&QWP(48,$K256)); - &vmovdqa (&QWP(32+0,"esp"),$t0); - &vmovdqa (&QWP(32+16,"esp"),$t1); - &vmovdqa (&QWP(32+32,"esp"),$t2); - &vmovdqa (&QWP(32+48,"esp"),$t3); - &jmp (&label("avx_00_47")); - -&set_label("avx_00_47",16); - &add ($K256,64); - -sub Xupdate_AVX () { - ( - '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] - '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] - '&vpsrld ($t2,$t0,7);', - '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] - '&vpsrld ($t3,$t0,3);', - '&vpslld ($t1,$t0,14);', - '&vpxor ($t0,$t3,$t2);', - '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] - '&vpsrld ($t2,$t2,18-7);', - '&vpxor ($t0,$t0,$t1);', - '&vpslld ($t1,$t1,25-14);', - '&vpxor ($t0,$t0,$t2);', - '&vpsrld ($t2,$t3,10);', - '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) - '&vpsrlq ($t1,$t3,17);', - '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) - '&vpxor ($t2,$t2,$t1);', - '&vpsrlq ($t3,$t3,19);', - '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] - '&vpshufd ($t3,$t2,0b10000100);', - '&vpsrldq ($t3,$t3,8);', - '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) - '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] - '&vpsrld ($t2,$t3,10);', - '&vpsrlq ($t1,$t3,17);', - '&vpxor ($t2,$t2,$t1);', - '&vpsrlq ($t3,$t3,19);', - '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] - '&vpshufd ($t3,$t2,0b11101000);', - '&vpslldq ($t3,$t3,8);', - '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) - ); -} - -local *ror = sub { &shrd(@_[0],@_) }; -sub AVX_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 120 instructions -my $insn; - - foreach (Xupdate_AVX()) { # 31 instructions - eval; - eval(shift(@insns)); - eval(shift(@insns)); - eval($insn = shift(@insns)); - eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); - } - &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (&QWP(32+16*$j,"esp"),$t2); -} - - for ($i=0,$j=0; $j<4; $j++) { - &AVX_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmp (&DWP(16*$j,$K256),0x00010203); - &jne (&label("avx_00_47")); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } - - &mov ("esi",&DWP(96,"esp")); #ctx - #&mov ($AH[0],&DWP(0,"esp")); - &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); - #&mov ("edi", &DWP(8,"esp")); - &mov ("ecx",&DWP(12,"esp")); - &add ($AH[0],&DWP(0,"esi")); - &add ($AH[1],&DWP(4,"esi")); - &add ("edi",&DWP(8,"esi")); - &add ("ecx",&DWP(12,"esi")); - &mov (&DWP(0,"esi"),$AH[0]); - &mov (&DWP(4,"esi"),$AH[1]); - &mov (&DWP(8,"esi"),"edi"); - &mov (&DWP(12,"esi"),"ecx"); - #&mov (&DWP(0,"esp"),$AH[0]); - &mov (&DWP(4,"esp"),$AH[1]); - &xor ($AH[1],"edi"); # magic - &mov (&DWP(8,"esp"),"edi"); - &mov (&DWP(12,"esp"),"ecx"); - #&mov ($E,&DWP(16,"esp")); - &mov ("edi",&DWP(20,"esp")); - &mov ("ecx",&DWP(24,"esp")); - &add ($E,&DWP(16,"esi")); - &add ("edi",&DWP(20,"esi")); - &add ("ecx",&DWP(24,"esi")); - &mov (&DWP(16,"esi"),$E); - &mov (&DWP(20,"esi"),"edi"); - &mov (&DWP(20,"esp"),"edi"); - &mov ("edi",&DWP(28,"esp")); - &mov (&DWP(24,"esi"),"ecx"); - #&mov (&DWP(16,"esp"),$E); - &add ("edi",&DWP(28,"esi")); - &mov (&DWP(24,"esp"),"ecx"); - &mov (&DWP(28,"esi"),"edi"); - &mov (&DWP(28,"esp"),"edi"); - &mov ("edi",&DWP(96+4,"esp")); # inp - - &vmovdqa ($t3,&QWP(64,$K256)); - &sub ($K256,3*64); # rewind K - &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? - &jb (&label("grand_avx")); - - &mov ("esp",&DWP(96+12,"esp")); # restore sp - &vzeroall (); -&function_end_A(); - if ($avx>1) { -sub bodyx_00_15 () { # +10% - ( - '&rorx ("ecx",$E,6)', - '&rorx ("esi",$E,11)', - '&mov (&off($e),$E)', # save $E, modulo-scheduled - '&rorx ("edi",$E,25)', - '&xor ("ecx","esi")', - '&andn ("esi",$E,&off($g))', - '&xor ("ecx","edi")', # Sigma1(e) - '&and ($E,&off($f))', - '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled - '&or ($E,"esi")', # T = Ch(e,f,g) - - '&rorx ("edi",$AH[0],2)', - '&rorx ("esi",$AH[0],13)', - '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) - '&rorx ("ecx",$AH[0],22)', - '&xor ("esi","edi")', - '&mov ("edi",&off($b))', - '&xor ("ecx","esi")', # Sigma0(a) - - '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round - '&add ($E,&off($h))', # T += h - '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) - '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] - '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) - - '&add ("ecx",$E)', # h += T - '&add ($E,&off($d))', # d += T - '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) - - '@AH = reverse(@AH); $i++;' # rotate(a,h) - ); -} - -&set_label("AVX_BMI",32); - &lea ("esp",&DWP(-96,"esp")); - &vzeroall (); - # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack - &mov ($AH[0],&DWP(0,"esi")); - &mov ($AH[1],&DWP(4,"esi")); - &mov ("ecx",&DWP(8,"esi")); - &mov ("edi",&DWP(12,"esi")); - #&mov (&DWP(0,"esp"),$AH[0]); - &mov (&DWP(4,"esp"),$AH[1]); - &xor ($AH[1],"ecx"); # magic - &mov (&DWP(8,"esp"),"ecx"); - &mov (&DWP(12,"esp"),"edi"); - &mov ($E,&DWP(16,"esi")); - &mov ("edi",&DWP(20,"esi")); - &mov ("ecx",&DWP(24,"esi")); - &mov ("esi",&DWP(28,"esi")); - #&mov (&DWP(16,"esp"),$E); - &mov (&DWP(20,"esp"),"edi"); - &mov ("edi",&DWP(96+4,"esp")); # inp - &mov (&DWP(24,"esp"),"ecx"); - &mov (&DWP(28,"esp"),"esi"); - &vmovdqa ($t3,&QWP(256,$K256)); - &jmp (&label("grand_avx_bmi")); - -&set_label("grand_avx_bmi",32); - # load input, reverse byte order, add K256[0..15], save to stack - &vmovdqu (@X[0],&QWP(0,"edi")); - &vmovdqu (@X[1],&QWP(16,"edi")); - &vmovdqu (@X[2],&QWP(32,"edi")); - &vmovdqu (@X[3],&QWP(48,"edi")); - &add ("edi",64); - &vpshufb (@X[0],@X[0],$t3); - &mov (&DWP(96+4,"esp"),"edi"); - &vpshufb (@X[1],@X[1],$t3); - &vpshufb (@X[2],@X[2],$t3); - &vpaddd ($t0,@X[0],&QWP(0,$K256)); - &vpshufb (@X[3],@X[3],$t3); - &vpaddd ($t1,@X[1],&QWP(16,$K256)); - &vpaddd ($t2,@X[2],&QWP(32,$K256)); - &vpaddd ($t3,@X[3],&QWP(48,$K256)); - &vmovdqa (&QWP(32+0,"esp"),$t0); - &vmovdqa (&QWP(32+16,"esp"),$t1); - &vmovdqa (&QWP(32+32,"esp"),$t2); - &vmovdqa (&QWP(32+48,"esp"),$t3); - &jmp (&label("avx_bmi_00_47")); - -&set_label("avx_bmi_00_47",16); - &add ($K256,64); - - for ($i=0,$j=0; $j<4; $j++) { - &AVX_00_47($j,\&bodyx_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmp (&DWP(16*$j,$K256),0x00010203); - &jne (&label("avx_bmi_00_47")); - - for ($i=0; $i<16; ) { - foreach(bodyx_00_15()) { eval; } - } - - &mov ("esi",&DWP(96,"esp")); #ctx - #&mov ($AH[0],&DWP(0,"esp")); - &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); - #&mov ("edi", &DWP(8,"esp")); - &mov ("ecx",&DWP(12,"esp")); - &add ($AH[0],&DWP(0,"esi")); - &add ($AH[1],&DWP(4,"esi")); - &add ("edi",&DWP(8,"esi")); - &add ("ecx",&DWP(12,"esi")); - &mov (&DWP(0,"esi"),$AH[0]); - &mov (&DWP(4,"esi"),$AH[1]); - &mov (&DWP(8,"esi"),"edi"); - &mov (&DWP(12,"esi"),"ecx"); - #&mov (&DWP(0,"esp"),$AH[0]); - &mov (&DWP(4,"esp"),$AH[1]); - &xor ($AH[1],"edi"); # magic - &mov (&DWP(8,"esp"),"edi"); - &mov (&DWP(12,"esp"),"ecx"); - #&mov ($E,&DWP(16,"esp")); - &mov ("edi",&DWP(20,"esp")); - &mov ("ecx",&DWP(24,"esp")); - &add ($E,&DWP(16,"esi")); - &add ("edi",&DWP(20,"esi")); - &add ("ecx",&DWP(24,"esi")); - &mov (&DWP(16,"esi"),$E); - &mov (&DWP(20,"esi"),"edi"); - &mov (&DWP(20,"esp"),"edi"); - &mov ("edi",&DWP(28,"esp")); - &mov (&DWP(24,"esi"),"ecx"); - #&mov (&DWP(16,"esp"),$E); - &add ("edi",&DWP(28,"esi")); - &mov (&DWP(24,"esp"),"ecx"); - &mov (&DWP(28,"esi"),"edi"); - &mov (&DWP(28,"esp"),"edi"); - &mov ("edi",&DWP(96+4,"esp")); # inp - - &vmovdqa ($t3,&QWP(64,$K256)); - &sub ($K256,3*64); # rewind K - &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? - &jb (&label("grand_avx_bmi")); - - &mov ("esp",&DWP(96+12,"esp")); # restore sp - &vzeroall (); -&function_end_A(); - } - } - }}} -&function_end_B("sha256_block_data_order"); - -&asm_finish(); +../openssl/./crypto/sha/asm/sha256-586.pl
\ No newline at end of file diff --git a/devel/perlasm/sha512-ssse3-x86.pl b/devel/perlasm/sha512-ssse3-x86.pl index 9fc792964f..ef9a48b390 100644..120000 --- a/devel/perlasm/sha512-ssse3-x86.pl +++ b/devel/perlasm/sha512-ssse3-x86.pl @@ -1,910 +1 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# SHA512 block transform for x86. September 2007. -# -# May 2013. -# -# Add SSSE3 code path, 20-25% improvement [over original SSE2 code]. -# -# Performance in clock cycles per processed byte (less is better): -# -# gcc icc x86 asm SIMD(*) x86_64(**) -# Pentium 100 97 61 - - -# PIII 75 77 56 - - -# P4 116 95 82 34.6 30.8 -# AMD K8 54 55 36 20.7 9.57 -# Core2 66 57 40 15.9 9.97 -# Westmere 70 - 38 12.2 9.58 -# Sandy Bridge 58 - 35 11.9 11.2 -# Ivy Bridge 50 - 33 11.5 8.17 -# Haswell 46 - 29 11.3 7.66 -# Bulldozer 121 - 50 14.0 13.5 -# VIA Nano 91 - 52 33 14.7 -# Atom 126 - 68 48(***) 14.7 -# -# (*) whichever best applicable. -# (**) x86_64 assembler performance is presented for reference -# purposes, the results are for integer-only code. -# (***) paddq is increadibly slow on Atom. -# -# IALU code-path is optimized for elder Pentiums. On vanilla Pentium -# performance improvement over compiler generated code reaches ~60%, -# while on PIII - ~35%. On newer µ-archs improvement varies from 15% -# to 50%, but it's less important as they are expected to execute SSE2 -# code-path, which is commonly ~2-3x faster [than compiler generated -# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even -# though it does not use 128-bit operations. The latter means that -# SSE2-aware kernel is no longer required to execute the code. Another -# difference is that new code optimizes amount of writes, but at the -# cost of increased data cache "footprint" by 1/2KB. - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -push(@INC,"${dir}","${dir}../../perlasm"); -require "x86asm.pl"; - -&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386"); - -$sse2=0; -for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } - -&external_label("OPENSSL_ia32cap_P") if ($sse2); - -$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp"); -$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp"); -$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp"); -$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp"); -$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp"); -$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp"); -$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp"); -$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp"); -$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp"); -$K512="ebp"; - -$Asse2=&QWP(0,"esp"); -$Bsse2=&QWP(8,"esp"); -$Csse2=&QWP(16,"esp"); -$Dsse2=&QWP(24,"esp"); -$Esse2=&QWP(32,"esp"); -$Fsse2=&QWP(40,"esp"); -$Gsse2=&QWP(48,"esp"); -$Hsse2=&QWP(56,"esp"); - -$A="mm0"; # B-D and -$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and - # mm5-mm7, but it's done on on-demand basis... -$BxC="mm2"; # ... except for B^C - -sub BODY_00_15_sse2 { - my $phase=shift; - - #&movq ("mm5",$Fsse2); # load f - #&movq ("mm6",$Gsse2); # load g - - &movq ("mm1",$E); # %mm1 is sliding right - &pxor ("mm5","mm6"); # f^=g - &psrlq ("mm1",14); - &movq ($Esse2,$E); # modulo-scheduled save e - &pand ("mm5",$E); # f&=e - &psllq ($E,23); # $E is sliding left - &movq ($A,"mm3") if ($phase<2); - &movq (&QWP(8*9,"esp"),"mm7") # save X[i] - &movq ("mm3","mm1"); # %mm3 is T1 - &psrlq ("mm1",4); - &pxor ("mm5","mm6"); # Ch(e,f,g) - &pxor ("mm3",$E); - &psllq ($E,23); - &pxor ("mm3","mm1"); - &movq ($Asse2,$A); # modulo-scheduled save a - &paddq ("mm7","mm5"); # X[i]+=Ch(e,f,g) - &pxor ("mm3",$E); - &psrlq ("mm1",23); - &paddq ("mm7",$Hsse2); # X[i]+=h - &pxor ("mm3","mm1"); - &psllq ($E,4); - &paddq ("mm7",QWP(0,$K512)); # X[i]+=K512[i] - &pxor ("mm3",$E); # T1=Sigma1_512(e) - - &movq ($E,$Dsse2); # e = load d, e in next round - &paddq ("mm3","mm7"); # T1+=X[i] - &movq ("mm5",$A); # %mm5 is sliding right - &psrlq ("mm5",28); - &paddq ($E,"mm3"); # d += T1 - &movq ("mm6",$A); # %mm6 is sliding left - &movq ("mm7","mm5"); - &psllq ("mm6",25); - &movq ("mm1",$Bsse2); # load b - &psrlq ("mm5",6); - &pxor ("mm7","mm6"); - &sub ("esp",8); - &psllq ("mm6",5); - &pxor ("mm7","mm5"); - &pxor ($A,"mm1"); # a^b, b^c in next round - &psrlq ("mm5",5); - &pxor ("mm7","mm6"); - &pand ($BxC,$A); # (b^c)&(a^b) - &psllq ("mm6",6); - &pxor ("mm7","mm5"); - &pxor ($BxC,"mm1"); # [h=]Maj(a,b,c) - &pxor ("mm6","mm7"); # Sigma0_512(a) - &movq ("mm7",&QWP(8*(9+16-1),"esp")) if ($phase!=0); # pre-fetch - &movq ("mm5",$Fsse2) if ($phase==0); # load f - - if ($phase>1) { - &paddq ($BxC,"mm6"); # h+=Sigma0(a) - &add ($K512,8); - #&paddq ($BxC,"mm3"); # h+=T1 - - ($A,$BxC) = ($BxC,$A); # rotate registers - } else { - &paddq ("mm3",$BxC); # T1+=Maj(a,b,c) - &movq ($BxC,$A); - &add ($K512,8); - &paddq ("mm3","mm6"); # T1+=Sigma0(a) - &movq ("mm6",$Gsse2) if ($phase==0); # load g - #&movq ($A,"mm3"); # h=T1 - } -} - -sub BODY_00_15_x86 { - #define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - # LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - # HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - &mov ("ecx",$Elo); - &mov ("edx",$Ehi); - &mov ("esi","ecx"); - - &shr ("ecx",9); # lo>>9 - &mov ("edi","edx"); - &shr ("edx",9); # hi>>9 - &mov ("ebx","ecx"); - &shl ("esi",14); # lo<<14 - &mov ("eax","edx"); - &shl ("edi",14); # hi<<14 - &xor ("ebx","esi"); - - &shr ("ecx",14-9); # lo>>14 - &xor ("eax","edi"); - &shr ("edx",14-9); # hi>>14 - &xor ("eax","ecx"); - &shl ("esi",18-14); # lo<<18 - &xor ("ebx","edx"); - &shl ("edi",18-14); # hi<<18 - &xor ("ebx","esi"); - - &shr ("ecx",18-14); # lo>>18 - &xor ("eax","edi"); - &shr ("edx",18-14); # hi>>18 - &xor ("eax","ecx"); - &shl ("esi",23-18); # lo<<23 - &xor ("ebx","edx"); - &shl ("edi",23-18); # hi<<23 - &xor ("eax","esi"); - &xor ("ebx","edi"); # T1 = Sigma1(e) - - &mov ("ecx",$Flo); - &mov ("edx",$Fhi); - &mov ("esi",$Glo); - &mov ("edi",$Ghi); - &add ("eax",$Hlo); - &adc ("ebx",$Hhi); # T1 += h - &xor ("ecx","esi"); - &xor ("edx","edi"); - &and ("ecx",$Elo); - &and ("edx",$Ehi); - &add ("eax",&DWP(8*(9+15)+0,"esp")); - &adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0] - &xor ("ecx","esi"); - &xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g - - &mov ("esi",&DWP(0,$K512)); - &mov ("edi",&DWP(4,$K512)); # K[i] - &add ("eax","ecx"); - &adc ("ebx","edx"); # T1 += Ch(e,f,g) - &mov ("ecx",$Dlo); - &mov ("edx",$Dhi); - &add ("eax","esi"); - &adc ("ebx","edi"); # T1 += K[i] - &mov ($Tlo,"eax"); - &mov ($Thi,"ebx"); # put T1 away - &add ("eax","ecx"); - &adc ("ebx","edx"); # d += T1 - - #define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - # LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - # HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - &mov ("ecx",$Alo); - &mov ("edx",$Ahi); - &mov ($Dlo,"eax"); - &mov ($Dhi,"ebx"); - &mov ("esi","ecx"); - - &shr ("ecx",2); # lo>>2 - &mov ("edi","edx"); - &shr ("edx",2); # hi>>2 - &mov ("ebx","ecx"); - &shl ("esi",4); # lo<<4 - &mov ("eax","edx"); - &shl ("edi",4); # hi<<4 - &xor ("ebx","esi"); - - &shr ("ecx",7-2); # lo>>7 - &xor ("eax","edi"); - &shr ("edx",7-2); # hi>>7 - &xor ("ebx","ecx"); - &shl ("esi",25-4); # lo<<25 - &xor ("eax","edx"); - &shl ("edi",25-4); # hi<<25 - &xor ("eax","esi"); - - &shr ("ecx",28-7); # lo>>28 - &xor ("ebx","edi"); - &shr ("edx",28-7); # hi>>28 - &xor ("eax","ecx"); - &shl ("esi",30-25); # lo<<30 - &xor ("ebx","edx"); - &shl ("edi",30-25); # hi<<30 - &xor ("eax","esi"); - &xor ("ebx","edi"); # Sigma0(a) - - &mov ("ecx",$Alo); - &mov ("edx",$Ahi); - &mov ("esi",$Blo); - &mov ("edi",$Bhi); - &add ("eax",$Tlo); - &adc ("ebx",$Thi); # T1 = Sigma0(a)+T1 - &or ("ecx","esi"); - &or ("edx","edi"); - &and ("ecx",$Clo); - &and ("edx",$Chi); - &and ("esi",$Alo); - &and ("edi",$Ahi); - &or ("ecx","esi"); - &or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b) - - &add ("eax","ecx"); - &adc ("ebx","edx"); # T1 += Maj(a,b,c) - &mov ($Tlo,"eax"); - &mov ($Thi,"ebx"); - - &mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K - &sub ("esp",8); - &lea ($K512,&DWP(8,$K512)); # K++ -} - - -&function_begin("sha512_block_data_order"); - &mov ("esi",wparam(0)); # ctx - &mov ("edi",wparam(1)); # inp - &mov ("eax",wparam(2)); # num - &mov ("ebx","esp"); # saved sp - - &call (&label("pic_point")); # make it PIC! -&set_label("pic_point"); - &blindpop($K512); - &lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512)); - - &sub ("esp",16); - &and ("esp",-64); - - &shl ("eax",7); - &add ("eax","edi"); - &mov (&DWP(0,"esp"),"esi"); # ctx - &mov (&DWP(4,"esp"),"edi"); # inp - &mov (&DWP(8,"esp"),"eax"); # inp+num*128 - &mov (&DWP(12,"esp"),"ebx"); # saved sp - -if ($sse2) { - &picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512")); - &mov ("ecx",&DWP(0,"edx")); - &test ("ecx",1<<26); - &jz (&label("loop_x86")); - - &mov ("edx",&DWP(4,"edx")); - - # load ctx->h[0-7] - &movq ($A,&QWP(0,"esi")); - &and ("ecx",1<<24); # XMM registers availability - &movq ("mm1",&QWP(8,"esi")); - &and ("edx",1<<9); # SSSE3 bit - &movq ($BxC,&QWP(16,"esi")); - &or ("ecx","edx"); - &movq ("mm3",&QWP(24,"esi")); - &movq ($E,&QWP(32,"esi")); - &movq ("mm5",&QWP(40,"esi")); - &movq ("mm6",&QWP(48,"esi")); - &movq ("mm7",&QWP(56,"esi")); - &cmp ("ecx",1<<24|1<<9); - &je (&label("SSSE3")); - &sub ("esp",8*10); - &jmp (&label("loop_sse2")); - -&set_label("loop_sse2",16); - #&movq ($Asse2,$A); - &movq ($Bsse2,"mm1"); - &movq ($Csse2,$BxC); - &movq ($Dsse2,"mm3"); - #&movq ($Esse2,$E); - &movq ($Fsse2,"mm5"); - &movq ($Gsse2,"mm6"); - &pxor ($BxC,"mm1"); # magic - &movq ($Hsse2,"mm7"); - &movq ("mm3",$A); # magic - - &mov ("eax",&DWP(0,"edi")); - &mov ("ebx",&DWP(4,"edi")); - &add ("edi",8); - &mov ("edx",15); # counter - &bswap ("eax"); - &bswap ("ebx"); - &jmp (&label("00_14_sse2")); - -&set_label("00_14_sse2",16); - &movd ("mm1","eax"); - &mov ("eax",&DWP(0,"edi")); - &movd ("mm7","ebx"); - &mov ("ebx",&DWP(4,"edi")); - &add ("edi",8); - &bswap ("eax"); - &bswap ("ebx"); - &punpckldq("mm7","mm1"); - - &BODY_00_15_sse2(); - - &dec ("edx"); - &jnz (&label("00_14_sse2")); - - &movd ("mm1","eax"); - &movd ("mm7","ebx"); - &punpckldq("mm7","mm1"); - - &BODY_00_15_sse2(1); - - &pxor ($A,$A); # A is in %mm3 - &mov ("edx",32); # counter - &jmp (&label("16_79_sse2")); - -&set_label("16_79_sse2",16); - for ($j=0;$j<2;$j++) { # 2x unroll - #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15 - &movq ("mm5",&QWP(8*(9+16-14),"esp")); - &movq ("mm1","mm7"); - &psrlq ("mm7",1); - &movq ("mm6","mm5"); - &psrlq ("mm5",6); - &psllq ("mm1",56); - &paddq ($A,"mm3"); # from BODY_00_15 - &movq ("mm3","mm7"); - &psrlq ("mm7",7-1); - &pxor ("mm3","mm1"); - &psllq ("mm1",63-56); - &pxor ("mm3","mm7"); - &psrlq ("mm7",8-7); - &pxor ("mm3","mm1"); - &movq ("mm1","mm5"); - &psrlq ("mm5",19-6); - &pxor ("mm7","mm3"); # sigma0 - - &psllq ("mm6",3); - &pxor ("mm1","mm5"); - &paddq ("mm7",&QWP(8*(9+16),"esp")); - &pxor ("mm1","mm6"); - &psrlq ("mm5",61-19); - &paddq ("mm7",&QWP(8*(9+16-9),"esp")); - &pxor ("mm1","mm5"); - &psllq ("mm6",45-3); - &movq ("mm5",$Fsse2); # load f - &pxor ("mm1","mm6"); # sigma1 - &movq ("mm6",$Gsse2); # load g - - &paddq ("mm7","mm1"); # X[i] - #&movq (&QWP(8*9,"esp"),"mm7"); # moved to BODY_00_15 - - &BODY_00_15_sse2(2); - } - &dec ("edx"); - &jnz (&label("16_79_sse2")); - - #&movq ($A,$Asse2); - &paddq ($A,"mm3"); # from BODY_00_15 - &movq ("mm1",$Bsse2); - #&movq ($BxC,$Csse2); - &movq ("mm3",$Dsse2); - #&movq ($E,$Esse2); - &movq ("mm5",$Fsse2); - &movq ("mm6",$Gsse2); - &movq ("mm7",$Hsse2); - - &pxor ($BxC,"mm1"); # de-magic - &paddq ($A,&QWP(0,"esi")); - &paddq ("mm1",&QWP(8,"esi")); - &paddq ($BxC,&QWP(16,"esi")); - &paddq ("mm3",&QWP(24,"esi")); - &paddq ($E,&QWP(32,"esi")); - &paddq ("mm5",&QWP(40,"esi")); - &paddq ("mm6",&QWP(48,"esi")); - &paddq ("mm7",&QWP(56,"esi")); - - &mov ("eax",8*80); - &movq (&QWP(0,"esi"),$A); - &movq (&QWP(8,"esi"),"mm1"); - &movq (&QWP(16,"esi"),$BxC); - &movq (&QWP(24,"esi"),"mm3"); - &movq (&QWP(32,"esi"),$E); - &movq (&QWP(40,"esi"),"mm5"); - &movq (&QWP(48,"esi"),"mm6"); - &movq (&QWP(56,"esi"),"mm7"); - - &lea ("esp",&DWP(0,"esp","eax")); # destroy frame - &sub ($K512,"eax"); # rewind K - - &cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet? - &jb (&label("loop_sse2")); - - &mov ("esp",&DWP(8*10+12,"esp")); # restore sp - &emms (); -&function_end_A(); - -&set_label("SSSE3",32); -{ my ($cnt,$frame)=("ecx","edx"); - my @X=map("xmm$_",(0..7)); - my $j; - my $i=0; - - &lea ($frame,&DWP(-64,"esp")); - &sub ("esp",256); - - # fixed stack frame layout - # - # +0 A B C D E F G H # backing store - # +64 X[0]+K[i] .. X[15]+K[i] # XMM->MM xfer area - # +192 # XMM off-load ring buffer - # +256 # saved parameters - - &movdqa (@X[1],&QWP(80*8,$K512)); # byte swap mask - &movdqu (@X[0],&QWP(0,"edi")); - &pshufb (@X[0],@X[1]); - for ($j=0;$j<8;$j++) { - &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load - &movdqa (@X[3],&QWP(16*($j%8),$K512)); - &movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask - &movdqu (@X[1],&QWP(16*($j+1),"edi")) if ($j<7); # next input - &movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0] - &paddq (@X[3],@X[0]); - &pshufb (@X[1],@X[2]) if ($j<7); - &movdqa (&QWP(16*($j%8)-128,$frame),@X[3]); # xfer X[i]+K[i] - - push(@X,shift(@X)); # rotate(@X) - } - #&jmp (&label("loop_ssse3")); - &nop (); - -&set_label("loop_ssse3",32); - &movdqa (@X[2],&QWP(16*(($j+1)%4),$frame)); # pre-restore @X[1] - &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]); # off-load @X[3] - &lea ($K512,&DWP(16*8,$K512)); - - #&movq ($Asse2,$A); # off-load A-H - &movq ($Bsse2,"mm1"); - &mov ("ebx","edi"); - &movq ($Csse2,$BxC); - &lea ("edi",&DWP(128,"edi")); # advance input - &movq ($Dsse2,"mm3"); - &cmp ("edi","eax"); - #&movq ($Esse2,$E); - &movq ($Fsse2,"mm5"); - &cmovb ("ebx","edi"); - &movq ($Gsse2,"mm6"); - &mov ("ecx",4); # loop counter - &pxor ($BxC,"mm1"); # magic - &movq ($Hsse2,"mm7"); - &pxor ("mm3","mm3"); # magic - - &jmp (&label("00_47_ssse3")); - -sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2 - ( - '&movq ("mm1",$E)', # %mm1 is sliding right - '&movq ("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i] - '&pxor ("mm5","mm6")', # f^=g - '&psrlq ("mm1",14)', - '&movq (&QWP(8*($i+4)%64,"esp"),$E)', # modulo-scheduled save e - '&pand ("mm5",$E)', # f&=e - '&psllq ($E,23)', # $E is sliding left - '&paddq ($A,"mm3")', # [h+=Maj(a,b,c)] - '&movq ("mm3","mm1")', # %mm3 is T1 - '&psrlq("mm1",4)', - '&pxor ("mm5","mm6")', # Ch(e,f,g) - '&pxor ("mm3",$E)', - '&psllq($E,23)', - '&pxor ("mm3","mm1")', - '&movq (&QWP(8*$i%64,"esp"),$A)', # modulo-scheduled save a - '&paddq("mm7","mm5")', # X[i]+=Ch(e,f,g) - '&pxor ("mm3",$E)', - '&psrlq("mm1",23)', - '&paddq("mm7",&QWP(8*($i+7)%64,"esp"))', # X[i]+=h - '&pxor ("mm3","mm1")', - '&psllq($E,4)', - '&pxor ("mm3",$E)', # T1=Sigma1_512(e) - - '&movq ($E,&QWP(8*($i+3)%64,"esp"))', # e = load d, e in next round - '&paddq ("mm3","mm7")', # T1+=X[i] - '&movq ("mm5",$A)', # %mm5 is sliding right - '&psrlq("mm5",28)', - '&paddq ($E,"mm3")', # d += T1 - '&movq ("mm6",$A)', # %mm6 is sliding left - '&movq ("mm7","mm5")', - '&psllq("mm6",25)', - '&movq ("mm1",&QWP(8*($i+1)%64,"esp"))', # load b - '&psrlq("mm5",6)', - '&pxor ("mm7","mm6")', - '&psllq("mm6",5)', - '&pxor ("mm7","mm5")', - '&pxor ($A,"mm1")', # a^b, b^c in next round - '&psrlq("mm5",5)', - '&pxor ("mm7","mm6")', - '&pand ($BxC,$A)', # (b^c)&(a^b) - '&psllq("mm6",6)', - '&pxor ("mm7","mm5")', - '&pxor ($BxC,"mm1")', # [h=]Maj(a,b,c) - '&pxor ("mm6","mm7")', # Sigma0_512(a) - '&movq ("mm5",&QWP(8*($i+5-1)%64,"esp"))', # pre-load f - '&paddq ($BxC,"mm6")', # h+=Sigma0(a) - '&movq ("mm6",&QWP(8*($i+6-1)%64,"esp"))', # pre-load g - - '($A,$BxC) = ($BxC,$A); $i--;' - ); -} - -&set_label("00_47_ssse3",32); - - for(;$j<16;$j++) { - my ($t0,$t2,$t1)=@X[2..4]; - my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3()); - - &movdqa ($t2,@X[5]); - &movdqa (@X[1],$t0); # restore @X[1] - &palignr ($t0,@X[0],8); # X[1..2] - &movdqa (&QWP(16*($j%4),$frame),@X[4]); # off-load @X[4] - &palignr ($t2,@X[4],8); # X[9..10] - - &movdqa ($t1,$t0); - &psrlq ($t0,7); - &paddq (@X[0],$t2); # X[0..1] += X[9..10] - &movdqa ($t2,$t1); - &psrlq ($t1,1); - &psllq ($t2,64-8); - &pxor ($t0,$t1); - &psrlq ($t1,8-1); - &pxor ($t0,$t2); - &psllq ($t2,8-1); - &pxor ($t0,$t1); - &movdqa ($t1,@X[7]); - &pxor ($t0,$t2); # sigma0(X[1..2]) - &movdqa ($t2,@X[7]); - &psrlq ($t1,6); - &paddq (@X[0],$t0); # X[0..1] += sigma0(X[1..2]) - - &movdqa ($t0,@X[7]); - &psrlq ($t2,19); - &psllq ($t0,64-61); - &pxor ($t1,$t2); - &psrlq ($t2,61-19); - &pxor ($t1,$t0); - &psllq ($t0,61-19); - &pxor ($t1,$t2); - &movdqa ($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1] - &pxor ($t1,$t0); # sigma0(X[1..2]) - &movdqa ($t0,&QWP(16*($j%8),$K512)); - eval(shift(@insns)); - &paddq (@X[0],$t1); # X[0..1] += sigma0(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &paddq ($t0,@X[0]); - foreach(@insns) { eval; } - &movdqa (&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i] - - push(@X,shift(@X)); # rotate(@X) - } - &lea ($K512,&DWP(16*8,$K512)); - &dec ("ecx"); - &jnz (&label("00_47_ssse3")); - - &movdqa (@X[1],&QWP(0,$K512)); # byte swap mask - &lea ($K512,&DWP(-80*8,$K512)); # rewind - &movdqu (@X[0],&QWP(0,"ebx")); - &pshufb (@X[0],@X[1]); - - for ($j=0;$j<8;$j++) { # load next or same block - my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3()); - - &movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load - &movdqa (@X[3],&QWP(16*($j%8),$K512)); - &movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask - &movdqu (@X[1],&QWP(16*($j+1),"ebx")) if ($j<7); # next input - &movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0] - &paddq (@X[3],@X[0]); - &pshufb (@X[1],@X[2]) if ($j<7); - foreach(@insns) { eval; } - &movdqa (&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i] - - push(@X,shift(@X)); # rotate(@X) - } - - #&movq ($A,$Asse2); # load A-H - &movq ("mm1",$Bsse2); - &paddq ($A,"mm3"); # from BODY_00_15 - #&movq ($BxC,$Csse2); - &movq ("mm3",$Dsse2); - #&movq ($E,$Esse2); - #&movq ("mm5",$Fsse2); - #&movq ("mm6",$Gsse2); - &movq ("mm7",$Hsse2); - - &pxor ($BxC,"mm1"); # de-magic - &paddq ($A,&QWP(0,"esi")); - &paddq ("mm1",&QWP(8,"esi")); - &paddq ($BxC,&QWP(16,"esi")); - &paddq ("mm3",&QWP(24,"esi")); - &paddq ($E,&QWP(32,"esi")); - &paddq ("mm5",&QWP(40,"esi")); - &paddq ("mm6",&QWP(48,"esi")); - &paddq ("mm7",&QWP(56,"esi")); - - &movq (&QWP(0,"esi"),$A); - &movq (&QWP(8,"esi"),"mm1"); - &movq (&QWP(16,"esi"),$BxC); - &movq (&QWP(24,"esi"),"mm3"); - &movq (&QWP(32,"esi"),$E); - &movq (&QWP(40,"esi"),"mm5"); - &movq (&QWP(48,"esi"),"mm6"); - &movq (&QWP(56,"esi"),"mm7"); - - &cmp ("edi","eax") # are we done yet? - &jb (&label("loop_ssse3")); - - &mov ("esp",&DWP(64+12,$frame)); # restore sp - &emms (); -} -&function_end_A(); -} -&set_label("loop_x86",16); - # copy input block to stack reversing byte and qword order - for ($i=0;$i<8;$i++) { - &mov ("eax",&DWP($i*16+0,"edi")); - &mov ("ebx",&DWP($i*16+4,"edi")); - &mov ("ecx",&DWP($i*16+8,"edi")); - &mov ("edx",&DWP($i*16+12,"edi")); - &bswap ("eax"); - &bswap ("ebx"); - &bswap ("ecx"); - &bswap ("edx"); - &push ("eax"); - &push ("ebx"); - &push ("ecx"); - &push ("edx"); - } - &add ("edi",128); - &sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H - &mov (&DWP(8*(9+16)+4,"esp"),"edi"); - - # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack - &lea ("edi",&DWP(8,"esp")); - &mov ("ecx",16); - &data_word(0xA5F3F689); # rep movsd - -&set_label("00_15_x86",16); - &BODY_00_15_x86(); - - &cmp (&LB("edx"),0x94); - &jne (&label("00_15_x86")); - -&set_label("16_79_x86",16); - #define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) - # LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 - # HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 - &mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp")); - &mov ("edx",&DWP(8*(9+15+16-1)+4,"esp")); - &mov ("esi","ecx"); - - &shr ("ecx",1); # lo>>1 - &mov ("edi","edx"); - &shr ("edx",1); # hi>>1 - &mov ("eax","ecx"); - &shl ("esi",24); # lo<<24 - &mov ("ebx","edx"); - &shl ("edi",24); # hi<<24 - &xor ("ebx","esi"); - - &shr ("ecx",7-1); # lo>>7 - &xor ("eax","edi"); - &shr ("edx",7-1); # hi>>7 - &xor ("eax","ecx"); - &shl ("esi",31-24); # lo<<31 - &xor ("ebx","edx"); - &shl ("edi",25-24); # hi<<25 - &xor ("ebx","esi"); - - &shr ("ecx",8-7); # lo>>8 - &xor ("eax","edi"); - &shr ("edx",8-7); # hi>>8 - &xor ("eax","ecx"); - &shl ("edi",31-25); # hi<<31 - &xor ("ebx","edx"); - &xor ("eax","edi"); # T1 = sigma0(X[-15]) - - &mov (&DWP(0,"esp"),"eax"); - &mov (&DWP(4,"esp"),"ebx"); # put T1 away - - #define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - # LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 - # HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 - &mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp")); - &mov ("edx",&DWP(8*(9+15+16-14)+4,"esp")); - &mov ("esi","ecx"); - - &shr ("ecx",6); # lo>>6 - &mov ("edi","edx"); - &shr ("edx",6); # hi>>6 - &mov ("eax","ecx"); - &shl ("esi",3); # lo<<3 - &mov ("ebx","edx"); - &shl ("edi",3); # hi<<3 - &xor ("eax","esi"); - - &shr ("ecx",19-6); # lo>>19 - &xor ("ebx","edi"); - &shr ("edx",19-6); # hi>>19 - &xor ("eax","ecx"); - &shl ("esi",13-3); # lo<<13 - &xor ("ebx","edx"); - &shl ("edi",13-3); # hi<<13 - &xor ("ebx","esi"); - - &shr ("ecx",29-19); # lo>>29 - &xor ("eax","edi"); - &shr ("edx",29-19); # hi>>29 - &xor ("ebx","ecx"); - &shl ("edi",26-13); # hi<<26 - &xor ("eax","edx"); - &xor ("eax","edi"); # sigma1(X[-2]) - - &mov ("ecx",&DWP(8*(9+15+16)+0,"esp")); - &mov ("edx",&DWP(8*(9+15+16)+4,"esp")); - &add ("eax",&DWP(0,"esp")); - &adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1 - &mov ("esi",&DWP(8*(9+15+16-9)+0,"esp")); - &mov ("edi",&DWP(8*(9+15+16-9)+4,"esp")); - &add ("eax","ecx"); - &adc ("ebx","edx"); # T1 += X[-16] - &add ("eax","esi"); - &adc ("ebx","edi"); # T1 += X[-7] - &mov (&DWP(8*(9+15)+0,"esp"),"eax"); - &mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0] - - &BODY_00_15_x86(); - - &cmp (&LB("edx"),0x17); - &jne (&label("16_79_x86")); - - &mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx - &mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp - for($i=0;$i<4;$i++) { - &mov ("eax",&DWP($i*16+0,"esi")); - &mov ("ebx",&DWP($i*16+4,"esi")); - &mov ("ecx",&DWP($i*16+8,"esi")); - &mov ("edx",&DWP($i*16+12,"esi")); - &add ("eax",&DWP(8+($i*16)+0,"esp")); - &adc ("ebx",&DWP(8+($i*16)+4,"esp")); - &mov (&DWP($i*16+0,"esi"),"eax"); - &mov (&DWP($i*16+4,"esi"),"ebx"); - &add ("ecx",&DWP(8+($i*16)+8,"esp")); - &adc ("edx",&DWP(8+($i*16)+12,"esp")); - &mov (&DWP($i*16+8,"esi"),"ecx"); - &mov (&DWP($i*16+12,"esi"),"edx"); - } - &add ("esp",8*(9+16+80)); # destroy frame - &sub ($K512,8*80); # rewind K - - &cmp ("edi",&DWP(8,"esp")); # are we done yet? - &jb (&label("loop_x86")); - - &mov ("esp",&DWP(12,"esp")); # restore sp -&function_end_A(); - -&set_label("K512",64); # Yes! I keep it in the code segment! - &data_word(0xd728ae22,0x428a2f98); # u64 - &data_word(0x23ef65cd,0x71374491); # u64 - &data_word(0xec4d3b2f,0xb5c0fbcf); # u64 - &data_word(0x8189dbbc,0xe9b5dba5); # u64 - &data_word(0xf348b538,0x3956c25b); # u64 - &data_word(0xb605d019,0x59f111f1); # u64 - &data_word(0xaf194f9b,0x923f82a4); # u64 - &data_word(0xda6d8118,0xab1c5ed5); # u64 - &data_word(0xa3030242,0xd807aa98); # u64 - &data_word(0x45706fbe,0x12835b01); # u64 - &data_word(0x4ee4b28c,0x243185be); # u64 - &data_word(0xd5ffb4e2,0x550c7dc3); # u64 - &data_word(0xf27b896f,0x72be5d74); # u64 - &data_word(0x3b1696b1,0x80deb1fe); # u64 - &data_word(0x25c71235,0x9bdc06a7); # u64 - &data_word(0xcf692694,0xc19bf174); # u64 - &data_word(0x9ef14ad2,0xe49b69c1); # u64 - &data_word(0x384f25e3,0xefbe4786); # u64 - &data_word(0x8b8cd5b5,0x0fc19dc6); # u64 - &data_word(0x77ac9c65,0x240ca1cc); # u64 - &data_word(0x592b0275,0x2de92c6f); # u64 - &data_word(0x6ea6e483,0x4a7484aa); # u64 - &data_word(0xbd41fbd4,0x5cb0a9dc); # u64 - &data_word(0x831153b5,0x76f988da); # u64 - &data_word(0xee66dfab,0x983e5152); # u64 - &data_word(0x2db43210,0xa831c66d); # u64 - &data_word(0x98fb213f,0xb00327c8); # u64 - &data_word(0xbeef0ee4,0xbf597fc7); # u64 - &data_word(0x3da88fc2,0xc6e00bf3); # u64 - &data_word(0x930aa725,0xd5a79147); # u64 - &data_word(0xe003826f,0x06ca6351); # u64 - &data_word(0x0a0e6e70,0x14292967); # u64 - &data_word(0x46d22ffc,0x27b70a85); # u64 - &data_word(0x5c26c926,0x2e1b2138); # u64 - &data_word(0x5ac42aed,0x4d2c6dfc); # u64 - &data_word(0x9d95b3df,0x53380d13); # u64 - &data_word(0x8baf63de,0x650a7354); # u64 - &data_word(0x3c77b2a8,0x766a0abb); # u64 - &data_word(0x47edaee6,0x81c2c92e); # u64 - &data_word(0x1482353b,0x92722c85); # u64 - &data_word(0x4cf10364,0xa2bfe8a1); # u64 - &data_word(0xbc423001,0xa81a664b); # u64 - &data_word(0xd0f89791,0xc24b8b70); # u64 - &data_word(0x0654be30,0xc76c51a3); # u64 - &data_word(0xd6ef5218,0xd192e819); # u64 - &data_word(0x5565a910,0xd6990624); # u64 - &data_word(0x5771202a,0xf40e3585); # u64 - &data_word(0x32bbd1b8,0x106aa070); # u64 - &data_word(0xb8d2d0c8,0x19a4c116); # u64 - &data_word(0x5141ab53,0x1e376c08); # u64 - &data_word(0xdf8eeb99,0x2748774c); # u64 - &data_word(0xe19b48a8,0x34b0bcb5); # u64 - &data_word(0xc5c95a63,0x391c0cb3); # u64 - &data_word(0xe3418acb,0x4ed8aa4a); # u64 - &data_word(0x7763e373,0x5b9cca4f); # u64 - &data_word(0xd6b2b8a3,0x682e6ff3); # u64 - &data_word(0x5defb2fc,0x748f82ee); # u64 - &data_word(0x43172f60,0x78a5636f); # u64 - &data_word(0xa1f0ab72,0x84c87814); # u64 - &data_word(0x1a6439ec,0x8cc70208); # u64 - &data_word(0x23631e28,0x90befffa); # u64 - &data_word(0xde82bde9,0xa4506ceb); # u64 - &data_word(0xb2c67915,0xbef9a3f7); # u64 - &data_word(0xe372532b,0xc67178f2); # u64 - &data_word(0xea26619c,0xca273ece); # u64 - &data_word(0x21c0c207,0xd186b8c7); # u64 - &data_word(0xcde0eb1e,0xeada7dd6); # u64 - &data_word(0xee6ed178,0xf57d4f7f); # u64 - &data_word(0x72176fba,0x06f067aa); # u64 - &data_word(0xa2c898a6,0x0a637dc5); # u64 - &data_word(0xbef90dae,0x113f9804); # u64 - &data_word(0x131c471b,0x1b710b35); # u64 - &data_word(0x23047d84,0x28db77f5); # u64 - &data_word(0x40c72493,0x32caab7b); # u64 - &data_word(0x15c9bebc,0x3c9ebe0a); # u64 - &data_word(0x9c100d4c,0x431d67c4); # u64 - &data_word(0xcb3e42b6,0x4cc5d4be); # u64 - &data_word(0xfc657e2a,0x597f299c); # u64 - &data_word(0x3ad6faec,0x5fcb6fab); # u64 - &data_word(0x4a475817,0x6c44198c); # u64 - - &data_word(0x04050607,0x00010203); # byte swap - &data_word(0x0c0d0e0f,0x08090a0b); # mask -&function_end_B("sha512_block_data_order"); -&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); - -&asm_finish(); +../openssl/./crypto/sha/asm/sha512-586.pl
\ No newline at end of file diff --git a/devel/perlasm/sha512-ssse3-x86_64.pl b/devel/perlasm/sha512-ssse3-x86_64.pl index 8070d09c94..086d6dd7ca 100755..120000 --- a/devel/perlasm/sha512-ssse3-x86_64.pl +++ b/devel/perlasm/sha512-ssse3-x86_64.pl @@ -1,2152 +1 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. -# ==================================================================== -# -# sha256/512_block procedure for x86_64. -# -# 40% improvement over compiler-generated code on Opteron. On EM64T -# sha256 was observed to run >80% faster and sha512 - >40%. No magical -# tricks, just straight implementation... I really wonder why gcc -# [being armed with inline assembler] fails to generate as fast code. -# The only thing which is cool about this module is that it's very -# same instruction sequence used for both SHA-256 and SHA-512. In -# former case the instructions operate on 32-bit operands, while in -# latter - on 64-bit ones. All I had to do is to get one flavor right, -# the other one passed the test right away:-) -# -# sha256_block runs in ~1005 cycles on Opteron, which gives you -# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock -# frequency in GHz. sha512_block runs in ~1275 cycles, which results -# in 128*1000/1275=100MBps per GHz. Is there room for improvement? -# Well, if you compare it to IA-64 implementation, which maintains -# X[16] in register bank[!], tends to 4 instructions per CPU clock -# cycle and runs in 1003 cycles, 1275 is very good result for 3-way -# issue Opteron pipeline and X[16] maintained in memory. So that *if* -# there is a way to improve it, *then* the only way would be to try to -# offload X[16] updates to SSE unit, but that would require "deeper" -# loop unroll, which in turn would naturally cause size blow-up, not -# to mention increased complexity! And once again, only *if* it's -# actually possible to noticeably improve overall ILP, instruction -# level parallelism, on a given CPU implementation in this case. -# -# Special note on Intel EM64T. While Opteron CPU exhibits perfect -# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], -# [currently available] EM64T CPUs apparently are far from it. On the -# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit -# sha256_block:-( This is presumably because 64-bit shifts/rotates -# apparently are not atomic instructions, but implemented in microcode. -# -# May 2012. -# -# Optimization including one of Pavel Semjanov's ideas, alternative -# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and -# unfortunately -2% SHA512 on P4 [which nobody should care about -# that much]. -# -# June 2012. -# -# Add SIMD code paths, see below for improvement coefficients. SSSE3 -# code path was not attempted for SHA512, because improvement is not -# estimated to be high enough, noticeably less than 9%, to justify -# the effort, not on pre-AVX processors. [Obviously with exclusion -# for VIA Nano, but it has SHA512 instruction that is faster and -# should be used instead.] For reference, corresponding estimated -# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that -# higher coefficients are observed on VIA Nano and Bulldozer has more -# to do with specifics of their architecture [which is topic for -# separate discussion]. -# -# November 2012. -# -# Add AVX2 code path. Two consecutive input blocks are loaded to -# 256-bit %ymm registers, with data from first block to least -# significant 128-bit halves and data from second to most significant. -# The data is then processed with same SIMD instruction sequence as -# for AVX, but with %ymm as operands. Side effect is increased stack -# frame, 448 additional bytes in SHA256 and 1152 in SHA512. - -###################################################################### -# Current performance in cycles per processed byte (less is better): -# -# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) -# -# AMD K8 14.9 - - 9.57 - -# P4 17.3 - - 30.8 - -# Core 2 15.6 13.8(+13%) - 9.97 - -# Westmere 14.8 12.3(+19%) - 9.58 - -# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) -# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) -# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) -# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) -# VIA Nano 23.0 16.5(+39%) - 14.7 - -# Atom 23.0 18.9(+22%) - 14.7 - -# -# (*) whichever best applicable; -# (**) switch from ror to shrd stands for fair share of improvement; -# (***) execution time is fully determined by remaining integer-only -# part, body_00_15; reducing the amount of SIMD instructions -# below certain limit makes no difference/sense; to conserve -# space SHA256 XOP code path is therefore omitted; - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); -} - -if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.09) + ($1>=2.10); -} - -if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); -} - -open OUT,"| \"$^X\" $xlate $flavour $output"; -*STDOUT=*OUT; - -if ($output =~ /512/) { - $func="sha512_block_data_order"; - $TABLE="K512"; - $SZ=8; - @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", - "%r8", "%r9", "%r10","%r11"); - ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); - @Sigma0=(28,34,39); - @Sigma1=(14,18,41); - @sigma0=(1, 8, 7); - @sigma1=(19,61, 6); - $rounds=80; -} else { - $func="sha256_block_data_order"; - $TABLE="K256"; - $SZ=4; - @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", - "%r8d","%r9d","%r10d","%r11d"); - ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); - @Sigma0=( 2,13,22); - @Sigma1=( 6,11,25); - @sigma0=( 7,18, 3); - @sigma1=(17,19,10); - $rounds=64; -} - -$ctx="%rdi"; # 1st arg, zapped by $a3 -$inp="%rsi"; # 2nd arg -$Tbl="%rbp"; - -$_ctx="16*$SZ+0*8(%rsp)"; -$_inp="16*$SZ+1*8(%rsp)"; -$_end="16*$SZ+2*8(%rsp)"; -$_rsp="16*$SZ+3*8(%rsp)"; -$framesz="16*$SZ+4*8"; - - -sub ROUND_00_15() -{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; - my $STRIDE=$SZ; - $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); - -$code.=<<___; - ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 - mov $f,$a2 - - xor $e,$a0 - ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 - xor $g,$a2 # f^g - - mov $T1,`$SZ*($i&0xf)`(%rsp) - xor $a,$a1 - and $e,$a2 # (f^g)&e - - ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 - add $h,$T1 # T1+=h - xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g - - ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 - xor $e,$a0 - add $a2,$T1 # T1+=Ch(e,f,g) - - mov $a,$a2 - add ($Tbl),$T1 # T1+=K[round] - xor $a,$a1 - - xor $b,$a2 # a^b, b^c in next round - ror \$$Sigma1[0],$a0 # Sigma1(e) - mov $b,$h - - and $a2,$a3 - ror \$$Sigma0[0],$a1 # Sigma0(a) - add $a0,$T1 # T1+=Sigma1(e) - - xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) - add $T1,$d # d+=T1 - add $T1,$h # h+=T1 - - lea $STRIDE($Tbl),$Tbl # round++ -___ -$code.=<<___ if ($i<15); - add $a1,$h # h+=Sigma0(a) -___ - ($a2,$a3) = ($a3,$a2); -} - -sub ROUND_16_XX() -{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; - -$code.=<<___; - mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 - mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 - - mov $a0,$T1 - ror \$`$sigma0[1]-$sigma0[0]`,$a0 - add $a1,$a # modulo-scheduled h+=Sigma0(a) - mov $a2,$a1 - ror \$`$sigma1[1]-$sigma1[0]`,$a2 - - xor $T1,$a0 - shr \$$sigma0[2],$T1 - ror \$$sigma0[0],$a0 - xor $a1,$a2 - shr \$$sigma1[2],$a1 - - ror \$$sigma1[0],$a2 - xor $a0,$T1 # sigma0(X[(i+1)&0xf]) - xor $a1,$a2 # sigma1(X[(i+14)&0xf]) - add `$SZ*(($i+9)&0xf)`(%rsp),$T1 - - add `$SZ*($i&0xf)`(%rsp),$T1 - mov $e,$a0 - add $a2,$T1 - mov $a,$a1 -___ - &ROUND_00_15(@_); -} - -$code=<<___; -.text - -.extern OPENSSL_ia32cap_P -.globl $func -.type $func,\@function,3 -.align 16 -$func: -___ -$code.=<<___ if ($SZ==4 || $avx); - lea OPENSSL_ia32cap_P(%rip),%r11 - mov 0(%r11),%r9d - mov 4(%r11),%r10d - mov 8(%r11),%r11d -___ -$code.=<<___ if ($avx && $SZ==8); - test \$`1<<11`,%r10d # check for XOP - jnz .Lxop_shortcut -___ -$code.=<<___ if ($avx>1); - and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 - cmp \$`1<<8|1<<5|1<<3`,%r11d - je .Lavx2_shortcut -___ -$code.=<<___ if ($avx); - and \$`1<<30`,%r9d # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits - or %r9d,%r10d - cmp \$`1<<28|1<<9|1<<30`,%r10d - je .Lavx_shortcut -___ -$code.=<<___ if ($SZ==4); - test \$`1<<9`,%r10d - jnz .Lssse3_shortcut -___ -$code.=<<___; - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - mov %rsp,%r11 # copy %rsp - shl \$4,%rdx # num*16 - sub \$$framesz,%rsp - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - and \$-64,%rsp # align stack frame - mov $ctx,$_ctx # save ctx, 1st arg - mov $inp,$_inp # save inp, 2nd arh - mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp -.Lprologue: - - mov $SZ*0($ctx),$A - mov $SZ*1($ctx),$B - mov $SZ*2($ctx),$C - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H - jmp .Lloop - -.align 16 -.Lloop: - mov $B,$a3 - lea $TABLE(%rip),$Tbl - xor $C,$a3 # magic -___ - for($i=0;$i<16;$i++) { - $code.=" mov $SZ*$i($inp),$T1\n"; - $code.=" mov @ROT[4],$a0\n"; - $code.=" mov @ROT[0],$a1\n"; - $code.=" bswap $T1\n"; - &ROUND_00_15($i,@ROT); - unshift(@ROT,pop(@ROT)); - } -$code.=<<___; - jmp .Lrounds_16_xx -.align 16 -.Lrounds_16_xx: -___ - for(;$i<32;$i++) { - &ROUND_16_XX($i,@ROT); - unshift(@ROT,pop(@ROT)); - } - -$code.=<<___; - cmpb \$0,`$SZ-1`($Tbl) - jnz .Lrounds_16_xx - - mov $_ctx,$ctx - add $a1,$A # modulo-scheduled h+=Sigma0(a) - lea 16*$SZ($inp),$inp - - add $SZ*0($ctx),$A - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - jb .Lloop - - mov $_rsp,%rsi - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lepilogue: - ret -.size $func,.-$func -___ - -if ($SZ==4) { -$code.=<<___; -.align 64 -.type $TABLE,\@object -$TABLE: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - - .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f - .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f - .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff - .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff - .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 - .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 - .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -___ -} else { -$code.=<<___; -.align 64 -.type $TABLE,\@object -$TABLE: - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - - .quad 0x0001020304050607,0x08090a0b0c0d0e0f - .quad 0x0001020304050607,0x08090a0b0c0d0e0f - .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -___ -} - -###################################################################### -# SIMD code paths -# -{{{ - -my $a4=$T1; -my ($a,$b,$c,$d,$e,$f,$g,$h); - -sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; - my $arg = pop; - $arg = "\$$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; -} - -sub body_00_15 () { - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. - - '&ror ($a0,$Sigma1[2]-$Sigma1[1])', - '&mov ($a,$a1)', - '&mov ($a4,$f)', - - '&ror ($a1,$Sigma0[2]-$Sigma0[1])', - '&xor ($a0,$e)', - '&xor ($a4,$g)', # f^g - - '&ror ($a0,$Sigma1[1]-$Sigma1[0])', - '&xor ($a1,$a)', - '&and ($a4,$e)', # (f^g)&e - - '&xor ($a0,$e)', - '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] - '&mov ($a2,$a)', - - '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g - '&ror ($a1,$Sigma0[1]-$Sigma0[0])', - '&xor ($a2,$b)', # a^b, b^c in next round - - '&add ($h,$a4)', # h+=Ch(e,f,g) - '&ror ($a0,$Sigma1[0])', # Sigma1(e) - '&and ($a3,$a2)', # (b^c)&(a^b) - - '&xor ($a1,$a)', - '&add ($h,$a0)', # h+=Sigma1(e) - '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) - - '&ror ($a1,$Sigma0[0])', # Sigma0(a) - '&add ($d,$h)', # d+=h - '&add ($h,$a3)', # h+=Maj(a,b,c) - - '&mov ($a0,$d)', - '&add ($a1,$h);'. # h+=Sigma0(a) - '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' - ); -} - -###################################################################### -# SSSE3 code path -# -if ($SZ==4) { # SHA256 only -my @X = map("%xmm$_",(0..3)); -my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); - -$code.=<<___; -.type ${func}_ssse3,\@function,3 -.align 64 -${func}_ssse3: -.Lssse3_shortcut: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - mov %rsp,%r11 # copy %rsp - shl \$4,%rdx # num*16 - sub \$`$framesz+$win64*16*4`,%rsp - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - and \$-64,%rsp # align stack frame - mov $ctx,$_ctx # save ctx, 1st arg - mov $inp,$_inp # save inp, 2nd arh - mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,16*$SZ+32(%rsp) - movaps %xmm7,16*$SZ+48(%rsp) - movaps %xmm8,16*$SZ+64(%rsp) - movaps %xmm9,16*$SZ+80(%rsp) -___ -$code.=<<___; -.Lprologue_ssse3: - - mov $SZ*0($ctx),$A - mov $SZ*1($ctx),$B - mov $SZ*2($ctx),$C - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H -___ - -$code.=<<___; - #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 - #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 - jmp .Lloop_ssse3 -.align 16 -.Lloop_ssse3: - movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - movdqu 0x00($inp),@X[0] - movdqu 0x10($inp),@X[1] - movdqu 0x20($inp),@X[2] - movdqu 0x30($inp),@X[3] - pshufb $t3,@X[0] - lea $TABLE(%rip),$Tbl - pshufb $t3,@X[1] - movdqa 0x00($Tbl),$t0 - pshufb $t3,@X[2] - movdqa 0x20($Tbl),$t1 - paddd @X[0],$t0 - movdqa 0x40($Tbl),$t2 - pshufb $t3,@X[3] - movdqa 0x60($Tbl),$t3 - paddd @X[1],$t1 - paddd @X[2],$t2 - paddd @X[3],$t3 - movdqa $t0,0x00(%rsp) - mov $A,$a1 - movdqa $t1,0x10(%rsp) - mov $B,$a3 - movdqa $t2,0x20(%rsp) - xor $C,$a3 # magic - movdqa $t3,0x30(%rsp) - mov $E,$a0 - jmp .Lssse3_00_47 - -.align 16 -.Lssse3_00_47: - sub \$-16*2*$SZ,$Tbl # size optimization -___ -sub Xupdate_256_SSSE3 () { - ( - '&movdqa ($t0,@X[1]);', - '&movdqa ($t3,@X[3])', - '&palignr ($t0,@X[0],$SZ)', # X[1..4] - '&palignr ($t3,@X[2],$SZ);', # X[9..12] - '&movdqa ($t1,$t0)', - '&movdqa ($t2,$t0);', - '&psrld ($t0,$sigma0[2])', - '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] - '&psrld ($t2,$sigma0[0])', - '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] - '&pslld ($t1,8*$SZ-$sigma0[1]);'. - '&pxor ($t0,$t2)', - '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. - '&pxor ($t0,$t1)', - '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. - '&pxor ($t0,$t2);', - '&movdqa ($t2,$t3)', - '&pxor ($t0,$t1);', # sigma0(X[1..4]) - '&psrld ($t3,$sigma1[2])', - '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) - '&psrlq ($t2,$sigma1[0])', - '&pxor ($t3,$t2);', - '&psrlq ($t2,$sigma1[1]-$sigma1[0])', - '&pxor ($t3,$t2)', - '&pshufb ($t3,$t4)', # sigma1(X[14..15]) - '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) - '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] - '&movdqa ($t2,$t3);', - '&psrld ($t3,$sigma1[2])', - '&psrlq ($t2,$sigma1[0])', - '&pxor ($t3,$t2);', - '&psrlq ($t2,$sigma1[1]-$sigma1[0])', - '&pxor ($t3,$t2);', - '&movdqa ($t2,16*2*$j."($Tbl)")', - '&pshufb ($t3,$t5)', - '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) - ); -} - -sub SSSE3_256_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 104 instructions - - if (0) { - foreach (Xupdate_256_SSSE3()) { # 36 instructions - eval; - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - } - } else { # squeeze extra 4% on Westmere and 19% on Atom - eval(shift(@insns)); #@ - &movdqa ($t0,@X[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t3,@X[3]); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &palignr ($t0,@X[0],$SZ); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - &palignr ($t3,@X[2],$SZ); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &movdqa ($t1,$t0); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t2,$t0); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &psrld ($t0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[0],$t3); # X[0..3] += X[9..12] - eval(shift(@insns)); #@ - eval(shift(@insns)); - &psrld ($t2,$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &pshufd ($t3,@X[3],0b11111010); # X[4..15] - eval(shift(@insns)); - eval(shift(@insns)); #@ - &pslld ($t1,8*$SZ-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t0,$t2); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &psrld ($t2,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - &pxor ($t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - &pslld ($t1,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t0,$t2); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &movdqa ($t2,$t3); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t0,$t1); # sigma0(X[1..4]) - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - &psrld ($t3,$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); #@ - eval(shift(@insns)); - &psrlq ($t2,$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t3,$t2); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &psrlq ($t2,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t3,$t2); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - #&pshufb ($t3,$t4); # sigma1(X[14..15]) - &pshufd ($t3,$t3,0b10000000); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &psrldq ($t3,8); - eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &pshufd ($t3,@X[0],0b01010000); # X[16..17] - eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &movdqa ($t2,$t3); - eval(shift(@insns)); - eval(shift(@insns)); - &psrld ($t3,$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &psrlq ($t2,$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t3,$t2); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &psrlq ($t2,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - #&pshufb ($t3,$t5); - &pshufd ($t3,$t3,0b00001000); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t2,16*2*$j."($Tbl)"); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &pslldq ($t3,8); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - } - &paddd ($t2,@X[0]); - foreach (@insns) { eval; } # remaining instructions - &movdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<4; $j++) { - &SSSE3_256_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); - &jne (".Lssse3_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } -$code.=<<___; - mov $_ctx,$ctx - mov $a1,$A - - add $SZ*0($ctx),$A - lea 16*$SZ($inp),$inp - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - jb .Lloop_ssse3 - - mov $_rsp,%rsi -___ -$code.=<<___ if ($win64); - movaps 16*$SZ+32(%rsp),%xmm6 - movaps 16*$SZ+48(%rsp),%xmm7 - movaps 16*$SZ+64(%rsp),%xmm8 - movaps 16*$SZ+80(%rsp),%xmm9 -___ -$code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lepilogue_ssse3: - ret -.size ${func}_ssse3,.-${func}_ssse3 -___ -} - -if ($avx) {{ -###################################################################### -# XOP code path -# -if ($SZ==8) { # SHA512 only -$code.=<<___; -.type ${func}_xop,\@function,3 -.align 64 -${func}_xop: -.Lxop_shortcut: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - mov %rsp,%r11 # copy %rsp - shl \$4,%rdx # num*16 - sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - and \$-64,%rsp # align stack frame - mov $ctx,$_ctx # save ctx, 1st arg - mov $inp,$_inp # save inp, 2nd arh - mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,16*$SZ+32(%rsp) - movaps %xmm7,16*$SZ+48(%rsp) - movaps %xmm8,16*$SZ+64(%rsp) - movaps %xmm9,16*$SZ+80(%rsp) -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps %xmm10,16*$SZ+96(%rsp) - movaps %xmm11,16*$SZ+112(%rsp) -___ -$code.=<<___; -.Lprologue_xop: - - vzeroupper - mov $SZ*0($ctx),$A - mov $SZ*1($ctx),$B - mov $SZ*2($ctx),$C - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H - jmp .Lloop_xop -___ - if ($SZ==4) { # SHA256 - my @X = map("%xmm$_",(0..3)); - my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); - -$code.=<<___; -.align 16 -.Lloop_xop: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu 0x00($inp),@X[0] - vmovdqu 0x10($inp),@X[1] - vmovdqu 0x20($inp),@X[2] - vmovdqu 0x30($inp),@X[3] - vpshufb $t3,@X[0],@X[0] - lea $TABLE(%rip),$Tbl - vpshufb $t3,@X[1],@X[1] - vpshufb $t3,@X[2],@X[2] - vpaddd 0x00($Tbl),@X[0],$t0 - vpshufb $t3,@X[3],@X[3] - vpaddd 0x20($Tbl),@X[1],$t1 - vpaddd 0x40($Tbl),@X[2],$t2 - vpaddd 0x60($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - mov $A,$a1 - vmovdqa $t1,0x10(%rsp) - mov $B,$a3 - vmovdqa $t2,0x20(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x30(%rsp) - mov $E,$a0 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - sub \$-16*2*$SZ,$Tbl # size optimization -___ -sub XOP_256_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 104 instructions - - &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t0,$t0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t2,@X[3],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrldq ($t3,$t3,8); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t2,@X[0],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpslldq ($t3,$t3,8); # 22 instructions - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<4; $j++) { - &XOP_256_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); - &jne (".Lxop_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } - - } else { # SHA512 - my @X = map("%xmm$_",(0..7)); - my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); - -$code.=<<___; -.align 16 -.Lloop_xop: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu 0x00($inp),@X[0] - lea $TABLE+0x80(%rip),$Tbl # size optimization - vmovdqu 0x10($inp),@X[1] - vmovdqu 0x20($inp),@X[2] - vpshufb $t3,@X[0],@X[0] - vmovdqu 0x30($inp),@X[3] - vpshufb $t3,@X[1],@X[1] - vmovdqu 0x40($inp),@X[4] - vpshufb $t3,@X[2],@X[2] - vmovdqu 0x50($inp),@X[5] - vpshufb $t3,@X[3],@X[3] - vmovdqu 0x60($inp),@X[6] - vpshufb $t3,@X[4],@X[4] - vmovdqu 0x70($inp),@X[7] - vpshufb $t3,@X[5],@X[5] - vpaddq -0x80($Tbl),@X[0],$t0 - vpshufb $t3,@X[6],@X[6] - vpaddq -0x60($Tbl),@X[1],$t1 - vpshufb $t3,@X[7],@X[7] - vpaddq -0x40($Tbl),@X[2],$t2 - vpaddq -0x20($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - vpaddq 0x00($Tbl),@X[4],$t0 - vmovdqa $t1,0x10(%rsp) - vpaddq 0x20($Tbl),@X[5],$t1 - vmovdqa $t2,0x20(%rsp) - vpaddq 0x40($Tbl),@X[6],$t2 - vmovdqa $t3,0x30(%rsp) - vpaddq 0x60($Tbl),@X[7],$t3 - vmovdqa $t0,0x40(%rsp) - mov $A,$a1 - vmovdqa $t1,0x50(%rsp) - mov $B,$a3 - vmovdqa $t2,0x60(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x70(%rsp) - mov $E,$a0 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - add \$16*2*$SZ,$Tbl -___ -sub XOP_512_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body); # 52 instructions - - &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrlq ($t0,$t0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrlq ($t2,@X[7],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<8; $j++) { - &XOP_512_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); - &jne (".Lxop_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } -} -$code.=<<___; - mov $_ctx,$ctx - mov $a1,$A - - add $SZ*0($ctx),$A - lea 16*$SZ($inp),$inp - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - jb .Lloop_xop - - mov $_rsp,%rsi - vzeroupper -___ -$code.=<<___ if ($win64); - movaps 16*$SZ+32(%rsp),%xmm6 - movaps 16*$SZ+48(%rsp),%xmm7 - movaps 16*$SZ+64(%rsp),%xmm8 - movaps 16*$SZ+80(%rsp),%xmm9 -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps 16*$SZ+96(%rsp),%xmm10 - movaps 16*$SZ+112(%rsp),%xmm11 -___ -$code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lepilogue_xop: - ret -.size ${func}_xop,.-${func}_xop -___ -} -###################################################################### -# AVX+shrd code path -# -local *ror = sub { &shrd(@_[0],@_) }; - -$code.=<<___; -.type ${func}_avx,\@function,3 -.align 64 -${func}_avx: -.Lavx_shortcut: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - mov %rsp,%r11 # copy %rsp - shl \$4,%rdx # num*16 - sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - and \$-64,%rsp # align stack frame - mov $ctx,$_ctx # save ctx, 1st arg - mov $inp,$_inp # save inp, 2nd arh - mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,16*$SZ+32(%rsp) - movaps %xmm7,16*$SZ+48(%rsp) - movaps %xmm8,16*$SZ+64(%rsp) - movaps %xmm9,16*$SZ+80(%rsp) -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps %xmm10,16*$SZ+96(%rsp) - movaps %xmm11,16*$SZ+112(%rsp) -___ -$code.=<<___; -.Lprologue_avx: - - vzeroupper - mov $SZ*0($ctx),$A - mov $SZ*1($ctx),$B - mov $SZ*2($ctx),$C - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H -___ - if ($SZ==4) { # SHA256 - my @X = map("%xmm$_",(0..3)); - my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); - -$code.=<<___; - vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 - vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu 0x00($inp),@X[0] - vmovdqu 0x10($inp),@X[1] - vmovdqu 0x20($inp),@X[2] - vmovdqu 0x30($inp),@X[3] - vpshufb $t3,@X[0],@X[0] - lea $TABLE(%rip),$Tbl - vpshufb $t3,@X[1],@X[1] - vpshufb $t3,@X[2],@X[2] - vpaddd 0x00($Tbl),@X[0],$t0 - vpshufb $t3,@X[3],@X[3] - vpaddd 0x20($Tbl),@X[1],$t1 - vpaddd 0x40($Tbl),@X[2],$t2 - vpaddd 0x60($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - mov $A,$a1 - vmovdqa $t1,0x10(%rsp) - mov $B,$a3 - vmovdqa $t2,0x20(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x30(%rsp) - mov $E,$a0 - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - sub \$-16*2*$SZ,$Tbl # size optimization -___ -sub Xupdate_256_AVX () { - ( - '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] - '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] - '&vpsrld ($t2,$t0,$sigma0[0]);', - '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] - '&vpsrld ($t3,$t0,$sigma0[2])', - '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', - '&vpxor ($t0,$t3,$t2)', - '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] - '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', - '&vpxor ($t0,$t0,$t1)', - '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', - '&vpxor ($t0,$t0,$t2)', - '&vpsrld ($t2,$t3,$sigma1[2]);', - '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) - '&vpsrlq ($t3,$t3,$sigma1[0]);', - '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) - '&vpxor ($t2,$t2,$t3);', - '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', - '&vpxor ($t2,$t2,$t3)', - '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) - '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) - '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] - '&vpsrld ($t2,$t3,$sigma1[2])', - '&vpsrlq ($t3,$t3,$sigma1[0])', - '&vpxor ($t2,$t2,$t3);', - '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', - '&vpxor ($t2,$t2,$t3)', - '&vpshufb ($t2,$t2,$t5)', - '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) - ); -} - -sub AVX_256_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 104 instructions - - foreach (Xupdate_256_AVX()) { # 29 instructions - eval; - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - } - &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<4; $j++) { - &AVX_256_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); - &jne (".Lavx_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } - - } else { # SHA512 - my @X = map("%xmm$_",(0..7)); - my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); - -$code.=<<___; - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu 0x00($inp),@X[0] - lea $TABLE+0x80(%rip),$Tbl # size optimization - vmovdqu 0x10($inp),@X[1] - vmovdqu 0x20($inp),@X[2] - vpshufb $t3,@X[0],@X[0] - vmovdqu 0x30($inp),@X[3] - vpshufb $t3,@X[1],@X[1] - vmovdqu 0x40($inp),@X[4] - vpshufb $t3,@X[2],@X[2] - vmovdqu 0x50($inp),@X[5] - vpshufb $t3,@X[3],@X[3] - vmovdqu 0x60($inp),@X[6] - vpshufb $t3,@X[4],@X[4] - vmovdqu 0x70($inp),@X[7] - vpshufb $t3,@X[5],@X[5] - vpaddq -0x80($Tbl),@X[0],$t0 - vpshufb $t3,@X[6],@X[6] - vpaddq -0x60($Tbl),@X[1],$t1 - vpshufb $t3,@X[7],@X[7] - vpaddq -0x40($Tbl),@X[2],$t2 - vpaddq -0x20($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - vpaddq 0x00($Tbl),@X[4],$t0 - vmovdqa $t1,0x10(%rsp) - vpaddq 0x20($Tbl),@X[5],$t1 - vmovdqa $t2,0x20(%rsp) - vpaddq 0x40($Tbl),@X[6],$t2 - vmovdqa $t3,0x30(%rsp) - vpaddq 0x60($Tbl),@X[7],$t3 - vmovdqa $t0,0x40(%rsp) - mov $A,$a1 - vmovdqa $t1,0x50(%rsp) - mov $B,$a3 - vmovdqa $t2,0x60(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x70(%rsp) - mov $E,$a0 - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - add \$16*2*$SZ,$Tbl -___ -sub Xupdate_512_AVX () { - ( - '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] - '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] - '&vpsrlq ($t2,$t0,$sigma0[0])', - '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] - '&vpsrlq ($t3,$t0,$sigma0[2])', - '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', - '&vpxor ($t0,$t3,$t2)', - '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', - '&vpxor ($t0,$t0,$t1)', - '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', - '&vpxor ($t0,$t0,$t2)', - '&vpsrlq ($t3,@X[7],$sigma1[2]);', - '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) - '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', - '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) - '&vpsrlq ($t1,@X[7],$sigma1[0]);', - '&vpxor ($t3,$t3,$t2)', - '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', - '&vpxor ($t3,$t3,$t1)', - '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', - '&vpxor ($t3,$t3,$t2)', - '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) - '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) - ); -} - -sub AVX_512_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body); # 52 instructions - - foreach (Xupdate_512_AVX()) { # 23 instructions - eval; - eval(shift(@insns)); - eval(shift(@insns)); - } - &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<8; $j++) { - &AVX_512_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); - &jne (".Lavx_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } -} -$code.=<<___; - mov $_ctx,$ctx - mov $a1,$A - - add $SZ*0($ctx),$A - lea 16*$SZ($inp),$inp - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - jb .Lloop_avx - - mov $_rsp,%rsi - vzeroupper -___ -$code.=<<___ if ($win64); - movaps 16*$SZ+32(%rsp),%xmm6 - movaps 16*$SZ+48(%rsp),%xmm7 - movaps 16*$SZ+64(%rsp),%xmm8 - movaps 16*$SZ+80(%rsp),%xmm9 -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps 16*$SZ+96(%rsp),%xmm10 - movaps 16*$SZ+112(%rsp),%xmm11 -___ -$code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lepilogue_avx: - ret -.size ${func}_avx,.-${func}_avx -___ - -if ($avx>1) {{ -###################################################################### -# AVX2+BMI code path -# -my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp -my $PUSH8=8*2*$SZ; -use integer; - -sub bodyx_00_15 () { - # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. - - '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] - '&and ($a4,$e)', # f&e - '&rorx ($a0,$e,$Sigma1[2])', - '&rorx ($a2,$e,$Sigma1[1])', - - '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past - '&lea ($h,"($h,$a4)")', - '&andn ($a4,$e,$g)', # ~e&g - '&xor ($a0,$a2)', - - '&rorx ($a1,$e,$Sigma1[0])', - '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) - '&xor ($a0,$a1)', # Sigma1(e) - '&mov ($a2,$a)', - - '&rorx ($a4,$a,$Sigma0[2])', - '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) - '&xor ($a2,$b)', # a^b, b^c in next round - '&rorx ($a1,$a,$Sigma0[1])', - - '&rorx ($a0,$a,$Sigma0[0])', - '&lea ($d,"($d,$h)")', # d+=h - '&and ($a3,$a2)', # (b^c)&(a^b) - '&xor ($a1,$a4)', - - '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) - '&xor ($a1,$a0)', # Sigma0(a) - '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) - '&mov ($a4,$e)', # copy of f in future - - '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' - ); - # and at the finish one has to $a+=$a1 -} - -$code.=<<___; -.type ${func}_avx2,\@function,3 -.align 64 -${func}_avx2: -.Lavx2_shortcut: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - mov %rsp,%r11 # copy %rsp - sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp - shl \$4,%rdx # num*16 - and \$-256*$SZ,%rsp # align stack frame - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - add \$`2*$SZ*($rounds-8)`,%rsp - mov $ctx,$_ctx # save ctx, 1st arg - mov $inp,$_inp # save inp, 2nd arh - mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp -___ -$code.=<<___ if ($win64); - movaps %xmm6,16*$SZ+32(%rsp) - movaps %xmm7,16*$SZ+48(%rsp) - movaps %xmm8,16*$SZ+64(%rsp) - movaps %xmm9,16*$SZ+80(%rsp) -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps %xmm10,16*$SZ+96(%rsp) - movaps %xmm11,16*$SZ+112(%rsp) -___ -$code.=<<___; -.Lprologue_avx2: - - vzeroupper - sub \$-16*$SZ,$inp # inp++, size optimization - mov $SZ*0($ctx),$A - mov $inp,%r12 # borrow $T1 - mov $SZ*1($ctx),$B - cmp %rdx,$inp # $_end - mov $SZ*2($ctx),$C - cmove %rsp,%r12 # next block or random data - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H -___ - if ($SZ==4) { # SHA256 - my @X = map("%ymm$_",(0..3)); - my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); - -$code.=<<___; - vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 - vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu -16*$SZ+0($inp),%xmm0 - vmovdqu -16*$SZ+16($inp),%xmm1 - vmovdqu -16*$SZ+32($inp),%xmm2 - vmovdqu -16*$SZ+48($inp),%xmm3 - #mov $inp,$_inp # offload $inp - vinserti128 \$1,(%r12),@X[0],@X[0] - vinserti128 \$1,16(%r12),@X[1],@X[1] - vpshufb $t3,@X[0],@X[0] - vinserti128 \$1,32(%r12),@X[2],@X[2] - vpshufb $t3,@X[1],@X[1] - vinserti128 \$1,48(%r12),@X[3],@X[3] - - lea $TABLE(%rip),$Tbl - vpshufb $t3,@X[2],@X[2] - vpaddd 0x00($Tbl),@X[0],$t0 - vpshufb $t3,@X[3],@X[3] - vpaddd 0x20($Tbl),@X[1],$t1 - vpaddd 0x40($Tbl),@X[2],$t2 - vpaddd 0x60($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - xor $a1,$a1 - vmovdqa $t1,0x20(%rsp) - lea -$PUSH8(%rsp),%rsp - mov $B,$a3 - vmovdqa $t2,0x00(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x20(%rsp) - mov $F,$a4 - sub \$-16*2*$SZ,$Tbl # size optimization - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: -___ - -sub AVX2_256_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 96 instructions -my $base = "+2*$PUSH8(%rsp)"; - - &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); - foreach (Xupdate_256_AVX()) { # 29 instructions - eval; - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - } - &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<4; $j++) { - &AVX2_256_00_47($j,\&bodyx_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &lea ($Tbl,16*2*$SZ."($Tbl)"); - &cmpb (($SZ-1)."($Tbl)",0); - &jne (".Lavx2_00_47"); - - for ($i=0; $i<16; ) { - my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; - foreach(bodyx_00_15()) { eval; } - } - } else { # SHA512 - my @X = map("%ymm$_",(0..7)); - my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); - -$code.=<<___; - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqu -16*$SZ($inp),%xmm0 - vmovdqu -16*$SZ+16($inp),%xmm1 - vmovdqu -16*$SZ+32($inp),%xmm2 - lea $TABLE+0x80(%rip),$Tbl # size optimization - vmovdqu -16*$SZ+48($inp),%xmm3 - vmovdqu -16*$SZ+64($inp),%xmm4 - vmovdqu -16*$SZ+80($inp),%xmm5 - vmovdqu -16*$SZ+96($inp),%xmm6 - vmovdqu -16*$SZ+112($inp),%xmm7 - #mov $inp,$_inp # offload $inp - vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 - vinserti128 \$1,(%r12),@X[0],@X[0] - vinserti128 \$1,16(%r12),@X[1],@X[1] - vpshufb $t2,@X[0],@X[0] - vinserti128 \$1,32(%r12),@X[2],@X[2] - vpshufb $t2,@X[1],@X[1] - vinserti128 \$1,48(%r12),@X[3],@X[3] - vpshufb $t2,@X[2],@X[2] - vinserti128 \$1,64(%r12),@X[4],@X[4] - vpshufb $t2,@X[3],@X[3] - vinserti128 \$1,80(%r12),@X[5],@X[5] - vpshufb $t2,@X[4],@X[4] - vinserti128 \$1,96(%r12),@X[6],@X[6] - vpshufb $t2,@X[5],@X[5] - vinserti128 \$1,112(%r12),@X[7],@X[7] - - vpaddq -0x80($Tbl),@X[0],$t0 - vpshufb $t2,@X[6],@X[6] - vpaddq -0x60($Tbl),@X[1],$t1 - vpshufb $t2,@X[7],@X[7] - vpaddq -0x40($Tbl),@X[2],$t2 - vpaddq -0x20($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - vpaddq 0x00($Tbl),@X[4],$t0 - vmovdqa $t1,0x20(%rsp) - vpaddq 0x20($Tbl),@X[5],$t1 - vmovdqa $t2,0x40(%rsp) - vpaddq 0x40($Tbl),@X[6],$t2 - vmovdqa $t3,0x60(%rsp) - lea -$PUSH8(%rsp),%rsp - vpaddq 0x60($Tbl),@X[7],$t3 - vmovdqa $t0,0x00(%rsp) - xor $a1,$a1 - vmovdqa $t1,0x20(%rsp) - mov $B,$a3 - vmovdqa $t2,0x40(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x60(%rsp) - mov $F,$a4 - add \$16*2*$SZ,$Tbl - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: -___ - -sub AVX2_512_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body); # 48 instructions -my $base = "+2*$PUSH8(%rsp)"; - - &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); - foreach (Xupdate_512_AVX()) { # 23 instructions - eval; - if ($_ !~ /\;$/) { - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - } - } - &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<8; $j++) { - &AVX2_512_00_47($j,\&bodyx_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &lea ($Tbl,16*2*$SZ."($Tbl)"); - &cmpb (($SZ-1-0x80)."($Tbl)",0); - &jne (".Lavx2_00_47"); - - for ($i=0; $i<16; ) { - my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; - foreach(bodyx_00_15()) { eval; } - } -} -$code.=<<___; - mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx - add $a1,$A - #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp - lea `2*$SZ*($rounds-8)`(%rsp),$Tbl - - add $SZ*0($ctx),$A - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - - cmp `$PUSH8+2*8`($Tbl),$inp # $_end - je .Ldone_avx2 - - xor $a1,$a1 - mov $B,$a3 - xor $C,$a3 # magic - mov $F,$a4 - jmp .Lower_avx2 -.align 16 -.Lower_avx2: -___ - for ($i=0; $i<8; ) { - my $base="+16($Tbl)"; - foreach(bodyx_00_15()) { eval; } - } -$code.=<<___; - lea -$PUSH8($Tbl),$Tbl - cmp %rsp,$Tbl - jae .Lower_avx2 - - mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx - add $a1,$A - #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp - lea `2*$SZ*($rounds-8)`(%rsp),%rsp - - add $SZ*0($ctx),$A - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - lea `2*16*$SZ`($inp),$inp # inp+=2 - add $SZ*6($ctx),$G - mov $inp,%r12 - add $SZ*7($ctx),$H - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - cmove %rsp,%r12 # next block or stale data - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - - jbe .Loop_avx2 - lea (%rsp),$Tbl - -.Ldone_avx2: - lea ($Tbl),%rsp - mov $_rsp,%rsi - vzeroupper -___ -$code.=<<___ if ($win64); - movaps 16*$SZ+32(%rsp),%xmm6 - movaps 16*$SZ+48(%rsp),%xmm7 - movaps 16*$SZ+64(%rsp),%xmm8 - movaps 16*$SZ+80(%rsp),%xmm9 -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps 16*$SZ+96(%rsp),%xmm10 - movaps 16*$SZ+112(%rsp),%xmm11 -___ -$code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp -.Lepilogue_avx2: - ret -.size ${func}_avx2,.-${func}_avx2 -___ -}} -}}}}} - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HanderlData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lin_prologue - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lin_prologue -___ -$code.=<<___ if ($avx>1); - lea .Lavx2_shortcut(%rip),%r10 - cmp %r10,%rbx # context->Rip<avx2_shortcut - jb .Lnot_in_avx2 - - and \$-256*$SZ,%rax - add \$`2*$SZ*($rounds-8)`,%rax -.Lnot_in_avx2: -___ -$code.=<<___; - mov %rax,%rsi # put aside Rsp - mov 16*$SZ+3*8(%rax),%rax # pull $_rsp - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R15 - - lea .Lepilogue(%rip),%r10 - cmp %r10,%rbx - jb .Lin_prologue # non-AVX code - - lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area - lea 512($context),%rdi # &context.Xmm6 - mov \$`$SZ==4?8:12`,%ecx - .long 0xa548f3fc # cld; rep movsq - -.Lin_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size se_handler,.-se_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_$func - .rva .LSEH_end_$func - .rva .LSEH_info_$func -___ -$code.=<<___ if ($SZ==4); - .rva .LSEH_begin_${func}_ssse3 - .rva .LSEH_end_${func}_ssse3 - .rva .LSEH_info_${func}_ssse3 -___ -$code.=<<___ if ($avx && $SZ==8); - .rva .LSEH_begin_${func}_xop - .rva .LSEH_end_${func}_xop - .rva .LSEH_info_${func}_xop -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_${func}_avx - .rva .LSEH_end_${func}_avx - .rva .LSEH_info_${func}_avx -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_${func}_avx2 - .rva .LSEH_end_${func}_avx2 - .rva .LSEH_info_${func}_avx2 -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_$func: - .byte 9,0,0,0 - .rva se_handler - .rva .Lprologue,.Lepilogue # HandlerData[] -___ -$code.=<<___ if ($SZ==4); -.LSEH_info_${func}_ssse3: - .byte 9,0,0,0 - .rva se_handler - .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] -___ -$code.=<<___ if ($avx && $SZ==8); -.LSEH_info_${func}_xop: - .byte 9,0,0,0 - .rva se_handler - .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] -___ -$code.=<<___ if ($avx); -.LSEH_info_${func}_avx: - .byte 9,0,0,0 - .rva se_handler - .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] -___ -$code.=<<___ if ($avx>1); -.LSEH_info_${func}_avx2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] -___ -} - -$code =~ s/\`([^\`]*)\`/eval $1/gem; -print $code; -close STDOUT; +../openssl/./crypto/sha/asm/sha512-x86_64.pl
\ No newline at end of file diff --git a/devel/perlasm/x86_64-xlate.pl b/devel/perlasm/x86_64-xlate.pl index bd165b152b..7e63ce7074 100755..120000 --- a/devel/perlasm/x86_64-xlate.pl +++ b/devel/perlasm/x86_64-xlate.pl @@ -1,1126 +1 @@ -#!/usr/bin/env perl - -# Ascetic x86_64 AT&T to MASM/NASM assembler translator by <appro>. -# -# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T -# format is way easier to parse. Because it's simpler to "gear" from -# Unix ABI to Windows one [see cross-reference "card" at the end of -# file]. Because Linux targets were available first... -# -# In addition the script also "distills" code suitable for GNU -# assembler, so that it can be compiled with more rigid assemblers, -# such as Solaris /usr/ccs/bin/as. -# -# This translator is not designed to convert *arbitrary* assembler -# code from AT&T format to MASM one. It's designed to convert just -# enough to provide for dual-ABI OpenSSL modules development... -# There *are* limitations and you might have to modify your assembler -# code or this script to achieve the desired result... -# -# Currently recognized limitations: -# -# - can't use multiple ops per line; -# -# Dual-ABI styling rules. -# -# 1. Adhere to Unix register and stack layout [see cross-reference -# ABI "card" at the end for explanation]. -# 2. Forget about "red zone," stick to more traditional blended -# stack frame allocation. If volatile storage is actually required -# that is. If not, just leave the stack as is. -# 3. Functions tagged with ".type name,@function" get crafted with -# unified Win64 prologue and epilogue automatically. If you want -# to take care of ABI differences yourself, tag functions as -# ".type name,@abi-omnipotent" instead. -# 4. To optimize the Win64 prologue you can specify number of input -# arguments as ".type name,@function,N." Keep in mind that if N is -# larger than 6, then you *have to* write "abi-omnipotent" code, -# because >6 cases can't be addressed with unified prologue. -# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: -# (sorry about latter). -# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is -# required to identify the spots, where to inject Win64 epilogue! -# But on the pros, it's then prefixed with rep automatically:-) -# 7. Stick to explicit ip-relative addressing. If you have to use -# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. -# Both are recognized and translated to proper Win64 addressing -# modes. To support legacy code a synthetic directive, .picmeup, -# is implemented. It puts address of the *next* instruction into -# target register, e.g.: -# -# .picmeup %rax -# lea .Label-.(%rax),%rax -# -# 8. In order to provide for structured exception handling unified -# Win64 prologue copies %rsp value to %rax. For further details -# see SEH paragraph at the end. -# 9. .init segment is allowed to contain calls to functions only. -# a. If function accepts more than 4 arguments *and* >4th argument -# is declared as non 64-bit value, do clear its upper part. - -my $flavour = shift; -my $output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -open STDOUT,">$output" || die "can't open $output: $!" - if (defined($output)); - -my $gas=1; $gas=0 if ($output =~ /\.asm$/); -my $elf=1; $elf=0 if (!$gas); -my $win64=0; -my $prefix=""; -my $decor=".L"; - -my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 -my $masm=0; -my $PTR=" PTR"; - -my $nasmref=2.03; -my $nasm=0; - -if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; - $prefix=`echo __USER_LABEL_PREFIX__ | $ENV{CC} -E -P -`; - chomp($prefix); - } -elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } -elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } -elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } -elsif (!$gas) -{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) - { $nasm = $1 + $2*0.01; $PTR=""; } - elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) - { $masm = $1 + $2*2**-16 + $4*2**-32; } - die "no assembler found on %PATH" if (!($nasm || $masm)); - $win64=1; - $elf=0; - $decor="\$L\$"; -} - -my $current_segment; -my $current_function; -my %globals; - -{ package opcode; # pick up opcodes - sub re { - my $self = shift; # single instance in enough... - local *line = shift; - undef $ret; - - if ($line =~ /^([a-z][a-z0-9]*)/i) { - $self->{op} = $1; - $ret = $self; - $line = substr($line,@+[0]); $line =~ s/^\s+//; - - undef $self->{sz}; - if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... - $self->{op} = $1; - $self->{sz} = $2; - } elsif ($self->{op} =~ /call|jmp/) { - $self->{sz} = ""; - } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn - $self->{sz} = ""; - } elsif ($self->{op} =~ /^v/) { # VEX - $self->{sz} = ""; - } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) { - $self->{sz} = ""; - } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { - $self->{op} = $1; - $self->{sz} = $2; - } - } - $ret; - } - sub size { - my $self = shift; - my $sz = shift; - $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); - $self->{sz}; - } - sub out { - my $self = shift; - if ($gas) { - if ($self->{op} eq "movz") { # movz is pain... - sprintf "%s%s%s",$self->{op},$self->{sz},shift; - } elsif ($self->{op} =~ /^set/) { - "$self->{op}"; - } elsif ($self->{op} eq "ret") { - my $epilogue = ""; - if ($win64 && $current_function->{abi} eq "svr4") { - $epilogue = "movq 8(%rsp),%rdi\n\t" . - "movq 16(%rsp),%rsi\n\t"; - } - $epilogue . ".byte 0xf3,0xc3"; - } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { - ".p2align\t3\n\t.quad"; - } else { - "$self->{op}$self->{sz}"; - } - } else { - $self->{op} =~ s/^movz/movzx/; - if ($self->{op} eq "ret") { - $self->{op} = ""; - if ($win64 && $current_function->{abi} eq "svr4") { - $self->{op} = "mov rdi,QWORD${PTR}[8+rsp]\t;WIN64 epilogue\n\t". - "mov rsi,QWORD${PTR}[16+rsp]\n\t"; - } - $self->{op} .= "DB\t0F3h,0C3h\t\t;repret"; - } elsif ($self->{op} =~ /^(pop|push)f/) { - $self->{op} .= $self->{sz}; - } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { - $self->{op} = "\tDQ"; - } - $self->{op}; - } - } - sub mnemonic { - my $self=shift; - my $op=shift; - $self->{op}=$op if (defined($op)); - $self->{op}; - } -} -{ package const; # pick up constants, which start with $ - sub re { - my $self = shift; # single instance in enough... - local *line = shift; - undef $ret; - - if ($line =~ /^\$([^,]+)/) { - $self->{value} = $1; - $ret = $self; - $line = substr($line,@+[0]); $line =~ s/^\s+//; - } - $ret; - } - sub out { - my $self = shift; - - if ($gas) { - # Solaris /usr/ccs/bin/as can't handle multiplications - # in $self->{value} - $self->{value} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi; - $self->{value} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg; - sprintf "\$%s",$self->{value}; - } else { - $self->{value} =~ s/(0b[0-1]+)/oct($1)/eig; - $self->{value} =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); - sprintf "%s",$self->{value}; - } - } -} -{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) - sub re { - my $self = shift; # single instance in enough... - local *line = shift; - undef $ret; - - # optional * ---vvv--- appears in indirect jmp/call - if ($line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)/) { - $self->{asterisk} = $1; - $self->{label} = $2; - ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); - $self->{scale} = 1 if (!defined($self->{scale})); - $ret = $self; - $line = substr($line,@+[0]); $line =~ s/^\s+//; - - if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { - die if (opcode->mnemonic() ne "mov"); - opcode->mnemonic("lea"); - } - $self->{base} =~ s/^%//; - $self->{index} =~ s/^%// if (defined($self->{index})); - } - $ret; - } - sub size {} - sub out { - my $self = shift; - my $sz = shift; - - $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; - $self->{label} =~ s/\.L/$decor/g; - - # Silently convert all EAs to 64-bit. This is required for - # elder GNU assembler and results in more compact code, - # *but* most importantly AES module depends on this feature! - $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; - $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; - - # Solaris /usr/ccs/bin/as can't handle multiplications - # in $self->{label}, new gas requires sign extension... - use integer; - $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi; - $self->{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; - $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg; - - if (!$self->{label} && $self->{index} && $self->{scale}==1 && - $self->{base} =~ /(rbp|r13)/) { - $self->{base} = $self->{index}; $self->{index} = $1; - } - - if ($gas) { - $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); - - if (defined($self->{index})) { - sprintf "%s%s(%s,%%%s,%d)",$self->{asterisk}, - $self->{label}, - $self->{base}?"%$self->{base}":"", - $self->{index},$self->{scale}; - } else { - sprintf "%s%s(%%%s)", $self->{asterisk},$self->{label},$self->{base}; - } - } else { - %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", - q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR", - y=>"" ); - - $self->{label} =~ s/\./\$/g; - $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig; - $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); - $sz="q" if ($self->{asterisk} || opcode->mnemonic() =~ /^v?movq$/); - $sz="l" if (opcode->mnemonic() =~ /^v?movd$/); - - if (defined($self->{index})) { - sprintf "%s[%s%s*%d%s]",$szmap{$sz}, - $self->{label}?"$self->{label}+":"", - $self->{index},$self->{scale}, - $self->{base}?"+$self->{base}":""; - } elsif ($self->{base} eq "rip") { - sprintf "%s[%s]",$szmap{$sz},$self->{label}; - } else { - sprintf "%s[%s%s]",$szmap{$sz}, - $self->{label}?"$self->{label}+":"", - $self->{base}; - } - } - } -} -{ package register; # pick up registers, which start with %. - sub re { - my $class = shift; # muliple instances... - my $self = {}; - local *line = shift; - undef $ret; - - # optional * ---vvv--- appears in indirect jmp/call - if ($line =~ /^(\*?)%(\w+)/) { - bless $self,$class; - $self->{asterisk} = $1; - $self->{value} = $2; - $ret = $self; - $line = substr($line,@+[0]); $line =~ s/^\s+//; - } - $ret; - } - sub size { - my $self = shift; - undef $ret; - - if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } - elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } - elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } - elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } - elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } - elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } - elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } - elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } - - $ret; - } - sub out { - my $self = shift; - if ($gas) { sprintf "%s%%%s",$self->{asterisk},$self->{value}; } - else { $self->{value}; } - } -} -{ package label; # pick up labels, which end with : - sub re { - my $self = shift; # single instance is enough... - local *line = shift; - undef $ret; - - if ($line =~ /(^[\.\w]+)\:/) { - $self->{value} = $1; - $ret = $self; - $line = substr($line,@+[0]); $line =~ s/^\s+//; - - $self->{value} =~ s/^\.L/$decor/; - } - $ret; - } - sub out { - my $self = shift; - - if ($gas) { - my $func = ($globals{$self->{value}} or $self->{value}) . ":"; - if ($win64 && - $current_function->{name} eq $self->{value} && - $current_function->{abi} eq "svr4") { - $func .= "\n"; - $func .= " movq %rdi,8(%rsp)\n"; - $func .= " movq %rsi,16(%rsp)\n"; - $func .= " movq %rsp,%rax\n"; - $func .= "${decor}SEH_begin_$current_function->{name}:\n"; - my $narg = $current_function->{narg}; - $narg=6 if (!defined($narg)); - $func .= " movq %rcx,%rdi\n" if ($narg>0); - $func .= " movq %rdx,%rsi\n" if ($narg>1); - $func .= " movq %r8,%rdx\n" if ($narg>2); - $func .= " movq %r9,%rcx\n" if ($narg>3); - $func .= " movq 40(%rsp),%r8\n" if ($narg>4); - $func .= " movq 48(%rsp),%r9\n" if ($narg>5); - } - $func; - } elsif ($self->{value} ne "$current_function->{name}") { - $self->{value} .= ":" if ($masm && $ret!~m/^\$/); - $self->{value} . ":"; - } elsif ($win64 && $current_function->{abi} eq "svr4") { - my $func = "$current_function->{name}" . - ($nasm ? ":" : "\tPROC $current_function->{scope}") . - "\n"; - $func .= " mov QWORD${PTR}[8+rsp],rdi\t;WIN64 prologue\n"; - $func .= " mov QWORD${PTR}[16+rsp],rsi\n"; - $func .= " mov rax,rsp\n"; - $func .= "${decor}SEH_begin_$current_function->{name}:"; - $func .= ":" if ($masm); - $func .= "\n"; - my $narg = $current_function->{narg}; - $narg=6 if (!defined($narg)); - $func .= " mov rdi,rcx\n" if ($narg>0); - $func .= " mov rsi,rdx\n" if ($narg>1); - $func .= " mov rdx,r8\n" if ($narg>2); - $func .= " mov rcx,r9\n" if ($narg>3); - $func .= " mov r8,QWORD${PTR}[40+rsp]\n" if ($narg>4); - $func .= " mov r9,QWORD${PTR}[48+rsp]\n" if ($narg>5); - $func .= "\n"; - } else { - "$current_function->{name}". - ($nasm ? ":" : "\tPROC $current_function->{scope}"); - } - } -} -{ package expr; # pick up expressioins - sub re { - my $self = shift; # single instance is enough... - local *line = shift; - undef $ret; - - if ($line =~ /(^[^,]+)/) { - $self->{value} = $1; - $ret = $self; - $line = substr($line,@+[0]); $line =~ s/^\s+//; - - $self->{value} =~ s/\@PLT// if (!$elf); - $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; - $self->{value} =~ s/\.L/$decor/g; - } - $ret; - } - sub out { - my $self = shift; - if ($nasm && opcode->mnemonic()=~m/^j(?![re]cxz)/) { - "NEAR ".$self->{value}; - } else { - $self->{value}; - } - } -} -{ package directive; # pick up directives, which start with . - sub re { - my $self = shift; # single instance is enough... - local *line = shift; - undef $ret; - my $dir; - my %opcode = # lea 2f-1f(%rip),%dst; 1: nop; 2: - ( "%rax"=>0x01058d48, "%rcx"=>0x010d8d48, - "%rdx"=>0x01158d48, "%rbx"=>0x011d8d48, - "%rsp"=>0x01258d48, "%rbp"=>0x012d8d48, - "%rsi"=>0x01358d48, "%rdi"=>0x013d8d48, - "%r8" =>0x01058d4c, "%r9" =>0x010d8d4c, - "%r10"=>0x01158d4c, "%r11"=>0x011d8d4c, - "%r12"=>0x01258d4c, "%r13"=>0x012d8d4c, - "%r14"=>0x01358d4c, "%r15"=>0x013d8d4c ); - - if ($line =~ /^\s*(\.\w+)/) { - $dir = $1; - $ret = $self; - undef $self->{value}; - $line = substr($line,@+[0]); $line =~ s/^\s+//; - - SWITCH: for ($dir) { - /\.picmeup/ && do { if ($line =~ /(%r[\w]+)/i) { - $dir="\t.long"; - $line=sprintf "0x%x,0x90000000",$opcode{$1}; - } - last; - }; - /\.global|\.globl|\.extern/ - && do { $globals{$line} = $prefix . $line; - $line = $globals{$line} if ($prefix); - last; - }; - /\.type/ && do { ($sym,$type,$narg) = split(',',$line); - if ($type eq "\@function") { - undef $current_function; - $current_function->{name} = $sym; - $current_function->{abi} = "svr4"; - $current_function->{narg} = $narg; - $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; - } elsif ($type eq "\@abi-omnipotent") { - undef $current_function; - $current_function->{name} = $sym; - $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; - } - $line =~ s/\@abi\-omnipotent/\@function/; - $line =~ s/\@function.*/\@function/; - last; - }; - /\.asciz/ && do { if ($line =~ /^"(.*)"$/) { - $dir = ".byte"; - $line = join(",",unpack("C*",$1),0); - } - last; - }; - /\.rva|\.long|\.quad/ - && do { $line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; - $line =~ s/\.L/$decor/g; - last; - }; - } - - if ($gas) { - $self->{value} = $dir . "\t" . $line; - - if ($dir =~ /\.extern/) { - $self->{value} = ""; # swallow extern - } elsif (!$elf && $dir =~ /\.type/) { - $self->{value} = ""; - $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . - (defined($globals{$1})?".scl 2;":".scl 3;") . - "\t.type 32;\t.endef" - if ($win64 && $line =~ /([^,]+),\@function/); - } elsif (!$elf && $dir =~ /\.size/) { - $self->{value} = ""; - if (defined($current_function)) { - $self->{value} .= "${decor}SEH_end_$current_function->{name}:" - if ($win64 && $current_function->{abi} eq "svr4"); - undef $current_function; - } - } elsif (!$elf && $dir =~ /\.align/) { - $self->{value} = ".p2align\t" . (log($line)/log(2)); - } elsif ($dir eq ".section") { - $current_segment=$line; - if (!$elf && $current_segment eq ".init") { - if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } - elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } - } - } elsif ($dir =~ /\.(text|data)/) { - $current_segment=".$1"; - } elsif ($dir =~ /\.hidden/) { - if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$line"; } - elsif ($flavour eq "mingw64") { $self->{value} = ""; } - } elsif ($dir =~ /\.comm/) { - $self->{value} = "$dir\t$prefix$line"; - $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); - } - $line = ""; - return $self; - } - - # non-gas case or nasm/masm - SWITCH: for ($dir) { - /\.text/ && do { my $v=undef; - if ($nasm) { - $v="section .text code align=64\n"; - } else { - $v="$current_segment\tENDS\n" if ($current_segment); - $current_segment = ".text\$"; - $v.="$current_segment\tSEGMENT "; - $v.=$masm>=$masmref ? "ALIGN(64)" : "PAGE"; - $v.=" 'CODE'"; - } - $self->{value} = $v; - last; - }; - /\.data/ && do { my $v=undef; - if ($nasm) { - $v="section .data data align=8\n"; - } else { - $v="$current_segment\tENDS\n" if ($current_segment); - $current_segment = "_DATA"; - $v.="$current_segment\tSEGMENT"; - } - $self->{value} = $v; - last; - }; - /\.section/ && do { my $v=undef; - $line =~ s/([^,]*).*/$1/; - $line = ".CRT\$XCU" if ($line eq ".init"); - if ($nasm) { - $v="section $line"; - if ($line=~/\.([px])data/) { - $v.=" rdata align="; - $v.=$1 eq "p"? 4 : 8; - } elsif ($line=~/\.CRT\$/i) { - $v.=" rdata align=8"; - } - } else { - $v="$current_segment\tENDS\n" if ($current_segment); - $v.="$line\tSEGMENT"; - if ($line=~/\.([px])data/) { - $v.=" READONLY"; - $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); - } elsif ($line=~/\.CRT\$/i) { - $v.=" READONLY "; - $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; - } - } - $current_segment = $line; - $self->{value} = $v; - last; - }; - /\.extern/ && do { $self->{value} = "EXTERN\t".$line; - $self->{value} .= ":NEAR" if ($masm); - last; - }; - /\.globl|.global/ - && do { $self->{value} = $masm?"PUBLIC":"global"; - $self->{value} .= "\t".$line; - last; - }; - /\.size/ && do { if (defined($current_function)) { - undef $self->{value}; - if ($current_function->{abi} eq "svr4") { - $self->{value}="${decor}SEH_end_$current_function->{name}:"; - $self->{value}.=":\n" if($masm); - } - $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); - undef $current_function; - } - last; - }; - /\.align/ && do { $self->{value} = "ALIGN\t".$line; last; }; - /\.(value|long|rva|quad)/ - && do { my $sz = substr($1,0,1); - my @arr = split(/,\s*/,$line); - my $last = pop(@arr); - my $conv = sub { my $var=shift; - $var=~s/^(0b[0-1]+)/oct($1)/eig; - $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); - if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) - { $var=~s/([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } - $var; - }; - - $sz =~ tr/bvlrq/BWDDQ/; - $self->{value} = "\tD$sz\t"; - for (@arr) { $self->{value} .= &$conv($_).","; } - $self->{value} .= &$conv($last); - last; - }; - /\.byte/ && do { my @str=split(/,\s*/,$line); - map(s/(0b[0-1]+)/oct($1)/eig,@str); - map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); - while ($#str>15) { - $self->{value}.="DB\t" - .join(",",@str[0..15])."\n"; - foreach (0..15) { shift @str; } - } - $self->{value}.="DB\t" - .join(",",@str) if (@str); - last; - }; - /\.comm/ && do { my @str=split(/,\s*/,$line); - my $v=undef; - if ($nasm) { - $v.="common $prefix@str[0] @str[1]"; - } else { - $v="$current_segment\tENDS\n" if ($current_segment); - $current_segment = "_DATA"; - $v.="$current_segment\tSEGMENT\n"; - $v.="COMM @str[0]:DWORD:".@str[1]/4; - } - $self->{value} = $v; - last; - }; - } - $line = ""; - } - - $ret; - } - sub out { - my $self = shift; - $self->{value}; - } -} - -sub rex { - local *opcode=shift; - my ($dst,$src,$rex)=@_; - - $rex|=0x04 if($dst>=8); - $rex|=0x01 if($src>=8); - push @opcode,($rex|0x40) if ($rex); -} - -# older gas and ml64 don't handle SSE>2 instructions -my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, - "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); - -my $movq = sub { # elderly gas can't handle inter-register movq - my $arg = shift; - my @opcode=(0x66); - if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { - my ($src,$dst)=($1,$2); - if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,$src,$dst,0x8); - push @opcode,0x0f,0x7e; - push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M - @opcode; - } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { - my ($src,$dst)=($2,$1); - if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,$src,$dst,0x8); - push @opcode,0x0f,0x6e; - push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M - @opcode; - } else { - (); - } -}; - -my $pextrd = sub { - if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { - my @opcode=(0x66); - $imm=$1; - $src=$2; - $dst=$3; - if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } - elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } - rex(\@opcode,$src,$dst); - push @opcode,0x0f,0x3a,0x16; - push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M - push @opcode,$imm; - @opcode; - } else { - (); - } -}; - -my $pinsrd = sub { - if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { - my @opcode=(0x66); - $imm=$1; - $src=$2; - $dst=$3; - if ($src =~ /%r([0-9]+)/) { $src = $1; } - elsif ($src =~ /%e/) { $src = $regrm{$src}; } - rex(\@opcode,$dst,$src); - push @opcode,0x0f,0x3a,0x22; - push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M - push @opcode,$imm; - @opcode; - } else { - (); - } -}; - -my $pshufb = sub { - if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x66); - rex(\@opcode,$2,$1); - push @opcode,0x0f,0x38,0x00; - push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M - @opcode; - } else { - (); - } -}; - -my $palignr = sub { - if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x66); - rex(\@opcode,$3,$2); - push @opcode,0x0f,0x3a,0x0f; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - push @opcode,$1; - @opcode; - } else { - (); - } -}; - -my $pclmulqdq = sub { - if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x66); - rex(\@opcode,$3,$2); - push @opcode,0x0f,0x3a,0x44; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - my $c=$1; - push @opcode,$c=~/^0/?oct($c):$c; - @opcode; - } else { - (); - } -}; - -my $rdrand = sub { - if (shift =~ /%[er](\w+)/) { - my @opcode=(); - my $dst=$1; - if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,0,$1,8); - push @opcode,0x0f,0xc7,0xf0|($dst&7); - @opcode; - } else { - (); - } -}; - -sub rxb { - local *opcode=shift; - my ($dst,$src1,$src2,$rxb)=@_; - - $rxb|=0x7<<5; - $rxb&=~(0x04<<5) if($dst>=8); - $rxb&=~(0x01<<5) if($src1>=8); - $rxb&=~(0x02<<5) if($src2>=8); - push @opcode,$rxb; -} - -my $vprotd = sub { - if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x8f); - rxb(\@opcode,$3,$2,-1,0x08); - push @opcode,0x78,0xc2; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - my $c=$1; - push @opcode,$c=~/^0/?oct($c):$c; - @opcode; - } else { - (); - } -}; - -my $vprotq = sub { - if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x8f); - rxb(\@opcode,$3,$2,-1,0x08); - push @opcode,0x78,0xc3; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - my $c=$1; - push @opcode,$c=~/^0/?oct($c):$c; - @opcode; - } else { - (); - } -}; - -if ($nasm) { - print <<___; -default rel -%define XMMWORD -___ -} elsif ($masm) { - print <<___; -OPTION DOTNAME -___ -} -while($line=<>) { - - chomp($line); - - $line =~ s|[#!].*$||; # get rid of asm-style comments... - $line =~ s|/\*.*\*/||; # ... and C-style comments... - $line =~ s|^\s+||; # ... and skip white spaces in beginning - - undef $label; - undef $opcode; - undef @args; - - if ($label=label->re(\$line)) { print $label->out(); } - - if (directive->re(\$line)) { - printf "%s",directive->out(); - } elsif ($opcode=opcode->re(\$line)) { - my $asm = eval("\$".$opcode->mnemonic()); - undef @bytes; - - if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) { - print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; - next; - } - - ARGUMENT: while (1) { - my $arg; - - if ($arg=register->re(\$line)) { opcode->size($arg->size()); } - elsif ($arg=const->re(\$line)) { } - elsif ($arg=ea->re(\$line)) { } - elsif ($arg=expr->re(\$line)) { } - else { last ARGUMENT; } - - push @args,$arg; - - last ARGUMENT if ($line !~ /^,/); - - $line =~ s/^,\s*//; - } # ARGUMENT: - - if ($#args>=0) { - my $insn; - my $sz=opcode->size(); - - if ($gas) { - $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); - @args = map($_->out($sz),@args); - printf "\t%s\t%s",$insn,join(",",@args); - } else { - $insn = $opcode->out(); - foreach (@args) { - my $arg = $_->out(); - # $insn.=$sz compensates for movq, pinsrw, ... - if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } - if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } - if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } - } - @args = reverse(@args); - undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); - printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); - } - } else { - printf "\t%s",$opcode->out(); - } - } - - print $line,"\n"; -} - -print "\n$current_segment\tENDS\n" if ($current_segment && $masm); -print "END\n" if ($masm); - -close STDOUT; - -################################################# -# Cross-reference x86_64 ABI "card" -# -# Unix Win64 -# %rax * * -# %rbx - - -# %rcx #4 #1 -# %rdx #3 #2 -# %rsi #2 - -# %rdi #1 - -# %rbp - - -# %rsp - - -# %r8 #5 #3 -# %r9 #6 #4 -# %r10 * * -# %r11 * * -# %r12 - - -# %r13 - - -# %r14 - - -# %r15 - - -# -# (*) volatile register -# (-) preserved by callee -# (#) Nth argument, volatile -# -# In Unix terms top of stack is argument transfer area for arguments -# which could not be accomodated in registers. Or in other words 7th -# [integer] argument resides at 8(%rsp) upon function entry point. -# 128 bytes above %rsp constitute a "red zone" which is not touched -# by signal handlers and can be used as temporal storage without -# allocating a frame. -# -# In Win64 terms N*8 bytes on top of stack is argument transfer area, -# which belongs to/can be overwritten by callee. N is the number of -# arguments passed to callee, *but* not less than 4! This means that -# upon function entry point 5th argument resides at 40(%rsp), as well -# as that 32 bytes from 8(%rsp) can always be used as temporal -# storage [without allocating a frame]. One can actually argue that -# one can assume a "red zone" above stack pointer under Win64 as well. -# Point is that at apparently no occasion Windows kernel would alter -# the area above user stack pointer in true asynchronous manner... -# -# All the above means that if assembler programmer adheres to Unix -# register and stack layout, but disregards the "red zone" existense, -# it's possible to use following prologue and epilogue to "gear" from -# Unix to Win64 ABI in leaf functions with not more than 6 arguments. -# -# omnipotent_function: -# ifdef WIN64 -# movq %rdi,8(%rsp) -# movq %rsi,16(%rsp) -# movq %rcx,%rdi ; if 1st argument is actually present -# movq %rdx,%rsi ; if 2nd argument is actually ... -# movq %r8,%rdx ; if 3rd argument is ... -# movq %r9,%rcx ; if 4th argument ... -# movq 40(%rsp),%r8 ; if 5th ... -# movq 48(%rsp),%r9 ; if 6th ... -# endif -# ... -# ifdef WIN64 -# movq 8(%rsp),%rdi -# movq 16(%rsp),%rsi -# endif -# ret -# -################################################# -# Win64 SEH, Structured Exception Handling. -# -# Unlike on Unix systems(*) lack of Win64 stack unwinding information -# has undesired side-effect at run-time: if an exception is raised in -# assembler subroutine such as those in question (basically we're -# referring to segmentation violations caused by malformed input -# parameters), the application is briskly terminated without invoking -# any exception handlers, most notably without generating memory dump -# or any user notification whatsoever. This poses a problem. It's -# possible to address it by registering custom language-specific -# handler that would restore processor context to the state at -# subroutine entry point and return "exception is not handled, keep -# unwinding" code. Writing such handler can be a challenge... But it's -# doable, though requires certain coding convention. Consider following -# snippet: -# -# .type function,@function -# function: -# movq %rsp,%rax # copy rsp to volatile register -# pushq %r15 # save non-volatile registers -# pushq %rbx -# pushq %rbp -# movq %rsp,%r11 -# subq %rdi,%r11 # prepare [variable] stack frame -# andq $-64,%r11 -# movq %rax,0(%r11) # check for exceptions -# movq %r11,%rsp # allocate [variable] stack frame -# movq %rax,0(%rsp) # save original rsp value -# magic_point: -# ... -# movq 0(%rsp),%rcx # pull original rsp value -# movq -24(%rcx),%rbp # restore non-volatile registers -# movq -16(%rcx),%rbx -# movq -8(%rcx),%r15 -# movq %rcx,%rsp # restore original rsp -# ret -# .size function,.-function -# -# The key is that up to magic_point copy of original rsp value remains -# in chosen volatile register and no non-volatile register, except for -# rsp, is modified. While past magic_point rsp remains constant till -# the very end of the function. In this case custom language-specific -# exception handler would look like this: -# -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -# { ULONG64 *rsp = (ULONG64 *)context->Rax; -# if (context->Rip >= magic_point) -# { rsp = ((ULONG64 **)context->Rsp)[0]; -# context->Rbp = rsp[-3]; -# context->Rbx = rsp[-2]; -# context->R15 = rsp[-1]; -# } -# context->Rsp = (ULONG64)rsp; -# context->Rdi = rsp[1]; -# context->Rsi = rsp[2]; -# -# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); -# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, -# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, -# &disp->HandlerData,&disp->EstablisherFrame,NULL); -# return ExceptionContinueSearch; -# } -# -# It's appropriate to implement this handler in assembler, directly in -# function's module. In order to do that one has to know members' -# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant -# values. Here they are: -# -# CONTEXT.Rax 120 -# CONTEXT.Rcx 128 -# CONTEXT.Rdx 136 -# CONTEXT.Rbx 144 -# CONTEXT.Rsp 152 -# CONTEXT.Rbp 160 -# CONTEXT.Rsi 168 -# CONTEXT.Rdi 176 -# CONTEXT.R8 184 -# CONTEXT.R9 192 -# CONTEXT.R10 200 -# CONTEXT.R11 208 -# CONTEXT.R12 216 -# CONTEXT.R13 224 -# CONTEXT.R14 232 -# CONTEXT.R15 240 -# CONTEXT.Rip 248 -# CONTEXT.Xmm6 512 -# sizeof(CONTEXT) 1232 -# DISPATCHER_CONTEXT.ControlPc 0 -# DISPATCHER_CONTEXT.ImageBase 8 -# DISPATCHER_CONTEXT.FunctionEntry 16 -# DISPATCHER_CONTEXT.EstablisherFrame 24 -# DISPATCHER_CONTEXT.TargetIp 32 -# DISPATCHER_CONTEXT.ContextRecord 40 -# DISPATCHER_CONTEXT.LanguageHandler 48 -# DISPATCHER_CONTEXT.HandlerData 56 -# UNW_FLAG_NHANDLER 0 -# ExceptionContinueSearch 1 -# -# In order to tie the handler to the function one has to compose -# couple of structures: one for .xdata segment and one for .pdata. -# -# UNWIND_INFO structure for .xdata segment would be -# -# function_unwind_info: -# .byte 9,0,0,0 -# .rva handler -# -# This structure designates exception handler for a function with -# zero-length prologue, no stack frame or frame register. -# -# To facilitate composing of .pdata structures, auto-generated "gear" -# prologue copies rsp value to rax and denotes next instruction with -# .LSEH_begin_{function_name} label. This essentially defines the SEH -# styling rule mentioned in the beginning. Position of this label is -# chosen in such manner that possible exceptions raised in the "gear" -# prologue would be accounted to caller and unwound from latter's frame. -# End of function is marked with respective .LSEH_end_{function_name} -# label. To summarize, .pdata segment would contain -# -# .rva .LSEH_begin_function -# .rva .LSEH_end_function -# .rva function_unwind_info -# -# Reference to function_unwind_info from .xdata segment is the anchor. -# In case you wonder why references are 32-bit .rvas and not 64-bit -# .quads. References put into these two segments are required to be -# *relative* to the base address of the current binary module, a.k.a. -# image base. No Win64 module, be it .exe or .dll, can be larger than -# 2GB and thus such relative references can be and are accommodated in -# 32 bits. -# -# Having reviewed the example function code, one can argue that "movq -# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix -# rax would contain an undefined value. If this "offends" you, use -# another register and refrain from modifying rax till magic_point is -# reached, i.e. as if it was a non-volatile register. If more registers -# are required prior [variable] frame setup is completed, note that -# nobody says that you can have only one "magic point." You can -# "liberate" non-volatile registers by denoting last stack off-load -# instruction and reflecting it in finer grade unwind logic in handler. -# After all, isn't it why it's called *language-specific* handler... -# -# Attentive reader can notice that exceptions would be mishandled in -# auto-generated "gear" epilogue. Well, exception effectively can't -# occur there, because if memory area used by it was subject to -# segmentation violation, then it would be raised upon call to the -# function (and as already mentioned be accounted to caller, which is -# not a problem). If you're still not comfortable, then define tail -# "magic point" just prior ret instruction and have handler treat it... -# -# (*) Note that we're talking about run-time, not debug-time. Lack of -# unwind information makes debugging hard on both Windows and -# Unix. "Unlike" referes to the fact that on Unix signal handler -# will always be invoked, core dumped and appropriate exit code -# returned to parent (for user notification). +.././openssl/crypto/perlasm/x86_64-xlate.pl
\ No newline at end of file diff --git a/devel/perlasm/x86asm.pl b/devel/perlasm/x86asm.pl index 17abf92297..4c88d69117 100644..120000 --- a/devel/perlasm/x86asm.pl +++ b/devel/perlasm/x86asm.pl @@ -1,288 +1 @@ -#!/usr/bin/env perl - -# require 'x86asm.pl'; -# &asm_init(<flavor>,"des-586.pl"[,$i386only]); -# &function_begin("foo"); -# ... -# &function_end("foo"); -# &asm_finish - -$out=(); -$i386=0; - -# AUTOLOAD is this context has quite unpleasant side effect, namely -# that typos in function calls effectively go to assembler output, -# but on the pros side we don't have to implement one subroutine per -# each opcode... -sub ::AUTOLOAD -{ my $opcode = $AUTOLOAD; - - die "more than 4 arguments passed to $opcode" if ($#_>3); - - $opcode =~ s/.*:://; - if ($opcode =~ /^push/) { $stack+=4; } - elsif ($opcode =~ /^pop/) { $stack-=4; } - - &generic($opcode,@_) or die "undefined subroutine \&$AUTOLOAD"; -} - -sub ::emit -{ my $opcode=shift; - - if ($#_==-1) { push(@out,"\t$opcode\n"); } - else { push(@out,"\t$opcode\t".join(',',@_)."\n"); } -} - -sub ::LB -{ $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'low byte'"; - $1."l"; -} -sub ::HB -{ $_[0] =~ m/^e?([a-d])x$/o or die "$_[0] does not have a 'high byte'"; - $1."h"; -} -sub ::stack_push{ my $num=$_[0]*4; $stack+=$num; &sub("esp",$num); } -sub ::stack_pop { my $num=$_[0]*4; $stack-=$num; &add("esp",$num); } -sub ::blindpop { &pop($_[0]); $stack+=4; } -sub ::wparam { &DWP($stack+4*$_[0],"esp"); } -sub ::swtmp { &DWP(4*$_[0],"esp"); } - -sub ::bswap -{ if ($i386) # emulate bswap for i386 - { &comment("bswap @_"); - &xchg(&HB(@_),&LB(@_)); - &ror (@_,16); - &xchg(&HB(@_),&LB(@_)); - } - else - { &generic("bswap",@_); } -} -# These are made-up opcodes introduced over the years essentially -# by ignorance, just alias them to real ones... -sub ::movb { &mov(@_); } -sub ::xorb { &xor(@_); } -sub ::rotl { &rol(@_); } -sub ::rotr { &ror(@_); } -sub ::exch { &xchg(@_); } -sub ::halt { &hlt; } -sub ::movz { &movzx(@_); } -sub ::pushf { &pushfd; } -sub ::popf { &popfd; } - -# 3 argument instructions -sub ::movq -{ my($p1,$p2,$optimize)=@_; - - if ($optimize && $p1=~/^mm[0-7]$/ && $p2=~/^mm[0-7]$/) - # movq between mmx registers can sink Intel CPUs - { &::pshufw($p1,$p2,0xe4); } - else - { &::generic("movq",@_); } -} - -# SSE>2 instructions -my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3, - "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 ); -sub ::pextrd -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/) - { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); } - else - { &::generic("pextrd",@_); } -} - -sub ::pinsrd -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/) - { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); } - else - { &::generic("pinsrd",@_); } -} - -sub ::pshufb -{ my($dst,$src)=@_; - if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) - { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); } - else - { &::generic("pshufb",@_); } -} - -sub ::palignr -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) - { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); } - else - { &::generic("palignr",@_); } -} - -sub ::pclmulqdq -{ my($dst,$src,$imm)=@_; - if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) - { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); } - else - { &::generic("pclmulqdq",@_); } -} - -sub ::rdrand -{ my ($dst)=@_; - if ($dst =~ /(e[a-dsd][ixp])/) - { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); } - else - { &::generic("rdrand",@_); } -} - -sub rxb { - local *opcode=shift; - my ($dst,$src1,$src2,$rxb)=@_; - - $rxb|=0x7<<5; - $rxb&=~(0x04<<5) if($dst>=8); - $rxb&=~(0x01<<5) if($src1>=8); - $rxb&=~(0x02<<5) if($src2>=8); - push @opcode,$rxb; -} - -sub ::vprotd -{ my $args=join(',',@_); - if ($args =~ /xmm([0-7]),xmm([0-7]),([x0-9a-f]+)/) - { my @opcode=(0x8f); - rxb(\@opcode,$1,$2,-1,0x08); - push @opcode,0x78,0xc2; - push @opcode,0xc0|($2&7)|(($1&7)<<3); # ModR/M - my $c=$3; - push @opcode,$c=~/^0/?oct($c):$c; - &::data_byte(@opcode); - } - else - { &::generic("vprotd",@_); } -} - -# label management -$lbdecor="L"; # local label decoration, set by package -$label="000"; - -sub ::islabel # see is argument is a known label -{ my $i; - foreach $i (values %label) { return $i if ($i eq $_[0]); } - $label{$_[0]}; # can be undef -} - -sub ::label # instantiate a function-scope label -{ if (!defined($label{$_[0]})) - { $label{$_[0]}="${lbdecor}${label}${_[0]}"; $label++; } - $label{$_[0]}; -} - -sub ::LABEL # instantiate a file-scope label -{ $label{$_[0]}=$_[1] if (!defined($label{$_[0]})); - $label{$_[0]}; -} - -sub ::static_label { &::LABEL($_[0],$lbdecor.$_[0]); } - -sub ::set_label_B { push(@out,"@_:\n"); } -sub ::set_label -{ my $label=&::label($_[0]); - &::align($_[1]) if ($_[1]>1); - &::set_label_B($label); - $label; -} - -sub ::wipe_labels # wipes function-scope labels -{ foreach $i (keys %label) - { delete $label{$i} if ($label{$i} =~ /^\Q${lbdecor}\E[0-9]{3}/); } -} - -# subroutine management -sub ::function_begin -{ &function_begin_B(@_); - $stack=4; - &push("ebp"); - &push("ebx"); - &push("esi"); - &push("edi"); -} - -sub ::function_end -{ &pop("edi"); - &pop("esi"); - &pop("ebx"); - &pop("ebp"); - &ret(); - &function_end_B(@_); - $stack=0; - &wipe_labels(); -} - -sub ::function_end_A -{ &pop("edi"); - &pop("esi"); - &pop("ebx"); - &pop("ebp"); - &ret(); - $stack+=16; # readjust esp as if we didn't pop anything -} - -sub ::asciz -{ my @str=unpack("C*",shift); - push @str,0; - while ($#str>15) { - &data_byte(@str[0..15]); - foreach (0..15) { shift @str; } - } - &data_byte(@str) if (@str); -} - -sub ::asm_finish -{ &file_end(); - print @out; -} - -sub ::asm_init -{ my ($type,$fn,$cpu)=@_; - - $filename=$fn; - $i386=$cpu; - - $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0; - if (($type eq "elf")) - { $elf=1; require "x86gas.pl"; } - elsif (($type eq "a\.out")) - { $aout=1; require "x86gas.pl"; } - elsif (($type eq "coff" or $type eq "gaswin")) - { $coff=1; require "x86gas.pl"; } - elsif (($type eq "win32n")) - { $win32=1; require "x86nasm.pl"; } - elsif (($type eq "nw-nasm")) - { $netware=1; require "x86nasm.pl"; } - #elsif (($type eq "nw-mwasm")) - #{ $netware=1; $mwerks=1; require "x86nasm.pl"; } - elsif (($type eq "win32")) - { $win32=1; require "x86masm.pl"; } - elsif (($type eq "macosx")) - { $aout=1; $macosx=1; require "x86gas.pl"; } - elsif (($type eq "android")) - { $elf=1; $android=1; require "x86gas.pl"; } - else - { print STDERR <<"EOF"; -Pick one target type from - elf - Linux, FreeBSD, Solaris x86, etc. - a.out - DJGPP, elder OpenBSD, etc. - coff - GAS/COFF such as Win32 targets - win32n - Windows 95/Windows NT NASM format - nw-nasm - NetWare NASM format - macosx - Mac OS X -EOF - exit(1); - } - - $pic=0; - for (@ARGV) { $pic=1 if (/\-[fK]PIC/i); } - - $filename =~ s/\.pl$//; - &file($filename); -} - -sub ::hidden {} - -1; +.././openssl/crypto/perlasm/x86asm.pl
\ No newline at end of file diff --git a/devel/perlasm/x86gas.pl b/devel/perlasm/x86gas.pl index 5c2498118f..8bbca21685 100644..120000 --- a/devel/perlasm/x86gas.pl +++ b/devel/perlasm/x86gas.pl @@ -1,258 +1 @@ -#!/usr/bin/env perl - -package x86gas; - -*out=\@::out; - -$::lbdecor=$::aout?"L":".L"; # local label decoration -$nmdecor=($::aout or $::coff)?"_":""; # external name decoration - -$initseg=""; - -$align=16; -$align=log($align)/log(2) if ($::aout); -$com_start="#" if ($::aout or $::coff); - -sub opsize() -{ my $reg=shift; - if ($reg =~ m/^%e/o) { "l"; } - elsif ($reg =~ m/^%[a-d][hl]$/o) { "b"; } - elsif ($reg =~ m/^%[xm]/o) { undef; } - else { "w"; } -} - -# swap arguments; -# expand opcode with size suffix; -# prefix numeric constants with $; -sub ::generic -{ my($opcode,@arg)=@_; - my($suffix,$dst,$src); - - @arg=reverse(@arg); - - for (@arg) - { s/^(\*?)(e?[a-dsixphl]{2})$/$1%$2/o; # gp registers - s/^([xy]?mm[0-7])$/%$1/o; # xmm/mmx registers - s/^(\-?[0-9]+)$/\$$1/o; # constants - s/^(\-?0x[0-9a-f]+)$/\$$1/o; # constants - } - - $dst = $arg[$#arg] if ($#arg>=0); - $src = $arg[$#arg-1] if ($#arg>=1); - if ($dst =~ m/^%/o) { $suffix=&opsize($dst); } - elsif ($src =~ m/^%/o) { $suffix=&opsize($src); } - else { $suffix="l"; } - undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o); - - if ($#_==0) { &::emit($opcode); } - elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o) - { &::emit($opcode,@arg); } - else { &::emit($opcode.$suffix,@arg);} - - 1; -} -# -# opcodes not covered by ::generic above, mostly inconsistent namings... -# -sub ::movzx { &::movzb(@_); } -sub ::pushfd { &::pushfl; } -sub ::popfd { &::popfl; } -sub ::cpuid { &::emit(".byte\t0x0f,0xa2"); } -sub ::rdtsc { &::emit(".byte\t0x0f,0x31"); } - -sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); } -sub ::call_ptr { &::generic("call","*$_[0]"); } -sub ::jmp_ptr { &::generic("jmp","*$_[0]"); } - -*::bswap = sub { &::emit("bswap","%$_[0]"); } if (!$::i386); - -sub ::DWP -{ my($addr,$reg1,$reg2,$idx)=@_; - my $ret=""; - - if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } - - $addr =~ s/^\s+//; - # prepend global references with optional underscore - $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige; - - $reg1 = "%$reg1" if ($reg1); - $reg2 = "%$reg2" if ($reg2); - - $ret .= $addr if (($addr ne "") && ($addr ne 0)); - - if ($reg2) - { $idx!= 0 or $idx=1; - $ret .= "($reg1,$reg2,$idx)"; - } - elsif ($reg1) - { $ret .= "($reg1)"; } - - $ret; -} -sub ::QWP { &::DWP(@_); } -sub ::BP { &::DWP(@_); } -sub ::WP { &::DWP(@_); } -sub ::BC { @_; } -sub ::DWC { @_; } - -sub ::file -{ push(@out,".file\t\"$_[0].s\"\n.text\n"); } - -sub ::function_begin_B -{ my $func=shift; - my $global=($func !~ /^_/); - my $begin="${::lbdecor}_${func}_begin"; - - &::LABEL($func,$global?"$begin":"$nmdecor$func"); - $func=$nmdecor.$func; - - push(@out,".globl\t$func\n") if ($global); - if ($::coff) - { push(@out,".def\t$func;\t.scl\t".(3-$global).";\t.type\t32;\t.endef\n"); } - elsif (($::aout and !$::pic) or $::macosx) - { } - else - { push(@out,".type $func,\@function\n"); } - push(@out,".align\t$align\n"); - push(@out,"$func:\n"); - push(@out,"$begin:\n") if ($global); - $::stack=4; -} - -sub ::function_end_B -{ my $func=shift; - push(@out,".size\t$nmdecor$func,.-".&::LABEL($func)."\n") if ($::elf); - $::stack=0; - &::wipe_labels(); -} - -sub ::comment - { - if (!defined($com_start) or $::elf) - { # Regarding $::elf above... - # GNU and SVR4 as'es use different comment delimiters, - push(@out,"\n"); # so we just skip ELF comments... - return; - } - foreach (@_) - { - if (/^\s*$/) - { push(@out,"\n"); } - else - { push(@out,"\t$com_start $_ $com_end\n"); } - } - } - -sub ::external_label -{ foreach(@_) { &::LABEL($_,$nmdecor.$_); } } - -sub ::public_label -{ push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); } - -sub ::file_end -{ if ($::macosx) - { if (%non_lazy_ptr) - { push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n"); - foreach $i (keys %non_lazy_ptr) - { push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n"); } - } - } - if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { - my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,16"; - if ($::macosx) { push (@out,"$tmp,2\n"); } - elsif ($::elf) { push (@out,"$tmp,4\n"); } - else { push (@out,"$tmp\n"); } - } - push(@out,$initseg) if ($initseg); -} - -sub ::data_byte { push(@out,".byte\t".join(',',@_)."\n"); } -sub ::data_short{ push(@out,".value\t".join(',',@_)."\n"); } -sub ::data_word { push(@out,".long\t".join(',',@_)."\n"); } - -sub ::align -{ my $val=$_[0]; - if ($::aout) - { $val=int(log($val)/log(2)); - $val.=",0x90"; - } - push(@out,".align\t$val\n"); -} - -sub ::picmeup -{ my($dst,$sym,$base,$reflabel)=@_; - - if (defined($base) && $sym eq "OPENSSL_ia32cap_P" && !$::macosx) - { &::lea($dst,&::DWP("$sym-$reflabel",$base)); } - elsif (($::pic && ($::elf || $::aout)) || $::macosx) - { if (!defined($base)) - { &::call(&::label("PIC_me_up")); - &::set_label("PIC_me_up"); - &::blindpop($dst); - $base=$dst; - $reflabel=&::label("PIC_me_up"); - } - if ($::macosx) - { my $indirect=&::static_label("$nmdecor$sym\$non_lazy_ptr"); - &::mov($dst,&::DWP("$indirect-$reflabel",$base)); - $non_lazy_ptr{"$nmdecor$sym"}=$indirect; - } - else - { &::lea($dst,&::DWP("_GLOBAL_OFFSET_TABLE_+[.-$reflabel]", - $base)); - &::mov($dst,&::DWP("$sym\@GOT",$dst)); - } - } - else - { &::lea($dst,&::DWP($sym)); } -} - -sub ::initseg -{ my $f=$nmdecor.shift; - - if ($::android) - { $initseg.=<<___; -.section .init_array -.align 4 -.long $f -___ - } - elsif ($::elf) - { $initseg.=<<___; -.section .init - call $f -___ - } - elsif ($::coff) - { $initseg.=<<___; # applies to both Cygwin and Mingw -.section .ctors -.long $f -___ - } - elsif ($::macosx) - { $initseg.=<<___; -.mod_init_func -.align 2 -.long $f -___ - } - elsif ($::aout) - { my $ctor="${nmdecor}_GLOBAL_\$I\$$f"; - $initseg.=".text\n"; - $initseg.=".type $ctor,\@function\n" if ($::pic); - $initseg.=<<___; # OpenBSD way... -.globl $ctor -.align 2 -$ctor: - jmp $f -___ - } -} - -sub ::dataseg -{ push(@out,".data\n"); } - -*::hidden = sub { push(@out,".hidden\t$nmdecor$_[0]\n"); } if ($::elf); - -1; +.././openssl/crypto/perlasm/x86gas.pl
\ No newline at end of file diff --git a/devel/perlasm/x86masm.pl b/devel/perlasm/x86masm.pl index 1741342c3a..278f81702a 100644..120000 --- a/devel/perlasm/x86masm.pl +++ b/devel/perlasm/x86masm.pl @@ -1,200 +1 @@ -#!/usr/bin/env perl - -package x86masm; - -*out=\@::out; - -$::lbdecor="\$L"; # local label decoration -$nmdecor="_"; # external name decoration - -$initseg=""; -$segment=""; - -sub ::generic -{ my ($opcode,@arg)=@_; - - # fix hexadecimal constants - for (@arg) { s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/oi; } - - if ($opcode =~ /lea/ && @arg[1] =~ s/.*PTR\s+(\(.*\))$/OFFSET $1/) # no [] - { $opcode="mov"; } - elsif ($opcode !~ /movq/) - { # fix xmm references - $arg[0] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[1]=~/\bxmm[0-7]\b/i); - $arg[1] =~ s/\b[A-Z]+WORD\s+PTR/XMMWORD PTR/i if ($arg[0]=~/\bxmm[0-7]\b/i); - } - - &::emit($opcode,@arg); - 1; -} -# -# opcodes not covered by ::generic above, mostly inconsistent namings... -# -sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); } -sub ::call_ptr { &::emit("call",@_); } -sub ::jmp_ptr { &::emit("jmp",@_); } -sub ::lock { &::data_byte(0xf0); } - -sub get_mem -{ my($size,$addr,$reg1,$reg2,$idx)=@_; - my($post,$ret); - - if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } - - $ret .= "$size PTR " if ($size ne ""); - - $addr =~ s/^\s+//; - # prepend global references with optional underscore - $addr =~ s/^([^\+\-0-9][^\+\-]*)/&::islabel($1) or "$nmdecor$1"/ige; - # put address arithmetic expression in parenthesis - $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/); - - if (($addr ne "") && ($addr ne 0)) - { if ($addr !~ /^-/) { $ret .= "$addr"; } - else { $post=$addr; } - } - $ret .= "["; - - if ($reg2 ne "") - { $idx!=0 or $idx=1; - $ret .= "$reg2*$idx"; - $ret .= "+$reg1" if ($reg1 ne ""); - } - else - { $ret .= "$reg1"; } - - $ret .= "$post]"; - $ret =~ s/\+\]/]/; # in case $addr was the only argument - $ret =~ s/\[\s*\]//; - - $ret; -} -sub ::BP { &get_mem("BYTE",@_); } -sub ::WP { &get_mem("WORD",@_); } -sub ::DWP { &get_mem("DWORD",@_); } -sub ::QWP { &get_mem("QWORD",@_); } -sub ::BC { "@_"; } -sub ::DWC { "@_"; } - -sub ::file -{ my $tmp=<<___; -TITLE $_[0].asm -IF \@Version LT 800 -ECHO MASM version 8.00 or later is strongly recommended. -ENDIF -.486 -.MODEL FLAT -OPTION DOTNAME -IF \@Version LT 800 -.text\$ SEGMENT PAGE 'CODE' -ELSE -.text\$ SEGMENT ALIGN(64) 'CODE' -ENDIF -___ - push(@out,$tmp); - $segment = ".text\$"; -} - -sub ::function_begin_B -{ my $func=shift; - my $global=($func !~ /^_/); - my $begin="${::lbdecor}_${func}_begin"; - - &::LABEL($func,$global?"$begin":"$nmdecor$func"); - $func="ALIGN\t16\n".$nmdecor.$func."\tPROC"; - - if ($global) { $func.=" PUBLIC\n${begin}::\n"; } - else { $func.=" PRIVATE\n"; } - push(@out,$func); - $::stack=4; -} -sub ::function_end_B -{ my $func=shift; - - push(@out,"$nmdecor$func ENDP\n"); - $::stack=0; - &::wipe_labels(); -} - -sub ::file_end -{ my $xmmheader=<<___; -.686 -.XMM -IF \@Version LT 800 -XMMWORD STRUCT 16 -DQ 2 dup (?) -XMMWORD ENDS -ENDIF -___ - if (grep {/\b[x]?mm[0-7]\b/i} @out) { - grep {s/\.[3-7]86/$xmmheader/} @out; - } - - push(@out,"$segment ENDS\n"); - - if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) - { my $comm=<<___; -.bss SEGMENT 'BSS' -COMM ${nmdecor}OPENSSL_ia32cap_P:DWORD:4 -.bss ENDS -___ - # comment out OPENSSL_ia32cap_P declarations - grep {s/(^EXTERN\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out; - push (@out,$comm); - } - push (@out,$initseg) if ($initseg); - push (@out,"END\n"); -} - -sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } } - -*::set_label_B = sub -{ my $l=shift; push(@out,$l.($l=~/^\Q${::lbdecor}\E[0-9]{3}/?":\n":"::\n")); }; - -sub ::external_label -{ foreach(@_) - { push(@out, "EXTERN\t".&::LABEL($_,$nmdecor.$_).":NEAR\n"); } -} - -sub ::public_label -{ push(@out,"PUBLIC\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); } - -sub ::data_byte -{ push(@out,("DB\t").join(',',@_)."\n"); } - -sub ::data_short -{ push(@out,("DW\t").join(',',@_)."\n"); } - -sub ::data_word -{ push(@out,("DD\t").join(',',@_)."\n"); } - -sub ::align -{ push(@out,"ALIGN\t$_[0]\n"); } - -sub ::picmeup -{ my($dst,$sym)=@_; - &::lea($dst,&::DWP($sym)); -} - -sub ::initseg -{ my $f=$nmdecor.shift; - - $initseg.=<<___; -.CRT\$XCU SEGMENT DWORD PUBLIC 'DATA' -EXTERN $f:NEAR -DD $f -.CRT\$XCU ENDS -___ -} - -sub ::dataseg -{ push(@out,"$segment\tENDS\n_DATA\tSEGMENT\n"); $segment="_DATA"; } - -sub ::safeseh -{ my $nm=shift; - push(@out,"IF \@Version GE 710\n"); - push(@out,".SAFESEH ".&::LABEL($nm,$nmdecor.$nm)."\n"); - push(@out,"ENDIF\n"); -} - -1; +.././openssl/crypto/perlasm/x86masm.pl
\ No newline at end of file diff --git a/devel/perlasm/x86nasm.pl b/devel/perlasm/x86nasm.pl index 5d92f6092a..99a0df0ea8 100644..120000 --- a/devel/perlasm/x86nasm.pl +++ b/devel/perlasm/x86nasm.pl @@ -1,179 +1 @@ -#!/usr/bin/env perl - -package x86nasm; - -*out=\@::out; - -$::lbdecor="L\$"; # local label decoration -$nmdecor=$::netware?"":"_"; # external name decoration -$drdecor=$::mwerks?".":""; # directive decoration - -$initseg=""; - -sub ::generic -{ my $opcode=shift; - my $tmp; - - if (!$::mwerks) - { if ($opcode =~ m/^j/o && $#_==0) # optimize jumps - { $_[0] = "NEAR $_[0]"; } - elsif ($opcode eq "lea" && $#_==1) # wipe storage qualifier from lea - { $_[1] =~ s/^[^\[]*\[/\[/o; } - elsif ($opcode eq "clflush" && $#_==0) - { $_[0] =~ s/^[^\[]*\[/\[/o; } - } - &::emit($opcode,@_); - 1; -} -# -# opcodes not covered by ::generic above, mostly inconsistent namings... -# -sub ::call { &::emit("call",(&::islabel($_[0]) or "$nmdecor$_[0]")); } -sub ::call_ptr { &::emit("call",@_); } -sub ::jmp_ptr { &::emit("jmp",@_); } - -sub get_mem -{ my($size,$addr,$reg1,$reg2,$idx)=@_; - my($post,$ret); - - if (!defined($idx) && 1*$reg2) { $idx=$reg2; $reg2=$reg1; undef $reg1; } - - if ($size ne "") - { $ret .= "$size"; - $ret .= " PTR" if ($::mwerks); - $ret .= " "; - } - $ret .= "["; - - $addr =~ s/^\s+//; - # prepend global references with optional underscore - $addr =~ s/^([^\+\-0-9][^\+\-]*)/::islabel($1) or "$nmdecor$1"/ige; - # put address arithmetic expression in parenthesis - $addr="($addr)" if ($addr =~ /^.+[\-\+].+$/); - - if (($addr ne "") && ($addr ne 0)) - { if ($addr !~ /^-/) { $ret .= "$addr+"; } - else { $post=$addr; } - } - - if ($reg2 ne "") - { $idx!=0 or $idx=1; - $ret .= "$reg2*$idx"; - $ret .= "+$reg1" if ($reg1 ne ""); - } - else - { $ret .= "$reg1"; } - - $ret .= "$post]"; - $ret =~ s/\+\]/]/; # in case $addr was the only argument - - $ret; -} -sub ::BP { &get_mem("BYTE",@_); } -sub ::DWP { &get_mem("DWORD",@_); } -sub ::WP { &get_mem("WORD",@_); } -sub ::QWP { &get_mem("",@_); } -sub ::BC { (($::mwerks)?"":"BYTE ")."@_"; } -sub ::DWC { (($::mwerks)?"":"DWORD ")."@_"; } - -sub ::file -{ if ($::mwerks) { push(@out,".section\t.text,64\n"); } - else - { my $tmp=<<___; -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -\$\@feat.00 equ 1 -section .text code align=64 -%else -section .text code -%endif -___ - push(@out,$tmp); - } -} - -sub ::function_begin_B -{ my $func=shift; - my $global=($func !~ /^_/); - my $begin="${::lbdecor}_${func}_begin"; - - $begin =~ s/^\@/./ if ($::mwerks); # the torture never stops - - &::LABEL($func,$global?"$begin":"$nmdecor$func"); - $func=$nmdecor.$func; - - push(@out,"${drdecor}global $func\n") if ($global); - push(@out,"${drdecor}align 16\n"); - push(@out,"$func:\n"); - push(@out,"$begin:\n") if ($global); - $::stack=4; -} - -sub ::function_end_B -{ $::stack=0; - &::wipe_labels(); -} - -sub ::file_end -{ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) - { my $comm=<<___; -${drdecor}segment .bss -${drdecor}common ${nmdecor}OPENSSL_ia32cap_P 16 -___ - # comment out OPENSSL_ia32cap_P declarations - grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out; - push (@out,$comm) - } - push (@out,$initseg) if ($initseg); -} - -sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } } - -sub ::external_label -{ foreach(@_) - { push(@out,"${drdecor}extern\t".&::LABEL($_,$nmdecor.$_)."\n"); } -} - -sub ::public_label -{ push(@out,"${drdecor}global\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); } - -sub ::data_byte -{ push(@out,(($::mwerks)?".byte\t":"db\t").join(',',@_)."\n"); } -sub ::data_short -{ push(@out,(($::mwerks)?".word\t":"dw\t").join(',',@_)."\n"); } -sub ::data_word -{ push(@out,(($::mwerks)?".long\t":"dd\t").join(',',@_)."\n"); } - -sub ::align -{ push(@out,"${drdecor}align\t$_[0]\n"); } - -sub ::picmeup -{ my($dst,$sym)=@_; - &::lea($dst,&::DWP($sym)); -} - -sub ::initseg -{ my $f=$nmdecor.shift; - if ($::win32) - { $initseg=<<___; -segment .CRT\$XCU data align=4 -extern $f -dd $f -___ - } -} - -sub ::dataseg -{ if ($mwerks) { push(@out,".section\t.data,4\n"); } - else { push(@out,"section\t.data align=4\n"); } -} - -sub ::safeseh -{ my $nm=shift; - push(@out,"%if __NASM_VERSION_ID__ >= 0x02030000\n"); - push(@out,"safeseh ".&::LABEL($nm,$nmdecor.$nm)."\n"); - push(@out,"%endif\n"); -} - -1; +.././openssl/crypto/perlasm/x86nasm.pl
\ No newline at end of file |