summaryrefslogtreecommitdiff
path: root/crypto/sm4/asm/vpsm4-armv8.pl
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/sm4/asm/vpsm4-armv8.pl')
-rwxr-xr-xcrypto/sm4/asm/vpsm4-armv8.pl458
1 files changed, 458 insertions, 0 deletions
diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
index ac979c074e..cb1be122e4 100755
--- a/crypto/sm4/asm/vpsm4-armv8.pl
+++ b/crypto/sm4/asm/vpsm4-armv8.pl
@@ -28,6 +28,7 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\""
$prefix="vpsm4";
my @vtmp=map("v$_",(0..3));
+my @qtmp=map("q$_",(0..3));
my @data=map("v$_",(4..7));
my @datax=map("v$_",(8..11));
my ($rk0,$rk1)=("v12","v13");
@@ -36,6 +37,7 @@ my @vtmpx=map("v$_",(12..15));
my @sbox=map("v$_",(16..31));
my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
+my ($xtmp1,$xtmp2)=("x8","x9");
my ($ptr,$counter)=("x10","w11");
my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
@@ -60,6 +62,51 @@ ___
}
}
+sub rev32_armeb() {
+ my $dst = shift;
+ my $src = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+$code.=<<___;
+#ifdef __AARCH64EB__
+ rev32 $dst.16b,$src.16b
+#else
+ mov $dst.16b,$src.16b
+#endif
+___
+ } else {
+$code.=<<___;
+#ifdef __AARCH64EB__
+ rev32 $dst.16b,$dst.16b
+#endif
+___
+ }
+}
+
+sub rbit() {
+ my $dst = shift;
+ my $src = shift;
+ my $std = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+ if ($std eq "_gb") {
+$code.=<<___;
+ rbit $dst.16b,$src.16b
+___
+ } else {
+$code.=<<___;
+ mov $dst.16b,$src.16b
+___
+ }
+ } else {
+ if ($std eq "_gb") {
+$code.=<<___;
+ rbit $dst.16b,$src.16b
+___
+ }
+ }
+}
+
sub transpose() {
my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
@@ -435,6 +482,58 @@ $code.=<<___;
___
}
+
+sub mov_reg_to_vec() {
+ my $src0 = shift;
+ my $src1 = shift;
+ my $desv = shift;
+$code.=<<___;
+ mov $desv.d[0],$src0
+ mov $desv.d[1],$src1
+___
+ &rev32_armeb($desv,$desv);
+}
+
+sub mov_vec_to_reg() {
+ my $srcv = shift;
+ my $des0 = shift;
+ my $des1 = shift;
+$code.=<<___;
+ mov $des0,$srcv.d[0]
+ mov $des1,$srcv.d[1]
+___
+}
+
+sub compute_tweak() {
+ my $src0 = shift;
+ my $src1 = shift;
+ my $des0 = shift;
+ my $des1 = shift;
+$code.=<<___;
+ mov $wtmp0,0x87
+ extr $xtmp2,$src1,$src1,#32
+ extr $des1,$src1,$src0,#63
+ and $wtmp1,$wtmp0,$wtmp2,asr#31
+ eor $des0,$xtmp1,$src0,lsl#1
+___
+}
+
+sub compute_tweak_vec() {
+ my $src = shift;
+ my $des = shift;
+ my $std = shift;
+ &rbit(@vtmp[2],$src,$std);
+$code.=<<___;
+ ldr @qtmp[0], =0x01010101010101010101010101010187
+ shl $des.16b, @vtmp[2].16b, #1
+ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
+ ushr @vtmp[1].16b, @vtmp[1].16b, #7
+ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
+ eor $des.16b, $des.16b, @vtmp[1].16b
+___
+ &rbit($des,$des,$std);
+}
+
$code=<<___;
#include "arm_arch.h"
.arch armv8-a
@@ -1101,6 +1200,365 @@ $code.=<<___;
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
+
+{{{
+my ($blocks,$len)=("x2","x2");
+my $ivp=("x5");
+my @twx=map("x$_",(12..27));
+my ($rks1,$rks2)=("x26","x27");
+my $lastBlk=("x26");
+my $enc=("w28");
+my $remain=("x29");
+
+my @tweak=@datax;
+
+sub gen_xts_cipher() {
+ my $std = shift;
+$code.=<<___;
+.globl ${prefix}_xts_encrypt${std}
+.type ${prefix}_xts_encrypt${std},%function
+.align 5
+${prefix}_xts_encrypt${std}:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov $rks1,x3
+ mov $rks2,x4
+ mov $enc,w6
+ ld1 {@tweak[0].4s}, [$ivp]
+ mov $rks,$rks2
+___
+ &load_sbox();
+ &rev32(@tweak[0],@tweak[0]);
+ &encrypt_1blk(@tweak[0]);
+$code.=<<___;
+ mov $rks,$rks1
+ and $remain,$len,#0x0F
+ // convert length into blocks
+ lsr $blocks,$len,4
+ cmp $blocks,#1
+ b.lt .return${std}
+
+ cmp $remain,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
+ b.eq .xts_encrypt_blocks${std}
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
+ subs $blocks,$blocks,#1
+ b.eq .only_2blks_tweak${std}
+.xts_encrypt_blocks${std}:
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rev32_armeb(@tweak[0],@tweak[0]);
+ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
+$code.=<<___;
+.Lxts_8_blocks_process${std}:
+ cmp $blocks,#8
+ b.lt .Lxts_4_blocks_process${std}
+___
+ &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
+ &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
+ &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
+ &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
+ &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
+ &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
+ &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
+ &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
+$code.=<<___;
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rbit(@vtmp[0],@vtmp[0],$std);
+ &rbit(@vtmp[1],@vtmp[1],$std);
+ &rbit(@vtmp[2],@vtmp[2],$std);
+ &rbit(@vtmp[3],@vtmp[3],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @vtmp[0].16b
+ eor @data[1].16b, @data[1].16b, @vtmp[1].16b
+ eor @data[2].16b, @data[2].16b, @vtmp[2].16b
+ eor @data[3].16b, @data[3].16b, @vtmp[3].16b
+ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+___
+ &rbit(@vtmpx[0],@vtmpx[0],$std);
+ &rbit(@vtmpx[1],@vtmpx[1],$std);
+ &rbit(@vtmpx[2],@vtmpx[2],$std);
+ &rbit(@vtmpx[3],@vtmpx[3],$std);
+$code.=<<___;
+ eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
+ eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
+ eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
+ eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],@datax[3]);
+ &transpose(@data,@vtmp);
+ &transpose(@datax,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_8blks
+___
+ &transpose(@vtmp,@datax);
+ &transpose(@data,@datax);
+
+ &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
+ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
+ &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
+ &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
+ &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
+ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
+ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
+ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
+ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
+ eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+ eor @data[3].16b, @data[3].16b, @tweak[3].16b
+
+ // save the last tweak
+ st1 {@tweak[3].4s},[$ivp]
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lxts_8_blocks_process${std}
+ b 100f
+.Lxts_4_blocks_process${std}:
+___
+ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
+ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
+ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
+ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
+$code.=<<___;
+ cmp $blocks,#4
+ b.lt 1f
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+ &rbit(@tweak[2],@tweak[2],$std);
+ &rbit(@tweak[3],@tweak[3],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+ eor @data[3].16b, @data[3].16b, @tweak[3].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
+ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ sub $blocks,$blocks,#4
+___
+ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
+ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
+ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
+$code.=<<___;
+ // save the last tweak
+ st1 {@tweak[3].4s},[$ivp]
+1:
+ // process last block
+ cmp $blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].4s},[$inp],#16
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ st1 {@data[0].4s},[$outp],#16
+ // save the last tweak
+ st1 {@tweak[0].4s},[$ivp]
+ b 100f
+1: // process last 2 blocks
+ cmp $blocks,#2
+ b.gt 1f
+ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
+ // save the last tweak
+ st1 {@tweak[1].4s},[$ivp]
+ b 100f
+1: // process last 3 blocks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+ &rbit(@tweak[2],@tweak[2],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
+ // save the last tweak
+ st1 {@tweak[2].4s},[$ivp]
+100:
+ cmp $remain,0
+ b.eq .return${std}
+
+// This brance calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak${std}:
+ ld1 {@tweak[0].4s},[$ivp]
+___
+ &rev32_armeb(@tweak[0],@tweak[0]);
+ &compute_tweak_vec(@tweak[0],@tweak[1],$std);
+ &compute_tweak_vec(@tweak[1],@tweak[2],$std);
+$code.=<<___;
+ b .check_dec${std}
+
+
+// This brance calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak${std}:
+ mov @tweak[1].16b,@tweak[0].16b
+___
+ &rev32_armeb(@tweak[1],@tweak[1]);
+ &compute_tweak_vec(@tweak[1],@tweak[2]);
+$code.=<<___;
+ b .check_dec${std}
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec${std}:
+ // encryption:1 decryption:0
+ cmp $enc,1
+ b.eq .prcess_last_2blks${std}
+ mov @vtmp[0].16B,@tweak[1].16b
+ mov @tweak[1].16B,@tweak[2].16b
+ mov @tweak[2].16B,@vtmp[0].16b
+
+.prcess_last_2blks${std}:
+___
+ &rev32_armeb(@tweak[1],@tweak[1]);
+ &rev32_armeb(@tweak[2],@tweak[2]);
+$code.=<<___;
+ ld1 {@data[0].4s},[$inp],#16
+ eor @data[0].16b, @data[0].16b, @tweak[1].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[1].16b
+ st1 {@data[0].4s},[$outp],#16
+
+ sub $lastBlk,$outp,16
+ .loop${std}:
+ subs $remain,$remain,1
+ ldrb $wtmp0,[$lastBlk,$remain]
+ ldrb $wtmp1,[$inp,$remain]
+ strb $wtmp1,[$lastBlk,$remain]
+ strb $wtmp0,[$outp,$remain]
+ b.gt .loop${std}
+ ld1 {@data[0].4s}, [$lastBlk]
+ eor @data[0].16b, @data[0].16b, @tweak[2].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[2].16b
+ st1 {@data[0].4s}, [$lastBlk]
+.return${std}:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
+___
+} # end of gen_xts_cipher
+&gen_xts_cipher("_gb");
+&gen_xts_cipher("");
+}}}
########################################
open SELF,$0;
while(<SELF>) {