summaryrefslogtreecommitdiff
path: root/crypto/chacha
diff options
context:
space:
mode:
authorDaniel Hu <Daniel.Hu@arm.com>2022-05-25 10:23:40 +0100
committerPauli <pauli@openssl.org>2022-06-22 17:07:17 +1000
commitbcb52bcc9f9c36a85d037976676fd5ca52f307cd (patch)
treef4aa6709074310d800f0ae5f06d38d29fee22b61 /crypto/chacha
parentb147b9daf17744d529f23b5da40397a6071a88aa (diff)
downloadopenssl-new-bcb52bcc9f9c36a85d037976676fd5ca52f307cd.tar.gz
Optimize chacha20 on aarch64 by SVE2
This patch improves existing chacha20 SVE patch by using SVE2, which is an optional architecture feature of aarch64, with XAR instruction that can improve the performance of chacha20. Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/18522)
Diffstat (limited to 'crypto/chacha')
-rwxr-xr-xcrypto/chacha/asm/chacha-armv8-sve.pl397
1 files changed, 235 insertions, 162 deletions
diff --git a/crypto/chacha/asm/chacha-armv8-sve.pl b/crypto/chacha/asm/chacha-armv8-sve.pl
index 6080414e0d..dfc4548a4f 100755
--- a/crypto/chacha/asm/chacha-armv8-sve.pl
+++ b/crypto/chacha/asm/chacha-armv8-sve.pl
@@ -31,17 +31,25 @@ sub AUTOLOAD() # thunk [simplified] x86-style perlasm
}
my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
-my ($state) = ("x5");
-my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7");
-my ($saved_outp) = ("x8");
-my ($wctr, $xctr) = ("w9", "x9");
-my @mx=map("z$_",(0..7,16..23));
+my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6");
+my ($sve2flag) = ("x7");
+my ($wctr, $xctr) = ("w8", "x8");
+my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
+my ($tmp,$tmpw) = ("x10", "w10");
+my ($counter) = ("x11");
+my @K=map("x$_",(12..15,19..22));
+my @KL=map("w$_",(12..15,19..22));
+my @mx=map("z$_",(0..15));
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
$xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
-my @xt=map("z$_",(24..31,8..11));
-my ($rot8) = ("z12");
-my ($zctr) = ("z13");
-my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt;
+my ($zctr) = ("z16");
+my @xt=map("z$_",(17..24));
+my @perm=map("z$_",(25..30));
+my ($rot8) = ("z31");
+my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt;
+# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register)
+# in SVE2 we use all 15 backup register
+my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8);
my $debug_encoder=0;
sub SVE_ADD() {
@@ -148,8 +156,12 @@ sub SVE_QR_GROUP() {
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
- &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
- &SVE_REV16($d0,$d1,$d2,$d3);
+ if ($have_sve2 == 0) {
+ &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+ &SVE_REV16($d0,$d1,$d2,$d3);
+ } else {
+ &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+ }
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
if ($have_sve2 == 0) {
@@ -162,8 +174,12 @@ sub SVE_QR_GROUP() {
}
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
- &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
- &SVE_ROT8($d0,$d1,$d2,$d3);
+ if ($have_sve2 == 0) {
+ &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+ &SVE_ROT8($d0,$d1,$d2,$d3);
+ } else {
+ &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+ }
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
if ($have_sve2 == 0) {
@@ -178,27 +194,32 @@ sub SVE_QR_GROUP() {
sub SVE_INNER_BLOCK() {
$code.=<<___;
- //cbnz $sve2flag, 10f
+ mov $counter,#10
+1:
+.align 5
___
&SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
&SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
$code.=<<___;
- // SVE 2 not enabled until hardware available
-#if 0
- b 11f
-10:
+ subs $counter,$counter,1
+ b.ne 1b
___
-# &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
-# &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
+}
+
+sub SVE2_INNER_BLOCK() {
$code.=<<___;
-11:
-#endif
+ mov $counter,#10
+1:
+.align 5
+___
+ &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
+ &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
+$code.=<<___;
+ subs $counter,$counter,1
+ b.ne 1b
___
}
-{{{
-my ($dlen,$rsize,$tmp) = ("x10","x11","x12");
-
sub load() {
my $x0 = shift;
my $x1 = shift;
@@ -252,72 +273,75 @@ sub transpose() {
my $xd = shift;
$code.=<<___;
- zip1 $xt8.s,$xa.s,$xb.s
- zip2 $xt9.s,$xa.s,$xb.s
- zip1 $xt10.s,$xc.s,$xd.s
- zip2 $xt11.s,$xc.s,$xd.s
- zip1 $xa.d,$xt8.d,$xt10.d
- zip2 $xb.d,$xt8.d,$xt10.d
- zip1 $xc.d,$xt9.d,$xt11.d
- zip2 $xd.d,$xt9.d,$xt11.d
+ zip1 $xt0.s,$xa.s,$xb.s
+ zip2 $xt1.s,$xa.s,$xb.s
+ zip1 $xt2.s,$xc.s,$xd.s
+ zip2 $xt3.s,$xc.s,$xd.s
+ zip1 $xa.d,$xt0.d,$xt2.d
+ zip2 $xb.d,$xt0.d,$xt2.d
+ zip1 $xc.d,$xt1.d,$xt3.d
+ zip2 $xd.d,$xt1.d,$xt3.d
___
}
-sub add_states() {
- my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
-
+sub SVE_ADD_STATES() {
$code.=<<___;
- ldp $tmpw0,$tmpw1,[$state]
- ldp $tmpw2,$tmpw3,[$state,#8]
- dup $xt0.s,$tmpw0
+ lsr $tmp1,@K[5],#32
+ dup $xt0.s,@KL[5]
dup $xt1.s,$tmpw1
- dup $xt2.s,$tmpw2
- dup $xt3.s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#16]
- ldp $tmpw2,$tmpw3,[$state,#24]
- add @mx[0].s,@mx[0].s,$xt0.s
- add @mx[1].s,@mx[1].s,$xt1.s
- add @mx[2].s,@mx[2].s,$xt2.s
- add @mx[3].s,@mx[3].s,$xt3.s
+ add @mx[0].s,@mx[0].s,$bak0.s
+ add @mx[1].s,@mx[1].s,$bak1.s
+ add @mx[2].s,@mx[2].s,$bak2.s
+ add @mx[3].s,@mx[3].s,$bak3.s
+ add @mx[4].s,@mx[4].s,$bak4.s
+ add @mx[5].s,@mx[5].s,$bak5.s
+ add @mx[6].s,@mx[6].s,$bak6.s
+ add @mx[7].s,@mx[7].s,$bak7.s
+ add @mx[8].s,@mx[8].s,$bak8.s
+ add @mx[9].s,@mx[9].s,$bak9.s
+ lsr $tmp0,@K[6],#32
dup $xt4.s,$tmpw0
- dup $xt5.s,$tmpw1
- dup $xt6.s,$tmpw2
- dup $xt7.s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#32]
- ldp $tmpw2,$tmpw3,[$state,#40]
- add @mx[4].s,@mx[4].s,$xt4.s
- add @mx[5].s,@mx[5].s,$xt5.s
- add @mx[6].s,@mx[6].s,$xt6.s
- add @mx[7].s,@mx[7].s,$xt7.s
- dup $xt0.s,$tmpw0
- dup $xt1.s,$tmpw1
- dup $xt2.s,$tmpw2
- dup $xt3.s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#48]
- ldp $tmpw2,$tmpw3,[$state,#56]
- add @mx[8].s,@mx[8].s,$xt0.s
- add @mx[9].s,@mx[9].s,$xt1.s
- add @mx[10].s,@mx[10].s,$xt2.s
- add @mx[11].s,@mx[11].s,$xt3.s
- dup $xt5.s,$tmpw1
- dup $xt6.s,$tmpw2
- dup $xt7.s,$tmpw3
+ lsr $tmp1,@K[7],#32
+ dup $xt5.s,@KL[7]
+ dup $xt6.s,$tmpw1
+ add @mx[10].s,@mx[10].s,$xt0.s
+ add @mx[11].s,@mx[11].s,$xt1.s
+ add @mx[12].s,@mx[12].s,$zctr.s
+ add @mx[13].s,@mx[13].s,$xt4.s
+ add @mx[14].s,@mx[14].s,$xt5.s
+ add @mx[15].s,@mx[15].s,$xt6.s
+___
+}
+
+sub SVE2_ADD_STATES() {
+$code.=<<___;
+ add @mx[0].s,@mx[0].s,$bak0.s
+ add @mx[1].s,@mx[1].s,$bak1.s
+ add @mx[2].s,@mx[2].s,$bak2.s
+ add @mx[3].s,@mx[3].s,$bak3.s
+ add @mx[4].s,@mx[4].s,$bak4.s
+ add @mx[5].s,@mx[5].s,$bak5.s
+ add @mx[6].s,@mx[6].s,$bak6.s
+ add @mx[7].s,@mx[7].s,$bak7.s
+ add @mx[8].s,@mx[8].s,$bak8.s
+ add @mx[9].s,@mx[9].s,$bak9.s
+ add @mx[10].s,@mx[10].s,$bak10.s
+ add @mx[11].s,@mx[11].s,$bak11.s
add @mx[12].s,@mx[12].s,$zctr.s
- add @mx[13].s,@mx[13].s,$xt5.s
- add @mx[14].s,@mx[14].s,$xt6.s
- add @mx[15].s,@mx[15].s,$xt7.s
+ add @mx[13].s,@mx[13].s,$bak13.s
+ add @mx[14].s,@mx[14].s,$bak14.s
+ add @mx[15].s,@mx[15].s,$bak15.s
___
}
sub SVE_TRANSFORMS() {
- &add_states();
&transpose($xa0,$xb0,$xc0,$xd0);
&transpose($xa1,$xb1,$xc1,$xd1);
&transpose($xa2,$xb2,$xc2,$xd2);
&transpose($xa3,$xb3,$xc3,$xd3);
- &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
&transpose($xa0,$xa1,$xa2,$xa3);
&transpose($xb0,$xb1,$xb2,$xb3);
+ &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
$code.=<<___;
eor $xa0.d,$xa0.d,$xt0.d
eor $xa1.d,$xa1.d,$xt1.d
@@ -330,8 +354,8 @@ $code.=<<___;
___
&transpose($xc0,$xc1,$xc2,$xc3);
&store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
- &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
&transpose($xd0,$xd1,$xd2,$xd3);
+ &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
$code.=<<___;
eor $xc0.d,$xc0.d,$xt0.d
eor $xc1.d,$xc1.d,$xt1.d
@@ -348,73 +372,111 @@ $code.=<<___;
incw $zctr.s, ALL, MUL #1
___
}
-}}}
sub SVE_LOAD_STATES() {
- my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
-
$code.=<<___;
- // FIXME following code are not functionally necessary
- // but appear to enhance performance
-#if 1
- ptrues p2.s,ALL
- ptrues p2.s,ALL
- ptrues p2.s,ALL
- ptrues p2.s,ALL
- ptrues p2.s,ALL
- ptrues p2.s,ALL
-#endif
+ lsr $tmp0,@K[0],#32
+ dup @mx[0].s,@KL[0]
+ dup $bak0.s,@KL[0]
+ dup @mx[1].s,$tmpw0
+ dup $bak1.s,$tmpw0
+ lsr $tmp1,@K[1],#32
+ dup @mx[2].s,@KL[1]
+ dup $bak2.s,@KL[1]
+ dup @mx[3].s,$tmpw1
+ dup $bak3.s,$tmpw1
+ lsr $tmp0,@K[2],#32
+ dup @mx[4].s,@KL[2]
+ dup $bak4.s,@KL[2]
+ dup @mx[5].s,$tmpw0
+ dup $bak5.s,$tmpw0
+ lsr $tmp1,@K[3],#32
+ dup @mx[6].s,@KL[3]
+ dup $bak6.s,@KL[3]
+ dup @mx[7].s,$tmpw1
+ dup $bak7.s,$tmpw1
+ lsr $tmp0,@K[4],#32
+ dup @mx[8].s,@KL[4]
+ dup $bak8.s,@KL[4]
+ dup @mx[9].s,$tmpw0
+ dup $bak9.s,$tmpw0
+ lsr $tmp1,@K[5],#32
+ dup @mx[10].s,@KL[5]
+ dup @mx[11].s,$tmpw1
+ orr @mx[12].d,$zctr.d,$zctr.d
+ lsr $tmp0,@K[6],#32
+ dup @mx[13].s,$tmpw0
+ lsr $tmp1,@K[7],#32
+ dup @mx[14].s,@KL[7]
+ dup @mx[15].s,$tmpw1
___
+}
+
+sub SVE2_LOAD_STATES() {
$code.=<<___;
- ldp $tmpw0,$tmpw1,[$state]
- ldp $tmpw2,$tmpw3,[$state,#8]
- dup @mx[0].s,$tmpw0
- dup @mx[1].s,$tmpw1
- dup @mx[2].s,$tmpw2
- dup @mx[3].s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#16]
- ldp $tmpw2,$tmpw3,[$state,#24]
- dup @mx[4].s,$tmpw0
- dup @mx[5].s,$tmpw1
- dup @mx[6].s,$tmpw2
- dup @mx[7].s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#32]
- ldp $tmpw2,$tmpw3,[$state,#40]
- dup @mx[8].s,$tmpw0
- dup @mx[9].s,$tmpw1
- dup @mx[10].s,$tmpw2
- dup @mx[11].s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state, #48]
- ldp $tmpw2,$tmpw3,[$state,#56]
- mov @mx[12].s,p0/m,$zctr.s
- dup @mx[13].s,$tmpw1
- dup @mx[14].s,$tmpw2
- dup @mx[15].s,$tmpw3
+ lsr $tmp0,@K[0],#32
+ dup @mx[0].s,@KL[0]
+ dup $bak0.s,@KL[0]
+ dup @mx[1].s,$tmpw0
+ dup $bak1.s,$tmpw0
+ lsr $tmp1,@K[1],#32
+ dup @mx[2].s,@KL[1]
+ dup $bak2.s,@KL[1]
+ dup @mx[3].s,$tmpw1
+ dup $bak3.s,$tmpw1
+ lsr $tmp0,@K[2],#32
+ dup @mx[4].s,@KL[2]
+ dup $bak4.s,@KL[2]
+ dup @mx[5].s,$tmpw0
+ dup $bak5.s,$tmpw0
+ lsr $tmp1,@K[3],#32
+ dup @mx[6].s,@KL[3]
+ dup $bak6.s,@KL[3]
+ dup @mx[7].s,$tmpw1
+ dup $bak7.s,$tmpw1
+ lsr $tmp0,@K[4],#32
+ dup @mx[8].s,@KL[4]
+ dup $bak8.s,@KL[4]
+ dup @mx[9].s,$tmpw0
+ dup $bak9.s,$tmpw0
+ lsr $tmp1,@K[5],#32
+ dup @mx[10].s,@KL[5]
+ dup $bak10.s,@KL[5]
+ dup @mx[11].s,$tmpw1
+ dup $bak11.s,$tmpw1
+ orr @mx[12].d,$zctr.d,$zctr.d
+ lsr $tmp0,@K[6],#32
+ dup @mx[13].s,$tmpw0
+ dup $bak13.s,$tmpw0
+ lsr $tmp1,@K[7],#32
+ dup @mx[14].s,@KL[7]
+ dup $bak14.s,@KL[7]
+ dup @mx[15].s,$tmpw1
+ dup $bak15.s,$tmpw1
___
}
sub sve_handle_blocks() {
- my ($counter) = ("x10");
-
- &SVE_LOAD_STATES();
$code.=<<___;
- mov $counter,#10
-.align 5
-1:
+ cbz $sve2flag,.sve_inner
___
-
+ &SVE2_LOAD_STATES();
+ &SVE2_INNER_BLOCK();
+ &SVE2_ADD_STATES();
+$code.=<<___;
+ b .fini_inner
+.sve_inner:
+___
+ &SVE_LOAD_STATES();
&SVE_INNER_BLOCK();
+ &SVE_ADD_STATES();
$code.=<<___;
- subs $counter,$counter,1
- b.ne 1b
+.fini_inner:
___
&SVE_TRANSFORMS();
}
sub chacha20_process() {
- my ($counter) = ("x10");
- my ($tmpw) = ("w11");
-
$code.=<<___;
.align 5
.Loop:
@@ -430,27 +492,18 @@ ___
}
{{{
-my ($tmp,$tmpw) = ("x10", "w10");
-my ($tmpw0,$tmpw1) = ("w11", "w12");
-my ($ptr) = ("x13");
-
$code.=<<___;
#include "arm_arch.h"
.arch armv8-a
-#if 0
.extern OPENSSL_armcap_P
.hidden OPENSSL_armcap_P
-#endif
.text
.align 5
.Lchacha20_consts:
- .word 0x61707865
- .word 0x3320646e
- .word 0x79622d32
- .word 0x6b206574
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lrot8:
.word 0x02010003,0x04040404,0x02010003,0x04040404
.globl ChaCha20_ctr32_sve
@@ -458,49 +511,55 @@ $code.=<<___;
.align 5
ChaCha20_ctr32_sve:
AARCH64_VALID_CALL_TARGET
- mov $tmp, #64
- whilelo p0.s,xzr,$tmp
- cntp $veclen,p0,p0.s
- // run Neon if we only have 128-bit SVE
- // in the future, we need to check SVE2
- cmp $veclen,4
- b.le .Lreturn
+ cntw $veclen, ALL, MUL #1
lsr $blocks,$len,#6
cmp $blocks,$veclen
b.lt .Lreturn
- stp d8,d9,[sp,-48]!
- stp d10,d11,[sp,16]
- stp d12,d13,[sp,32]
- sub sp,sp,#64
- adr $tmp,.Lchacha20_consts
- ld1 {v0.4s},[$tmp]
- adr $tmp,.Lrot8
- ldp $tmpw0,$tmpw1,[$tmp]
- ld1 {v1.4s,v2.4s},[$key]
- ld1 {v3.4s},[$ctr]
- ldr $wctr,[$ctr]
- index $zctr.s,$wctr,1
- index $rot8.s,$tmpw0,$tmpw1
- st1 {v0.4s,v1.4s,v2.4s,v3.4s},[sp]
- mov $state,sp
-#if 0
- // SVE2 code not enabled until we have hardware
- // for verification
mov $sve2flag,0
adrp $tmp,OPENSSL_armcap_P
ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
tst $tmpw,#ARMV8_SVE2
b.eq 1f
mov $sve2flag,1
+ b 2f
1:
+ cmp $veclen,4
+ b.le .Lreturn
+ adr $tmp,.Lrot8
+ ldp $tmpw0,$tmpw1,[$tmp]
+ index $rot8.s,$tmpw0,$tmpw1
+2:
+ stp d8,d9,[sp,-96]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ stp x19,x20,[sp,64]
+ stp x21,x22,[sp,80]
+ adr $tmp,.Lchacha20_consts
+ ldp @K[0],@K[1],[$tmp]
+ ldp @K[2],@K[3],[$key]
+ ldp @K[4],@K[5],[$key, 16]
+ ldp @K[6],@K[7],[$ctr]
+ ldr $wctr,[$ctr]
+ index $zctr.s,$wctr,1
+ ptrues p0.s,ALL
+#ifdef __AARCH64EB__
+ ror @K[2],@K[2],#32
+ ror @K[3],@K[3],#32
+ ror @K[4],@K[4],#32
+ ror @K[5],@K[5],#32
+ ror @K[6],@K[6],#32
+ ror @K[7],@K[7],#32
#endif
___
&chacha20_process();
$code.=<<___;
- add sp,sp,#64
ldp d10,d11,[sp,16]
ldp d12,d13,[sp,32]
- ldp d8,d9,[sp],48
+ ldp d14,d15,[sp,48]
+ ldp x19,x20,[sp,64]
+ ldp x21,x22,[sp,80]
+ ldp d8,d9,[sp],96
str $wctr,[$ctr]
and $len,$len,#63
add $len,$len,$blocks,lsl #6
@@ -514,6 +573,7 @@ ___
########################################
{
my %opcode_unpred = (
+ "movprfx" => 0x0420BC00,
"eor" => 0x04a03000,
"add" => 0x04200000,
"orr" => 0x04603000,
@@ -528,6 +588,7 @@ my %opcode_unpred = (
"index" => 0x04204C00,
"mov" => 0x05203800,
"dup" => 0x05203800,
+ "cntw" => 0x04A0E000,
"tbl" => 0x05203000);
my %opcode_imm_unpred = (
@@ -564,6 +625,7 @@ my %opcode_pred = (
"st4w" => 0xE570E000,
"st1w" => 0xE500E000,
"ld1w" => 0xA540A000,
+ "ld1rw" => 0x8540C000,
"revh" => 0x05258000);
my %tsize = (
@@ -740,6 +802,10 @@ sub sve_pred {
if ($addr =~ m/x([0-9]+)\s*/o) {
$xn = $1;
}
+
+ if ($mnemonic =~m/ld1r[bhwd]/o) {
+ $size = 0;
+ }
if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
} elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
@@ -810,8 +876,14 @@ sub sve_other {
} elsif ($arg =~ m/x([0-9]+)/o) {
return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
}
+ } elsif ($mnemonic =~ /cnt[bhdw]/) {
+ if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
+ return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
+ }
} elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
+ } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
+ return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
}
sprintf "%s // fail to parse", $inst;
}
@@ -834,9 +906,10 @@ foreach(split("\n",$code)) {
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
+ s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
- s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
+ s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
print $_,"\n";
}