summaryrefslogtreecommitdiff
path: root/crypto/chacha/asm/chacha-armv8-sve.pl
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/chacha/asm/chacha-armv8-sve.pl')
-rwxr-xr-xcrypto/chacha/asm/chacha-armv8-sve.pl397
1 files changed, 235 insertions, 162 deletions
diff --git a/crypto/chacha/asm/chacha-armv8-sve.pl b/crypto/chacha/asm/chacha-armv8-sve.pl
index 6080414e0d..dfc4548a4f 100755
--- a/crypto/chacha/asm/chacha-armv8-sve.pl
+++ b/crypto/chacha/asm/chacha-armv8-sve.pl
@@ -31,17 +31,25 @@ sub AUTOLOAD() # thunk [simplified] x86-style perlasm
}
my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
-my ($state) = ("x5");
-my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7");
-my ($saved_outp) = ("x8");
-my ($wctr, $xctr) = ("w9", "x9");
-my @mx=map("z$_",(0..7,16..23));
+my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6");
+my ($sve2flag) = ("x7");
+my ($wctr, $xctr) = ("w8", "x8");
+my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
+my ($tmp,$tmpw) = ("x10", "w10");
+my ($counter) = ("x11");
+my @K=map("x$_",(12..15,19..22));
+my @KL=map("w$_",(12..15,19..22));
+my @mx=map("z$_",(0..15));
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
$xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
-my @xt=map("z$_",(24..31,8..11));
-my ($rot8) = ("z12");
-my ($zctr) = ("z13");
-my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt;
+my ($zctr) = ("z16");
+my @xt=map("z$_",(17..24));
+my @perm=map("z$_",(25..30));
+my ($rot8) = ("z31");
+my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt;
+# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register)
+# in SVE2 we use all 15 backup register
+my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8);
my $debug_encoder=0;
sub SVE_ADD() {
@@ -148,8 +156,12 @@ sub SVE_QR_GROUP() {
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
- &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
- &SVE_REV16($d0,$d1,$d2,$d3);
+ if ($have_sve2 == 0) {
+ &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+ &SVE_REV16($d0,$d1,$d2,$d3);
+ } else {
+ &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+ }
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
if ($have_sve2 == 0) {
@@ -162,8 +174,12 @@ sub SVE_QR_GROUP() {
}
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
- &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
- &SVE_ROT8($d0,$d1,$d2,$d3);
+ if ($have_sve2 == 0) {
+ &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+ &SVE_ROT8($d0,$d1,$d2,$d3);
+ } else {
+ &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+ }
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
if ($have_sve2 == 0) {
@@ -178,27 +194,32 @@ sub SVE_QR_GROUP() {
sub SVE_INNER_BLOCK() {
$code.=<<___;
- //cbnz $sve2flag, 10f
+ mov $counter,#10
+1:
+.align 5
___
&SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
&SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
$code.=<<___;
- // SVE 2 not enabled until hardware available
-#if 0
- b 11f
-10:
+ subs $counter,$counter,1
+ b.ne 1b
___
-# &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
-# &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
+}
+
+sub SVE2_INNER_BLOCK() {
$code.=<<___;
-11:
-#endif
+ mov $counter,#10
+1:
+.align 5
+___
+ &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
+ &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
+$code.=<<___;
+ subs $counter,$counter,1
+ b.ne 1b
___
}
-{{{
-my ($dlen,$rsize,$tmp) = ("x10","x11","x12");
-
sub load() {
my $x0 = shift;
my $x1 = shift;
@@ -252,72 +273,75 @@ sub transpose() {
my $xd = shift;
$code.=<<___;
- zip1 $xt8.s,$xa.s,$xb.s
- zip2 $xt9.s,$xa.s,$xb.s
- zip1 $xt10.s,$xc.s,$xd.s
- zip2 $xt11.s,$xc.s,$xd.s
- zip1 $xa.d,$xt8.d,$xt10.d
- zip2 $xb.d,$xt8.d,$xt10.d
- zip1 $xc.d,$xt9.d,$xt11.d
- zip2 $xd.d,$xt9.d,$xt11.d
+ zip1 $xt0.s,$xa.s,$xb.s
+ zip2 $xt1.s,$xa.s,$xb.s
+ zip1 $xt2.s,$xc.s,$xd.s
+ zip2 $xt3.s,$xc.s,$xd.s
+ zip1 $xa.d,$xt0.d,$xt2.d
+ zip2 $xb.d,$xt0.d,$xt2.d
+ zip1 $xc.d,$xt1.d,$xt3.d
+ zip2 $xd.d,$xt1.d,$xt3.d
___
}
-sub add_states() {
- my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
-
+sub SVE_ADD_STATES() {
$code.=<<___;
- ldp $tmpw0,$tmpw1,[$state]
- ldp $tmpw2,$tmpw3,[$state,#8]
- dup $xt0.s,$tmpw0
+ lsr $tmp1,@K[5],#32
+ dup $xt0.s,@KL[5]
dup $xt1.s,$tmpw1
- dup $xt2.s,$tmpw2
- dup $xt3.s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#16]
- ldp $tmpw2,$tmpw3,[$state,#24]
- add @mx[0].s,@mx[0].s,$xt0.s
- add @mx[1].s,@mx[1].s,$xt1.s
- add @mx[2].s,@mx[2].s,$xt2.s
- add @mx[3].s,@mx[3].s,$xt3.s
+ add @mx[0].s,@mx[0].s,$bak0.s
+ add @mx[1].s,@mx[1].s,$bak1.s
+ add @mx[2].s,@mx[2].s,$bak2.s
+ add @mx[3].s,@mx[3].s,$bak3.s
+ add @mx[4].s,@mx[4].s,$bak4.s
+ add @mx[5].s,@mx[5].s,$bak5.s
+ add @mx[6].s,@mx[6].s,$bak6.s
+ add @mx[7].s,@mx[7].s,$bak7.s
+ add @mx[8].s,@mx[8].s,$bak8.s
+ add @mx[9].s,@mx[9].s,$bak9.s
+ lsr $tmp0,@K[6],#32
dup $xt4.s,$tmpw0
- dup $xt5.s,$tmpw1
- dup $xt6.s,$tmpw2
- dup $xt7.s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#32]
- ldp $tmpw2,$tmpw3,[$state,#40]
- add @mx[4].s,@mx[4].s,$xt4.s
- add @mx[5].s,@mx[5].s,$xt5.s
- add @mx[6].s,@mx[6].s,$xt6.s
- add @mx[7].s,@mx[7].s,$xt7.s
- dup $xt0.s,$tmpw0
- dup $xt1.s,$tmpw1
- dup $xt2.s,$tmpw2
- dup $xt3.s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#48]
- ldp $tmpw2,$tmpw3,[$state,#56]
- add @mx[8].s,@mx[8].s,$xt0.s
- add @mx[9].s,@mx[9].s,$xt1.s
- add @mx[10].s,@mx[10].s,$xt2.s
- add @mx[11].s,@mx[11].s,$xt3.s
- dup $xt5.s,$tmpw1
- dup $xt6.s,$tmpw2
- dup $xt7.s,$tmpw3
+ lsr $tmp1,@K[7],#32
+ dup $xt5.s,@KL[7]
+ dup $xt6.s,$tmpw1
+ add @mx[10].s,@mx[10].s,$xt0.s
+ add @mx[11].s,@mx[11].s,$xt1.s
+ add @mx[12].s,@mx[12].s,$zctr.s
+ add @mx[13].s,@mx[13].s,$xt4.s
+ add @mx[14].s,@mx[14].s,$xt5.s
+ add @mx[15].s,@mx[15].s,$xt6.s
+___
+}
+
+sub SVE2_ADD_STATES() {
+$code.=<<___;
+ add @mx[0].s,@mx[0].s,$bak0.s
+ add @mx[1].s,@mx[1].s,$bak1.s
+ add @mx[2].s,@mx[2].s,$bak2.s
+ add @mx[3].s,@mx[3].s,$bak3.s
+ add @mx[4].s,@mx[4].s,$bak4.s
+ add @mx[5].s,@mx[5].s,$bak5.s
+ add @mx[6].s,@mx[6].s,$bak6.s
+ add @mx[7].s,@mx[7].s,$bak7.s
+ add @mx[8].s,@mx[8].s,$bak8.s
+ add @mx[9].s,@mx[9].s,$bak9.s
+ add @mx[10].s,@mx[10].s,$bak10.s
+ add @mx[11].s,@mx[11].s,$bak11.s
add @mx[12].s,@mx[12].s,$zctr.s
- add @mx[13].s,@mx[13].s,$xt5.s
- add @mx[14].s,@mx[14].s,$xt6.s
- add @mx[15].s,@mx[15].s,$xt7.s
+ add @mx[13].s,@mx[13].s,$bak13.s
+ add @mx[14].s,@mx[14].s,$bak14.s
+ add @mx[15].s,@mx[15].s,$bak15.s
___
}
sub SVE_TRANSFORMS() {
- &add_states();
&transpose($xa0,$xb0,$xc0,$xd0);
&transpose($xa1,$xb1,$xc1,$xd1);
&transpose($xa2,$xb2,$xc2,$xd2);
&transpose($xa3,$xb3,$xc3,$xd3);
- &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
&transpose($xa0,$xa1,$xa2,$xa3);
&transpose($xb0,$xb1,$xb2,$xb3);
+ &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
$code.=<<___;
eor $xa0.d,$xa0.d,$xt0.d
eor $xa1.d,$xa1.d,$xt1.d
@@ -330,8 +354,8 @@ $code.=<<___;
___
&transpose($xc0,$xc1,$xc2,$xc3);
&store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
- &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
&transpose($xd0,$xd1,$xd2,$xd3);
+ &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
$code.=<<___;
eor $xc0.d,$xc0.d,$xt0.d
eor $xc1.d,$xc1.d,$xt1.d
@@ -348,73 +372,111 @@ $code.=<<___;
incw $zctr.s, ALL, MUL #1
___
}
-}}}
sub SVE_LOAD_STATES() {
- my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
-
$code.=<<___;
- // FIXME following code are not functionally necessary
- // but appear to enhance performance
-#if 1
- ptrues p2.s,ALL
- ptrues p2.s,ALL
- ptrues p2.s,ALL
- ptrues p2.s,ALL
- ptrues p2.s,ALL
- ptrues p2.s,ALL
-#endif
+ lsr $tmp0,@K[0],#32
+ dup @mx[0].s,@KL[0]
+ dup $bak0.s,@KL[0]
+ dup @mx[1].s,$tmpw0
+ dup $bak1.s,$tmpw0
+ lsr $tmp1,@K[1],#32
+ dup @mx[2].s,@KL[1]
+ dup $bak2.s,@KL[1]
+ dup @mx[3].s,$tmpw1
+ dup $bak3.s,$tmpw1
+ lsr $tmp0,@K[2],#32
+ dup @mx[4].s,@KL[2]
+ dup $bak4.s,@KL[2]
+ dup @mx[5].s,$tmpw0
+ dup $bak5.s,$tmpw0
+ lsr $tmp1,@K[3],#32
+ dup @mx[6].s,@KL[3]
+ dup $bak6.s,@KL[3]
+ dup @mx[7].s,$tmpw1
+ dup $bak7.s,$tmpw1
+ lsr $tmp0,@K[4],#32
+ dup @mx[8].s,@KL[4]
+ dup $bak8.s,@KL[4]
+ dup @mx[9].s,$tmpw0
+ dup $bak9.s,$tmpw0
+ lsr $tmp1,@K[5],#32
+ dup @mx[10].s,@KL[5]
+ dup @mx[11].s,$tmpw1
+ orr @mx[12].d,$zctr.d,$zctr.d
+ lsr $tmp0,@K[6],#32
+ dup @mx[13].s,$tmpw0
+ lsr $tmp1,@K[7],#32
+ dup @mx[14].s,@KL[7]
+ dup @mx[15].s,$tmpw1
___
+}
+
+sub SVE2_LOAD_STATES() {
$code.=<<___;
- ldp $tmpw0,$tmpw1,[$state]
- ldp $tmpw2,$tmpw3,[$state,#8]
- dup @mx[0].s,$tmpw0
- dup @mx[1].s,$tmpw1
- dup @mx[2].s,$tmpw2
- dup @mx[3].s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#16]
- ldp $tmpw2,$tmpw3,[$state,#24]
- dup @mx[4].s,$tmpw0
- dup @mx[5].s,$tmpw1
- dup @mx[6].s,$tmpw2
- dup @mx[7].s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state,#32]
- ldp $tmpw2,$tmpw3,[$state,#40]
- dup @mx[8].s,$tmpw0
- dup @mx[9].s,$tmpw1
- dup @mx[10].s,$tmpw2
- dup @mx[11].s,$tmpw3
- ldp $tmpw0,$tmpw1,[$state, #48]
- ldp $tmpw2,$tmpw3,[$state,#56]
- mov @mx[12].s,p0/m,$zctr.s
- dup @mx[13].s,$tmpw1
- dup @mx[14].s,$tmpw2
- dup @mx[15].s,$tmpw3
+ lsr $tmp0,@K[0],#32
+ dup @mx[0].s,@KL[0]
+ dup $bak0.s,@KL[0]
+ dup @mx[1].s,$tmpw0
+ dup $bak1.s,$tmpw0
+ lsr $tmp1,@K[1],#32
+ dup @mx[2].s,@KL[1]
+ dup $bak2.s,@KL[1]
+ dup @mx[3].s,$tmpw1
+ dup $bak3.s,$tmpw1
+ lsr $tmp0,@K[2],#32
+ dup @mx[4].s,@KL[2]
+ dup $bak4.s,@KL[2]
+ dup @mx[5].s,$tmpw0
+ dup $bak5.s,$tmpw0
+ lsr $tmp1,@K[3],#32
+ dup @mx[6].s,@KL[3]
+ dup $bak6.s,@KL[3]
+ dup @mx[7].s,$tmpw1
+ dup $bak7.s,$tmpw1
+ lsr $tmp0,@K[4],#32
+ dup @mx[8].s,@KL[4]
+ dup $bak8.s,@KL[4]
+ dup @mx[9].s,$tmpw0
+ dup $bak9.s,$tmpw0
+ lsr $tmp1,@K[5],#32
+ dup @mx[10].s,@KL[5]
+ dup $bak10.s,@KL[5]
+ dup @mx[11].s,$tmpw1
+ dup $bak11.s,$tmpw1
+ orr @mx[12].d,$zctr.d,$zctr.d
+ lsr $tmp0,@K[6],#32
+ dup @mx[13].s,$tmpw0
+ dup $bak13.s,$tmpw0
+ lsr $tmp1,@K[7],#32
+ dup @mx[14].s,@KL[7]
+ dup $bak14.s,@KL[7]
+ dup @mx[15].s,$tmpw1
+ dup $bak15.s,$tmpw1
___
}
sub sve_handle_blocks() {
- my ($counter) = ("x10");
-
- &SVE_LOAD_STATES();
$code.=<<___;
- mov $counter,#10
-.align 5
-1:
+ cbz $sve2flag,.sve_inner
___
-
+ &SVE2_LOAD_STATES();
+ &SVE2_INNER_BLOCK();
+ &SVE2_ADD_STATES();
+$code.=<<___;
+ b .fini_inner
+.sve_inner:
+___
+ &SVE_LOAD_STATES();
&SVE_INNER_BLOCK();
+ &SVE_ADD_STATES();
$code.=<<___;
- subs $counter,$counter,1
- b.ne 1b
+.fini_inner:
___
&SVE_TRANSFORMS();
}
sub chacha20_process() {
- my ($counter) = ("x10");
- my ($tmpw) = ("w11");
-
$code.=<<___;
.align 5
.Loop:
@@ -430,27 +492,18 @@ ___
}
{{{
-my ($tmp,$tmpw) = ("x10", "w10");
-my ($tmpw0,$tmpw1) = ("w11", "w12");
-my ($ptr) = ("x13");
-
$code.=<<___;
#include "arm_arch.h"
.arch armv8-a
-#if 0
.extern OPENSSL_armcap_P
.hidden OPENSSL_armcap_P
-#endif
.text
.align 5
.Lchacha20_consts:
- .word 0x61707865
- .word 0x3320646e
- .word 0x79622d32
- .word 0x6b206574
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lrot8:
.word 0x02010003,0x04040404,0x02010003,0x04040404
.globl ChaCha20_ctr32_sve
@@ -458,49 +511,55 @@ $code.=<<___;
.align 5
ChaCha20_ctr32_sve:
AARCH64_VALID_CALL_TARGET
- mov $tmp, #64
- whilelo p0.s,xzr,$tmp
- cntp $veclen,p0,p0.s
- // run Neon if we only have 128-bit SVE
- // in the future, we need to check SVE2
- cmp $veclen,4
- b.le .Lreturn
+ cntw $veclen, ALL, MUL #1
lsr $blocks,$len,#6
cmp $blocks,$veclen
b.lt .Lreturn
- stp d8,d9,[sp,-48]!
- stp d10,d11,[sp,16]
- stp d12,d13,[sp,32]
- sub sp,sp,#64
- adr $tmp,.Lchacha20_consts
- ld1 {v0.4s},[$tmp]
- adr $tmp,.Lrot8
- ldp $tmpw0,$tmpw1,[$tmp]
- ld1 {v1.4s,v2.4s},[$key]
- ld1 {v3.4s},[$ctr]
- ldr $wctr,[$ctr]
- index $zctr.s,$wctr,1
- index $rot8.s,$tmpw0,$tmpw1
- st1 {v0.4s,v1.4s,v2.4s,v3.4s},[sp]
- mov $state,sp
-#if 0
- // SVE2 code not enabled until we have hardware
- // for verification
mov $sve2flag,0
adrp $tmp,OPENSSL_armcap_P
ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
tst $tmpw,#ARMV8_SVE2
b.eq 1f
mov $sve2flag,1
+ b 2f
1:
+ cmp $veclen,4
+ b.le .Lreturn
+ adr $tmp,.Lrot8
+ ldp $tmpw0,$tmpw1,[$tmp]
+ index $rot8.s,$tmpw0,$tmpw1
+2:
+ stp d8,d9,[sp,-96]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ stp x19,x20,[sp,64]
+ stp x21,x22,[sp,80]
+ adr $tmp,.Lchacha20_consts
+ ldp @K[0],@K[1],[$tmp]
+ ldp @K[2],@K[3],[$key]
+ ldp @K[4],@K[5],[$key, 16]
+ ldp @K[6],@K[7],[$ctr]
+ ldr $wctr,[$ctr]
+ index $zctr.s,$wctr,1
+ ptrues p0.s,ALL
+#ifdef __AARCH64EB__
+ ror @K[2],@K[2],#32
+ ror @K[3],@K[3],#32
+ ror @K[4],@K[4],#32
+ ror @K[5],@K[5],#32
+ ror @K[6],@K[6],#32
+ ror @K[7],@K[7],#32
#endif
___
&chacha20_process();
$code.=<<___;
- add sp,sp,#64
ldp d10,d11,[sp,16]
ldp d12,d13,[sp,32]
- ldp d8,d9,[sp],48
+ ldp d14,d15,[sp,48]
+ ldp x19,x20,[sp,64]
+ ldp x21,x22,[sp,80]
+ ldp d8,d9,[sp],96
str $wctr,[$ctr]
and $len,$len,#63
add $len,$len,$blocks,lsl #6
@@ -514,6 +573,7 @@ ___
########################################
{
my %opcode_unpred = (
+ "movprfx" => 0x0420BC00,
"eor" => 0x04a03000,
"add" => 0x04200000,
"orr" => 0x04603000,
@@ -528,6 +588,7 @@ my %opcode_unpred = (
"index" => 0x04204C00,
"mov" => 0x05203800,
"dup" => 0x05203800,
+ "cntw" => 0x04A0E000,
"tbl" => 0x05203000);
my %opcode_imm_unpred = (
@@ -564,6 +625,7 @@ my %opcode_pred = (
"st4w" => 0xE570E000,
"st1w" => 0xE500E000,
"ld1w" => 0xA540A000,
+ "ld1rw" => 0x8540C000,
"revh" => 0x05258000);
my %tsize = (
@@ -740,6 +802,10 @@ sub sve_pred {
if ($addr =~ m/x([0-9]+)\s*/o) {
$xn = $1;
}
+
+ if ($mnemonic =~m/ld1r[bhwd]/o) {
+ $size = 0;
+ }
if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
} elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
@@ -810,8 +876,14 @@ sub sve_other {
} elsif ($arg =~ m/x([0-9]+)/o) {
return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
}
+ } elsif ($mnemonic =~ /cnt[bhdw]/) {
+ if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
+ return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
+ }
} elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
+ } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
+ return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
}
sprintf "%s // fail to parse", $inst;
}
@@ -834,9 +906,10 @@ foreach(split("\n",$code)) {
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
+ s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
- s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
+ s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
print $_,"\n";
}