summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2020-12-12 16:46:51 +0100
committerNiels Möller <nisse@lysator.liu.se>2020-12-12 16:46:51 +0100
commit539efc9b997dfba757cacbbc512bc0a216ffe743 (patch)
treea2e5e824e21da8558200a1694b7d949f6dc3cb8c
parent9bdadc1317ffe0bf0ac5bfa7842275c6386ae7f6 (diff)
downloadnettle-539efc9b997dfba757cacbbc512bc0a216ffe743.tar.gz
ppc: More interleaving of chacha_4core.
-rw-r--r--ChangeLog5
-rw-r--r--powerpc64/p7/chacha-4core.asm32
2 files changed, 21 insertions, 16 deletions
diff --git a/ChangeLog b/ChangeLog
index 1f2e2d40..21eecdea 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2020-12-12 Niels Möller <nisse@lysator.liu.se>
+
+ * powerpc64/p7/chacha-4core.asm: More interleaving of independent
+ instructions, gives slight speedup on Power9.
+
2020-12-01 Niels Möller <nisse@lysator.liu.se>
* powerpc64/p7/chacha-4core.asm: Use protected zone below stack
diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm
index b2330247..ed1445dd 100644
--- a/powerpc64/p7/chacha-4core.asm
+++ b/powerpc64/p7/chacha-4core.asm
@@ -57,53 +57,53 @@ C Main loop for round
define(`QR',`
vadduwm $1, $1, $2
vadduwm $5, $5, $6
- vxor $4, $4, $1
- vxor $8, $8, $5
- vrlw $4, $4, ROT16
- vrlw $8, $8, ROT16
vadduwm $9, $9, $10
vadduwm $13, $13, $14
+ vxor $4, $4, $1
+ vxor $8, $8, $5
vxor $12, $12, $9
vxor $16, $16, $13
+ vrlw $4, $4, ROT16
+ vrlw $8, $8, ROT16
vrlw $12, $12, ROT16
vrlw $16, $16, ROT16
vadduwm $3, $3, $4
vadduwm $7, $7, $8
- vxor $2, $2, $3
- vxor $6, $6, $7
- vrlw $2, $2, ROT12
- vrlw $6, $6, ROT12
vadduwm $11, $11, $12
vadduwm $15, $15, $16
+ vxor $2, $2, $3
+ vxor $6, $6, $7
vxor $10, $10, $11
vxor $14, $14, $15
+ vrlw $2, $2, ROT12
+ vrlw $6, $6, ROT12
vrlw $10, $10, ROT12
vrlw $14, $14, ROT12
vadduwm $1, $1, $2
vadduwm $5, $5, $6
- vxor $4, $4, $1
- vxor $8, $8, $5
- vrlw $4, $4, ROT8
- vrlw $8, $8, ROT8
vadduwm $9, $9, $10
vadduwm $13, $13, $14
+ vxor $4, $4, $1
+ vxor $8, $8, $5
vxor $12, $12, $9
vxor $16, $16, $13
+ vrlw $4, $4, ROT8
+ vrlw $8, $8, ROT8
vrlw $12, $12, ROT8
vrlw $16, $16, ROT8
vadduwm $3, $3, $4
vadduwm $7, $7, $8
- vxor $2, $2, $3
- vxor $6, $6, $7
- vrlw $2, $2, ROT7
- vrlw $6, $6, ROT7
vadduwm $11, $11, $12
vadduwm $15, $15, $16
+ vxor $2, $2, $3
+ vxor $6, $6, $7
vxor $10, $10, $11
vxor $14, $14, $15
+ vrlw $2, $2, ROT7
+ vrlw $6, $6, ROT7
vrlw $10, $10, ROT7
vrlw $14, $14, ROT7
')