summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2020-06-25 23:26:56 +0200
committerNiels Möller <nisse@lysator.liu.se>2020-06-25 23:26:56 +0200
commit8c4ef180abef60d65d94382055e2fa9c558202a9 (patch)
treee107b496be332b36907bee0b0053bfa8c3e96a03
parentdb9b66e047a47d5070d99de8a6ccb6ddd79efce6 (diff)
downloadnettle-arm-salsa20-chacha-vsra.tar.gz
arm: Micro optimize neon implementation of salsa20 and chachaarm-salsa20-chacha-vsra
-rw-r--r--ChangeLog5
-rw-r--r--arm/neon/chacha-core-internal.asm28
-rw-r--r--arm/neon/salsa20-core-internal.asm12
3 files changed, 21 insertions, 24 deletions
diff --git a/ChangeLog b/ChangeLog
index c3d1b4fa..ef037fc0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
2020-06-25 Niels Möller <nisse@lysator.liu.se>
+ * arm/neon/chacha-core-internal.asm (QROUND): Micro optimize
+ rotations, using the vsra.u32 instruction. Gives 10% speedup,
+ benchmarked on Cortex-A5. Suggested by Torbjörn Granlund.
+ * arm/neon/salsa20-core-internal.asm (QROUND): Likewise.
+
* x86_64/chacha-core-internal.asm (QROUND): Fix use of macro
arguments. Spotted by Torbjörn Granlund.
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm
index 22f843e8..799376f2 100644
--- a/arm/neon/chacha-core-internal.asm
+++ b/arm/neon/chacha-core-internal.asm
@@ -54,28 +54,24 @@ define(<QROUND>, <
C x2 += x3, x1 ^= x2, x1 lrot 7
vadd.i32 $1, $1, $2
- veor $4, $4, $1
- vshl.i32 T0, $4, #16
- vshr.u32 $4, $4, #16
- veor $4, $4, T0
+ veor T0, $4, $1
+ vshl.i32 $4, T0, #16
+ vsra.u32 $4, T0, #16
vadd.i32 $3, $3, $4
- veor $2, $2, $3
- vshl.i32 T0, $2, #12
- vshr.u32 $2, $2, #20
- veor $2, $2, T0
+ veor T0, $2, $3
+ vshl.i32 $2, T0, #12
+ vsra.u32 $2, T0, #20
vadd.i32 $1, $1, $2
- veor $4, $4, $1
- vshl.i32 T0, $4, #8
- vshr.u32 $4, $4, #24
- veor $4, $4, T0
+ veor T0, $4, $1
+ vshl.i32 $4, T0, #8
+ vsra.u32 $4, T0, #24
vadd.i32 $3, $3, $4
- veor $2, $2, $3
- vshl.i32 T0, $2, #7
- vshr.u32 $2, $2, #25
- veor $2, $2, T0
+ veor T0, $2, $3
+ vshl.i32 $2, T0, #7
+ vsra.u32 $2, T0, #25
>)
.text
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
index 20710499..590c3bb1 100644
--- a/arm/neon/salsa20-core-internal.asm
+++ b/arm/neon/salsa20-core-internal.asm
@@ -53,26 +53,22 @@ define(<S3>, <q15>)
define(<QROUND>, <
vadd.i32 T0, $1, $4
vshl.i32 T1, T0, #7
- vshr.u32 T0, T0, #25
- veor $2, $2, T0
+ vsra.u32 T1, T0, #25
veor $2, $2, T1
vadd.i32 T0, $1, $2
vshl.i32 T1, T0, #9
- vshr.u32 T0, T0, #23
- veor $3, $3, T0
+ vsra.u32 T1, T0, #23
veor $3, $3, T1
vadd.i32 T0, $2, $3
vshl.i32 T1, T0, #13
- vshr.u32 T0, T0, #19
- veor $4, $4, T0
+ vsra.u32 T1, T0, #19
veor $4, $4, T1
vadd.i32 T0, $3, $4
vshl.i32 T1, T0, #18
- vshr.u32 T0, T0, #14
- veor $1, $1, T0
+ vsra.u32 T1, T0, #14
veor $1, $1, T1
>)