diff options
author | Niels Möller <nisse@lysator.liu.se> | 2020-06-25 23:26:56 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2020-06-25 23:26:56 +0200 |
commit | 8c4ef180abef60d65d94382055e2fa9c558202a9 (patch) | |
tree | e107b496be332b36907bee0b0053bfa8c3e96a03 | |
parent | db9b66e047a47d5070d99de8a6ccb6ddd79efce6 (diff) | |
download | nettle-arm-salsa20-chacha-vsra.tar.gz |
arm: Micro optimize neon implementation of salsa20 and chachaarm-salsa20-chacha-vsra
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | arm/neon/chacha-core-internal.asm | 28 | ||||
-rw-r--r-- | arm/neon/salsa20-core-internal.asm | 12 |
3 files changed, 21 insertions, 24 deletions
@@ -1,5 +1,10 @@ 2020-06-25 Niels Möller <nisse@lysator.liu.se> + * arm/neon/chacha-core-internal.asm (QROUND): Micro optimize + rotations, using the vsra.u32 instruction. Gives 10% speedup, + benchmarked on Cortex-A5. Suggested by Torbjörn Granlund. + * arm/neon/salsa20-core-internal.asm (QROUND): Likewise. + * x86_64/chacha-core-internal.asm (QROUND): Fix use of macro arguments. Spotted by Torbjörn Granlund. diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm index 22f843e8..799376f2 100644 --- a/arm/neon/chacha-core-internal.asm +++ b/arm/neon/chacha-core-internal.asm @@ -54,28 +54,24 @@ define(<QROUND>, < C x2 += x3, x1 ^= x2, x1 lrot 7 vadd.i32 $1, $1, $2 - veor $4, $4, $1 - vshl.i32 T0, $4, #16 - vshr.u32 $4, $4, #16 - veor $4, $4, T0 + veor T0, $4, $1 + vshl.i32 $4, T0, #16 + vsra.u32 $4, T0, #16 vadd.i32 $3, $3, $4 - veor $2, $2, $3 - vshl.i32 T0, $2, #12 - vshr.u32 $2, $2, #20 - veor $2, $2, T0 + veor T0, $2, $3 + vshl.i32 $2, T0, #12 + vsra.u32 $2, T0, #20 vadd.i32 $1, $1, $2 - veor $4, $4, $1 - vshl.i32 T0, $4, #8 - vshr.u32 $4, $4, #24 - veor $4, $4, T0 + veor T0, $4, $1 + vshl.i32 $4, T0, #8 + vsra.u32 $4, T0, #24 vadd.i32 $3, $3, $4 - veor $2, $2, $3 - vshl.i32 T0, $2, #7 - vshr.u32 $2, $2, #25 - veor $2, $2, T0 + veor T0, $2, $3 + vshl.i32 $2, T0, #7 + vsra.u32 $2, T0, #25 >) .text diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm index 20710499..590c3bb1 100644 --- a/arm/neon/salsa20-core-internal.asm +++ b/arm/neon/salsa20-core-internal.asm @@ -53,26 +53,22 @@ define(<S3>, <q15>) define(<QROUND>, < vadd.i32 T0, $1, $4 vshl.i32 T1, T0, #7 - vshr.u32 T0, T0, #25 - veor $2, $2, T0 + vsra.u32 T1, T0, #25 veor $2, $2, T1 vadd.i32 T0, $1, $2 vshl.i32 T1, T0, #9 - vshr.u32 T0, T0, #23 - veor $3, $3, T0 + vsra.u32 T1, T0, #23 veor $3, $3, T1 vadd.i32 T0, $2, $3 vshl.i32 T1, T0, #13 - vshr.u32 T0, T0, #19 - veor $4, $4, T0 + vsra.u32 T1, T0, #19 veor $4, $4, T1 vadd.i32 T0, $3, $4 vshl.i32 T1, T0, #18 - vshr.u32 T0, T0, #14 - veor $1, $1, T0 + vsra.u32 T1, T0, #14 veor $1, $1, T1 >) |