summaryrefslogtreecommitdiff
path: root/cipher/keccak_permute_32.h
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2015-11-18 09:44:18 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2015-11-18 09:44:18 +0200
commit6571a64331839d7d952292163afbf34c8bef62e0 (patch)
treedd3931f6151152724461e571493fced8cc06cd1c /cipher/keccak_permute_32.h
parent15ea0acf8bb0aa307eccc23024a0bd7878fb8080 (diff)
downloadlibgcrypt-6571a64331839d7d952292163afbf34c8bef62e0.tar.gz
Tweak Keccak for small speed-up
* cipher/keccak_permute_32.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Track rounds with round constant pointer instead of separate round counter. * cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Ditto. (KECCAK_F1600_ABSORB_FUNC_NAME): Tweak lanes pointer increment for bulk absorb loops. -- Patch makes small tweaks to improve performance. Benchmark on Intel Haswell @ 3.2 Ghz: Before: | nanosecs/byte mebibytes/sec cycles/byte SHAKE128 | 2.27 ns/B 420.5 MiB/s 7.26 c/B SHAKE256 | 2.79 ns/B 341.4 MiB/s 8.94 c/B SHA3-224 | 2.64 ns/B 361.7 MiB/s 8.44 c/B SHA3-256 | 2.79 ns/B 341.4 MiB/s 8.94 c/B SHA3-384 | 3.65 ns/B 261.3 MiB/s 11.68 c/B SHA3-512 | 5.27 ns/B 181.0 MiB/s 16.86 c/B After: | nanosecs/byte mebibytes/sec cycles/byte SHAKE128 | 2.25 ns/B 423.5 MiB/s 7.21 c/B SHAKE256 | 2.77 ns/B 343.9 MiB/s 8.88 c/B SHA3-224 | 2.62 ns/B 364.1 MiB/s 8.38 c/B SHA3-256 | 2.77 ns/B 343.8 MiB/s 8.88 c/B SHA3-384 | 3.63 ns/B 262.6 MiB/s 11.63 c/B SHA3-512 | 5.23 ns/B 182.3 MiB/s 16.75 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak_permute_32.h')
-rw-r--r--cipher/keccak_permute_32.h13
1 files changed, 7 insertions, 6 deletions
diff --git a/cipher/keccak_permute_32.h b/cipher/keccak_permute_32.h
index fed93831..1ce42a42 100644
--- a/cipher/keccak_permute_32.h
+++ b/cipher/keccak_permute_32.h
@@ -27,6 +27,7 @@ static unsigned int
KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
{
const u32 *round_consts = round_consts_32bit;
+ const u32 *round_consts_end = round_consts_32bit + 2 * 24;
u32 Aba0, Abe0, Abi0, Abo0, Abu0;
u32 Aba1, Abe1, Abi1, Abo1, Abu1;
u32 Aga0, Age0, Agi0, Ago0, Agu0;
@@ -52,7 +53,6 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
u32 Esa0, Ese0, Esi0, Eso0, Esu0;
u32 Esa1, Ese1, Esi1, Eso1, Esu1;
u32 *state = hd->u.state32bi;
- unsigned int round;
Aba0 = state[0];
Aba1 = state[1];
@@ -105,7 +105,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
Asu0 = state[48];
Asu1 = state[49];
- for (round = 0; round < 24; round += 2)
+ do
{
/* prepareTheta */
BCa0 = Aba0 ^ Aga0 ^ Aka0 ^ Ama0 ^ Asa0;
@@ -142,7 +142,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
Asu0 ^= Du0;
BCu0 = ROL32(Asu0, 7);
Eba0 = BCa0 ^ ANDN32(BCe0, BCi0);
- Eba0 ^= round_consts[round * 2 + 0];
+ Eba0 ^= *(round_consts++);
Ebe0 = BCe0 ^ ANDN32(BCi0, BCo0);
Ebi0 = BCi0 ^ ANDN32(BCo0, BCu0);
Ebo0 = BCo0 ^ ANDN32(BCu0, BCa0);
@@ -159,7 +159,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
Asu1 ^= Du1;
BCu1 = ROL32(Asu1, 7);
Eba1 = BCa1 ^ ANDN32(BCe1, BCi1);
- Eba1 ^= round_consts[round * 2 + 1];
+ Eba1 ^= *(round_consts++);
Ebe1 = BCe1 ^ ANDN32(BCi1, BCo1);
Ebi1 = BCi1 ^ ANDN32(BCo1, BCu1);
Ebo1 = BCo1 ^ ANDN32(BCu1, BCa1);
@@ -328,7 +328,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
Esu0 ^= Du0;
BCu0 = ROL32(Esu0, 7);
Aba0 = BCa0 ^ ANDN32(BCe0, BCi0);
- Aba0 ^= round_consts[round * 2 + 2];
+ Aba0 ^= *(round_consts++);
Abe0 = BCe0 ^ ANDN32(BCi0, BCo0);
Abi0 = BCi0 ^ ANDN32(BCo0, BCu0);
Abo0 = BCo0 ^ ANDN32(BCu0, BCa0);
@@ -345,7 +345,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
Esu1 ^= Du1;
BCu1 = ROL32(Esu1, 7);
Aba1 = BCa1 ^ ANDN32(BCe1, BCi1);
- Aba1 ^= round_consts[round * 2 + 3];
+ Aba1 ^= *(round_consts++);
Abe1 = BCe1 ^ ANDN32(BCi1, BCo1);
Abi1 = BCi1 ^ ANDN32(BCo1, BCu1);
Abo1 = BCo1 ^ ANDN32(BCu1, BCa1);
@@ -479,6 +479,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
Aso1 = BCo1 ^ ANDN32(BCu1, BCa1);
Asu1 = BCu1 ^ ANDN32(BCa1, BCe1);
}
+ while (round_consts < round_consts_end);
state[0] = Aba0;
state[1] = Aba1;