summaryrefslogtreecommitdiff
path: root/chacha_avx.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2021-09-20 12:31:32 -0400
committerJeffrey Walton <noloader@gmail.com>2021-09-20 12:31:32 -0400
commit20962baf4440df538a7438e80e45e808a76fb04e (patch)
tree1d85f60668a131a40bfcb27dd61373e75a818656 /chacha_avx.cpp
parente154280d310c137148d321828a2b98c0c1e4158f (diff)
downloadcryptopp-git-20962baf4440df538a7438e80e45e808a76fb04e.tar.gz
Fix ChaCha AVX2 implementation (GH #1069)
Many thanks to Jack Lloyd
Diffstat (limited to 'chacha_avx.cpp')
-rw-r--r--chacha_avx.cpp28
1 files changed, 13 insertions, 15 deletions
diff --git a/chacha_avx.cpp b/chacha_avx.cpp
index 23f10511..8bd8ddc2 100644
--- a/chacha_avx.cpp
+++ b/chacha_avx.cpp
@@ -20,9 +20,6 @@
#include "chacha.h"
#include "misc.h"
-// https://github.com/weidai11/cryptopp/issues/1069
-#undef CRYPTOPP_AVX2_AVAILABLE
-
#if defined(CRYPTOPP_AVX2_AVAILABLE)
# include <xmmintrin.h>
# include <emmintrin.h>
@@ -103,30 +100,31 @@ void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *
const __m256i state3 = _mm256_broadcastsi128_si256(
_mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));
- const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4);
- const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5);
- const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6);
- const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7);
+ const word32 C = 0xFFFFFFFFu - state[12];
+ const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4);
+ const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);
+ const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);
+ const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);
__m256i X0_0 = state0;
__m256i X0_1 = state1;
__m256i X0_2 = state2;
- __m256i X0_3 = _mm256_add_epi64(state3, CTR0);
+ __m256i X0_3 = _mm256_add_epi32(state3, CTR0);
__m256i X1_0 = state0;
__m256i X1_1 = state1;
__m256i X1_2 = state2;
- __m256i X1_3 = _mm256_add_epi64(state3, CTR1);
+ __m256i X1_3 = _mm256_add_epi32(state3, CTR1);
__m256i X2_0 = state0;
__m256i X2_1 = state1;
__m256i X2_2 = state2;
- __m256i X2_3 = _mm256_add_epi64(state3, CTR2);
+ __m256i X2_3 = _mm256_add_epi32(state3, CTR2);
__m256i X3_0 = state0;
__m256i X3_1 = state1;
__m256i X3_2 = state2;
- __m256i X3_3 = _mm256_add_epi64(state3, CTR3);
+ __m256i X3_3 = _mm256_add_epi32(state3, CTR3);
for (int i = static_cast<int>(rounds); i > 0; i -= 2)
{
@@ -287,25 +285,25 @@ void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *
X0_1 = _mm256_add_epi32(X0_1, state1);
X0_2 = _mm256_add_epi32(X0_2, state2);
X0_3 = _mm256_add_epi32(X0_3, state3);
- X0_3 = _mm256_add_epi64(X0_3, CTR0);
+ X0_3 = _mm256_add_epi32(X0_3, CTR0);
X1_0 = _mm256_add_epi32(X1_0, state0);
X1_1 = _mm256_add_epi32(X1_1, state1);
X1_2 = _mm256_add_epi32(X1_2, state2);
X1_3 = _mm256_add_epi32(X1_3, state3);
- X1_3 = _mm256_add_epi64(X1_3, CTR1);
+ X1_3 = _mm256_add_epi32(X1_3, CTR1);
X2_0 = _mm256_add_epi32(X2_0, state0);
X2_1 = _mm256_add_epi32(X2_1, state1);
X2_2 = _mm256_add_epi32(X2_2, state2);
X2_3 = _mm256_add_epi32(X2_3, state3);
- X2_3 = _mm256_add_epi64(X2_3, CTR2);
+ X2_3 = _mm256_add_epi32(X2_3, CTR2);
X3_0 = _mm256_add_epi32(X3_0, state0);
X3_1 = _mm256_add_epi32(X3_1, state1);
X3_2 = _mm256_add_epi32(X3_2, state2);
X3_3 = _mm256_add_epi32(X3_3, state3);
- X3_3 = _mm256_add_epi64(X3_3, CTR3);
+ X3_3 = _mm256_add_epi32(X3_3, CTR3);
if (input)
{