summaryrefslogtreecommitdiff
path: root/chacha_simd.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2019-06-09 04:29:40 -0400
committerJeffrey Walton <noloader@gmail.com>2019-06-09 04:29:40 -0400
commit955ac6fe2419b8956adb7402234580dc5e954d49 (patch)
tree932912c332bbea5313ace3067c7c7f862ea6d1ce /chacha_simd.cpp
parent8c78985de2362fd9387ce8a602d6f3a16982c2a5 (diff)
downloadcryptopp-git-955ac6fe2419b8956adb7402234580dc5e954d49.tar.gz
Rework SSE2 and AVX2 loads and stores
Diffstat (limited to 'chacha_simd.cpp')
-rw-r--r--chacha_simd.cpp84
1 files changed, 40 insertions, 44 deletions
diff --git a/chacha_simd.cpp b/chacha_simd.cpp
index 9fd6b0f1..a983ab69 100644
--- a/chacha_simd.cpp
+++ b/chacha_simd.cpp
@@ -565,14 +565,10 @@ void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, byte *
void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
- const __m128i* state_mm = reinterpret_cast<const __m128i*>(state);
- const __m128i* input_mm = reinterpret_cast<const __m128i*>(input);
- __m128i* output_mm = reinterpret_cast<__m128i*>(output);
-
- const __m128i state0 = _mm_load_si128(state_mm + 0);
- const __m128i state1 = _mm_load_si128(state_mm + 1);
- const __m128i state2 = _mm_load_si128(state_mm + 2);
- const __m128i state3 = _mm_load_si128(state_mm + 3);
+ const __m128i state0 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+0*4));
+ const __m128i state1 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+1*4));
+ const __m128i state2 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+2*4));
+ const __m128i state3 = _mm_load_si128(reinterpret_cast<const __m128i*>(state+3*4));
__m128i r0_0 = state0;
__m128i r0_1 = state1;
@@ -772,57 +768,57 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
r3_3 = _mm_add_epi32(r3_3, state3);
r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
- if (input_mm)
+ if (input)
{
- r0_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 0), r0_0);
- r0_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 1), r0_1);
- r0_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 2), r0_2);
- r0_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 3), r0_3);
+ r0_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+0*16)), r0_0);
+ r0_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+1*16)), r0_1);
+ r0_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+2*16)), r0_2);
+ r0_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+3*16)), r0_3);
}
- _mm_storeu_si128(output_mm + 0, r0_0);
- _mm_storeu_si128(output_mm + 1, r0_1);
- _mm_storeu_si128(output_mm + 2, r0_2);
- _mm_storeu_si128(output_mm + 3, r0_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+0*16), r0_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+1*16), r0_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+2*16), r0_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+3*16), r0_3);
- if (input_mm)
+ if (input)
{
- r1_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 4), r1_0);
- r1_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 5), r1_1);
- r1_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 6), r1_2);
- r1_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 7), r1_3);
+ r1_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+4*16)), r1_0);
+ r1_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+5*16)), r1_1);
+ r1_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+6*16)), r1_2);
+ r1_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+7*16)), r1_3);
}
- _mm_storeu_si128(output_mm + 4, r1_0);
- _mm_storeu_si128(output_mm + 5, r1_1);
- _mm_storeu_si128(output_mm + 6, r1_2);
- _mm_storeu_si128(output_mm + 7, r1_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+4*16), r1_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+5*16), r1_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+6*16), r1_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+7*16), r1_3);
- if (input_mm)
+ if (input)
{
- r2_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 8), r2_0);
- r2_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 9), r2_1);
- r2_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 10), r2_2);
- r2_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 11), r2_3);
+ r2_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+ 8*16)), r2_0);
+ r2_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+ 9*16)), r2_1);
+ r2_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+10*16)), r2_2);
+ r2_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+11*16)), r2_3);
}
- _mm_storeu_si128(output_mm + 8, r2_0);
- _mm_storeu_si128(output_mm + 9, r2_1);
- _mm_storeu_si128(output_mm + 10, r2_2);
- _mm_storeu_si128(output_mm + 11, r2_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+ 8*16), r2_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+ 9*16), r2_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+10*16), r2_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+11*16), r2_3);
- if (input_mm)
+ if (input)
{
- r3_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 12), r3_0);
- r3_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 13), r3_1);
- r3_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 14), r3_2);
- r3_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 15), r3_3);
+ r3_0 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+12*16)), r3_0);
+ r3_1 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+13*16)), r3_1);
+ r3_2 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+14*16)), r3_2);
+ r3_3 = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast<const __m128i*>(input+15*16)), r3_3);
}
- _mm_storeu_si128(output_mm + 12, r3_0);
- _mm_storeu_si128(output_mm + 13, r3_1);
- _mm_storeu_si128(output_mm + 14, r3_2);
- _mm_storeu_si128(output_mm + 15, r3_3);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+12*16), r3_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+13*16), r3_1);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+14*16), r3_2);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(output+15*16), r3_3);
}
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE