summaryrefslogtreecommitdiff
path: root/keccak_simd.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2019-05-17 15:18:17 -0400
committerGitHub <noreply@github.com>2019-05-17 15:18:17 -0400
commitc51f0ecbfd21b1eb29cfd26ea870b175d92db6c6 (patch)
tree87f0f2dbe92e25d2816ab6077e93714b2e69d863 /keccak_simd.cpp
parentb25b6f0892ebab5cc8a5e8c494c0db69057e1ba8 (diff)
downloadcryptopp-git-c51f0ecbfd21b1eb29cfd26ea870b175d92db6c6.tar.gz
Make config.h more Autoconf friendly (GH #835, PR #836)
Diffstat (limited to 'keccak_simd.cpp')
-rw-r--r--keccak_simd.cpp158
1 files changed, 79 insertions, 79 deletions
diff --git a/keccak_simd.cpp b/keccak_simd.cpp
index 71389cbd..45674ac2 100644
--- a/keccak_simd.cpp
+++ b/keccak_simd.cpp
@@ -39,16 +39,17 @@ extern void KeccakF1600x2_SSE(word64 *state);
// The F1600 round constants
extern const word64 KeccakF1600Constants[24];
-const word64 rho8[2] = {W64LIT(0x0605040302010007), W64LIT(0x0E0D0C0B0A09080F)};
-const word64 rho56[2] = {W64LIT(0x0007060504030201), W64LIT(0x080F0E0D0C0B0A09)};
+CRYPTOPP_ALIGN_DATA(16)
+const word64
+rho8[2] = {W64LIT(0x0605040302010007), W64LIT(0x0E0D0C0B0A09080F)};
-#define V128 __m128i
-#define CV128 const __m128i
+CRYPTOPP_ALIGN_DATA(16)
+const word64
+rho56[2] = {W64LIT(0x0007060504030201), W64LIT(0x080F0E0D0C0B0A09)};
-#define CONST128(a) _mm_load_si128((CV128 *)&(a))
-#define XOREQ128(a, b) a = _mm_xor_si128((a), (b))
-#define UNPACKL(a, b) _mm_unpacklo_epi64((a), (b))
-#define UNPACKH(a, b) _mm_unpackhi_epi64((a), (b))
+// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
+#define M128_CAST(x) ((__m128i *)(void *)(x))
+#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
#if defined(__XOP__)
# define ROL64in128(a, o) _mm_roti_epi64((a), (o))
@@ -56,8 +57,8 @@ const word64 rho56[2] = {W64LIT(0x0007060504030201), W64LIT(0x080F0E0D0C0B0A09)}
# define ROL64in128_56(a) ROL64in128((a), 56)
#else
# define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64((a), (o)), _mm_srli_epi64(a, 64-(o)))
-# define ROL64in128_8(a) _mm_shuffle_epi8((a), CONST128(rho8))
-# define ROL64in128_56(a) _mm_shuffle_epi8((a), CONST128(rho56))
+# define ROL64in128_8(a) _mm_shuffle_epi8((a), _mm_load_si128(CONST_M128_CAST(rho8)))
+# define ROL64in128_56(a) _mm_shuffle_epi8((a), _mm_load_si128(CONST_M128_CAST(rho56)))
#endif
// Damn Visual Studio is missing too many intrinsics...
@@ -74,51 +75,50 @@ inline __m128i SPLAT64(const word64 a)
// The Keccak ParallelHash128 core function
void KeccakF1600x2_SSE(word64 *state)
{
- V128 *statesAsLanes = (V128 *)state;
+ __m128i Aba, Abe, Abi, Abo, Abu;
+ __m128i Aga, Age, Agi, Ago, Agu;
+ __m128i Aka, Ake, Aki, Ako, Aku;
+ __m128i Ama, Ame, Ami, Amo, Amu;
+ __m128i Asa, Ase, Asi, Aso, Asu;
+ __m128i Bba, Bbe, Bbi, Bbo, Bbu;
+ __m128i Bga, Bge, Bgi, Bgo, Bgu;
+ __m128i Bka, Bke, Bki, Bko, Bku;
+ __m128i Bma, Bme, Bmi, Bmo, Bmu;
+ __m128i Bsa, Bse, Bsi, Bso, Bsu;
+ __m128i Ca, Ce, Ci, Co, Cu;
+ __m128i Da, De, Di, Do, Du;
+ __m128i Eba, Ebe, Ebi, Ebo, Ebu;
+ __m128i Ega, Ege, Egi, Ego, Egu;
+ __m128i Eka, Eke, Eki, Eko, Eku;
+ __m128i Ema, Eme, Emi, Emo, Emu;
+ __m128i Esa, Ese, Esi, Eso, Esu;
- V128 Aba, Abe, Abi, Abo, Abu;
- V128 Aga, Age, Agi, Ago, Agu;
- V128 Aka, Ake, Aki, Ako, Aku;
- V128 Ama, Ame, Ami, Amo, Amu;
- V128 Asa, Ase, Asi, Aso, Asu;
- V128 Bba, Bbe, Bbi, Bbo, Bbu;
- V128 Bga, Bge, Bgi, Bgo, Bgu;
- V128 Bka, Bke, Bki, Bko, Bku;
- V128 Bma, Bme, Bmi, Bmo, Bmu;
- V128 Bsa, Bse, Bsi, Bso, Bsu;
- V128 Ca, Ce, Ci, Co, Cu;
- V128 Da, De, Di, Do, Du;
- V128 Eba, Ebe, Ebi, Ebo, Ebu;
- V128 Ega, Ege, Egi, Ego, Egu;
- V128 Eka, Eke, Eki, Eko, Eku;
- V128 Ema, Eme, Emi, Emo, Emu;
- V128 Esa, Ese, Esi, Eso, Esu;
-
- Aba = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 0]));
- Abe = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 1]));
- Abi = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 2]));
- Abo = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 3]));
- Abu = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 4]));
- Aga = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 5]));
- Age = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 6]));
- Agi = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 7]));
- Ago = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 8]));
- Agu = _mm_loadu_si128((CV128 *)&(statesAsLanes[ 9]));
- Aka = _mm_loadu_si128((CV128 *)&(statesAsLanes[10]));
- Ake = _mm_loadu_si128((CV128 *)&(statesAsLanes[11]));
- Aki = _mm_loadu_si128((CV128 *)&(statesAsLanes[12]));
- Ako = _mm_loadu_si128((CV128 *)&(statesAsLanes[13]));
- Aku = _mm_loadu_si128((CV128 *)&(statesAsLanes[14]));
- Ama = _mm_loadu_si128((CV128 *)&(statesAsLanes[15]));
- Ame = _mm_loadu_si128((CV128 *)&(statesAsLanes[16]));
- Ami = _mm_loadu_si128((CV128 *)&(statesAsLanes[17]));
- Amo = _mm_loadu_si128((CV128 *)&(statesAsLanes[18]));
- Amu = _mm_loadu_si128((CV128 *)&(statesAsLanes[19]));
- Asa = _mm_loadu_si128((CV128 *)&(statesAsLanes[20]));
- Ase = _mm_loadu_si128((CV128 *)&(statesAsLanes[21]));
- Asi = _mm_loadu_si128((CV128 *)&(statesAsLanes[22]));
- Aso = _mm_loadu_si128((CV128 *)&(statesAsLanes[23]));
- Asu = _mm_loadu_si128((CV128 *)&(statesAsLanes[24]));
+ __m128i* lanes = reinterpret_cast<__m128i*>(state);
+ Aba = _mm_loadu_si128(CONST_M128_CAST(lanes+ 0));
+ Abe = _mm_loadu_si128(CONST_M128_CAST(lanes+ 1));
+ Abi = _mm_loadu_si128(CONST_M128_CAST(lanes+ 2));
+ Abo = _mm_loadu_si128(CONST_M128_CAST(lanes+ 3));
+ Abu = _mm_loadu_si128(CONST_M128_CAST(lanes+ 4));
+ Aga = _mm_loadu_si128(CONST_M128_CAST(lanes+ 5));
+ Age = _mm_loadu_si128(CONST_M128_CAST(lanes+ 6));
+ Agi = _mm_loadu_si128(CONST_M128_CAST(lanes+ 7));
+ Ago = _mm_loadu_si128(CONST_M128_CAST(lanes+ 8));
+ Agu = _mm_loadu_si128(CONST_M128_CAST(lanes+ 9));
+ Aka = _mm_loadu_si128(CONST_M128_CAST(lanes+10));
+ Ake = _mm_loadu_si128(CONST_M128_CAST(lanes+11));
+ Aki = _mm_loadu_si128(CONST_M128_CAST(lanes+12));
+ Ako = _mm_loadu_si128(CONST_M128_CAST(lanes+13));
+ Aku = _mm_loadu_si128(CONST_M128_CAST(lanes+14));
+ Ama = _mm_loadu_si128(CONST_M128_CAST(lanes+15));
+ Ame = _mm_loadu_si128(CONST_M128_CAST(lanes+16));
+ Ami = _mm_loadu_si128(CONST_M128_CAST(lanes+17));
+ Amo = _mm_loadu_si128(CONST_M128_CAST(lanes+18));
+ Amu = _mm_loadu_si128(CONST_M128_CAST(lanes+19));
+ Asa = _mm_loadu_si128(CONST_M128_CAST(lanes+20));
+ Ase = _mm_loadu_si128(CONST_M128_CAST(lanes+21));
+ Asi = _mm_loadu_si128(CONST_M128_CAST(lanes+22));
+ Aso = _mm_loadu_si128(CONST_M128_CAST(lanes+23));
+ Asu = _mm_loadu_si128(CONST_M128_CAST(lanes+24));
Ca = _mm_xor_si128(Aba, _mm_xor_si128(Aga, _mm_xor_si128(Aka, _mm_xor_si128(Ama, Asa))));
Ce = _mm_xor_si128(Abe, _mm_xor_si128(Age, _mm_xor_si128(Ake, _mm_xor_si128(Ame, Ase))));
@@ -2646,31 +2646,31 @@ void KeccakF1600x2_SSE(word64 *state)
Aso = _mm_xor_si128(Bso, _mm_andnot_si128(Bsu, Bsa));
Asu = _mm_xor_si128(Bsu, _mm_andnot_si128(Bsa, Bse));
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 0]), Aba);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 1]), Abe);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 2]), Abi);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 3]), Abo);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 4]), Abu);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 5]), Aga);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 6]), Age);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 7]), Agi);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 8]), Ago);
- _mm_storeu_si128((V128 *)&(statesAsLanes[ 9]), Agu);
- _mm_storeu_si128((V128 *)&(statesAsLanes[10]), Aka);
- _mm_storeu_si128((V128 *)&(statesAsLanes[11]), Ake);
- _mm_storeu_si128((V128 *)&(statesAsLanes[12]), Aki);
- _mm_storeu_si128((V128 *)&(statesAsLanes[13]), Ako);
- _mm_storeu_si128((V128 *)&(statesAsLanes[14]), Aku);
- _mm_storeu_si128((V128 *)&(statesAsLanes[15]), Ama);
- _mm_storeu_si128((V128 *)&(statesAsLanes[16]), Ame);
- _mm_storeu_si128((V128 *)&(statesAsLanes[17]), Ami);
- _mm_storeu_si128((V128 *)&(statesAsLanes[18]), Amo);
- _mm_storeu_si128((V128 *)&(statesAsLanes[19]), Amu);
- _mm_storeu_si128((V128 *)&(statesAsLanes[20]), Asa);
- _mm_storeu_si128((V128 *)&(statesAsLanes[21]), Ase);
- _mm_storeu_si128((V128 *)&(statesAsLanes[22]), Asi);
- _mm_storeu_si128((V128 *)&(statesAsLanes[23]), Aso);
- _mm_storeu_si128((V128 *)&(statesAsLanes[24]), Asu);
+ _mm_storeu_si128(M128_CAST(lanes+ 0), Aba);
+ _mm_storeu_si128(M128_CAST(lanes+ 1), Abe);
+ _mm_storeu_si128(M128_CAST(lanes+ 2), Abi);
+ _mm_storeu_si128(M128_CAST(lanes+ 3), Abo);
+ _mm_storeu_si128(M128_CAST(lanes+ 4), Abu);
+ _mm_storeu_si128(M128_CAST(lanes+ 5), Aga);
+ _mm_storeu_si128(M128_CAST(lanes+ 6), Age);
+ _mm_storeu_si128(M128_CAST(lanes+ 7), Agi);
+ _mm_storeu_si128(M128_CAST(lanes+ 8), Ago);
+ _mm_storeu_si128(M128_CAST(lanes+ 9), Agu);
+ _mm_storeu_si128(M128_CAST(lanes+10), Aka);
+ _mm_storeu_si128(M128_CAST(lanes+11), Ake);
+ _mm_storeu_si128(M128_CAST(lanes+12), Aki);
+ _mm_storeu_si128(M128_CAST(lanes+13), Ako);
+ _mm_storeu_si128(M128_CAST(lanes+14), Aku);
+ _mm_storeu_si128(M128_CAST(lanes+15), Ama);
+ _mm_storeu_si128(M128_CAST(lanes+16), Ame);
+ _mm_storeu_si128(M128_CAST(lanes+17), Ami);
+ _mm_storeu_si128(M128_CAST(lanes+18), Amo);
+ _mm_storeu_si128(M128_CAST(lanes+19), Amu);
+ _mm_storeu_si128(M128_CAST(lanes+20), Asa);
+ _mm_storeu_si128(M128_CAST(lanes+21), Ase);
+ _mm_storeu_si128(M128_CAST(lanes+22), Asi);
+ _mm_storeu_si128(M128_CAST(lanes+23), Aso);
+ _mm_storeu_si128(M128_CAST(lanes+24), Asu);
}
#endif