summaryrefslogtreecommitdiff
path: root/shacal2_simd.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2018-11-10 08:00:14 -0500
committerJeffrey Walton <noloader@gmail.com>2018-11-10 08:00:14 -0500
commit896225069db7f34e752dd7b7bb401052c6b7cb17 (patch)
tree8d4eabb2bd304b6fe9168aac8fdc28618f3212f8 /shacal2_simd.cpp
parent776a2195bd78c80130b1809b22a5e4d3aecb5b95 (diff)
downloadcryptopp-git-896225069db7f34e752dd7b7bb401052c6b7cb17.tar.gz
Rename files with dashes to underscores (GH #736)
Also see https://groups.google.com/forum/#!topic/cryptopp-users/HBz-6gZZFOA on the mailing list
Diffstat (limited to 'shacal2_simd.cpp')
-rw-r--r--shacal2_simd.cpp98
1 files changed, 98 insertions, 0 deletions
diff --git a/shacal2_simd.cpp b/shacal2_simd.cpp
new file mode 100644
index 00000000..dd64089c
--- /dev/null
+++ b/shacal2_simd.cpp
@@ -0,0 +1,98 @@
+// shacla2-simd.cpp - written and placed in the public domain by
+// Jeffrey Walton and Jack Lloyd
+//
+// Jack Lloyd and the Botan team allowed Crypto++ to use parts of
+// Botan's implementation under the same license as Crypto++
+// is released. The code for SHACAL2_Enc_ProcessAndXorBlock_SHANI
+// below is Botan's x86_encrypt_blocks with minor tweaks. Many thanks
+// to the Botan team. Also see http://github.com/randombit/botan/.
+//
+// This source file uses intrinsics to gain access to SHA-NI and
+// ARMv8a SHA instructions. A separate source file is needed because
+// additional CXXFLAGS are required to enable the appropriate instruction
+// sets in some build configurations.
+
+#include "pch.h"
+#include "config.h"
+#include "sha.h"
+#include "misc.h"
+
+#if (CRYPTOPP_SHANI_AVAILABLE)
+# include <nmmintrin.h>
+# include <immintrin.h>
+#endif
+
+// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
+#define M128_CAST(x) ((__m128i *)(void *)(x))
+#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
+
+// Squash MS LNK4221 and libtool warnings
+extern const char SHACAL2_SIMD_FNAME[] = __FILE__;
+
+NAMESPACE_BEGIN(CryptoPP)
+
+#if CRYPTOPP_SHANI_AVAILABLE
+void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys, const byte *inBlock, const byte *xorBlock, byte *outBlock)
+{
+ CRYPTOPP_ASSERT(subKeys);
+ CRYPTOPP_ASSERT(inBlock);
+ CRYPTOPP_ASSERT(outBlock);
+
+ const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
+ const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15);
+
+ __m128i B0 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 0)), MASK1);
+ __m128i B1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 16)), MASK2);
+
+ __m128i TMP = _mm_alignr_epi8(B0, B1, 8);
+ B1 = _mm_blend_epi16(B1, B0, 0xF0);
+ B0 = TMP;
+
+#if 0
+ // SSE2 + SSSE3, but 0.2 cpb slower on a Celeraon J3455
+ const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7);
+ const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15);
+
+ __m128i B0 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 0));
+ __m128i B1 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 16));
+
+ __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2);
+ B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2);
+ B0 = TMP;
+#endif
+
+ const byte* keys = reinterpret_cast<const byte*>(subKeys);
+ for (size_t i = 0; i != 8; ++i)
+ {
+ const __m128i RK0 = _mm_load_si128(CONST_M128_CAST(keys + 32*i));
+ const __m128i RK2 = _mm_load_si128(CONST_M128_CAST(keys + 32*i+16));
+ const __m128i RK1 = _mm_srli_si128(RK0, 8);
+ const __m128i RK3 = _mm_srli_si128(RK2, 8);
+
+ B1 = _mm_sha256rnds2_epu32(B1, B0, RK0);
+ B0 = _mm_sha256rnds2_epu32(B0, B1, RK1);
+ B1 = _mm_sha256rnds2_epu32(B1, B0, RK2);
+ B0 = _mm_sha256rnds2_epu32(B0, B1, RK3);
+ }
+
+ TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1);
+ B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1);
+ B0 = TMP;
+
+ if (xorBlock)
+ {
+ _mm_storeu_si128(M128_CAST(outBlock + 0),
+ _mm_xor_si128(B0, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 0))));
+
+ _mm_storeu_si128(M128_CAST(outBlock + 16),
+ _mm_xor_si128(B1, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 16))));
+ }
+ else
+ {
+ _mm_storeu_si128(M128_CAST(outBlock + 0), B0);
+ _mm_storeu_si128(M128_CAST(outBlock + 16), B1);
+ }
+}
+#endif
+
+NAMESPACE_END