summaryrefslogtreecommitdiff
path: root/threefish.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2017-05-14 16:59:04 -0400
committerJeffrey Walton <noloader@gmail.com>2017-05-14 16:59:04 -0400
commitb3399b4f86918efee66e0831cc88da542c71aa10 (patch)
treee08983df17abd466f0cb54ecc21ad07a5364b8d6 /threefish.cpp
parent54ca8b3a16c727f48a82398dbcd4113459d89f27 (diff)
downloadcryptopp-git-b3399b4f86918efee66e0831cc88da542c71aa10.tar.gz
Remove SSE2 intrinsics from Threefish
The benchmarks showed things ran slower with GCC and MSVC
Diffstat (limited to 'threefish.cpp')
-rw-r--r--threefish.cpp98
1 files changed, 9 insertions, 89 deletions
diff --git a/threefish.cpp b/threefish.cpp
index 95160b1f..8ec51762 100644
--- a/threefish.cpp
+++ b/threefish.cpp
@@ -347,42 +347,10 @@ void Threefish::Base::ProcessAndXorBlock_512(const byte *inBlock, const byte *xo
if (IsForwardTransformation())
{
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && 0
- const bool s_sse2 = HasSSE2();
- if (s_sse2)
- {
- const word64 *ky = m_rkey.begin(), *tw = m_tweak.begin();
- word64 *ws = m_wspace.begin();
-
- // 15 SSE instructions
- _mm_store_si128((__m128i*)ws,
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)ws),
- _mm_load_si128((const __m128i*)ky)));
- _mm_store_si128((__m128i*)(ws+2),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+2)),
- _mm_load_si128((const __m128i*)(ky+2))));
- _mm_store_si128((__m128i*)(ws+4),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+4)),
- _mm_load_si128((const __m128i*)(ky+4))));
- _mm_store_si128((__m128i*)(ws+6),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+6)),
- _mm_load_si128((const __m128i*)(ky+6))));
- _mm_storeu_si128((__m128i*)(ws+5),
- _mm_add_epi64(
- _mm_loadu_si128((const __m128i*)(ws+5)),
- _mm_load_si128((const __m128i*)(tw))));
- }
-#endif
- {
- // 34 integer instructions total
- G0 += m_rkey[0]; G1 += m_rkey[1]; G2 += m_rkey[2]; G3 += m_rkey[3];
- G4 += m_rkey[4]; G5 += m_rkey[5]; G6 += m_rkey[6]; G7 += m_rkey[7];
- G5 += m_tweak[0]; G6 += m_tweak[1];
- }
+ // 34 integer instructions total
+ G0 += m_rkey[0]; G1 += m_rkey[1]; G2 += m_rkey[2]; G3 += m_rkey[3];
+ G4 += m_rkey[4]; G5 += m_rkey[5]; G6 += m_rkey[6]; G7 += m_rkey[7];
+ G5 += m_tweak[0]; G6 += m_tweak[1];
G8512(0); G8512(2); G8512(4); G8512(6); G8512(8);
G8512(10); G8512(12); G8512(14); G8512(16);
@@ -417,59 +385,11 @@ void Threefish::Base::ProcessAndXorBlock_1024(const byte *inBlock, const byte *x
if (IsForwardTransformation())
{
-#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && 0
- const bool s_sse2 = HasSSE2();
- if (s_sse2)
- {
- const word64 *ky = m_rkey.begin(), *tw = m_tweak.begin();
- word64 *ws = m_wspace.begin();
-
- _mm_store_si128((__m128i*)ws,
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)ws),
- _mm_load_si128((const __m128i*)ky)));
- _mm_store_si128((__m128i*)(ws+2),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+2)),
- _mm_load_si128((const __m128i*)(ky+2))));
- _mm_store_si128((__m128i*)(ws+4),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+4)),
- _mm_load_si128((const __m128i*)(ky+4))));
- _mm_store_si128((__m128i*)(ws+6),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+6)),
- _mm_load_si128((const __m128i*)(ky+6))));
- _mm_store_si128((__m128i*)(ws+8),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+8)),
- _mm_load_si128((const __m128i*)(ky+8))));
- _mm_store_si128((__m128i*)(ws+10),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+10)),
- _mm_load_si128((const __m128i*)(ky+10))));
- _mm_store_si128((__m128i*)(ws+12),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+12)),
- _mm_load_si128((const __m128i*)(ky+12))));
- _mm_store_si128((__m128i*)(ws+14),
- _mm_add_epi64(
- _mm_load_si128((const __m128i*)(ws+14)),
- _mm_load_si128((const __m128i*)(ky+14))));
- _mm_storeu_si128((__m128i*)(ws+13),
- _mm_add_epi64(
- _mm_loadu_si128((const __m128i*)(ws+13)),
- _mm_load_si128((const __m128i*)(tw))));
- }
- else
-#endif
- {
- G0 += m_rkey[0]; G1 += m_rkey[1]; G2 += m_rkey[2]; G3 += m_rkey[3];
- G4 += m_rkey[4]; G5 += m_rkey[5]; G6 += m_rkey[6]; G7 += m_rkey[7];
- G8 += m_rkey[8]; G9 += m_rkey[9]; G10 += m_rkey[10]; G11 += m_rkey[11];
- G12 += m_rkey[12]; G13 += m_rkey[13]; G14 += m_rkey[14]; G15 += m_rkey[15];
- G13 += m_tweak[0]; G14 += m_tweak[1];
- }
+ G0 += m_rkey[0]; G1 += m_rkey[1]; G2 += m_rkey[2]; G3 += m_rkey[3];
+ G4 += m_rkey[4]; G5 += m_rkey[5]; G6 += m_rkey[6]; G7 += m_rkey[7];
+ G8 += m_rkey[8]; G9 += m_rkey[9]; G10 += m_rkey[10]; G11 += m_rkey[11];
+ G12 += m_rkey[12]; G13 += m_rkey[13]; G14 += m_rkey[14]; G15 += m_rkey[15];
+ G13 += m_tweak[0]; G14 += m_tweak[1];
G81024(0); G81024(2); G81024(4); G81024(6); G81024(8);
G81024(10); G81024(12); G81024(14); G81024(16); G81024(18);