summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--config.compat7
-rw-r--r--config.h7
-rw-r--r--cpu.h9
-rw-r--r--sha.cpp237
-rw-r--r--sha.h3
5 files changed, 233 insertions, 30 deletions
diff --git a/config.compat b/config.compat
index 42d57c5f..bc39b199 100644
--- a/config.compat
+++ b/config.compat
@@ -502,11 +502,10 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
#endif
-// AVX2 in MSC 18.00
-#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_AVX) && !defined(_M_ARM) && ((_MSC_VER >= 1600) || (defined(__RDRND__) || defined(__RDSEED__) || defined(__AVX__)))
- #define CRYPTOPP_BOOL_AVX_AVAILABLE 1
+#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && !defined(_M_ARM) && ((_MSC_VER >= 1900) || (CRYPTOPP_GCC_VERSION >= 50000) || defined(__SHA__))
+ #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 1
#else
- #define CRYPTOPP_BOOL_AVX_AVAILABLE 0
+ #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 0
#endif
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
diff --git a/config.h b/config.h
index c0383fb6..3b507617 100644
--- a/config.h
+++ b/config.h
@@ -502,11 +502,10 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
#endif
-// AVX2 in MSC 18.00
-#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_AVX) && !defined(_M_ARM) && ((_MSC_VER >= 1600) || (defined(__RDRND__) || defined(__RDSEED__) || defined(__AVX__)))
- #define CRYPTOPP_BOOL_AVX_AVAILABLE 1
+#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && !defined(_M_ARM) && ((_MSC_VER >= 1900) || (CRYPTOPP_GCC_VERSION >= 50000) || defined(__SHA__))
+ #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 1
#else
- #define CRYPTOPP_BOOL_AVX_AVAILABLE 0
+ #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 0
#endif
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
diff --git a/cpu.h b/cpu.h
index 6a7e6173..e42792ba 100644
--- a/cpu.h
+++ b/cpu.h
@@ -47,12 +47,9 @@
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
# include <wmmintrin.h> // aesenc, aesdec, etc
#endif // wmmintrin.h
-#if CRYPTOPP_BOOL_AVX_INTRINSICS_AVAILABLE
-# include <immintrin.h> // RDRAND, RDSEED and AVX
-#endif
-#if CRYPTOPP_BOOL_AVX2_INTRINSICS_AVAILABLE
-# include <zmmintrin.h> // AVX 512-bit extensions
-#endif
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+# include <immintrin.h> // RDRAND, RDSEED, AVX, SHA
+#endif // immintrin.h
#endif // X86/X64/X32 Headers
// Applies to both X86/X32/X64 and ARM32/ARM64. And we've got MIPS devices on the way.
diff --git a/sha.cpp b/sha.cpp
index 441a5c65..3499699f 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -1,7 +1,7 @@
// sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
-// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2.
-// Both are in the public domain.
+// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
+// implemented Intel SHA extensions. All are in the public domain.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
@@ -29,20 +29,13 @@
NAMESPACE_BEGIN(CryptoPP)
-// start of Steve Reid's code
+////////////////////////////////
+// start of Steve Reid's code //
+////////////////////////////////
#define blk0(i) (W[i] = data[i])
#define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1))
-void SHA1::InitState(HashWordType *state)
-{
- state[0] = 0x67452301L;
- state[1] = 0xEFCDAB89L;
- state[2] = 0x98BADCFEL;
- state[3] = 0x10325476L;
- state[4] = 0xC3D2E1F0L;
-}
-
#define f1(x,y,z) (z^(x&(y^z)))
#define f2(x,y,z) (x^y^z)
#define f3(x,y,z) ((x&y)|(z&(x|y)))
@@ -55,7 +48,7 @@ void SHA1::InitState(HashWordType *state)
#define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30);
#define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30);
-void SHA1::Transform(word32 *state, const word32 *data)
+static void SHA1_CXX_Transform(word32 *state, const word32 *data)
{
word32 W[16];
/* Copy context->state[] to working vars */
@@ -93,7 +86,223 @@ void SHA1::Transform(word32 *state, const word32 *data)
state[4] += e;
}
-// end of Steve Reid's code
+//////////////////////////////
+// end of Steve Reid's code //
+//////////////////////////////
+
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data)
+{
+ __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
+ __m128i MASK, MSG0, MSG1, MSG2, MSG3;
+
+ word32 T[16];
+ ByteReverse(T, data, 64);
+
+ // Load initial values
+ ABCD = _mm_loadu_si128((__m128i*) state);
+ E0 = _mm_set_epi32(state[4], 0, 0, 0);
+ ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
+ MASK = _mm_set_epi64x(W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f));
+
+ // Save current hash
+ ABCD_SAVE = ABCD;
+ E0_SAVE = E0;
+
+ // Rounds 0-3
+ MSG0 = _mm_loadu_si128((__m128i*) T+0);
+ MSG0 = _mm_shuffle_epi8(MSG0, MASK);
+ E0 = _mm_add_epi32(E0, MSG0);
+ E1 = ABCD;
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+
+ // Rounds 4-7
+ MSG1 = _mm_loadu_si128((__m128i*) (T+4));
+ MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+ MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+
+ // Rounds 8-11
+ MSG2 = _mm_loadu_si128((__m128i*) (T+8));
+ MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+ MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+ MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+ // Rounds 12-15
+ MSG3 = _mm_loadu_si128((__m128i*) (T+12));
+ MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+ MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+ MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+ // Rounds 16-19
+ E0 = _mm_sha1nexte_epu32(E0, MSG0);
+ E1 = ABCD;
+ MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+ MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+ MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+ // Rounds 20-23
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+ MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+ MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+ // Rounds 24-27
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+ MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+ MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+ // Rounds 28-31
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+ MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+ MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+ // Rounds 32-35
+ E0 = _mm_sha1nexte_epu32(E0, MSG0);
+ E1 = ABCD;
+ MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+ MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+ MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+ // Rounds 36-39
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+ MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+ MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+ // Rounds 40-43
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+ MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+ MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+ // Rounds 44-47
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+ MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+ MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+ // Rounds 48-51
+ E0 = _mm_sha1nexte_epu32(E0, MSG0);
+ E1 = ABCD;
+ MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+ MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+ MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+ // Rounds 52-55
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+ MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+ MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+ // Rounds 56-59
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+ MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+ MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+ // Rounds 60-63
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+ MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+ MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+ // Rounds 64-67
+ E0 = _mm_sha1nexte_epu32(E0, MSG0);
+ E1 = ABCD;
+ MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+ MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+ MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+ // Rounds 68-71
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+ MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+ // Rounds 72-75
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+
+ // Rounds 76-79
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+
+ // Add values back to state
+ E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
+ ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
+
+ // Save state
+ ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
+ _mm_storeu_si128((__m128i*) state, ABCD);
+ *(state+4) = _mm_extract_epi32(E0, 3);
+}
+#endif
+
+typedef void (*pfnSHA1Transform)(word32 *state, const word32 *data);
+
+pfnSHA1Transform InitializeSHA1Transform()
+{
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+ if (HasSHA())
+ return &SHA1_SHAEXT_Transform;
+ else
+#endif
+
+ return &SHA1_CXX_Transform;
+}
+
+void SHA1::InitState(HashWordType *state)
+{
+ state[0] = 0x67452301L;
+ state[1] = 0xEFCDAB89L;
+ state[2] = 0x98BADCFEL;
+ state[3] = 0x10325476L;
+ state[4] = 0xC3D2E1F0L;
+}
+
+void SHA1::Transform(word32 *state, const word32 *data)
+{
+ static const pfnSHA1Transform s_pfn = InitializeSHA1Transform();
+ s_pfn(state, data);
+}
// *************************************************************
diff --git a/sha.h b/sha.h
index 008c9b37..dc5e8b64 100644
--- a/sha.h
+++ b/sha.h
@@ -1,7 +1,6 @@
// sha.h - written and placed in the public domain by Wei Dai
-//! \file
-//! \headerfile sha.h
+//! \file sha.h
//! \brief Classes for SHA-1 and SHA-2 family of message digests
#ifndef CRYPTOPP_SHA_H