Add Intel SHA1 extension support (Issue 139)

author: Jeffrey Walton <noloader@gmail.com> 2016-12-01 15:05:41 -0500
committer: Jeffrey Walton <noloader@gmail.com> 2016-12-01 15:05:41 -0500
commit: cce56d3f79f739339e17746c3c4cdcba7297483b (patch)
tree: e18059f5b500ea7e1e023f71bd1d0c49d2e70fe4 /sha.cpp
parent: 315b4b0b3e7b50afa1184a7c490a284abf2c39c1 (diff)
download: cryptopp-git-cce56d3f79f739339e17746c3c4cdcba7297483b.tar.gz
1 files changed, 259 insertions, 16 deletions
diff --git a/sha.cpp b/sha.cpp
index 3499699f..ce4be602 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -1,7 +1,8 @@
 // sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
 
 // Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
-//  implemented Intel SHA extensions. All are in the public domain.
+//  implemented Intel SHA extensions based on Intel articles and code by
+//  Sean Gulley. All code is in the public domain.
 
 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
 
@@ -29,6 +30,10 @@
 
 NAMESPACE_BEGIN(CryptoPP)
 
+// Function pointer for specific SHA1 or SHA256 Transform function
+typedef void (*pfnSHATransform)(word32 *state, const word32 *data);
+typedef void (*pfnSHAHashBlocks)(word32 *state, const word32 *data, size_t length);
+
 ////////////////////////////////
 // start of Steve Reid's code //
 ////////////////////////////////
@@ -91,7 +96,8 @@ static void SHA1_CXX_Transform(word32 *state, const word32 *data)
 //////////////////////////////
 
 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
-static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data)
+// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
+static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
 {
     __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
     __m128i MASK, MSG0, MSG1, MSG2, MSG3;
@@ -272,17 +278,15 @@ static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data)
 	// Save state
     ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
     _mm_storeu_si128((__m128i*) state, ABCD);
-    *(state+4) = _mm_extract_epi32(E0, 3);
+    state[4] = _mm_extract_epi32(E0, 3);
 }
 #endif
 
-typedef void (*pfnSHA1Transform)(word32 *state, const word32 *data);
-
-pfnSHA1Transform InitializeSHA1Transform()
+pfnSHATransform InitializeSHA1Transform()
 {
 #if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
 	if (HasSHA())
-		return &SHA1_SHAEXT_Transform;
+		return &SHA1_SSE_SHA_Transform;
 	else
 #endif
 
@@ -300,7 +304,7 @@ void SHA1::InitState(HashWordType *state)
 
 void SHA1::Transform(word32 *state, const word32 *data)
 {
-	static const pfnSHA1Transform s_pfn = InitializeSHA1Transform();
+	static const pfnSHATransform s_pfn = InitializeSHA1Transform();
 	s_pfn(state, data);
 }
 
@@ -683,18 +687,25 @@ void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data,
 }
 #endif
 
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+static void SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t blocks);
+#endif
+
 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
 
+pfnSHAHashBlocks InitializeSHA256HashBlocks();
 size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
 {
-	X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
-	return length % BLOCKSIZE;
+    static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
+    s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
+    return length % BLOCKSIZE;
 }
 
 size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
 {
-	X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
-	return length % BLOCKSIZE;
+    static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
+    s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
+    return length % BLOCKSIZE;
 }
 
 #endif
@@ -724,7 +735,7 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
 
 // Smaller but slower
 #if defined(__OPTIMIZE_SIZE__)
-void SHA256::Transform(word32 *state, const word32 *data)
+void SHA256_CXX_Transform(word32 *state, const word32 *data)
 {
 	word32 T[20];
 	word32 W[32];
@@ -796,13 +807,13 @@ void SHA256::Transform(word32 *state, const word32 *data)
     state[7] += t[7];
 }
 #else
-void SHA256::Transform(word32 *state, const word32 *data)
+void SHA256_CXX_Transform(word32 *state, const word32 *data)
 {
 	word32 W[16];
 #if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
 	// this byte reverse is a waste of time, but this function is only called by MDC
-	ByteReverse(W, data, BLOCKSIZE);
-	X86_SHA256_HashBlocks(state, W, BLOCKSIZE - !HasSSE2());
+	ByteReverse(W, data, SHA256::BLOCKSIZE);
+	X86_SHA256_HashBlocks(state, W, SHA256::BLOCKSIZE - !HasSSE2());
 #else
 	word32 T[8];
     /* Copy context->state[] to working vars */
@@ -834,6 +845,238 @@ void SHA256::Transform(word32 *state, const word32 *data)
 #undef s1
 #undef R
 
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+static void SHA256_SSE_SHA_Transform(word32 *state, const word32 *data)
+{
+    return SHA256_SSE_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
+}
+
+// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
+static void SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
+{
+    CRYPTOPP_ASSERT(state);    CRYPTOPP_ASSERT(data);
+    CRYPTOPP_ASSERT(length % SHA256::BLOCKSIZE == 0);
+
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP, MASK;
+    __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m128i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial hash values
+    TMP = _mm_loadu_si128((__m128i*) &state[0]);
+    STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
+    MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203));
+
+    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+    while (length)
+    {
+        // Save hash values for addition after rounds
+        ABEF_SAVE = STATE0;
+        CDGH_SAVE = STATE1;
+
+        // Rounds 0-3
+        MSG = _mm_loadu_si128((__m128i*) data+0);
+        TMSG0 = _mm_shuffle_epi8(MSG, MASK);
+        MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+        // Rounds 4-7
+        TMSG1 = _mm_loadu_si128((__m128i*) (data+4));
+        TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+        MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+        // Rounds 8-11
+        TMSG2 = _mm_loadu_si128((__m128i*) (data+8));
+        TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+        MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+        // Rounds 12-15
+        TMSG3 = _mm_loadu_si128((__m128i*) (data+12));
+        TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+        MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+        TMSG0 = _mm_add_epi32(TMSG0, TMP);
+        TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+        // Rounds 16-19
+        MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+        TMSG1 = _mm_add_epi32(TMSG1, TMP);
+        TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+        // Rounds 20-23
+        MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+        TMSG2 = _mm_add_epi32(TMSG2, TMP);
+        TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+        // Rounds 24-27
+        MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+        TMSG3 = _mm_add_epi32(TMSG3, TMP);
+        TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+        // Rounds 28-31
+        MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+        TMSG0 = _mm_add_epi32(TMSG0, TMP);
+        TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+        // Rounds 32-35
+        MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+        TMSG1 = _mm_add_epi32(TMSG1, TMP);
+        TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+        // Rounds 36-39
+        MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+        TMSG2 = _mm_add_epi32(TMSG2, TMP);
+        TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+        // Rounds 40-43
+        MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+        TMSG3 = _mm_add_epi32(TMSG3, TMP);
+        TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+        // Rounds 44-47
+        MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+        TMSG0 = _mm_add_epi32(TMSG0, TMP);
+        TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+        // Rounds 48-51
+        MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+        TMSG1 = _mm_add_epi32(TMSG1, TMP);
+        TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+        TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+        // Rounds 52-55
+        MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+        TMSG2 = _mm_add_epi32(TMSG2, TMP);
+        TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+        // Rounds 56-59
+        MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+        TMSG3 = _mm_add_epi32(TMSG3, TMP);
+        TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+        // Rounds 60-63
+        MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
+        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+        MSG = _mm_shuffle_epi32(MSG, 0x0E);
+        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+        // Add current hash values with previously saved
+        STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+        STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+        data += 16;
+        length -= SHA256::BLOCKSIZE;
+    }
+
+    // Write hash values back in the correct order
+    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+    _mm_storeu_si128((__m128i*) &state[0], STATE0);
+    _mm_storeu_si128((__m128i*) &state[4], STATE1);
+}
+#endif  // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+
+pfnSHATransform InitializeSHA256Transform()
+{
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+    if (HasSHA())
+        return &SHA256_SSE_SHA_Transform;
+    else
+#endif
+
+    return &SHA256_CXX_Transform;
+}
+
+pfnSHAHashBlocks InitializeSHA256HashBlocks()
+{
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+    if (HasSHA())
+        return &SHA256_SSE_SHA_HashBlocks;
+    else
+#endif
+
+    return &X86_SHA256_HashBlocks;
+}
+
+void SHA256::Transform(word32 *state, const word32 *data)
+{
+    static const pfnSHATransform s_pfn = InitializeSHA256Transform();
+    s_pfn(state, data);
+}
+
 // *************************************************************
 
 void SHA384::InitState(HashWordType *state)
author	Jeffrey Walton <noloader@gmail.com>	2016-12-01 15:05:41 -0500
committer	Jeffrey Walton <noloader@gmail.com>	2016-12-01 15:05:41 -0500
commit	cce56d3f79f739339e17746c3c4cdcba7297483b (patch)
tree	e18059f5b500ea7e1e023f71bd1d0c49d2e70fe4 /sha.cpp
parent	315b4b0b3e7b50afa1184a7c490a284abf2c39c1 (diff)
download	cryptopp-git-cce56d3f79f739339e17746c3c4cdcba7297483b.tar.gz