summaryrefslogtreecommitdiff
path: root/sha.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2016-12-01 15:05:41 -0500
committerJeffrey Walton <noloader@gmail.com>2016-12-01 15:05:41 -0500
commitcce56d3f79f739339e17746c3c4cdcba7297483b (patch)
treee18059f5b500ea7e1e023f71bd1d0c49d2e70fe4 /sha.cpp
parent315b4b0b3e7b50afa1184a7c490a284abf2c39c1 (diff)
downloadcryptopp-git-cce56d3f79f739339e17746c3c4cdcba7297483b.tar.gz
Add Intel SHA1 extension support (Issue 139)
Diffstat (limited to 'sha.cpp')
-rw-r--r--sha.cpp275
1 files changed, 259 insertions, 16 deletions
diff --git a/sha.cpp b/sha.cpp
index 3499699f..ce4be602 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -1,7 +1,8 @@
// sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
-// implemented Intel SHA extensions. All are in the public domain.
+// implemented Intel SHA extensions based on Intel articles and code by
+// Sean Gulley. All code is in the public domain.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
@@ -29,6 +30,10 @@
NAMESPACE_BEGIN(CryptoPP)
+// Function pointer for specific SHA1 or SHA256 Transform function
+typedef void (*pfnSHATransform)(word32 *state, const word32 *data);
+typedef void (*pfnSHAHashBlocks)(word32 *state, const word32 *data, size_t length);
+
////////////////////////////////
// start of Steve Reid's code //
////////////////////////////////
@@ -91,7 +96,8 @@ static void SHA1_CXX_Transform(word32 *state, const word32 *data)
//////////////////////////////
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
-static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data)
+// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
+static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
{
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
__m128i MASK, MSG0, MSG1, MSG2, MSG3;
@@ -272,17 +278,15 @@ static void SHA1_SHAEXT_Transform(word32 *state, const word32 *data)
// Save state
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
_mm_storeu_si128((__m128i*) state, ABCD);
- *(state+4) = _mm_extract_epi32(E0, 3);
+ state[4] = _mm_extract_epi32(E0, 3);
}
#endif
-typedef void (*pfnSHA1Transform)(word32 *state, const word32 *data);
-
-pfnSHA1Transform InitializeSHA1Transform()
+pfnSHATransform InitializeSHA1Transform()
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
- return &SHA1_SHAEXT_Transform;
+ return &SHA1_SSE_SHA_Transform;
else
#endif
@@ -300,7 +304,7 @@ void SHA1::InitState(HashWordType *state)
void SHA1::Transform(word32 *state, const word32 *data)
{
- static const pfnSHA1Transform s_pfn = InitializeSHA1Transform();
+ static const pfnSHATransform s_pfn = InitializeSHA1Transform();
s_pfn(state, data);
}
@@ -683,18 +687,25 @@ void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data,
}
#endif
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+static void SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t blocks);
+#endif
+
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
+pfnSHAHashBlocks InitializeSHA256HashBlocks();
size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
{
- X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
- return length % BLOCKSIZE;
+ static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
+ s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
+ return length % BLOCKSIZE;
}
size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
{
- X86_SHA256_HashBlocks(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
- return length % BLOCKSIZE;
+ static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
+ s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
+ return length % BLOCKSIZE;
}
#endif
@@ -724,7 +735,7 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
// Smaller but slower
#if defined(__OPTIMIZE_SIZE__)
-void SHA256::Transform(word32 *state, const word32 *data)
+void SHA256_CXX_Transform(word32 *state, const word32 *data)
{
word32 T[20];
word32 W[32];
@@ -796,13 +807,13 @@ void SHA256::Transform(word32 *state, const word32 *data)
state[7] += t[7];
}
#else
-void SHA256::Transform(word32 *state, const word32 *data)
+void SHA256_CXX_Transform(word32 *state, const word32 *data)
{
word32 W[16];
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
// this byte reverse is a waste of time, but this function is only called by MDC
- ByteReverse(W, data, BLOCKSIZE);
- X86_SHA256_HashBlocks(state, W, BLOCKSIZE - !HasSSE2());
+ ByteReverse(W, data, SHA256::BLOCKSIZE);
+ X86_SHA256_HashBlocks(state, W, SHA256::BLOCKSIZE - !HasSSE2());
#else
word32 T[8];
/* Copy context->state[] to working vars */
@@ -834,6 +845,238 @@ void SHA256::Transform(word32 *state, const word32 *data)
#undef s1
#undef R
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+static void SHA256_SSE_SHA_Transform(word32 *state, const word32 *data)
+{
+ return SHA256_SSE_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
+}
+
+// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
+static void SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
+{
+ CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
+ CRYPTOPP_ASSERT(length % SHA256::BLOCKSIZE == 0);
+
+ __m128i STATE0, STATE1;
+ __m128i MSG, TMP, MASK;
+ __m128i TMSG0, TMSG1, TMSG2, TMSG3;
+ __m128i ABEF_SAVE, CDGH_SAVE;
+
+ // Load initial hash values
+ TMP = _mm_loadu_si128((__m128i*) &state[0]);
+ STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
+ MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203));
+
+ TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
+ STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
+ STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
+ STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
+
+ while (length)
+ {
+ // Save hash values for addition after rounds
+ ABEF_SAVE = STATE0;
+ CDGH_SAVE = STATE1;
+
+ // Rounds 0-3
+ MSG = _mm_loadu_si128((__m128i*) data+0);
+ TMSG0 = _mm_shuffle_epi8(MSG, MASK);
+ MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+ // Rounds 4-7
+ TMSG1 = _mm_loadu_si128((__m128i*) (data+4));
+ TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
+ MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+ // Rounds 8-11
+ TMSG2 = _mm_loadu_si128((__m128i*) (data+8));
+ TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
+ MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+ // Rounds 12-15
+ TMSG3 = _mm_loadu_si128((__m128i*) (data+12));
+ TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
+ MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+ TMSG0 = _mm_add_epi32(TMSG0, TMP);
+ TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+ // Rounds 16-19
+ MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+ TMSG1 = _mm_add_epi32(TMSG1, TMP);
+ TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+ // Rounds 20-23
+ MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+ TMSG2 = _mm_add_epi32(TMSG2, TMP);
+ TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+ // Rounds 24-27
+ MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+ TMSG3 = _mm_add_epi32(TMSG3, TMP);
+ TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+ // Rounds 28-31
+ MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+ TMSG0 = _mm_add_epi32(TMSG0, TMP);
+ TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+ // Rounds 32-35
+ MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+ TMSG1 = _mm_add_epi32(TMSG1, TMP);
+ TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+ // Rounds 36-39
+ MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+ TMSG2 = _mm_add_epi32(TMSG2, TMP);
+ TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
+
+ // Rounds 40-43
+ MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+ TMSG3 = _mm_add_epi32(TMSG3, TMP);
+ TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
+
+ // Rounds 44-47
+ MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
+ TMSG0 = _mm_add_epi32(TMSG0, TMP);
+ TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
+
+ // Rounds 48-51
+ MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
+ TMSG1 = _mm_add_epi32(TMSG1, TMP);
+ TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+ TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
+
+ // Rounds 52-55
+ MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
+ TMSG2 = _mm_add_epi32(TMSG2, TMP);
+ TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+ // Rounds 56-59
+ MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
+ TMSG3 = _mm_add_epi32(TMSG3, TMP);
+ TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+ // Rounds 60-63
+ MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+
+ // Add current hash values with previously saved
+ STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
+ STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
+
+ data += 16;
+ length -= SHA256::BLOCKSIZE;
+ }
+
+ // Write hash values back in the correct order
+ TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
+ STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
+ STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
+ STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
+
+ _mm_storeu_si128((__m128i*) &state[0], STATE0);
+ _mm_storeu_si128((__m128i*) &state[4], STATE1);
+}
+#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+
+pfnSHATransform InitializeSHA256Transform()
+{
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+ if (HasSHA())
+ return &SHA256_SSE_SHA_Transform;
+ else
+#endif
+
+ return &SHA256_CXX_Transform;
+}
+
+pfnSHAHashBlocks InitializeSHA256HashBlocks()
+{
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+ if (HasSHA())
+ return &SHA256_SSE_SHA_HashBlocks;
+ else
+#endif
+
+ return &X86_SHA256_HashBlocks;
+}
+
+void SHA256::Transform(word32 *state, const word32 *data)
+{
+ static const pfnSHATransform s_pfn = InitializeSHA256Transform();
+ s_pfn(state, data);
+}
+
// *************************************************************
void SHA384::InitState(HashWordType *state)