summaryrefslogtreecommitdiff
path: root/sha.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2017-08-13 16:05:39 -0400
committerJeffrey Walton <noloader@gmail.com>2017-08-13 16:05:39 -0400
commit2aff92ddb6e679fca04432f01a1a16a035e33008 (patch)
tree87673ebbd4e2aee79d52717cafd6bf67fe1629df /sha.cpp
parent863bf9133c81933f4417fa9de49850a7c181158c (diff)
downloadcryptopp-git-2aff92ddb6e679fca04432f01a1a16a035e33008.tar.gz
Fix bad SHA::Transform calculation (Issue 455)
Reworked SHA class internals to align all the implementations. Formerly all hashes were software based, IterHashBase handled endian conversions, IterHashBase repeatedly called the single block SHA{N}::Transform. The rework added SHA{N}::HashMultipleBlocks, and the SHA classes attempt to always use it. Now SHA{N}::Transform calls into SHA{N}_HashMultipleBlocks, which is a free standing function. An added wrinkle is hardware wants little endian data and software presents big endian data, so HashMultipleBlocks accepts a ByteOrder for the incoming data. Hardware based SHA{N}_HashMultipleBlocks can often perform the endian swap much easier by setting an EPI mask so it was profitable to defer to hardware when available. The rework also removed the hacked-in pointers to implementations. The class now looks more like AES, GCM, etc.
Diffstat (limited to 'sha.cpp')
-rw-r--r--sha.cpp1161
1 files changed, 598 insertions, 563 deletions
diff --git a/sha.cpp b/sha.cpp
index 935adc38..72a82e61 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -6,6 +6,16 @@
// code from Johannes Schneiders, Skip Hovsmith and Barry O'Rourke.
// All code is in the public domain.
+// In August 2017 Walton reworked the internals to align all the implementations.
+// Formerly all hashes were software based, IterHashBase handled endian conversions,
+// IterHashBase repeatedly called the single block SHA{N}::Transform. The rework
+// added SHA{N}::HashMultipleBlocks, and the SHA classes attempt to always use it.
+// Now SHA{N}::Transform calls into SHA{N}::HashMultipleBlocks. An added wrinkle is
+// hardware is little endian and software is big endian, so HashMultipleBlocks
+// accepts a ByteOrder for the incoming data. Hardware based SHA{N}::HashMultipleBlocks
+// can often perform the endian swap much easier by setting an EPI mask. The rework
+// also removed the hacked-in pointers to implementations.
+
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
#include "pch.h"
@@ -30,11 +40,11 @@
# undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
#endif
-NAMESPACE_BEGIN(CryptoPP)
+// Clang __m128i casts
+#define M128_CAST(x) ((__m128i *)(void *)(x))
+#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
-// Function pointer for specific SHA1 or SHA256 Transform function
-typedef void (*pfnSHATransform)(word32 *state, const word32 *data);
-typedef void (CRYPTOPP_FASTCALL *pfnSHAHashBlocks)(word32 *state, const word32 *data, size_t length);
+NAMESPACE_BEGIN(CryptoPP)
////////////////////////////////
// start of Steve Reid's code //
@@ -55,8 +65,11 @@ typedef void (CRYPTOPP_FASTCALL *pfnSHAHashBlocks)(word32 *state, const word32 *
#define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30);
#define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30);
-static void SHA1_CXX_Transform(word32 *state, const word32 *data)
+static void SHA1_CXX_HashBlock(word32 *state, const word32 *data)
{
+ CRYPTOPP_ASSERT(state);
+ CRYPTOPP_ASSERT(data);
+
word32 W[16];
/* Copy context->state[] to working vars */
word32 a = state[0];
@@ -103,184 +116,200 @@ static void SHA1_CXX_Transform(word32 *state, const word32 *data)
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
-static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
+static void SHA1_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
+ CRYPTOPP_ASSERT(state);
+ CRYPTOPP_ASSERT(data);
+ CRYPTOPP_ASSERT(length >= 64);
+
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
__m128i MASK, MSG0, MSG1, MSG2, MSG3;
// Load initial values
- ABCD = _mm_loadu_si128((__m128i*) state);
+ ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
E0 = _mm_set_epi32(state[4], 0, 0, 0);
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
- MASK = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15);
-
- // Save current hash
- ABCD_SAVE = ABCD;
- E0_SAVE = E0;
-
- // Rounds 0-3
- MSG0 = _mm_loadu_si128((__m128i*) data+0);
- MSG0 = _mm_shuffle_epi8(MSG0, MASK);
- E0 = _mm_add_epi32(E0, MSG0);
- E1 = ABCD;
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-
- // Rounds 4-7
- MSG1 = _mm_loadu_si128((__m128i*) (data+4));
- MSG1 = _mm_shuffle_epi8(MSG1, MASK);
- E1 = _mm_sha1nexte_epu32(E1, MSG1);
- E0 = ABCD;
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
- MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-
- // Rounds 8-11
- MSG2 = _mm_loadu_si128((__m128i*) (data+8));
- MSG2 = _mm_shuffle_epi8(MSG2, MASK);
- E0 = _mm_sha1nexte_epu32(E0, MSG2);
- E1 = ABCD;
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
- MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
- MSG0 = _mm_xor_si128(MSG0, MSG2);
-
- // Rounds 12-15
- MSG3 = _mm_loadu_si128((__m128i*) (data+12));
- MSG3 = _mm_shuffle_epi8(MSG3, MASK);
- E1 = _mm_sha1nexte_epu32(E1, MSG3);
- E0 = ABCD;
- MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
- MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
- MSG1 = _mm_xor_si128(MSG1, MSG3);
-
- // Rounds 16-19
- E0 = _mm_sha1nexte_epu32(E0, MSG0);
- E1 = ABCD;
- MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
- MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
- MSG2 = _mm_xor_si128(MSG2, MSG0);
-
- // Rounds 20-23
- E1 = _mm_sha1nexte_epu32(E1, MSG1);
- E0 = ABCD;
- MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
- MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
- MSG3 = _mm_xor_si128(MSG3, MSG1);
-
- // Rounds 24-27
- E0 = _mm_sha1nexte_epu32(E0, MSG2);
- E1 = ABCD;
- MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
- MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
- MSG0 = _mm_xor_si128(MSG0, MSG2);
-
- // Rounds 28-31
- E1 = _mm_sha1nexte_epu32(E1, MSG3);
- E0 = ABCD;
- MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
- MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
- MSG1 = _mm_xor_si128(MSG1, MSG3);
-
- // Rounds 32-35
- E0 = _mm_sha1nexte_epu32(E0, MSG0);
- E1 = ABCD;
- MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
- MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
- MSG2 = _mm_xor_si128(MSG2, MSG0);
-
- // Rounds 36-39
- E1 = _mm_sha1nexte_epu32(E1, MSG1);
- E0 = ABCD;
- MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
- MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
- MSG3 = _mm_xor_si128(MSG3, MSG1);
-
- // Rounds 40-43
- E0 = _mm_sha1nexte_epu32(E0, MSG2);
- E1 = ABCD;
- MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
- MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
- MSG0 = _mm_xor_si128(MSG0, MSG2);
-
- // Rounds 44-47
- E1 = _mm_sha1nexte_epu32(E1, MSG3);
- E0 = ABCD;
- MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
- MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
- MSG1 = _mm_xor_si128(MSG1, MSG3);
-
- // Rounds 48-51
- E0 = _mm_sha1nexte_epu32(E0, MSG0);
- E1 = ABCD;
- MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
- MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
- MSG2 = _mm_xor_si128(MSG2, MSG0);
-
- // Rounds 52-55
- E1 = _mm_sha1nexte_epu32(E1, MSG1);
- E0 = ABCD;
- MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
- MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
- MSG3 = _mm_xor_si128(MSG3, MSG1);
-
- // Rounds 56-59
- E0 = _mm_sha1nexte_epu32(E0, MSG2);
- E1 = ABCD;
- MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
- MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
- MSG0 = _mm_xor_si128(MSG0, MSG2);
-
- // Rounds 60-63
- E1 = _mm_sha1nexte_epu32(E1, MSG3);
- E0 = ABCD;
- MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
- MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
- MSG1 = _mm_xor_si128(MSG1, MSG3);
-
- // Rounds 64-67
- E0 = _mm_sha1nexte_epu32(E0, MSG0);
- E1 = ABCD;
- MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
- MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
- MSG2 = _mm_xor_si128(MSG2, MSG0);
-
- // Rounds 68-71
- E1 = _mm_sha1nexte_epu32(E1, MSG1);
- E0 = ABCD;
- MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
- MSG3 = _mm_xor_si128(MSG3, MSG1);
-
- // Rounds 72-75
- E0 = _mm_sha1nexte_epu32(E0, MSG2);
- E1 = ABCD;
- MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
- ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
-
- // Rounds 76-79
- E1 = _mm_sha1nexte_epu32(E1, MSG3);
- E0 = ABCD;
- ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-
- // Add values back to state
- E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
- ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
+
+ // IA-32 SHA is little endian, SHA::Transform is big endian,
+ // and SHA::HashMultipleBlocks can be either. ByteOrder
+ // allows us to avoid extra endian reversals. It saves 1.0 cpb.
+ MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
+ _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
+ _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
+
+ while (length >= 64)
+ {
+ // Save current hash
+ ABCD_SAVE = ABCD;
+ E0_SAVE = E0;
+
+ // Rounds 0-3
+ MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
+ MSG0 = _mm_shuffle_epi8(MSG0, MASK);
+ E0 = _mm_add_epi32(E0, MSG0);
+ E1 = ABCD;
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+
+ // Rounds 4-7
+ MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
+ MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+ MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+
+ // Rounds 8-11
+ MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
+ MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+ MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+ MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+ // Rounds 12-15
+ MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
+ MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+ MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+ MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+ // Rounds 16-19
+ E0 = _mm_sha1nexte_epu32(E0, MSG0);
+ E1 = ABCD;
+ MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+ MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+ MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+ // Rounds 20-23
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+ MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+ MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+ // Rounds 24-27
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+ MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+ MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+ // Rounds 28-31
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+ MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+ MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+ // Rounds 32-35
+ E0 = _mm_sha1nexte_epu32(E0, MSG0);
+ E1 = ABCD;
+ MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+ MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+ MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+ // Rounds 36-39
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+ MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+ MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+ // Rounds 40-43
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+ MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+ MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+ // Rounds 44-47
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+ MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+ MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+ // Rounds 48-51
+ E0 = _mm_sha1nexte_epu32(E0, MSG0);
+ E1 = ABCD;
+ MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+ MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+ MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+ // Rounds 52-55
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+ MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+ MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+ // Rounds 56-59
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+ MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+ MSG0 = _mm_xor_si128(MSG0, MSG2);
+
+ // Rounds 60-63
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+ MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+ MSG1 = _mm_xor_si128(MSG1, MSG3);
+
+ // Rounds 64-67
+ E0 = _mm_sha1nexte_epu32(E0, MSG0);
+ E1 = ABCD;
+ MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+ MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+ MSG2 = _mm_xor_si128(MSG2, MSG0);
+
+ // Rounds 68-71
+ E1 = _mm_sha1nexte_epu32(E1, MSG1);
+ E0 = ABCD;
+ MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+ MSG3 = _mm_xor_si128(MSG3, MSG1);
+
+ // Rounds 72-75
+ E0 = _mm_sha1nexte_epu32(E0, MSG2);
+ E1 = ABCD;
+ MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+
+ // Rounds 76-79
+ E1 = _mm_sha1nexte_epu32(E1, MSG3);
+ E0 = ABCD;
+ ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+
+ // Add values back to state
+ E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
+ ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
+
+ data += 16;
+ length -= 64;
+ }
// Save state
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
- _mm_storeu_si128((__m128i*) state, ABCD);
+ _mm_storeu_si128(M128_CAST(state), ABCD);
state[4] = _mm_extract_epi32(E0, 3);
}
#endif
@@ -294,8 +323,12 @@ static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
//////////////////////////////////////////////////////////////
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
-static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
+static void SHA1_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
+ CRYPTOPP_ASSERT(state);
+ CRYPTOPP_ASSERT(data);
+ CRYPTOPP_ASSERT(length >= 64);
+
uint32x4_t C0, C1, C2, C3;
uint32x4_t ABCD, ABCD_SAVED;
uint32x4_t MSG0, MSG1, MSG2, MSG3;
@@ -311,152 +344,166 @@ static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
ABCD = vld1q_u32(&state[0]);
E0 = state[4];
- // Save current hash
- ABCD_SAVED = ABCD;
- E0_SAVED = E0;
-
- MSG0 = vld1q_u32(data + 0);
- MSG1 = vld1q_u32(data + 4);
- MSG2 = vld1q_u32(data + 8);
- MSG3 = vld1q_u32(data + 12);
-
- TMP0 = vaddq_u32(MSG0, C0);
- TMP1 = vaddq_u32(MSG1, C0);
-
- // Rounds 0-3
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1cq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG2, C0);
- MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
-
- // Rounds 4-7
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1cq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG3, C0);
- MSG0 = vsha1su1q_u32(MSG0, MSG3);
- MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
-
- // Rounds 8-11
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1cq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG0, C0);
- MSG1 = vsha1su1q_u32(MSG1, MSG0);
- MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
-
- // Rounds 12-15
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1cq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG1, C1);
- MSG2 = vsha1su1q_u32(MSG2, MSG1);
- MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
-
- // Rounds 16-19
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1cq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG2, C1);
- MSG3 = vsha1su1q_u32(MSG3, MSG2);
- MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
-
- // Rounds 20-23
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG3, C1);
- MSG0 = vsha1su1q_u32(MSG0, MSG3);
- MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
-
- // Rounds 24-27
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG0, C1);
- MSG1 = vsha1su1q_u32(MSG1, MSG0);
- MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
-
- // Rounds 28-31
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG1, C1);
- MSG2 = vsha1su1q_u32(MSG2, MSG1);
- MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
-
- // Rounds 32-35
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG2, C2);
- MSG3 = vsha1su1q_u32(MSG3, MSG2);
- MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
-
- // Rounds 36-39
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG3, C2);
- MSG0 = vsha1su1q_u32(MSG0, MSG3);
- MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
-
- // Rounds 40-43
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1mq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG0, C2);
- MSG1 = vsha1su1q_u32(MSG1, MSG0);
- MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
-
- // Rounds 44-47
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1mq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG1, C2);
- MSG2 = vsha1su1q_u32(MSG2, MSG1);
- MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
-
- // Rounds 48-51
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1mq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG2, C2);
- MSG3 = vsha1su1q_u32(MSG3, MSG2);
- MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
-
- // Rounds 52-55
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1mq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG3, C3);
- MSG0 = vsha1su1q_u32(MSG0, MSG3);
- MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
-
- // Rounds 56-59
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1mq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG0, C3);
- MSG1 = vsha1su1q_u32(MSG1, MSG0);
- MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
-
- // Rounds 60-63
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG1, C3);
- MSG2 = vsha1su1q_u32(MSG2, MSG1);
- MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
-
- // Rounds 64-67
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E0, TMP0);
- TMP0 = vaddq_u32(MSG2, C3);
- MSG3 = vsha1su1q_u32(MSG3, MSG2);
- MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
-
- // Rounds 68-71
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E1, TMP1);
- TMP1 = vaddq_u32(MSG3, C3);
- MSG0 = vsha1su1q_u32(MSG0, MSG3);
-
- // Rounds 72-75
- E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E0, TMP0);
-
- // Rounds 76-79
- E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
- ABCD = vsha1pq_u32(ABCD, E1, TMP1);
-
- E0 += E0_SAVED;
- ABCD = vaddq_u32(ABCD_SAVED, ABCD);
+ while (length >= 64)
+ {
+ // Save current hash
+ ABCD_SAVED = ABCD;
+ E0_SAVED = E0;
+
+ MSG0 = vld1q_u32(data + 0);
+ MSG1 = vld1q_u32(data + 4);
+ MSG2 = vld1q_u32(data + 8);
+ MSG3 = vld1q_u32(data + 12);
+
+ if (order == BIG_ENDIAN_ORDER) // Data arrangement
+ {
+ MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
+ MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
+ MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
+ MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
+ }
+
+ TMP0 = vaddq_u32(MSG0, C0);
+ TMP1 = vaddq_u32(MSG1, C0);
+
+ // Rounds 0-3
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C0);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 4-7
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C0);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 8-11
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C0);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 12-15
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C1);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 16-19
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C1);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 20-23
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C1);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 24-27
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C1);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 28-31
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C1);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 32-35
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C2);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 36-39
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C2);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 40-43
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C2);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 44-47
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C2);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 48-51
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C2);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 52-55
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C3);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 56-59
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C3);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 60-63
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C3);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 64-67
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C3);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 68-71
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C3);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+
+ // Rounds 72-75
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+
+ // Rounds 76-79
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+
+ E0 += E0_SAVED;
+ ABCD = vaddq_u32(ABCD_SAVED, ABCD);
+
+ data += 16;
+ length -= 64;
+ }
// Save state
vst1q_u32(&state[0], ABCD);
@@ -468,21 +515,6 @@ static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
// end of Walton/Schneiders/O'Rourke/Hovsmith's code //
///////////////////////////////////////////////////////
-pfnSHATransform InitializeSHA1Transform()
-{
-#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
- if (HasSHA())
- return &SHA1_SSE_SHA_Transform;
- else
-#endif
-#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
- if (HasSHA1())
- return &SHA1_ARM_SHA_Transform;
- else
-#endif
- return &SHA1_CXX_Transform;
-}
-
void SHA1::InitState(HashWordType *state)
{
state[0] = 0x67452301L;
@@ -494,53 +526,75 @@ void SHA1::InitState(HashWordType *state)
void SHA1::Transform(word32 *state, const word32 *data)
{
- static const pfnSHATransform s_pfn = InitializeSHA1Transform();
- s_pfn(state, data);
-}
+ CRYPTOPP_ASSERT(state);
+ CRYPTOPP_ASSERT(data);
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+ if (HasSHA())
+ {
+ SHA1_SHANI_HashMultipleBlocks(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER);
+ return;
+ }
+#endif
+#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
+ if (HasSHA1())
+ {
+ SHA1_ARM_SHA_HashMultipleBlocks(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER);
+ return;
+ }
+#endif
+
+ SHA1_CXX_HashBlock(state, data);
+}
+
size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length)
{
- static const bool noReverse = HasSHA() || NativeByteOrderIs(this->GetByteOrder());
- const unsigned int blockSize = this->BlockSize();
- word32* dataBuf = this->DataBuf();
+ CRYPTOPP_ASSERT(input);
+ CRYPTOPP_ASSERT(length >= 64);
+
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+ if (HasSHA())
+ {
+ SHA1_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
+ return length & (SHA1::BLOCKSIZE - 1);
+ }
+#endif
+#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
+ if (HasSHA1())
+ {
+ SHA1_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
+ return length & (SHA1::BLOCKSIZE - 1);
+ }
+#endif
+
+ const bool noReverse = NativeByteOrderIs(this->GetByteOrder());
+ word32 *dataBuf = this->DataBuf();
do
{
if (noReverse)
- this->HashEndianCorrectedBlock(input);
+ {
+ // this->HashEndianCorrectedBlock(input);
+ SHA1_CXX_HashBlock(m_state, input);
+ }
else
{
- ByteReverse(dataBuf, input, this->BlockSize());
- this->HashEndianCorrectedBlock(dataBuf);
+ ByteReverse(dataBuf, input, 64);
+ // this->HashEndianCorrectedBlock(dataBuf);
+ SHA1_CXX_HashBlock(m_state, dataBuf);
}
- input += blockSize/sizeof(word32);
- length -= blockSize;
+ input += 16;
+ length -= 64;
}
- while (length >= blockSize);
+ while (length >= 64);
return length;
}
-#endif
// *************************************************************
-void SHA224::InitState(HashWordType *state)
-{
- static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
- memcpy(state, s, sizeof(s));
-}
+CRYPTOPP_ALIGN_DATA(16)
+extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
-void SHA256::InitState(HashWordType *state)
-{
- static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
- memcpy(state, s, sizeof(s));
-}
-
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
-CRYPTOPP_ALIGN_DATA(16) extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
-#else
-extern const word32 SHA256_K[64] = {
-#endif
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -559,11 +613,75 @@ extern const word32 SHA256_K[64] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
+#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
+
+#define Ch(x,y,z) (z^(x&(y^z)))
+#define Maj(x,y,z) (y^((x^y)&(y^z)))
+
+#define a(i) T[(0-i)&7]
+#define b(i) T[(1-i)&7]
+#define c(i) T[(2-i)&7]
+#define d(i) T[(3-i)&7]
+#define e(i) T[(4-i)&7]
+#define f(i) T[(5-i)&7]
+#define g(i) T[(6-i)&7]
+#define h(i) T[(7-i)&7]
+
+#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
+ d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
+
+// for SHA256
+#define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
+#define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25))
+#define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3))
+#define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10))
+
+static void SHA256_CXX_HashBlock(word32 *state, const word32 *data)
+{
+ word32 W[16], T[8];
+ /* Copy context->state[] to working vars */
+ memcpy(T, state, sizeof(T));
+ /* 64 operations, partially loop unrolled */
+ for (unsigned int j=0; j<64; j+=16)
+ {
+ R( 0); R( 1); R( 2); R( 3);
+ R( 4); R( 5); R( 6); R( 7);
+ R( 8); R( 9); R(10); R(11);
+ R(12); R(13); R(14); R(15);
+ }
+ /* Add the working vars back into context.state[] */
+ state[0] += a(0);
+ state[1] += b(0);
+ state[2] += c(0);
+ state[3] += d(0);
+ state[4] += e(0);
+ state[5] += f(0);
+ state[6] += g(0);
+ state[7] += h(0);
+}
+
+#undef S0
+#undef S1
+#undef s0
+#undef s1
+#undef R
+
+void SHA224::InitState(HashWordType *state)
+{
+ static const word32 s[8] = {0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4};
+ memcpy(state, s, sizeof(s));
+}
+
+void SHA256::InitState(HashWordType *state)
+{
+ static const word32 s[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
+ memcpy(state, s, sizeof(s));
+}
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_GENERATE_X64_MASM))
-static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len)
+static void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 *data, size_t len)
{
#define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ
#define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4]
@@ -685,7 +803,7 @@ static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32
INTEL_NOPREFIX
#elif defined(CRYPTOPP_GENERATE_X64_MASM)
ALIGN 8
- X86_SHA256_HashBlocks PROC FRAME
+ SHA256_SSE_HashMultipleBlocks PROC FRAME
rex_push_reg rsi
push_reg rdi
push_reg rbx
@@ -864,7 +982,7 @@ INTEL_NOPREFIX
pop rdi
pop rsi
ret
- X86_SHA256_HashBlocks ENDP
+ SHA256_SSE_HashMultipleBlocks ENDP
#endif
#ifdef __GNUC__
@@ -888,200 +1006,109 @@ INTEL_NOPREFIX
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
extern "C" {
-void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len);
+void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 *data, size_t len);
}
#endif
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
-static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
+static void SHA256_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order);
#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
-static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
+static void SHA256_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order);
#endif
-#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
-
-pfnSHAHashBlocks InitializeSHA256HashBlocks()
+size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
{
+ CRYPTOPP_ASSERT(input);
+ CRYPTOPP_ASSERT(length >= 64);
+
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
- return &SHA256_SSE_SHA_HashBlocks;
- else
+ {
+ SHA256_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
+ return length & (SHA256::BLOCKSIZE - 1);
+ }
+#endif
+#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
+ if (HasSSE2())
+ {
+ const size_t res = length & (SHA256::BLOCKSIZE - 1);
+ SHA256_SSE_HashMultipleBlocks(m_state, input, length-res);
+ return res;
+ }
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA2())
- return &SHA256_ARM_SHA_HashBlocks;
- else
+ {
+ SHA256_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
+ return length & (SHA256::BLOCKSIZE - 1);
+ }
#endif
- return &X86_SHA256_HashBlocks;
-}
+ const bool noReverse = NativeByteOrderIs(this->GetByteOrder());
+ word32 *dataBuf = this->DataBuf();
+ do
+ {
+ if (noReverse)
+ {
+ // this->HashEndianCorrectedBlock(input);
+ SHA256_CXX_HashBlock(m_state, input);
+ }
+ else
+ {
+ ByteReverse(dataBuf, input, SHA256::BLOCKSIZE);
+ // this->HashEndianCorrectedBlock(dataBuf);
+ SHA256_CXX_HashBlock(m_state, dataBuf);
+ }
-size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length)
-{
- static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
- s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
- return length % BLOCKSIZE;
+ input += SHA256::BLOCKSIZE/sizeof(word32);
+ length -= SHA256::BLOCKSIZE;
+ }
+ while (length >= SHA256::BLOCKSIZE);
+ return length;
}
size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length)
{
- static const pfnSHAHashBlocks s_pfn = InitializeSHA256HashBlocks();
- s_pfn(m_state, input, (length&(size_t(0)-BLOCKSIZE)) - !HasSSE2());
- return length % BLOCKSIZE;
-}
-#endif
-
-#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
-
-#define Ch(x,y,z) (z^(x&(y^z)))
-#define Maj(x,y,z) (y^((x^y)&(y^z)))
-
-#define a(i) T[(0-i)&7]
-#define b(i) T[(1-i)&7]
-#define c(i) T[(2-i)&7]
-#define d(i) T[(3-i)&7]
-#define e(i) T[(4-i)&7]
-#define f(i) T[(5-i)&7]
-#define g(i) T[(6-i)&7]
-#define h(i) T[(7-i)&7]
-
-#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\
- d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
-
-// for SHA256
-#define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22))
-#define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25))
-#define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3))
-#define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10))
-
-#if defined(__OPTIMIZE_SIZE__)
-// Smaller but slower
-void SHA256_CXX_Transform(word32 *state, const word32 *data)
-{
- word32 W[32], T[20];
- unsigned int i = 0, j = 0;
- word32 *t = T+8;
+ CRYPTOPP_ASSERT(input);
+ CRYPTOPP_ASSERT(length >= 64);
- memcpy(t, state, 8*4);
- word32 e = t[4], a = t[0];
-
- do
+#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
+ if (HasSHA())
+ {
+ SHA256_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
+ return length & (SHA256::BLOCKSIZE - 1);
+ }
+#endif
+#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
+ if (HasSHA2())
{
- word32 w = data[j];
- W[j] = w;
- w += SHA256_K[j];
- w += t[7];
- w += S1(e);
- w += Ch(e, t[5], t[6]);
- e = t[3] + w;
- t[3] = t[3+8] = e;
- w += S0(t[0]);
- a = w + Maj(a, t[1], t[2]);
- t[-1] = t[7] = a;
- --t;
- ++j;
- if (j%8 == 0)
- t += 8;
- } while (j<16);
+ SHA256_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER);
+ return length & (SHA256::BLOCKSIZE - 1);
+ }
+#endif
+ const bool noReverse = NativeByteOrderIs(this->GetByteOrder());
+ word32 *dataBuf = this->DataBuf();
do
{
- i = j&0xf;
- word32 w = s1(W[i+16-2]) + s0(W[i+16-15]) + W[i] + W[i+16-7];
- W[i+16] = W[i] = w;
- w += SHA256_K[j];
- w += t[7];
- w += S1(e);
- w += Ch(e, t[5], t[6]);
- e = t[3] + w;
- t[3] = t[3+8] = e;
- w += S0(t[0]);
- a = w + Maj(a, t[1], t[2]);
- t[-1] = t[7] = a;
-
- w = s1(W[(i+1)+16-2]) + s0(W[(i+1)+16-15]) + W[(i+1)] + W[(i+1)+16-7];
- W[(i+1)+16] = W[(i+1)] = w;
- w += SHA256_K[j+1];
- w += (t-1)[7];
- w += S1(e);
- w += Ch(e, (t-1)[5], (t-1)[6]);
- e = (t-1)[3] + w;
- (t-1)[3] = (t-1)[3+8] = e;
- w += S0((t-1)[0]);
- a = w + Maj(a, (t-1)[1], (t-1)[2]);
- (t-1)[-1] = (t-1)[7] = a;
-
- t-=2;
- j+=2;
- if (j%8 == 0)
- t += 8;
- } while (j<64);
+ if (noReverse)
+ {
+ // this->HashEndianCorrectedBlock(input);
+ SHA256_CXX_HashBlock(m_state, input);
+ }
+ else
+ {
+ ByteReverse(dataBuf, input, SHA256::BLOCKSIZE);
+ // this->HashEndianCorrectedBlock(dataBuf);
+ SHA256_CXX_HashBlock(m_state, dataBuf);
+ }
- state[0] += a;
- state[1] += t[1];
- state[2] += t[2];
- state[3] += t[3];
- state[4] += e;
- state[5] += t[5];
- state[6] += t[6];
- state[7] += t[7];
-}
-#else
-// Bigger but faster
-void SHA256_CXX_Transform(word32 *state, const word32 *data)
-{
- word32 W[16], T[8];
- /* Copy context->state[] to working vars */
- memcpy(T, state, sizeof(T));
- /* 64 operations, partially loop unrolled */
- for (unsigned int j=0; j<64; j+=16)
- {
- R( 0); R( 1); R( 2); R( 3);
- R( 4); R( 5); R( 6); R( 7);
- R( 8); R( 9); R(10); R(11);
- R(12); R(13); R(14); R(15);
+ input += SHA256::BLOCKSIZE/sizeof(word32);
+ length -= SHA256::BLOCKSIZE;
}
- /* Add the working vars back into context.state[] */
- state[0] += a(0);
- state[1] += b(0);
- state[2] += c(0);
- state[3] += d(0);
- state[4] += e(0);
- state[5] += f(0);
- state[6] += g(0);
- state[7] += h(0);
-}
-#endif // __OPTIMIZE_SIZE__
-
-#undef S0
-#undef S1
-#undef s0
-#undef s1
-#undef R
-
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
-static void SHA256_SSE2_Transform(word32 *state, const word32 *data)
-{
- // this byte reverse is a waste of time, but this function is only called by MDC
- word32 W[16];
- ByteReverse(W, data, SHA256::BLOCKSIZE);
- X86_SHA256_HashBlocks(state, W, SHA256::BLOCKSIZE - !HasSSE2());
-}
-#endif // CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
-
-#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
-static void SHA256_SSE_SHA_Transform(word32 *state, const word32 *data)
-{
- return SHA256_SSE_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
-}
-#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
-
-#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
-static void SHA256_ARM_SHA_Transform(word32 *state, const word32 *data)
-{
- return SHA256_ARM_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
+ while (length >= SHA256::BLOCKSIZE);
+ return length;
}
-#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
///////////////////////////////////
// start of Walton/Gulley's code //
@@ -1089,10 +1116,11 @@ static void SHA256_ARM_SHA_Transform(word32 *state, const word32 *data)
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
-static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
+static void SHA256_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
- CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
- CRYPTOPP_ASSERT(length % SHA256::BLOCKSIZE == 0);
+ CRYPTOPP_ASSERT(state);
+ CRYPTOPP_ASSERT(data);
+ CRYPTOPP_ASSERT(length >= 64);
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
@@ -1100,9 +1128,15 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
- TMP = _mm_loadu_si128((__m128i*) &state[0]);
- STATE1 = _mm_loadu_si128((__m128i*) &state[4]);
- MASK = _mm_set_epi64x(W64LIT(0x0c0d0e0f08090a0b), W64LIT(0x0405060700010203));
+ TMP = _mm_loadu_si128(M128_CAST(&state[0]));
+ STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
+
+ // IA-32 SHA is little endian, SHA::Transform is big endian,
+ // and SHA::HashMultipleBlocks can be either. ByteOrder
+ // allows us to avoid extra endian reversals. It saves 1.0 cpb.
+ MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
+ _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
+ _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
@@ -1116,7 +1150,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
CDGH_SAVE = STATE1;
// Rounds 0-3
- MSG = _mm_loadu_si128((__m128i*) data+0);
+ MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
@@ -1124,7 +1158,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
- TMSG1 = _mm_loadu_si128((__m128i*) (data+4));
+ TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
@@ -1133,7 +1167,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
- TMSG2 = _mm_loadu_si128((__m128i*) (data+8));
+ TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
@@ -1142,7 +1176,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
- TMSG3 = _mm_loadu_si128((__m128i*) (data+12));
+ TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
@@ -1281,8 +1315,8 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
- _mm_storeu_si128((__m128i*) &state[0], STATE0);
- _mm_storeu_si128((__m128i*) &state[4], STATE1);
+ _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
+ _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
}
#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
@@ -1295,7 +1329,7 @@ static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const wor
/////////////////////////////////////////////////////////
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
-static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
+static void SHA256_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order)
{
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
uint32x4_t MSG0, MSG1, MSG2, MSG3;
@@ -1317,6 +1351,14 @@ static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const wor
MSG2 = vld1q_u32(data + 8);
MSG3 = vld1q_u32(data + 12);
+ if (order == BIG_ENDIAN_ORDER) // Data arrangement
+ {
+ MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
+ MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
+ MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
+ MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
+ }
+
TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
// Rounds 0-3
@@ -1456,31 +1498,24 @@ static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const wor
// end of Walton/Schneiders/O'Rourke/Hovsmith's code //
///////////////////////////////////////////////////////
-pfnSHATransform InitializeSHA256Transform()
+void SHA256::Transform(word32 *state, const word32 *data)
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
if (HasSHA())
- return &SHA256_SSE_SHA_Transform;
- else
-#endif
-#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
- if (HasSSE2())
- return &SHA256_SSE2_Transform;
- else
+ {
+ SHA256_SHANI_HashMultipleBlocks(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER);
+ return;
+ }
#endif
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
if (HasSHA2())
- return &SHA256_ARM_SHA_Transform;
- else
+ {
+ SHA256_ARM_SHA_HashMultipleBlocks(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER);
+ return;
+ }
#endif
- return &SHA256_CXX_Transform;
-}
-
-void SHA256::Transform(word32 *state, const word32 *data)
-{
- static const pfnSHATransform s_pfn = InitializeSHA256Transform();
- s_pfn(state, data);
+ SHA256_CXX_HashBlock(state, data);
}
// *************************************************************