summaryrefslogtreecommitdiff
path: root/sha.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2017-01-13 05:44:19 -0500
committerJeffrey Walton <noloader@gmail.com>2017-01-13 05:44:19 -0500
commitd3bb0e13def80a4bf59ea324eed80becdf0045ce (patch)
treec043bc9ed60bfaa959a12b5c88d83a2415e123c5 /sha.cpp
parent52f13eca9b9a3fd086dda30c3b5b78b3505e6cd1 (diff)
downloadcryptopp-git-d3bb0e13def80a4bf59ea324eed80becdf0045ce.tar.gz
Add ARM SHA extensions for SHA1
Benchmarking on ARMv8/Aarch64 dev-board shows SHA-1 speeds up by 2.5x
Diffstat (limited to 'sha.cpp')
-rw-r--r--sha.cpp190
1 files changed, 187 insertions, 3 deletions
diff --git a/sha.cpp b/sha.cpp
index 83e8945e..9b23b9fb 100644
--- a/sha.cpp
+++ b/sha.cpp
@@ -1,8 +1,9 @@
// sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c
// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton
-// implemented Intel SHA extensions based on Intel articles and code by
-// Sean Gulley. All code is in the public domain.
+// implemented Intel SHA extensions based on Intel articles and code by
+// Sean Gulley. Jeffrey Walton implemented ARM SHA based on ARM ARM and
+// code from Johannes Schneiders. All code is in the public domain.
// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sha.cpp" to generate MASM code
@@ -293,6 +294,185 @@ static void SHA1_SSE_SHA_Transform(word32 *state, const word32 *data)
// end of Walton/Gulley's code //
/////////////////////////////////
+//////////////////////////////////////
+// start of Walton/Schneiders' code //
+//////////////////////////////////////
+
+#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
+static void SHA1_ARM_SHA_Transform(word32 *state, const word32 *data)
+{
+ uint32x4_t C0, C1, C2, C3;
+ uint32x4_t ABCD, ABCD_SAVED;
+ uint32x4_t MSG0, MSG1, MSG2, MSG3;
+ uint32x4_t TMP0, TMP1;
+ uint32_t E0, E0_SAVED, E1;
+
+ // Load initial values
+ C0 = vdupq_n_u32(0x5A827999);
+ C1 = vdupq_n_u32(0x6ED9EBA1);
+ C2 = vdupq_n_u32(0x8F1BBCDC);
+ C3 = vdupq_n_u32(0xCA62C1D6);
+
+ ABCD = vld1q_u32(&state[0]);
+ E0 = state[4];
+
+ // Save current hash
+ ABCD_SAVED = ABCD;
+ E0_SAVED = E0;
+
+ MSG0 = vld1q_u32(data + 0);
+ MSG1 = vld1q_u32(data + 4);
+ MSG2 = vld1q_u32(data + 8);
+ MSG3 = vld1q_u32(data + 12);
+
+ TMP0 = vaddq_u32(MSG0, C0);
+ TMP1 = vaddq_u32(MSG1, C0);
+
+ // Rounds 0-3
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C0);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 4-7
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C0);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 8-11
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0); /* 2 */
+ TMP0 = vaddq_u32(MSG0, C0);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 12-15
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C1);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 16-19
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C1);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 20-23
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C1);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 24-27
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C1);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 28-31
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C1);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 32-35
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C2);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 36-39
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C2);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 40-43
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C2);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 44-47
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C2);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 48-51
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C2);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 52-55
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C3);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 56-59
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C3);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 60-63
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C3);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 64-67
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C3);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 68-71
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C3);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+
+ // Rounds 72-75
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+
+ // Rounds 76-79
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+
+ E0 += E0_SAVED;
+ ABCD = vaddq_u32(ABCD_SAVED, ABCD);
+
+ // Save state
+ vst1q_u32(&state[0], ABCD);
+ state[4] = E0;
+}
+#endif
+
+////////////////////////////////////
+// end of Walton/Schneiders' code //
+////////////////////////////////////
+
pfnSHATransform InitializeSHA1Transform()
{
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
@@ -300,7 +480,11 @@ pfnSHATransform InitializeSHA1Transform()
return &SHA1_SSE_SHA_Transform;
else
#endif
-
+#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
+ if (HasSHA1())
+ return &SHA1_ARM_SHA_Transform;
+ else
+#endif
return &SHA1_CXX_Transform;
}