summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2020-06-29 04:44:58 -0400
committerJeffrey Walton <noloader@gmail.com>2020-06-29 04:44:58 -0400
commit97695f87a9d2d467712ef8bdd7aabd537716f345 (patch)
treeea21b92c5d9cc3e95e53d140869dca6f4a26a90c
parent4f54885d868dcca51046e1a86cd27cf889e0c5b6 (diff)
downloadcryptopp-git-97695f87a9d2d467712ef8bdd7aabd537716f345.tar.gz
Speedup BLAKE2 message loading on PowerPC
-rw-r--r--blake2b_simd.cpp44
1 files changed, 32 insertions, 12 deletions
diff --git a/blake2b_simd.cpp b/blake2b_simd.cpp
index 10bd0a1d..8d62d828 100644
--- a/blake2b_simd.cpp
+++ b/blake2b_simd.cpp
@@ -1170,6 +1170,17 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
m5 = (uint64x2_p) vec_xl( 80, CONST_V8_CAST( input ));
m6 = (uint64x2_p) vec_xl( 96, CONST_V8_CAST( input ));
m7 = (uint64x2_p) vec_xl(112, CONST_V8_CAST( input ));
+
+# if defined(CRYPTOPP_BIG_ENDIAN)
+ m0 = vec_perm(m0, m0, le_mask);
+ m1 = vec_perm(m1, m1, le_mask);
+ m2 = vec_perm(m2, m2, le_mask);
+ m3 = vec_perm(m3, m3, le_mask);
+ m4 = vec_perm(m4, m4, le_mask);
+ m5 = vec_perm(m5, m5, le_mask);
+ m6 = vec_perm(m6, m6, le_mask);
+ m7 = vec_perm(m7, m7, le_mask);
+# endif
#else
/* Altivec only provides 16-byte aligned loads */
/* http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf, Section 3.16 */
@@ -1184,14 +1195,34 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
/* Alignment check for load of the message buffer */
const uintptr_t addr = (uintptr_t)input;
- if (addr%16 != 0 /*not aligned*/)
+ if (addr%16 == 0)
+ {
+ /* Already aligned. Perform a little-endian swap as required */
+# if defined(CRYPTOPP_BIG_ENDIAN)
+ m0 = vec_perm(m0, m0, le_mask);
+ m1 = vec_perm(m1, m1, le_mask);
+ m2 = vec_perm(m2, m2, le_mask);
+ m3 = vec_perm(m3, m3, le_mask);
+ m4 = vec_perm(m4, m4, le_mask);
+ m5 = vec_perm(m5, m5, le_mask);
+ m6 = vec_perm(m6, m6, le_mask);
+ m7 = vec_perm(m7, m7, le_mask);
+# endif
+ }
+ else
{
+ /* Not aligned. Fix vectors and perform a little-endian swap as required */
// http://mirror.informatimago.com/next/developer.apple.com/
// hardwaredrivers/ve/code_optimization.html
uint64x2_p ex; uint8x16_p perm;
ex = (uint64x2_p) vec_ld(112+15, CONST_V8_CAST( input ));
perm = vec_lvsl(0, CONST_V8_CAST( addr ));
+# if defined(CRYPTOPP_BIG_ENDIAN)
+ /* Combine the vector permute with the little-endian swap */
+ perm = vec_perm(perm, perm, le_mask);
+# endif
+
m0 = vec_perm(m0, m1, perm);
m1 = vec_perm(m1, m2, perm);
m2 = vec_perm(m2, m3, perm);
@@ -1203,17 +1234,6 @@ void BLAKE2_Compress64_POWER8(const byte* input, BLAKE2b_State& state)
}
#endif
-#if defined(CRYPTOPP_BIG_ENDIAN)
- m0 = vec_perm(m0, m0, le_mask);
- m1 = vec_perm(m1, m1, le_mask);
- m2 = vec_perm(m2, m2, le_mask);
- m3 = vec_perm(m3, m3, le_mask);
- m4 = vec_perm(m4, m4, le_mask);
- m5 = vec_perm(m5, m5, le_mask);
- m6 = vec_perm(m6, m6, le_mask);
- m7 = vec_perm(m7, m7, le_mask);
-#endif
-
uint64x2_p row1l, row1h, row2l, row2h;
uint64x2_p row3l, row3h, row4l, row4h;