summaryrefslogtreecommitdiff
path: root/chacha_simd.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2018-11-14 08:19:13 -0500
committerJeffrey Walton <noloader@gmail.com>2018-11-14 08:19:13 -0500
commit225ab6cb7be252135d43b275fb0b2447036985ea (patch)
treeff0000fc8bc10081c132ef1655696170dcc4d462 /chacha_simd.cpp
parentb3941a433db3f6b26b66b7db67580a0ecd97ae09 (diff)
downloadcryptopp-git-225ab6cb7be252135d43b275fb0b2447036985ea.tar.gz
Drop ChaCha requirements to POWER7
This costs about 0.6 cpb (700 MB/s on GCC112), but it makes the faster algorithm available to more machines. In the future we may want to provide both POWER7 and POWER8
Diffstat (limited to 'chacha_simd.cpp')
-rw-r--r--chacha_simd.cpp37
1 files changed, 29 insertions, 8 deletions
diff --git a/chacha_simd.cpp b/chacha_simd.cpp
index 6e619281..5190f161 100644
--- a/chacha_simd.cpp
+++ b/chacha_simd.cpp
@@ -54,7 +54,7 @@
# include <arm_acle.h>
#endif
-#if defined(CRYPTOPP_POWER8_AVAILABLE)
+#if defined(CRYPTOPP_POWER7_AVAILABLE)
# include "ppc_simd.h"
#endif
@@ -201,16 +201,25 @@ inline __m128i RotateLeft<16>(const __m128i val)
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
-// **************************** POWER8 **************************** //
+// **************************** POWER7 **************************** //
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
+
+// POWER8 is optional and runs about 0.6 cpb faster because
+// of the native 64-bit vector add. That's about 700 MB/s on
+// GCC112 from the compile farm. Use -mcpu=power8 to engage
+// POWER8. POWER7 lacks 64-bit element support, so code built
+// with -mcpu=power8 will SIGILL on POWER7 machines.
using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
-using CryptoPP::uint64x2_p;
using CryptoPP::VectorLoad;
using CryptoPP::VectorStore;
+#if (_ARCH_PWR8 || _ARCH_PWR9)
+using CryptoPP::uint64x2_p;
+#endif
+
// Permutes bytes in packed 32-bit words to little endian.
// State is already in proper endian order. Input and
// output must be permuted during load and save.
@@ -290,10 +299,22 @@ inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
// Helper to perform 64-bit addition across two elements of 32-bit vectors
inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b)
{
+#if (_ARCH_PWR8 || _ARCH_PWR9)
return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b);
+#else
+ // The carry mask selects carries from elements 1 and 3 and sets remaining
+ // elements to 0. The mask also shifts the carried values left by 4 bytes
+ // so the carries are added to elements 0 and 2.
+ const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
+ const uint32x4_p zero = {0, 0, 0, 0};
+
+ uint32x4_p cy = vec_addc(a, b);
+ cy = vec_perm(cy, zero, cmask);
+ return vec_add(vec_add(a, b), cy);
+#endif
}
-#endif // CRYPTOPP_POWER8_AVAILABLE
+#endif // CRYPTOPP_POWER7_AVAILABLE
ANONYMOUS_NAMESPACE_END
@@ -835,9 +856,9 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
-#if (CRYPTOPP_POWER8_AVAILABLE)
+#if (CRYPTOPP_POWER7_AVAILABLE)
-void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
+void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
const uint32x4_p state0 = VectorLoad(state + 0*4);
const uint32x4_p state1 = VectorLoad(state + 1*4);
@@ -1099,6 +1120,6 @@ void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte
VectorStore32LE(output + 15*16, r3_3);
}
-#endif // CRYPTOPP_POWER8_AVAILABLE
+#endif // CRYPTOPP_POWER7_AVAILABLE
NAMESPACE_END