summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2018-08-09 23:28:49 -0400
committerJeffrey Walton <noloader@gmail.com>2018-08-09 23:28:49 -0400
commit9f2d65409a70c8bb2168537d37715ef1e16fda26 (patch)
tree2092bbefdf873a84960c9db29ef3fbe7540cf83b
parent989c3bfbf26d62630058aa26c2318634470f0c1b (diff)
downloadcryptopp-git-9f2d65409a70c8bb2168537d37715ef1e16fda26.tar.gz
Add POWER8 GCM mode (GH #698)
Commit 3ed38e42f619 added the POWER8 infrastructure for GCM mode. It also added GCM_SetKeyWithoutResync_VMULL, GCM_Multiply_VMULL and GCM_Reduce_VMULL. This commit adds the remainder, which includes GCM_AuthenticateBlocks_VMULL. GCC is OK on Linux (ppc64-le) and AIX (ppc64-be). We may need some touchups for XLC compiler
-rw-r--r--bench2.cpp4
-rw-r--r--config.h2
-rw-r--r--gcm-simd.cpp107
3 files changed, 60 insertions, 53 deletions
diff --git a/bench2.cpp b/bench2.cpp
index 2470399c..df1a813c 100644
--- a/bench2.cpp
+++ b/bench2.cpp
@@ -107,6 +107,10 @@ void Benchmark2(double t, double hertz)
if (HasPMULL())
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES)");
else
+#elif CRYPTOPP_POWER8_VMULL_AVAILABLE
+ if (HasPMULL())
+ BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES)");
+ else
#endif
{
BenchMarkByName2<AuthenticatedSymmetricCipher, MessageAuthenticationCode>("AES/GCM", 0, "GMAC(AES) (2K tables)", MakeParameters(Name::TableSize(), 2048));
diff --git a/config.h b/config.h
index 4a520282..57f1515d 100644
--- a/config.h
+++ b/config.h
@@ -787,7 +787,7 @@ NAMESPACE_END
# if defined(__CRYPTO__) || defined(_ARCH_PWR8) || (CRYPTOPP_XLC_VERSION >= 130000) || (CRYPTOPP_GCC_VERSION >= 40800)
//# define CRYPTOPP_POWER8_CRC_AVAILABLE 1
# define CRYPTOPP_POWER8_AES_AVAILABLE 1
-//# define CRYPTOPP_POWER8_VMULL_AVAILABLE 1
+# define CRYPTOPP_POWER8_VMULL_AVAILABLE 1
# define CRYPTOPP_POWER8_SHA_AVAILABLE 1
# endif
#endif
diff --git a/gcm-simd.cpp b/gcm-simd.cpp
index 31a0245e..6f78c727 100644
--- a/gcm-simd.cpp
+++ b/gcm-simd.cpp
@@ -137,38 +137,44 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
#if defined(_MSC_VER)
inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
{
- return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
- vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
+ return (uint64x2_t)(vmull_p64(
+ vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
+ vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
}
inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
{
- return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
- vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
+ return (uint64x2_t)(vmull_p64(
+ vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
+ vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
}
inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
{
- return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
- vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
+ return (uint64x2_t)(vmull_p64(
+ vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
+ vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
}
inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
{
- return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
- vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
+ return (uint64x2_t)(vmull_p64(
+ vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
+ vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
}
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
{
- return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
+ return (uint64x2_t)vextq_u8(
+ vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
}
// https://github.com/weidai11/cryptopp/issues/366
template <unsigned int C>
inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
{
- return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
+ return (uint64x2_t)vextq_u8(
+ vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
}
#endif // Microsoft and compatibles
#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
@@ -374,24 +380,12 @@ bool CPU_ProbePMULL()
b={0x0f,0xc0,0xc0,0xc0, 0x0c,0x0c,0x0c,0x0c,
0x00,0xe0,0xe0,0xe0, 0x0e,0x0e,0x0e,0x0e};
-#if 0
- const uint64x2_p x = VectorGetHigh((uint64x2_p)a);
- const uint64x2_p y = VectorGetLow((uint64x2_p)a);
-#endif
-
const uint64x2_p r1 = VMULL_00((uint64x2_p)(a), (uint64x2_p)(b));
const uint64x2_p r2 = VMULL_01((uint64x2_p)(a), (uint64x2_p)(b));
const uint64x2_p r3 = VMULL_10((uint64x2_p)(a), (uint64x2_p)(b));
const uint64x2_p r4 = VMULL_11((uint64x2_p)(a), (uint64x2_p)(b));
- word64 w1[2], w2[2], w3[2], w4[2];
- VectorStore(r1, (byte*)w1); VectorStore(r2, (byte*)w2);
- VectorStore(r3, (byte*)w3); VectorStore(r4, (byte*)w4);
- result = !!(w1[0] == 0xa5a3a5c03a3c3855ull && w1[1] == 0x0600060066606607ull &&
- w2[0] == 0x199e19e061e66600ull && w2[1] == 0x078007807ff87f86ull &&
- w3[0] == 0x2d2a2d5fa2a5a000ull && w3[1] == 0x0700070077707700ull &&
- w4[0] == 0x6aac6ac006c00000ull && w4[1] == 0x06c006c06aac6ac0ull);
- result = true;
+ result = VectorNotEqual(r1, r2) && VectorNotEqual(r3, r4);
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
@@ -832,19 +826,43 @@ void GCM_SetKeyWithoutResync_VMULL(const byte *hashKey, byte *mulTable, unsigned
std::memcpy(mulTable+i+8, temp+0, 8);
}
+INLINE uint64x2_p LoadBuffer1(const byte *dataBuffer)
+{
+#if CRYPTOPP_BIG_ENDIAN
+ return (uint64x2_p)VectorLoad(dataBuffer);
+#else
+ const uint64x2_p data = (uint64x2_p)VectorLoad(dataBuffer);
+ const uint8x16_p mask = {7,6,5,4, 3,2,1,0, 15,14,13,12, 11,10,9,8};
+ return vec_perm(data, data, mask);
+#endif
+}
+
+INLINE uint64x2_p LoadBuffer2(const byte *dataBuffer)
+{
+#if CRYPTOPP_BIG_ENDIAN
+ return (uint64x2_p)VectorRotateLeft<8>(VectorLoad(dataBuffer));
+#else
+ const uint64x2_p data = (uint64x2_p)VectorLoad(dataBuffer);
+ const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+ return (uint64x2_p)vec_perm(data, data, mask);
+#endif
+}
+
+// Swaps high and low 64-bit words
+INLINE uint64x2_p SwapWords(const uint64x2_p& data)
+{
+ return VectorRotateLeft<8>(data);
+}
+
size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
{
const uint64x2_p r = {0xe100000000000000ull, 0xc200000000000000ull};
- const uint64x2_p m1 = {0x08090a0b0c0d0e0full, 0x0001020304050607ull};
- const uint64x2_p m2 = {0x0001020304050607ull, 0x08090a0b0c0d0e0full};
uint64x2_p x = (uint64x2_p)VectorLoad(hbuffer);
while (len >= 16)
{
size_t i=0, s = UnsignedMin(len/16, 8U);
- uint64x2_p d1 = (uint64x2_p)VectorLoad(data+(s-1)*16);
- // uint64x2_p d2 = _mm_shuffle_epi8(d1, m2);
- uint64x2_p d2 = (uint64x2_p)VectorPermute(d1, d1, m2);
+ uint64x2_p d1, d2 = LoadBuffer1(data+(s-1)*16);
uint64x2_p c0 = {0}, c1 = {0}, c2 = {0};
while (true)
@@ -855,43 +873,33 @@ size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mt
if (++i == s)
{
- // d1 = _mm_shuffle_epi8(VectorLoad(data), m1);
- d1 = (uint64x2_p)VectorLoad(data);
- d1 = VectorPermute(d1, d1, m1);
+ d1 = LoadBuffer2(data);
d1 = VectorXor(d1, x);
c0 = VectorXor(c0, VMULL_00(d1, h0));
c2 = VectorXor(c2, VMULL_01(d1, h1));
- // d1 = VectorXor(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2)));
- d1 = VectorXor(d1, VectorPermute(d1, d1, m1));
+ d1 = VectorXor(d1, SwapWords(d1));
c1 = VectorXor(c1, VMULL_00(d1, h2));
break;
}
- // d1 = _mm_shuffle_epi8(VectorLoad(data+(s-i)*16-8), m2);
- d1 = (uint64x2_p)VectorLoad(data+(s-i)*16-8);
- d1 = VectorPermute(d1, d1, m2);
+ d1 = LoadBuffer1(data+(s-i)*16-8);
c0 = VectorXor(c0, VMULL_01(d2, h0));
- c2 = VectorXor(c2, VMULL_00(d1, h1));
+ c2 = VectorXor(c2, VMULL_01(d1, h1));
d2 = VectorXor(d2, d1);
- c1 = VectorXor(c1, VMULL_00(d2, h2));
+ c1 = VectorXor(c1, VMULL_01(d2, h2));
if (++i == s)
{
- // d1 = _mm_shuffle_epi8(VectorLoad(data), m1);
- d1 = (uint64x2_p)VectorLoad(data);
- d1 = VectorPermute(d1, d1, m1);
+ d1 = LoadBuffer2(data);
d1 = VectorXor(d1, x);
c0 = VectorXor(c0, VMULL_10(d1, h0));
c2 = VectorXor(c2, VMULL_11(d1, h1));
- // d1 = VectorXor(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2)));
- d1 = VectorXor(d1, VectorPermute(d1, d1, m1));
+ d1 = VectorXor(d1, SwapWords(d1));
c1 = VectorXor(c1, VMULL_10(d1, h2));
break;
}
- // d2 = _mm_shuffle_epi8(VectorLoad(data+(s-i)*16-8), m1);
- d2 = (uint64x2_p)VectorLoad(data+(s-i)*16-8);
- d2 = VectorPermute(d2, d2, m1);
+ d2 = LoadBuffer2(data+(s-i)*16-8);
c0 = VectorXor(c0, VMULL_10(d1, h0));
c2 = VectorXor(c2, VMULL_10(d2, h1));
d1 = VectorXor(d1, d2);
@@ -910,13 +918,8 @@ size_t GCM_AuthenticateBlocks_VMULL(const byte *data, size_t len, const byte *mt
void GCM_ReverseHashBufferIfNeeded_VMULL(byte *hashBuffer)
{
- // SSSE3 instruction, but only used with CLMUL
- uint64x2_p val = (uint64x2_p)VectorLoad(hashBuffer);
- // const uint64x2_p mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
const uint64x2_p mask = {0x08090a0b0c0d0e0full, 0x0001020304050607ull};
- // val = _mm_shuffle_epi8(val, mask);
- val = VectorPermute(val, val, mask);
- VectorStore(val, hashBuffer);
+ VectorStore(VectorPermute(VectorLoad(hashBuffer), mask), hashBuffer);
}
#endif // CRYPTOPP_POWER8_VMULL_AVAILABLE