Add AdvancedProcessBlocks for Power8

This increases performance to about 1.6 cpb. We are about 0.5 cpb behind Botan, and about 1.0 cpb behind OpenSSL. However, it beats the snot out of C/C++, which runs at 20 to 30 cpb
author: Jeffrey Walton <noloader@gmail.com> 2017-09-12 18:15:55 -0400
committer: Jeffrey Walton <noloader@gmail.com> 2017-09-12 18:15:55 -0400
commit: 6899d3f8bb06499557c7d0672e371c6ee06a87e9 (patch)
tree: 6493734715e3bfbadbf8836c713544942ac6a837 /rijndael-simd.cpp
parent: 2ebd30d43cfd60c2213d7e74ee5f97eff915d8ec (diff)
download: cryptopp-git-6899d3f8bb06499557c7d0672e371c6ee06a87e9.tar.gz
1 files changed, 250 insertions, 29 deletions
diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp
index 0c624fa8..ff46e564 100644
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@@ -817,6 +817,22 @@ uint8x16_p8 Load8x16(const uint8_t src[16])
 #endif
 }
 
+uint8x16_p8 Load8x16(int off, const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return vec_xl_be(off, s);
+# else
+	return vec_xl(off, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+	return (uint8x16_p8)vec_vsx_ld(off, src);
+#endif
+}
+
 void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@@ -854,6 +870,28 @@ uint64x2_p8 Load64x2(const uint8_t src[16])
 #endif
 }
 
+uint64x2_p8 Load64x2(int off, const uint8_t src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	/* http://stackoverflow.com/q/46124383/608639 */
+	uint8_t* s = (uint8_t*)src;
+# if defined(IS_LITTLE_ENDIAN)
+	return (uint64x2_p8)vec_xl_be(off, s);
+# else
+	return (uint64x2_p8)vec_xl(off, s);
+# endif
+#else
+	/* GCC, Clang, etc */
+# if defined(IS_LITTLE_ENDIAN)
+	const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
+	const uint8x16_p8 zero = {0};
+	return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask);
+# else
+	return (uint64x2_p8)vec_vsx_ld(off, src);
+# endif
+#endif
+}
+
 void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
 {
 #if defined(CRYPTOPP_XLC_VERSION)
@@ -894,6 +932,15 @@ inline VectorType VectorLoad(const byte src[16])
 #endif
 }
 
+inline VectorType VectorLoad(int off, const byte src[16])
+{
+#if defined(CRYPTOPP_XLC_VERSION)
+	return Load8x16(off, src);
+#elif defined(CRYPTOPP_GCC_VERSION)
+	return Load64x2(off, src);
+#endif
+}
+
 inline VectorType VectorLoadAligned(const byte vec[16])
 {
 	return (VectorType)vec_ld(0, vec);
@@ -973,64 +1020,238 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
 #endif
 }
 
-//////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
 
-void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 
-	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(keys);
+	block = VectorXor(block, k);
 
-	s = VectorXor(s, k);
 	for (size_t i=1; i<rounds-1; i+=2)
 	{
-		s = VectorEncrypt(s, VectorLoadAligned(  i*16,   keys));
-		s = VectorEncrypt(s, VectorLoadAligned((i+1)*16, keys));
+		block = VectorEncrypt(block, VectorLoadAligned(  i*16,   keys));
+		block = VectorEncrypt(block, VectorLoadAligned((i+1)*16, keys));
 	}
 
-	s = VectorEncrypt(s, VectorLoadAligned((rounds-1)*16, keys));
-	s = VectorEncryptLast(s, VectorLoadAligned(rounds*16, keys));
+	block = VectorEncrypt(block, VectorLoadAligned((rounds-1)*16, keys));
+	block = VectorEncryptLast(block, VectorLoadAligned(rounds*16, keys));
+}
+
+inline void POWER8_Enc_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
+            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+{
+	CRYPTOPP_ASSERT(subkeys);
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 
-	// According to benchmarks this is a tad bit slower
-	// if (xorBlock)
-	//	s = VectorXor(s, VectorLoad(xorBlock));
+	VectorType k = VectorLoadAligned(keys);
+	block0 = VectorXor(block0, k);
+	block1 = VectorXor(block1, k);
+	block2 = VectorXor(block2, k);
+	block3 = VectorXor(block3, k);
+
+	for (size_t i=1; i<rounds; ++i)
+	{
+		k = VectorLoadAligned(i*16, keys);
+		block0 = VectorEncrypt(block0, k);
+		block1 = VectorEncrypt(block1, k);
+		block2 = VectorEncrypt(block2, k);
+		block3 = VectorEncrypt(block3, k);
+	}
 
-	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
-	s = VectorXor(s, x);
+	k = VectorLoadAligned(rounds*16, keys);
+	block0 = VectorEncryptLast(block0, k);
+	block1 = VectorEncryptLast(block1, k);
+	block2 = VectorEncryptLast(block2, k);
+	block3 = VectorEncryptLast(block3, k);
 
-	VectorStore(s, outBlock);
 }
 
-void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
-        const byte *inBlock, const byte *xorBlock, byte *outBlock)
+inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
 {
 	CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
 	const byte *keys = reinterpret_cast<const byte*>(subkeys);
 
-	VectorType s = VectorLoad(inBlock);
 	VectorType k = VectorLoadAligned(rounds*16, keys);
+	block = VectorXor(block, k);
 
-	s = VectorXor(s, k);
 	for (size_t i=rounds-1; i>1; i-=2)
 	{
-		s = VectorDecrypt(s, VectorLoadAligned(  i*16,   keys));
-		s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys));
+		block = VectorDecrypt(block, VectorLoadAligned(  i*16,   keys));
+		block = VectorDecrypt(block, VectorLoadAligned((i-1)*16, keys));
+	}
+
+	block = VectorDecrypt(block, VectorLoadAligned(16, keys));
+	block = VectorDecryptLast(block, VectorLoadAligned(0, keys));
+}
+
+inline void POWER8_Dec_4_Blocks(VectorType &block0, VectorType &block1, VectorType &block2,
+            VectorType &block3, const word32 *subkeys, unsigned int rounds)
+{
+	CRYPTOPP_ASSERT(subkeys);
+	const byte *keys = reinterpret_cast<const byte*>(subkeys);
+
+	VectorType k = VectorLoadAligned(rounds*16, keys);
+	block0 = VectorXor(block0, k);
+	block1 = VectorXor(block1, k);
+	block2 = VectorXor(block2, k);
+	block3 = VectorXor(block3, k);
+
+	for (size_t i=rounds-1; i>0; --i)
+	{
+		k = VectorLoadAligned(i*16, keys);
+		block0 = VectorDecrypt(block0, k);
+		block1 = VectorDecrypt(block1, k);
+		block2 = VectorDecrypt(block2, k);
+		block3 = VectorDecrypt(block3, k);
+	}
+
+	k = VectorLoadAligned(0, keys);
+	block0 = VectorDecryptLast(block0, k);
+	block1 = VectorDecryptLast(block1, k);
+	block2 = VectorDecryptLast(block2, k);
+	block3 = VectorDecryptLast(block3, k);
+}
+
+template <typename F1, typename F4>
+size_t Rijndael_AdvancedProcessBlocks_POWER8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	CRYPTOPP_ASSERT(subKeys);
+	CRYPTOPP_ASSERT(inBlocks);
+	CRYPTOPP_ASSERT(outBlocks);
+	CRYPTOPP_ASSERT(length >= 16);
+
+	const size_t blockSize = 16;
+	size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
+	size_t xorIncrement = xorBlocks ? blockSize : 0;
+	size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
+
+	if (flags & BlockTransformation::BT_ReverseDirection)
+	{
+		inBlocks += length - blockSize;
+		xorBlocks += length - blockSize;
+		outBlocks += length - blockSize;
+		inIncrement = 0-inIncrement;
+		xorIncrement = 0-xorIncrement;
+		outIncrement = 0-outIncrement;
 	}
 
-	s = VectorDecrypt(s, VectorLoadAligned(16, keys));
-	s = VectorDecryptLast(s, VectorLoadAligned(0, keys));
+	if (flags & BlockTransformation::BT_AllowParallel)
+	{
+		while (length >= 4*blockSize)
+		{
+			VectorType block0, block1, block2, block3, temp;
+			block0 = VectorLoad(inBlocks);
+
+			if (flags & BlockTransformation::BT_InBlockIsCounter)
+			{
+#if defined(IS_LITTLE_ENDIAN)
+				const VectorType one = {1};
+#else
+				const VectorType one = (VectorType)(uint64x2_p8){0,1};
+#endif
+				block1 = VectorAdd(block0, one);
+				block2 = VectorAdd(block1, one);
+				block3 = VectorAdd(block2, one);
+				temp   = VectorAdd(block3, one);
+				VectorStore(temp, const_cast<byte*>(inBlocks));
+			}
+			else
+			{
+				//inBlocks += inIncrement;
+				block1 = VectorLoad(1*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				block2 = VectorLoad(2*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				block3 = VectorLoad(3*inIncrement, inBlocks);
+				//inBlocks += inIncrement;
+				inBlocks += 4*inIncrement;
+			}
+
+			if (flags & BlockTransformation::BT_XorInput)
+			{
+				block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				xorBlocks += 4*xorIncrement;
+			}
+
+			func4(block0, block1, block2, block3, subKeys, rounds);
 
-	// According to benchmarks this is a tad bit slower
-	// if (xorBlock)
-	//	s = VectorXor(s, VectorLoad(xorBlock));
+			if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+			{
+				block0 = VectorXor(block0, VectorLoad(0*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block1 = VectorXor(block1, VectorLoad(1*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block2 = VectorXor(block2, VectorLoad(2*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				block3 = VectorXor(block3, VectorLoad(3*xorIncrement, xorBlocks));
+				//xorBlocks += xorIncrement;
+				xorBlocks += 4*xorIncrement;
+			}
 
-	VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
-	s = VectorXor(s, x);
+			// I can't get Store to run faster using indexed offsets
+			VectorStore(block0, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block1, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block2, outBlocks);
+			outBlocks += outIncrement;
+			VectorStore(block3, outBlocks);
+			outBlocks += outIncrement;
 
-	VectorStore(s, outBlock);
+			length -= 4*blockSize;
+		}
+	}
+
+	while (length >= blockSize)
+	{
+		VectorType block = VectorLoad(inBlocks);
+
+		if (flags & BlockTransformation::BT_XorInput)
+			block = VectorXor(block, VectorLoad(xorBlocks));
+
+		if (flags & BlockTransformation::BT_InBlockIsCounter)
+			const_cast<byte *>(inBlocks)[15]++;
+
+		func1(block, subKeys, rounds);
+
+		if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+			block = VectorXor(block, VectorLoad(xorBlocks));
+
+		VectorStore(block, outBlocks);
+
+		inBlocks += inIncrement;
+		outBlocks += outIncrement;
+		xorBlocks += xorIncrement;
+		length -= blockSize;
+	}
+
+	return length;
 }
+
+size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Enc_Block, POWER8_Enc_4_Blocks,
+            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t Rijndael_Dec_AdvancedProcessBlocks_POWER8(const word32 *subKeys, size_t rounds,
+            const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+	return Rijndael_AdvancedProcessBlocks_POWER8(POWER8_Dec_Block, POWER8_Dec_4_Blocks,
+            subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
 #endif  // CRYPTOPP_POWER8_AES_AVAILABLE
 NAMESPACE_END
author	Jeffrey Walton <noloader@gmail.com>	2017-09-12 18:15:55 -0400
committer	Jeffrey Walton <noloader@gmail.com>	2017-09-12 18:15:55 -0400
commit	6899d3f8bb06499557c7d0672e371c6ee06a87e9 (patch)
tree	6493734715e3bfbadbf8836c713544942ac6a837 /rijndael-simd.cpp
parent	2ebd30d43cfd60c2213d7e74ee5f97eff915d8ec (diff)
download	cryptopp-git-6899d3f8bb06499557c7d0672e371c6ee06a87e9.tar.gz