summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2017-09-12 05:53:17 -0400
committerJeffrey Walton <noloader@gmail.com>2017-09-12 05:53:17 -0400
commitb090e5f69fe761b08b6fd1bfc51cb16d6d363cb7 (patch)
tree3ae3bdefc185f92f2149445008adf3cf4f2b9c84
parentcfb63decec771d660394d7fbf8b5c7b566a09232 (diff)
downloadcryptopp-git-b090e5f69fe761b08b6fd1bfc51cb16d6d363cb7.tar.gz
Add Power8 AES decryption
-rw-r--r--rdtables.cpp3
-rw-r--r--rijndael-simd.cpp62
-rw-r--r--rijndael.cpp54
3 files changed, 70 insertions, 49 deletions
diff --git a/rdtables.cpp b/rdtables.cpp
index 8b2cea5b..8ceb800f 100644
--- a/rdtables.cpp
+++ b/rdtables.cpp
@@ -154,10 +154,11 @@ const byte Rijndael::Base::Sd[256] = {
0x55, 0x21, 0x0c, 0x7d,
};
+/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
const word32 Rijndael::Base::rcon[] = {
0x01000000, 0x02000000, 0x04000000, 0x08000000,
0x10000000, 0x20000000, 0x40000000, 0x80000000,
- 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+ 0x1B000000, 0x36000000
};
NAMESPACE_END
diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp
index aaae4166..0c624fa8 100644
--- a/rijndael-simd.cpp
+++ b/rijndael-simd.cpp
@@ -45,8 +45,9 @@
// Don't include <arm_acle.h> when using Apple Clang. Early Apple compilers
// fail to compile with <arm_acle.h> included. Later Apple compilers compile
-// intrinsics without <arm_acle.h> included.
-#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION)
+// intrinsics without <arm_acle.h> included. Also avoid it with GCC 4.8.
+#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) && \
+ (!defined(CRYPTOPP_GCC_VERSION) || (CRYPTOPP_GCC_VERSION >= 40900))
# include <arm_acle.h>
#endif
@@ -158,6 +159,24 @@ bool CPU_ProbeAES()
}
#endif // ARM32 or ARM64
+ANONYMOUS_NAMESPACE_BEGIN
+
+CRYPTOPP_ALIGN_DATA(16)
+const word32 s_one[] = {0, 0, 0, 1<<24};
+
+/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+CRYPTOPP_ALIGN_DATA(16)
+const word32 s_rconLE[] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
+};
+CRYPTOPP_ALIGN_DATA(16)
+const word32 s_rconBE[] = {
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000,
+ 0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000
+};
+
+ANONYMOUS_NAMESPACE_END
+
// ***************************** ARMv8 ***************************** //
#if (CRYPTOPP_ARM_AES_AVAILABLE)
@@ -323,15 +342,6 @@ inline void ARMV8_Dec_4_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_
block3 = veorq_u8(block3, vld1q_u8(keys+(i+1)*16));
}
-const word32 s_one[] = {0, 0, 0, 1<<24};
-
-/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
-const word32 rcon[] = {
- 0x01, 0x02, 0x04, 0x08,
- 0x10, 0x20, 0x40, 0x80,
- 0x1B, 0x36
-};
-
template <typename F1, typename F4>
size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
@@ -537,9 +547,6 @@ inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2
block3 = _mm_aesdeclast_si128(block3, rk);
}
-CRYPTOPP_ALIGN_DATA(16)
-static const word32 s_one[] = {0, 0, 0, 1<<24};
-
template <typename F1, typename F4>
inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
@@ -680,16 +687,9 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t ro
sk, rounds, ib, xb, outBlocks, length, flags);
}
-void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk)
+void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk, unsigned int rounds)
{
- const unsigned rounds = static_cast<unsigned int>(keyLen/4 + 6);
- static const word32 rcLE[] = {
- 0x01, 0x02, 0x04, 0x08,
- 0x10, 0x20, 0x40, 0x80,
- 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
- };
-
- const word32 *ro = rcLE, *rc = rcLE;
+ const word32 *ro = s_rconLE, *rc = s_rconLE;
CRYPTOPP_UNUSED(ro);
__m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16));
@@ -700,7 +700,7 @@ void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, wor
const word32* end = rk + keySize;
while (true)
{
- CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE));
+ CRYPTOPP_ASSERT(rc < ro + COUNTOF(s_rconLE));
rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4];
rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1];
@@ -1011,17 +1011,21 @@ void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds
const byte *keys = reinterpret_cast<const byte*>(subkeys);
VectorType s = VectorLoad(inBlock);
- VectorType k = VectorLoadAligned(keys);
+ VectorType k = VectorLoadAligned(rounds*16, keys);
s = VectorXor(s, k);
- for (size_t i=1; i<rounds-1; i+=2)
+ for (size_t i=rounds-1; i>1; i-=2)
{
s = VectorDecrypt(s, VectorLoadAligned( i*16, keys));
- s = VectorDecrypt(s, VectorLoadAligned((i+1)*16, keys));
+ s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys));
}
- s = VectorDecrypt(s, VectorLoadAligned((rounds-1)*16, keys));
- s = VectorDecryptLast(s, VectorLoadAligned(rounds*16, keys));
+ s = VectorDecrypt(s, VectorLoadAligned(16, keys));
+ s = VectorDecryptLast(s, VectorLoadAligned(0, keys));
+
+ // According to benchmarks this is a tad bit slower
+ // if (xorBlock)
+ // s = VectorXor(s, VectorLoad(xorBlock));
VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0};
s = VectorXor(s, x);
diff --git a/rijndael.cpp b/rijndael.cpp
index e5079d9b..3e016d6f 100644
--- a/rijndael.cpp
+++ b/rijndael.cpp
@@ -113,6 +113,19 @@ CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4];
static volatile bool s_TeFilled = false, s_TdFilled = false;
+ANONYMOUS_NAMESPACE_BEGIN
+
+CRYPTOPP_ALIGN_DATA(16)
+const word32 s_one[] = {0, 0, 0, 1<<24};
+
+/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+CRYPTOPP_ALIGN_DATA(16)
+const word32 s_rconLE[] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
+};
+
+ANONYMOUS_NAMESPACE_END
+
// ************************* Portable Code ************************************
#define QUARTER_ROUND(L, T, t, a, b, c, d) \
@@ -221,7 +234,7 @@ void Rijndael::Base::FillDecTable()
}
#if (CRYPTOPP_AESNI_AVAILABLE)
-extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk);
+extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk, unsigned int rounds);
extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds);
extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds,
@@ -240,8 +253,6 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
extern void ByteReverseArrayLE(byte src[16]);
-extern void Rijndael_UncheckedSetKey_POWER8(const byte *userKey, size_t keyLen, word32 *rk, CipherDir dir);
-
extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlock, const byte *xorBlock, byte *outBlock);
extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds,
@@ -263,7 +274,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
{
// TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end
// Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2.
- Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk);
+ Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk, m_rounds);
if (!IsForwardTransformation())
Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds);
@@ -306,6 +317,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
rk = m_key;
+#if CRYPTOPP_POWER8_AES_AVAILABLE
+ if (HasAES())
+ {
+ ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
+ ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
+ ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
+
+#if defined(IS_LITTLE_ENDIAN)
+ // VSX registers are big-endian. The entire subkey table must be byte
+ // reversed on little-endian systems to ensure it loads properly.
+ byte * ptr = reinterpret_cast<byte*>(rk);
+ for (unsigned int i=0; i<=m_rounds; i++)
+ ByteReverseArrayLE(ptr+i*16);
+#endif // IS_LITTLE_ENDIAN
+
+ return;
+ }
+#endif // CRYPTOPP_POWER8_AES_AVAILABLE
+
if (IsForwardTransformation())
{
if (!s_TeFilled)
@@ -351,20 +381,6 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
if (HasAES())
ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
#endif
-#if CRYPTOPP_POWER8_AES_AVAILABLE
- if (IsForwardTransformation() && HasAES())
- {
- ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
-
-#if defined(IS_LITTLE_ENDIAN)
- // VSX registers are big-endian. The entire subkey table must be byte
- // reversed on little-endian systems to ensure it loads properly.
- byte * ptr = reinterpret_cast<byte*>(rk);
- for (unsigned int i=0; i<=m_rounds; i++)
- ByteReverseArrayLE(ptr+i*16);
-#endif // IS_LITTLE_ENDIAN
- }
-#endif // CRYPTOPP_POWER8_AES_AVAILABLE
}
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@@ -483,7 +499,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
}
#endif
-#if (CRYPTOPP_POWER8_AES_AVAILABLE) && 0
+#if (CRYPTOPP_POWER8_AES_AVAILABLE)
if (HasAES())
{
(void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock);