From b090e5f69fe761b08b6fd1bfc51cb16d6d363cb7 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 12 Sep 2017 05:53:17 -0400 Subject: Add Power8 AES decryption --- rdtables.cpp | 3 ++- rijndael-simd.cpp | 62 +++++++++++++++++++++++++++++-------------------------- rijndael.cpp | 54 +++++++++++++++++++++++++++++++----------------- 3 files changed, 70 insertions(+), 49 deletions(-) diff --git a/rdtables.cpp b/rdtables.cpp index 8b2cea5b..8ceb800f 100644 --- a/rdtables.cpp +++ b/rdtables.cpp @@ -154,10 +154,11 @@ const byte Rijndael::Base::Sd[256] = { 0x55, 0x21, 0x0c, 0x7d, }; +/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ const word32 Rijndael::Base::rcon[] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, - 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ + 0x1B000000, 0x36000000 }; NAMESPACE_END diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index aaae4166..0c624fa8 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -45,8 +45,9 @@ // Don't include when using Apple Clang. Early Apple compilers // fail to compile with included. Later Apple compilers compile -// intrinsics without included. -#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) +// intrinsics without included. Also avoid it with GCC 4.8. +#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) && \ + (!defined(CRYPTOPP_GCC_VERSION) || (CRYPTOPP_GCC_VERSION >= 40900)) # include #endif @@ -158,6 +159,24 @@ bool CPU_ProbeAES() } #endif // ARM32 or ARM64 +ANONYMOUS_NAMESPACE_BEGIN + +CRYPTOPP_ALIGN_DATA(16) +const word32 s_one[] = {0, 0, 0, 1<<24}; + +/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ +CRYPTOPP_ALIGN_DATA(16) +const word32 s_rconLE[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; +CRYPTOPP_ALIGN_DATA(16) +const word32 s_rconBE[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, + 0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000 +}; + +ANONYMOUS_NAMESPACE_END + // ***************************** ARMv8 ***************************** // #if (CRYPTOPP_ARM_AES_AVAILABLE) @@ -323,15 +342,6 @@ inline void ARMV8_Dec_4_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_ block3 = veorq_u8(block3, vld1q_u8(keys+(i+1)*16)); } -const word32 s_one[] = {0, 0, 0, 1<<24}; - -/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ -const word32 rcon[] = { - 0x01, 0x02, 0x04, 0x08, - 0x10, 0x20, 0x40, 0x80, - 0x1B, 0x36 -}; - template size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) @@ -537,9 +547,6 @@ inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2 block3 = _mm_aesdeclast_si128(block3, rk); } -CRYPTOPP_ALIGN_DATA(16) -static const word32 s_one[] = {0, 0, 0, 1<<24}; - template inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks, @@ -680,16 +687,9 @@ size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subKeys, size_t ro sk, rounds, ib, xb, outBlocks, length, flags); } -void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk) +void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk, unsigned int rounds) { - const unsigned rounds = static_cast(keyLen/4 + 6); - static const word32 rcLE[] = { - 0x01, 0x02, 0x04, 0x08, - 0x10, 0x20, 0x40, 0x80, - 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ - }; - - const word32 *ro = rcLE, *rc = rcLE; + const word32 *ro = s_rconLE, *rc = s_rconLE; CRYPTOPP_UNUSED(ro); __m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16)); @@ -700,7 +700,7 @@ void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, wor const word32* end = rk + keySize; while (true) { - CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE)); + CRYPTOPP_ASSERT(rc < ro + COUNTOF(s_rconLE)); rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; @@ -1011,17 +1011,21 @@ void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds const byte *keys = reinterpret_cast(subkeys); VectorType s = VectorLoad(inBlock); - VectorType k = VectorLoadAligned(keys); + VectorType k = VectorLoadAligned(rounds*16, keys); s = VectorXor(s, k); - for (size_t i=1; i1; i-=2) { s = VectorDecrypt(s, VectorLoadAligned( i*16, keys)); - s = VectorDecrypt(s, VectorLoadAligned((i+1)*16, keys)); + s = VectorDecrypt(s, VectorLoadAligned((i-1)*16, keys)); } - s = VectorDecrypt(s, VectorLoadAligned((rounds-1)*16, keys)); - s = VectorDecryptLast(s, VectorLoadAligned(rounds*16, keys)); + s = VectorDecrypt(s, VectorLoadAligned(16, keys)); + s = VectorDecryptLast(s, VectorLoadAligned(0, keys)); + + // According to benchmarks this is a tad bit slower + // if (xorBlock) + // s = VectorXor(s, VectorLoad(xorBlock)); VectorType x = xorBlock ? VectorLoad(xorBlock) : (VectorType) {0}; s = VectorXor(s, x); diff --git a/rijndael.cpp b/rijndael.cpp index e5079d9b..3e016d6f 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -113,6 +113,19 @@ CRYPTOPP_ALIGN_DATA(16) static word32 Td[256*4]; static volatile bool s_TeFilled = false, s_TdFilled = false; +ANONYMOUS_NAMESPACE_BEGIN + +CRYPTOPP_ALIGN_DATA(16) +const word32 s_one[] = {0, 0, 0, 1<<24}; + +/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ +CRYPTOPP_ALIGN_DATA(16) +const word32 s_rconLE[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +ANONYMOUS_NAMESPACE_END + // ************************* Portable Code ************************************ #define QUARTER_ROUND(L, T, t, a, b, c, d) \ @@ -221,7 +234,7 @@ void Rijndael::Base::FillDecTable() } #if (CRYPTOPP_AESNI_AVAILABLE) -extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk); +extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk, unsigned int rounds); extern void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds); extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds, @@ -240,8 +253,6 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si #if (CRYPTOPP_POWER8_AES_AVAILABLE) extern void ByteReverseArrayLE(byte src[16]); -extern void Rijndael_UncheckedSetKey_POWER8(const byte *userKey, size_t keyLen, word32 *rk, CipherDir dir); - extern void Rijndael_Enc_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds, const byte *inBlock, const byte *xorBlock, byte *outBlock); extern void Rijndael_Dec_ProcessAndXorBlock_POWER8(const word32 *subkeys, size_t rounds, @@ -263,7 +274,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c { // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2. - Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk); + Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk, m_rounds); if (!IsForwardTransformation()) Rijndael_UncheckedSetKeyRev_AESNI(m_key, m_rounds); @@ -306,6 +317,25 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c rk = m_key; +#if CRYPTOPP_POWER8_AES_AVAILABLE + if (HasAES()) + { + ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16); + ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16); + ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); + +#if defined(IS_LITTLE_ENDIAN) + // VSX registers are big-endian. The entire subkey table must be byte + // reversed on little-endian systems to ensure it loads properly. + byte * ptr = reinterpret_cast(rk); + for (unsigned int i=0; i<=m_rounds; i++) + ByteReverseArrayLE(ptr+i*16); +#endif // IS_LITTLE_ENDIAN + + return; + } +#endif // CRYPTOPP_POWER8_AES_AVAILABLE + if (IsForwardTransformation()) { if (!s_TeFilled) @@ -351,20 +381,6 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c if (HasAES()) ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); #endif -#if CRYPTOPP_POWER8_AES_AVAILABLE - if (IsForwardTransformation() && HasAES()) - { - ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); - -#if defined(IS_LITTLE_ENDIAN) - // VSX registers are big-endian. The entire subkey table must be byte - // reversed on little-endian systems to ensure it loads properly. - byte * ptr = reinterpret_cast(rk); - for (unsigned int i=0; i<=m_rounds; i++) - ByteReverseArrayLE(ptr+i*16); -#endif // IS_LITTLE_ENDIAN - } -#endif // CRYPTOPP_POWER8_AES_AVAILABLE } void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const @@ -483,7 +499,7 @@ void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock } #endif -#if (CRYPTOPP_POWER8_AES_AVAILABLE) && 0 +#if (CRYPTOPP_POWER8_AES_AVAILABLE) if (HasAES()) { (void)Rijndael_Dec_ProcessAndXorBlock_POWER8(m_key, m_rounds, inBlock, xorBlock, outBlock); -- cgit v1.2.1