From 656be82a8fd798d5242e553ced508760b215288b Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Fri, 9 Nov 2018 11:56:47 -0500 Subject: Cleanup ARIA SSE and NEON code --- aria-simd.cpp | 97 +++++++++++++++++++++++++++++++++++++++++------------------ aria.cpp | 31 +++++++++---------- 2 files changed, 82 insertions(+), 46 deletions(-) diff --git a/aria-simd.cpp b/aria-simd.cpp index ce16ada5..9c741d7e 100644 --- a/aria-simd.cpp +++ b/aria-simd.cpp @@ -29,10 +29,6 @@ #define M128_CAST(x) ((__m128i *)(void *)(x)) #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) -// GCC cast warning -#define UINT32_CAST(x) ((uint32_t *)(void *)(x)) -#define CONST_UINT32_CAST(x) ((const uint32_t *)(const void *)(x)) - NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(ARIATab) @@ -45,6 +41,17 @@ extern const word32 KRK[3][4]; NAMESPACE_END NAMESPACE_END +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; + +inline byte ARIA_BRF(const word32 x, const int y) { + return GETBYTE(x, y); +} + +ANONYMOUS_NAMESPACE_END + NAMESPACE_BEGIN(CryptoPP) using CryptoPP::ARIATab::S1; @@ -58,22 +65,23 @@ using CryptoPP::ARIATab::KRK; template inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16]) { - static const unsigned int Q1 = (4-(N/32)) % 4; - static const unsigned int Q2 = (3-(N/32)) % 4; - static const unsigned int R = N % 32; + enum { Q1 = (4-(N/32)) % 4, + Q2 = (3-(N/32)) % 4, + R = N % 32 + }; - vst1q_u32(UINT32_CAST(RK), + vst1q_u8(RK, vreinterpretq_u8_u32( veorq_u32(X, veorq_u32( vshrq_n_u32(vextq_u32(Y, Y, Q1), R), - vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R)))); + vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R))))); } void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen) { - const uint32x4_t w0 = vld1q_u32(CONST_UINT32_CAST(ws+ 0)); - const uint32x4_t w1 = vld1q_u32(CONST_UINT32_CAST(ws+ 8)); - const uint32x4_t w2 = vld1q_u32(CONST_UINT32_CAST(ws+12)); - const uint32x4_t w3 = vld1q_u32(CONST_UINT32_CAST(ws+16)); + const uint32x4_t w0 = vld1q_u32(ws+ 0); + const uint32x4_t w1 = vld1q_u32(ws+ 8); + const uint32x4_t w2 = vld1q_u32(ws+12); + const uint32x4_t w3 = vld1q_u32(ws+16); ARIA_GSRK_NEON<19>(w0, w1, rk + 0); ARIA_GSRK_NEON<19>(w1, w2, rk + 16); @@ -102,22 +110,49 @@ void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keyle } } -void ARIA_ProcessAndXorBlock_Xor_NEON(const byte* xorBlock, byte* outBlock) +void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t) { - vst1q_u32(UINT32_CAST(outBlock), veorq_u32( - vld1q_u32(CONST_UINT32_CAST(outBlock)), - vld1q_u32(CONST_UINT32_CAST(xorBlock)))); + outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ); + outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8); + outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); + outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); + outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); + outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); + outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); + outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); + outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); + outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); + outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); + outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); + outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); + outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); + outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); + outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); + + // 'outBlock' and 'xorBlock' may be unaligned. + if (xorBlock != NULLPTR) + { + vst1q_u8(outBlock, + veorq_u8( + vld1q_u8(xorBlock), + veorq_u8( + vld1q_u8(outBlock), + vrev32q_u8(vld1q_u8((rk)))))); + } + else + { + vst1q_u8(outBlock, + veorq_u8( + vld1q_u8(outBlock), + vrev32q_u8(vld1q_u8(rk)))); + } } #endif // CRYPTOPP_ARM_NEON_AVAILABLE #if (CRYPTOPP_SSSE3_AVAILABLE) -inline byte ARIA_BRF(const word32 x, const int y) { - return GETBYTE(x, y); -} - -void ARIA_ProcessAndXorBlock_Xor_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t) +void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t) { const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); @@ -138,18 +173,22 @@ void ARIA_ProcessAndXorBlock_Xor_SSSE3(const byte* xorBlock, byte* outBlock, con outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); - // 'outBlock' may be unaligned. - _mm_storeu_si128(M128_CAST(outBlock), - _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(outBlock)), - _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK))); - // 'outBlock' and 'xorBlock' may be unaligned. if (xorBlock != NULLPTR) { _mm_storeu_si128(M128_CAST(outBlock), _mm_xor_si128( - _mm_loadu_si128(CONST_M128_CAST(outBlock)), - _mm_loadu_si128(CONST_M128_CAST(xorBlock)))); + _mm_loadu_si128(CONST_M128_CAST(xorBlock)), + _mm_xor_si128( + _mm_loadu_si128(CONST_M128_CAST(outBlock)), + _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK))) + ); + } + else + { + _mm_storeu_si128(M128_CAST(outBlock), + _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(outBlock)), + _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(rk)), MASK))); } } diff --git a/aria.cpp b/aria.cpp index 52cc2eb2..8f3f221a 100644 --- a/aria.cpp +++ b/aria.cpp @@ -85,11 +85,11 @@ inline byte ARIA_BRF(const word32 x, const int y) { #if (CRYPTOPP_ARM_NEON_AVAILABLE) extern void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen); -extern void ARIA_ProcessAndXorBlock_Xor_NEON(const byte* xorBlock, byte* outblock); +extern void ARIA_ProcessAndXorBlock_NEON(const byte* xorBlock, byte* outblock, const byte *rk, word32 *t); #endif #if (CRYPTOPP_SSSE3_AVAILABLE) -extern void ARIA_ProcessAndXorBlock_Xor_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t); +extern void ARIA_ProcessAndXorBlock_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t); #endif // n-bit right shift of Y XORed to X @@ -283,12 +283,19 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b #if CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS if (HasSSSE3()) { - ARIA_ProcessAndXorBlock_Xor_SSSE3(xorBlock, outBlock, rk, t); + ARIA_ProcessAndXorBlock_SSSE3(xorBlock, outBlock, rk, t); return; } else #endif // CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS - +#if (CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + { + ARIA_ProcessAndXorBlock_NEON(xorBlock, outBlock, rk, t); + return; + } + else +#endif // CRYPTOPP_ARM_NEON_AVAILABLE #if (CRYPTOPP_LITTLE_ENDIAN) { outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ) ^ rk[ 3]; @@ -329,19 +336,9 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b } #endif // CRYPTOPP_LITTLE_ENDIAN -#if CRYPTOPP_ARM_NEON_AVAILABLE - if (HasNEON()) - { - if (xorBlock != NULLPTR) - ARIA_ProcessAndXorBlock_Xor_NEON(xorBlock, outBlock); - } - else -#endif // CRYPTOPP_ARM_NEON_AVAILABLE - { - if (xorBlock != NULLPTR) - for (unsigned int n=0; n