From e9654192f20ec4d16e2fbccb167ce63bd22b48f5 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 5 Dec 2017 20:35:57 -0500 Subject: Remove unneeded temp[] array --- simon-simd.cpp | 26 ++++++++++++-------------- speck-simd.cpp | 26 ++++++++++++-------------- 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/simon-simd.cpp b/simon-simd.cpp index 186e862a..e95b0820 100644 --- a/simon-simd.cpp +++ b/simon-simd.cpp @@ -1422,7 +1422,7 @@ inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1, // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. Thanks to Peter Cordes for help with the - // SSE permutes below. + // SSE permutes below. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const __m128 t0 = _mm_castsi128_ps(block0); const __m128 t1 = _mm_castsi128_ps(block1); @@ -1464,7 +1464,7 @@ inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1, // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. Thanks to Peter Cordes for help with the - // SSE permutes below. + // SSE permutes below. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const __m128 t0 = _mm_castsi128_ps(block0); const __m128 t1 = _mm_castsi128_ps(block1); @@ -1507,7 +1507,7 @@ inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. Thanks to Peter Cordes for help with the - // SSE permutes below. + // SSE permutes below. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const __m128 t0 = _mm_castsi128_ps(block0); const __m128 t1 = _mm_castsi128_ps(block1); @@ -1563,7 +1563,7 @@ inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. Thanks to Peter Cordes for help with the - // SSE permutes below. + // SSE permutes below. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const __m128 t0 = _mm_castsi128_ps(block0); const __m128 t1 = _mm_castsi128_ps(block1); @@ -1629,7 +1629,6 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize; size_t xorIncrement = xorBlocks ? xmmBlockSize : 0; size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize; - CRYPTOPP_ALIGN_DATA(16) word32 temp[4]; if (flags & BlockTransformation::BT_ReverseDirection) { @@ -1728,15 +1727,14 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, while (length >= blockSize) { - // temp[] is an aligned array - std::memcpy(temp, inBlocks, 8); __m128i block, zero = _mm_setzero_si128(); - block = _mm_load_si128(CONST_M128_CAST(temp)); + block = _mm_xor_si128(block, _mm_castpd_si128( + _mm_loaddup_pd(reinterpret_cast(inBlocks)))); if (flags & BlockTransformation::BT_XorInput) { - std::memcpy(temp, xorBlocks, 8); - block = _mm_xor_si128(block, _mm_load_si128(CONST_M128_CAST(temp))); + block = _mm_xor_si128(block, _mm_castpd_si128( + _mm_loaddup_pd(reinterpret_cast(xorBlocks)))); } if (flags & BlockTransformation::BT_InBlockIsCounter) @@ -1746,12 +1744,12 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { - std::memcpy(temp, xorBlocks, 8); - block = _mm_xor_si128(block, _mm_load_si128(CONST_M128_CAST(temp))); + block = _mm_xor_si128(block, _mm_castpd_si128( + _mm_loaddup_pd(reinterpret_cast(xorBlocks)))); } - _mm_store_si128(M128_CAST(temp), block); - std::memcpy(outBlocks, temp, 8); + const word64 temp = _mm_cvtsi128_si64x(block); + std::memcpy(outBlocks, &temp, 8); inBlocks += inIncrement; outBlocks += outIncrement; diff --git a/speck-simd.cpp b/speck-simd.cpp index 4c29c4c8..c2a32d97 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -1328,7 +1328,7 @@ inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1, // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. Thanks to Peter Cordes for help with the - // SSE permutes below. + // SSE permutes below. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const __m128 t0 = _mm_castsi128_ps(block0); const __m128 t1 = _mm_castsi128_ps(block1); @@ -1365,7 +1365,7 @@ inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1, // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. Thanks to Peter Cordes for help with the - // SSE permutes below. + // SSE permutes below. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const __m128 t0 = _mm_castsi128_ps(block0); const __m128 t1 = _mm_castsi128_ps(block1); @@ -1402,7 +1402,7 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. Thanks to Peter Cordes for help with the - // SSE permutes below. + // SSE permutes below. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const __m128 t0 = _mm_castsi128_ps(block0); const __m128 t1 = _mm_castsi128_ps(block1); @@ -1455,7 +1455,7 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. Thanks to Peter Cordes for help with the - // SSE permutes below. + // SSE permutes below. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... const __m128 t0 = _mm_castsi128_ps(block0); const __m128 t1 = _mm_castsi128_ps(block1); @@ -1517,7 +1517,6 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize; size_t xorIncrement = xorBlocks ? xmmBlockSize : 0; size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize; - CRYPTOPP_ALIGN_DATA(16) word32 temp[4]; if (flags & BlockTransformation::BT_ReverseDirection) { @@ -1616,15 +1615,14 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, while (length >= blockSize) { - // temp[] is an aligned array - std::memcpy(temp, inBlocks, 8); __m128i block, zero = _mm_setzero_si128(); - block = _mm_load_si128(CONST_M128_CAST(temp)); + block = _mm_xor_si128(block, _mm_castpd_si128( + _mm_loaddup_pd(reinterpret_cast(inBlocks)))); if (flags & BlockTransformation::BT_XorInput) { - std::memcpy(temp, xorBlocks, 8); - block = _mm_xor_si128(block, _mm_load_si128(CONST_M128_CAST(temp))); + block = _mm_xor_si128(block, _mm_castpd_si128( + _mm_loaddup_pd(reinterpret_cast(xorBlocks)))); } if (flags & BlockTransformation::BT_InBlockIsCounter) @@ -1634,12 +1632,12 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { - std::memcpy(temp, xorBlocks, 8); - block = _mm_xor_si128(block, _mm_load_si128(CONST_M128_CAST(temp))); + block = _mm_xor_si128(block, _mm_castpd_si128( + _mm_loaddup_pd(reinterpret_cast(xorBlocks)))); } - _mm_store_si128(M128_CAST(temp), block); - std::memcpy(outBlocks, temp, 8); + const word64 temp = _mm_cvtsi128_si64x(block); + std::memcpy(outBlocks, &temp, 8); inBlocks += inIncrement; outBlocks += outIncrement; -- cgit v1.2.1