From 86acc8ed456bd048c030724eecbcd8756157a687 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 6 Dec 2017 06:18:46 -0500 Subject: Use 6x-2x-1x for Simon and Speck on IA-32 For Simon-64 and Speck-64 this means we are effectively using 12x-4x-1x. We are mostly at the threshold for IA-32 and parallelization. At any time 10 to 13 XMM registers are being used. Prefer movsd by way of _mm_load_sd and _mm_store_sd. Fix "error C3861: _mm_cvtsi128_si64x identifier not found". --- simon-simd.cpp | 268 +++++++++++++++++++++++++++++++++++++++++++++++---------- speck-simd.cpp | 252 ++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 444 insertions(+), 76 deletions(-) diff --git a/simon-simd.cpp b/simon-simd.cpp index e95b0820..0f6ea74f 100644 --- a/simon-simd.cpp +++ b/simon-simd.cpp @@ -1073,13 +1073,12 @@ inline __m128i SIMON128_f(const __m128i& v) _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v))); } -inline void SIMON128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) +inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1, const word64 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - __m128i block1 = _mm_setzero_si128(); __m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1); @@ -1111,11 +1110,12 @@ inline void SIMON128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned y1 = _mm_shuffle_epi8(y1, mask); block0 = _mm_unpacklo_epi64(x1, y1); - // block1 = _mm_unpackhi_epi64(x1, y1); + block1 = _mm_unpackhi_epi64(x1, y1); } -inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) +inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word64 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to @@ -1125,12 +1125,16 @@ inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i x2 = _mm_unpacklo_epi64(block2, block3); __m128i y2 = _mm_unpackhi_epi64(block2, block3); + __m128i x3 = _mm_unpacklo_epi64(block4, block5); + __m128i y3 = _mm_unpackhi_epi64(block4, block5); const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); for (size_t i = 0; static_cast(i) < (rounds & ~1) - 1; i += 2) { @@ -1138,11 +1142,13 @@ inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1, _mm_loaddup_pd(reinterpret_cast(subkeys + i))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1); const __m128i rk2 = _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(subkeys + i + 1))); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2); + x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2); } if (rounds & 1) @@ -1151,27 +1157,32 @@ inline void SIMON128_Enc_4_Blocks(__m128i &block0, __m128i &block1, _mm_loaddup_pd(reinterpret_cast(subkeys + rounds - 1))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk); - Swap128(x1, y1); Swap128(x2, y2); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk); + Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); } x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... block0 = _mm_unpacklo_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1); block2 = _mm_unpacklo_epi64(x2, y2); block3 = _mm_unpackhi_epi64(x2, y2); + block4 = _mm_unpacklo_epi64(x3, y3); + block5 = _mm_unpackhi_epi64(x3, y3); } -inline void SIMON128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned int rounds) +inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1, const word64 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... - __m128i block1 = _mm_setzero_si128(); __m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1); @@ -1204,11 +1215,12 @@ inline void SIMON128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned y1 = _mm_shuffle_epi8(y1, mask); block0 = _mm_unpacklo_epi64(x1, y1); - // block1 = _mm_unpackhi_epi64(x1, y1); + block1 = _mm_unpackhi_epi64(x1, y1); } -inline void SIMON128_Dec_4_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) +inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word64 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to @@ -1218,21 +1230,26 @@ inline void SIMON128_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i x2 = _mm_unpacklo_epi64(block2, block3); __m128i y2 = _mm_unpackhi_epi64(block2, block3); + __m128i x3 = _mm_unpacklo_epi64(block4, block5); + __m128i y3 = _mm_unpackhi_epi64(block4, block5); const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); if (rounds & 1) { const __m128i rk = _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(subkeys + rounds - 1))); - Swap128(x1, y1); Swap128(x2, y2); + Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2)); + y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3)); rounds--; } @@ -1242,26 +1259,33 @@ inline void SIMON128_Dec_4_Blocks(__m128i &block0, __m128i &block1, _mm_loaddup_pd(reinterpret_cast(subkeys + i + 1))); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1); + x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1); const __m128i rk2 = _mm_castpd_si128( _mm_loaddup_pd(reinterpret_cast(subkeys + i))); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2); } x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... block0 = _mm_unpacklo_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1); block2 = _mm_unpacklo_epi64(x2, y2); block3 = _mm_unpackhi_epi64(x2, y2); + block4 = _mm_unpacklo_epi64(x3, y3); + block5 = _mm_unpackhi_epi64(x3, y3); } -template -inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, +template +inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F2 func2, F6 func6, const word64 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { @@ -1287,16 +1311,19 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, if (flags & BlockTransformation::BT_AllowParallel) { - while (length >= 4*blockSize) + while (length >= 6*blockSize) { - __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; + __m128i block0, block1, block2, block3, block4, block5; + block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); if (flags & BlockTransformation::BT_InBlockIsCounter) { const __m128i be1 = *CONST_M128_CAST(s_one128); block1 = _mm_add_epi32(block0, be1); block2 = _mm_add_epi32(block1, be1); block3 = _mm_add_epi32(block2, be1); - _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); + block4 = _mm_add_epi32(block3, be1); + block5 = _mm_add_epi32(block4, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1)); } else { @@ -1307,6 +1334,10 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, inBlocks += inIncrement; block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); inBlocks += inIncrement; + block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; } if (flags & BlockTransformation::BT_XorInput) @@ -1321,9 +1352,13 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, xorBlocks += xorIncrement; block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; + block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; } - func4(block0, block1, block2, block3, subKeys, static_cast(rounds)); + func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { @@ -1335,6 +1370,10 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, xorBlocks += xorIncrement; block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; + block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; } _mm_storeu_si128(M128_CAST(outBlocks), block0); @@ -1345,14 +1384,63 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, outBlocks += outIncrement; _mm_storeu_si128(M128_CAST(outBlocks), block3); outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block4); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block5); + outBlocks += outIncrement; - length -= 4*blockSize; + length -= 6*blockSize; + } + + while (length >= 2*blockSize) + { + __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1; + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + const __m128i be1 = *CONST_M128_CAST(s_one128); + block1 = _mm_add_epi32(block0, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1)); + } + else + { + inBlocks += inIncrement; + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + // Coverity finding, appears to be false positive. Assert the condition. + CRYPTOPP_ASSERT(xorBlocks); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + func2(block0, block1, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128(M128_CAST(outBlocks), block0); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block1); + outBlocks += outIncrement; + + length -= 2*blockSize; } } while (length >= blockSize) { - __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + __m128i block, zero = _mm_setzero_si128(); + block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); if (flags & BlockTransformation::BT_XorInput) block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); @@ -1360,7 +1448,7 @@ inline size_t SIMON128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, if (flags & BlockTransformation::BT_InBlockIsCounter) const_cast(inBlocks)[15]++; - func1(block, subKeys, static_cast(rounds)); + func2(block, zero, subKeys, static_cast(rounds)); if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); @@ -1501,8 +1589,9 @@ inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1, block1 = _mm_unpackhi_epi32(x1, y1); } -inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, - __m128i &block3, const word32 *subkeys, unsigned int rounds) +inline void SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word32 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to @@ -1519,21 +1608,30 @@ inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); + const __m128 t4 = _mm_castsi128_ps(block4); + const __m128 t5 = _mm_castsi128_ps(block5); + __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); + __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); + const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); for (size_t i = 0; static_cast(i) < (rounds & ~1)-1; i += 2) { const __m128i rk1 = _mm_set1_epi32(subkeys[i]); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1); const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2); + x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2); } if (rounds & 1) @@ -1541,13 +1639,16 @@ inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk); - Swap128(x1, y1); Swap128(x2, y2); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk); + Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); } x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); // The is roughly the SSE equivalent to ARM vzp32 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] @@ -1555,10 +1656,13 @@ inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc block1 = _mm_unpackhi_epi32(x1, y1); block2 = _mm_unpacklo_epi32(x2, y2); block3 = _mm_unpackhi_epi32(x2, y2); + block4 = _mm_unpacklo_epi32(x3, y3); + block5 = _mm_unpackhi_epi32(x3, y3); } -inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, - __m128i &block3, const word32 *subkeys, unsigned int rounds) +inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word32 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to @@ -1575,18 +1679,26 @@ inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); + const __m128 t4 = _mm_castsi128_ps(block4); + const __m128 t5 = _mm_castsi128_ps(block5); + __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); + __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); + const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); if (rounds & 1) { - Swap128(x1, y1); Swap128(x2, y2); + Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1)); y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2)); + y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3)); rounds--; } @@ -1595,16 +1707,20 @@ inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]); x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1); x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1); + x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1); const __m128i rk2 = _mm_set1_epi32(subkeys[i]); y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2); y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2); } x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); // The is roughly the SSE equivalent to ARM vzp32 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] @@ -1612,10 +1728,12 @@ inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc block1 = _mm_unpackhi_epi32(x1, y1); block2 = _mm_unpacklo_epi32(x2, y2); block3 = _mm_unpackhi_epi32(x2, y2); + block4 = _mm_unpacklo_epi32(x3, y3); + block5 = _mm_unpackhi_epi32(x3, y3); } -template -inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, +template +inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, const word32 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { @@ -1642,16 +1760,19 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, if (flags & BlockTransformation::BT_AllowParallel) { - while (length >= 4*xmmBlockSize) + while (length >= 6*xmmBlockSize) { - __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; + __m128i block0, block1, block2, block3, block4, block5; + block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); if (flags & BlockTransformation::BT_InBlockIsCounter) { const __m128i be1 = *CONST_M128_CAST(s_one64); block1 = _mm_add_epi32(block0, be1); block2 = _mm_add_epi32(block1, be1); block3 = _mm_add_epi32(block2, be1); - _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); + block4 = _mm_add_epi32(block3, be1); + block5 = _mm_add_epi32(block4, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1)); } else { @@ -1662,6 +1783,10 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, inBlocks += inIncrement; block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); inBlocks += inIncrement; + block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; } if (flags & BlockTransformation::BT_XorInput) @@ -1676,9 +1801,13 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, xorBlocks += xorIncrement; block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; + block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; } - func4(block0, block1, block2, block3, subKeys, static_cast(rounds)); + func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { @@ -1690,6 +1819,10 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, xorBlocks += xorIncrement; block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; + block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; } _mm_storeu_si128(M128_CAST(outBlocks), block0); @@ -1700,8 +1833,56 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, outBlocks += outIncrement; _mm_storeu_si128(M128_CAST(outBlocks), block3); outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block4); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block5); + outBlocks += outIncrement; + + length -= 6*xmmBlockSize; + } + + while (length >= 2*xmmBlockSize) + { + __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1; + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + const __m128i be1 = *CONST_M128_CAST(s_one64); + block1 = _mm_add_epi32(block0, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1)); + } + else + { + inBlocks += inIncrement; + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + // Coverity finding, appears to be false positive. Assert the condition. + CRYPTOPP_ASSERT(xorBlocks); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + func2(block0, block1, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128(M128_CAST(outBlocks), block0); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block1); + outBlocks += outIncrement; - length -= 4*xmmBlockSize; + length -= 2*xmmBlockSize; } } @@ -1728,13 +1909,13 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, while (length >= blockSize) { __m128i block, zero = _mm_setzero_si128(); - block = _mm_xor_si128(block, _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(inBlocks)))); + block = _mm_castpd_si128( + _mm_load_sd(reinterpret_cast(inBlocks))); if (flags & BlockTransformation::BT_XorInput) { block = _mm_xor_si128(block, _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(xorBlocks)))); + _mm_load_sd(reinterpret_cast(xorBlocks)))); } if (flags & BlockTransformation::BT_InBlockIsCounter) @@ -1745,11 +1926,10 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { block = _mm_xor_si128(block, _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(xorBlocks)))); + _mm_load_sd(reinterpret_cast(xorBlocks)))); } - const word64 temp = _mm_cvtsi128_si64x(block); - std::memcpy(outBlocks, &temp, 8); + _mm_store_sd(reinterpret_cast(outBlocks), _mm_castsi128_pd(block)); inBlocks += inIncrement; outBlocks += outIncrement; @@ -1809,14 +1989,14 @@ size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rou size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Enc_Block, SIMON64_Enc_4_Blocks, + return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Dec_Block, SIMON64_Dec_4_Blocks, + return SIMON64_AdvancedProcessBlocks_SSE41(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } #endif @@ -1825,14 +2005,14 @@ size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rou size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Enc_Block, SIMON128_Enc_4_Blocks, + return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Dec_Block, SIMON128_Dec_4_Blocks, + return SIMON128_AdvancedProcessBlocks_SSSE3(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } #endif // CRYPTOPP_SSSE3_AVAILABLE diff --git a/speck-simd.cpp b/speck-simd.cpp index c2a32d97..a5844019 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -1043,8 +1043,9 @@ inline void SPECK128_Enc_Block(__m128i &block0, __m128i &block1, block1 = _mm_unpackhi_epi64(x1, y1); } -inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) +inline void SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word64 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to @@ -1054,12 +1055,16 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i x2 = _mm_unpacklo_epi64(block2, block3); __m128i y2 = _mm_unpackhi_epi64(block2, block3); + __m128i x3 = _mm_unpacklo_epi64(block4, block5); + __m128i y3 = _mm_unpackhi_epi64(block4, block5); const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); for (size_t i=0; static_cast(i)(x1); x2 = RotateRight64<8>(x2); + x3 = RotateRight64<8>(x3); x1 = _mm_add_epi64(x1, y1); x2 = _mm_add_epi64(x2, y2); + x3 = _mm_add_epi64(x3, y3); x1 = _mm_xor_si128(x1, rk); x2 = _mm_xor_si128(x2, rk); + x3 = _mm_xor_si128(x3, rk); y1 = RotateLeft64<3>(y1); y2 = RotateLeft64<3>(y2); + y3 = RotateLeft64<3>(y3); y1 = _mm_xor_si128(y1, x1); y2 = _mm_xor_si128(y2, x2); + y3 = _mm_xor_si128(y3, x3); } x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... block0 = _mm_unpacklo_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1); block2 = _mm_unpacklo_epi64(x2, y2); block3 = _mm_unpackhi_epi64(x2, y2); + block4 = _mm_unpacklo_epi64(x3, y3); + block5 = _mm_unpackhi_epi64(x3, y3); } inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1, @@ -1124,8 +1138,9 @@ inline void SPECK128_Dec_Block(__m128i &block0, __m128i &block1, block1 = _mm_unpackhi_epi64(x1, y1); } -inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, - __m128i &block2, __m128i &block3, const word64 *subkeys, unsigned int rounds) +inline void SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word64 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to @@ -1135,12 +1150,16 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i x2 = _mm_unpacklo_epi64(block2, block3); __m128i y2 = _mm_unpackhi_epi64(block2, block3); + __m128i x3 = _mm_unpacklo_epi64(block4, block5); + __m128i y3 = _mm_unpackhi_epi64(block4, block5); const __m128i mask = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); for (size_t i=rounds-1; static_cast(i)>=0; --i) { @@ -1149,30 +1168,39 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, y1 = _mm_xor_si128(y1, x1); y2 = _mm_xor_si128(y2, x2); + y3 = _mm_xor_si128(y3, x3); y1 = RotateRight64<3>(y1); y2 = RotateRight64<3>(y2); + y3 = RotateRight64<3>(y3); x1 = _mm_xor_si128(x1, rk); x2 = _mm_xor_si128(x2, rk); + x3 = _mm_xor_si128(x3, rk); x1 = _mm_sub_epi64(x1, y1); x2 = _mm_sub_epi64(x2, y2); + x3 = _mm_sub_epi64(x3, y3); x1 = RotateLeft64<8>(x1); x2 = RotateLeft64<8>(x2); + x3 = RotateLeft64<8>(x3); } x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... block0 = _mm_unpacklo_epi64(x1, y1); block1 = _mm_unpackhi_epi64(x1, y1); block2 = _mm_unpacklo_epi64(x2, y2); block3 = _mm_unpackhi_epi64(x2, y2); + block4 = _mm_unpacklo_epi64(x3, y3); + block5 = _mm_unpackhi_epi64(x3, y3); } -template -inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4, +template +inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F6 func6, const word64 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { @@ -1198,16 +1226,19 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4, if (flags & BlockTransformation::BT_AllowParallel) { - while (length >= 4*blockSize) + while (length >= 6*blockSize) { - __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; + __m128i block0, block1, block2, block3, block4, block5; + block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); if (flags & BlockTransformation::BT_InBlockIsCounter) { const __m128i be1 = *CONST_M128_CAST(s_one128); block1 = _mm_add_epi32(block0, be1); block2 = _mm_add_epi32(block1, be1); block3 = _mm_add_epi32(block2, be1); - _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); + block4 = _mm_add_epi32(block3, be1); + block5 = _mm_add_epi32(block4, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1)); } else { @@ -1218,6 +1249,10 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4, inBlocks += inIncrement; block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); inBlocks += inIncrement; + block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; } if (flags & BlockTransformation::BT_XorInput) @@ -1232,9 +1267,13 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4, xorBlocks += xorIncrement; block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; + block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; } - func4(block0, block1, block2, block3, subKeys, static_cast(rounds)); + func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { @@ -1246,6 +1285,10 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4, xorBlocks += xorIncrement; block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; + block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; } _mm_storeu_si128(M128_CAST(outBlocks), block0); @@ -1256,8 +1299,57 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F2 func2, F4 func4, outBlocks += outIncrement; _mm_storeu_si128(M128_CAST(outBlocks), block3); outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block4); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block5); + outBlocks += outIncrement; + + length -= 6*blockSize; + } + + while (length >= 2*blockSize) + { + __m128i block0, block1; + block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + const __m128i be1 = *CONST_M128_CAST(s_one128); + block1 = _mm_add_epi32(block0, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1)); + } + else + { + inBlocks += inIncrement; + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + // Coverity finding, appears to be false positive. Assert the condition. + CRYPTOPP_ASSERT(xorBlocks); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + func2(block0, block1, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128(M128_CAST(outBlocks), block0); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block1); + outBlocks += outIncrement; - length -= 4*blockSize; + length -= 2*blockSize; } } @@ -1396,8 +1488,9 @@ inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1, block1 = _mm_unpackhi_epi32(x1, y1); } -inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, - __m128i &block3, const word32 *subkeys, unsigned int rounds) +inline void SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word32 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to @@ -1414,11 +1507,18 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); + const __m128 t4 = _mm_castsi128_ps(block4); + const __m128 t5 = _mm_castsi128_ps(block5); + __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); + __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); + const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); for (size_t i=0; static_cast(i)(x1); x2 = RotateRight32<8>(x2); + x3 = RotateRight32<8>(x3); x1 = _mm_add_epi32(x1, y1); x2 = _mm_add_epi32(x2, y2); + x3 = _mm_add_epi32(x3, y3); x1 = _mm_xor_si128(x1, rk); x2 = _mm_xor_si128(x2, rk); + x3 = _mm_xor_si128(x3, rk); y1 = RotateLeft32<3>(y1); y2 = RotateLeft32<3>(y2); + y3 = RotateLeft32<3>(y3); y1 = _mm_xor_si128(y1, x1); y2 = _mm_xor_si128(y2, x2); + y3 = _mm_xor_si128(y3, x3); } x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); // The is roughly the SSE equivalent to ARM vzp32 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] @@ -1447,10 +1554,13 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc block1 = _mm_unpackhi_epi32(x1, y1); block2 = _mm_unpacklo_epi32(x2, y2); block3 = _mm_unpackhi_epi32(x2, y2); + block4 = _mm_unpacklo_epi32(x3, y3); + block5 = _mm_unpackhi_epi32(x3, y3); } -inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, - __m128i &block3, const word32 *subkeys, unsigned int rounds) +inline void SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word32 *subkeys, unsigned int rounds) { // Rearrange the data for vectorization. The incoming data was read from // a big-endian byte array. Depending on the number of blocks it needs to @@ -1467,11 +1577,18 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); + const __m128 t4 = _mm_castsi128_ps(block4); + const __m128 t5 = _mm_castsi128_ps(block5); + __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); + __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); + const __m128i mask = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); for (size_t i=rounds-1; static_cast(i)>=0; --i) { @@ -1479,20 +1596,27 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc y1 = _mm_xor_si128(y1, x1); y2 = _mm_xor_si128(y2, x2); + y3 = _mm_xor_si128(y3, x3); y1 = RotateRight32<3>(y1); y2 = RotateRight32<3>(y2); + y3 = RotateRight32<3>(y3); x1 = _mm_xor_si128(x1, rk); x2 = _mm_xor_si128(x2, rk); + x3 = _mm_xor_si128(x3, rk); x1 = _mm_sub_epi32(x1, y1); x2 = _mm_sub_epi32(x2, y2); + x3 = _mm_sub_epi32(x3, y3); x1 = RotateLeft32<8>(x1); x2 = RotateLeft32<8>(x2); + x3 = RotateLeft32<8>(x3); } x1 = _mm_shuffle_epi8(x1, mask); y1 = _mm_shuffle_epi8(y1, mask); x2 = _mm_shuffle_epi8(x2, mask); y2 = _mm_shuffle_epi8(y2, mask); + x3 = _mm_shuffle_epi8(x3, mask); + y3 = _mm_shuffle_epi8(y3, mask); // The is roughly the SSE equivalent to ARM vzp32 // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] @@ -1500,10 +1624,12 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc block1 = _mm_unpackhi_epi32(x1, y1); block2 = _mm_unpacklo_epi32(x2, y2); block3 = _mm_unpackhi_epi32(x2, y2); + block4 = _mm_unpacklo_epi32(x3, y3); + block5 = _mm_unpackhi_epi32(x3, y3); } -template -inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, +template +inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6, const word32 *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { @@ -1530,16 +1656,19 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, if (flags & BlockTransformation::BT_AllowParallel) { - while (length >= 4*xmmBlockSize) + while (length >= 6*xmmBlockSize) { - __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; + __m128i block0, block1, block2, block3, block4, block5; + block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); if (flags & BlockTransformation::BT_InBlockIsCounter) { const __m128i be1 = *CONST_M128_CAST(s_one64); block1 = _mm_add_epi32(block0, be1); block2 = _mm_add_epi32(block1, be1); block3 = _mm_add_epi32(block2, be1); - _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); + block4 = _mm_add_epi32(block3, be1); + block5 = _mm_add_epi32(block4, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1)); } else { @@ -1550,6 +1679,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, inBlocks += inIncrement; block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); inBlocks += inIncrement; + block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; } if (flags & BlockTransformation::BT_XorInput) @@ -1564,9 +1697,13 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, xorBlocks += xorIncrement; block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; + block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; } - func4(block0, block1, block2, block3, subKeys, static_cast(rounds)); + func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { @@ -1578,6 +1715,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, xorBlocks += xorIncrement; block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; + block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; } _mm_storeu_si128(M128_CAST(outBlocks), block0); @@ -1588,8 +1729,56 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, outBlocks += outIncrement; _mm_storeu_si128(M128_CAST(outBlocks), block3); outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block4); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block5); + outBlocks += outIncrement; + + length -= 6*xmmBlockSize; + } + + while (length >= 2*xmmBlockSize) + { + __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1; + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + const __m128i be1 = *CONST_M128_CAST(s_one64); + block1 = _mm_add_epi32(block0, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1)); + } + else + { + inBlocks += inIncrement; + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + // Coverity finding, appears to be false positive. Assert the condition. + CRYPTOPP_ASSERT(xorBlocks); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + func2(block0, block1, subKeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128(M128_CAST(outBlocks), block0); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block1); + outBlocks += outIncrement; - length -= 4*xmmBlockSize; + length -= 2*xmmBlockSize; } } @@ -1616,13 +1805,13 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, while (length >= blockSize) { __m128i block, zero = _mm_setzero_si128(); - block = _mm_xor_si128(block, _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(inBlocks)))); + block = _mm_castpd_si128( + _mm_load_sd(reinterpret_cast(inBlocks))); if (flags & BlockTransformation::BT_XorInput) { block = _mm_xor_si128(block, _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(xorBlocks)))); + _mm_load_sd(reinterpret_cast(xorBlocks)))); } if (flags & BlockTransformation::BT_InBlockIsCounter) @@ -1633,11 +1822,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { block = _mm_xor_si128(block, _mm_castpd_si128( - _mm_loaddup_pd(reinterpret_cast(xorBlocks)))); + _mm_load_sd(reinterpret_cast(xorBlocks)))); } - const word64 temp = _mm_cvtsi128_si64x(block); - std::memcpy(outBlocks, &temp, 8); + _mm_store_sd(reinterpret_cast(outBlocks), _mm_castsi128_pd(block)); inBlocks += inIncrement; outBlocks += outIncrement; @@ -1697,14 +1885,14 @@ size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rou size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SPECK64_AdvancedProcessBlocks_SSE41(SPECK64_Enc_Block, SPECK64_Enc_4_Blocks, + return SPECK64_AdvancedProcessBlocks_SSE41(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SPECK64_AdvancedProcessBlocks_SSE41(SPECK64_Dec_Block, SPECK64_Dec_4_Blocks, + return SPECK64_AdvancedProcessBlocks_SSE41(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } #endif @@ -1713,14 +1901,14 @@ size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rou size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Enc_Block, SPECK128_Enc_4_Blocks, + return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { - return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks, + return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } #endif // CRYPTOPP_SSSE3_AVAILABLE -- cgit v1.2.1