Remove unneeded temp[] array

author: Jeffrey Walton <noloader@gmail.com> 2017-12-05 20:35:57 -0500
committer: Jeffrey Walton <noloader@gmail.com> 2017-12-05 20:35:57 -0500
commit: e9654192f20ec4d16e2fbccb167ce63bd22b48f5 (patch)
tree: 75f45207abc977abda66b605eac0e052a2a2d94d
parent: 490701accaf061a121c2f28c71a097bbca85d06b (diff)
download: cryptopp-git-e9654192f20ec4d16e2fbccb167ce63bd22b48f5.tar.gz
2 files changed, 24 insertions, 28 deletions
diff --git a/simon-simd.cpp b/simon-simd.cpp
index 186e862a..e95b0820 100644
--- a/simon-simd.cpp
+++ b/simon-simd.cpp
@@ -1422,7 +1422,7 @@ inline void SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
     // Rearrange the data for vectorization. The incoming data was read from
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
-	// SSE permutes below.
+    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -1464,7 +1464,7 @@ inline void SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
     // Rearrange the data for vectorization. The incoming data was read from
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
-	// SSE permutes below.
+    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -1507,7 +1507,7 @@ inline void SIMON64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
     // Rearrange the data for vectorization. The incoming data was read from
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
-	// SSE permutes below.
+    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -1563,7 +1563,7 @@ inline void SIMON64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
     // Rearrange the data for vectorization. The incoming data was read from
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
-	// SSE permutes below.
+    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -1629,7 +1629,6 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
     size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
     size_t xorIncrement = xorBlocks ? xmmBlockSize : 0;
     size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
-    CRYPTOPP_ALIGN_DATA(16) word32 temp[4];
 
     if (flags & BlockTransformation::BT_ReverseDirection)
     {
@@ -1728,15 +1727,14 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
 
         while (length >= blockSize)
         {
-            // temp[] is an aligned array
-            std::memcpy(temp, inBlocks, 8);
             __m128i block, zero = _mm_setzero_si128();
-            block = _mm_load_si128(CONST_M128_CAST(temp));
+            block = _mm_xor_si128(block, _mm_castpd_si128(
+                _mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
 
             if (flags & BlockTransformation::BT_XorInput)
             {
-                std::memcpy(temp, xorBlocks, 8);
-                block = _mm_xor_si128(block, _mm_load_si128(CONST_M128_CAST(temp)));
+                block = _mm_xor_si128(block, _mm_castpd_si128(
+                    _mm_loaddup_pd(reinterpret_cast<const double*>(xorBlocks))));
             }
 
             if (flags & BlockTransformation::BT_InBlockIsCounter)
@@ -1746,12 +1744,12 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
 
             if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
             {
-                std::memcpy(temp, xorBlocks, 8);
-                block = _mm_xor_si128(block, _mm_load_si128(CONST_M128_CAST(temp)));
+                block = _mm_xor_si128(block, _mm_castpd_si128(
+                    _mm_loaddup_pd(reinterpret_cast<const double*>(xorBlocks))));
             }
 
-            _mm_store_si128(M128_CAST(temp), block);
-            std::memcpy(outBlocks, temp, 8);
+            const word64 temp = _mm_cvtsi128_si64x(block);
+            std::memcpy(outBlocks, &temp, 8);
 
             inBlocks += inIncrement;
             outBlocks += outIncrement;
diff --git a/speck-simd.cpp b/speck-simd.cpp
index 4c29c4c8..c2a32d97 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -1328,7 +1328,7 @@ inline void SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
     // Rearrange the data for vectorization. The incoming data was read from
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
-	// SSE permutes below.
+    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -1365,7 +1365,7 @@ inline void SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
     // Rearrange the data for vectorization. The incoming data was read from
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
-	// SSE permutes below.
+    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -1402,7 +1402,7 @@ inline void SPECK64_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
     // Rearrange the data for vectorization. The incoming data was read from
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
-	// SSE permutes below.
+    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -1455,7 +1455,7 @@ inline void SPECK64_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &bloc
     // Rearrange the data for vectorization. The incoming data was read from
     // a big-endian byte array. Depending on the number of blocks it needs to
     // be permuted to the following. Thanks to Peter Cordes for help with the
-	// SSE permutes below.
+    // SSE permutes below.
     // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
     const __m128 t0 = _mm_castsi128_ps(block0);
     const __m128 t1 = _mm_castsi128_ps(block1);
@@ -1517,7 +1517,6 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
     size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
     size_t xorIncrement = xorBlocks ? xmmBlockSize : 0;
     size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
-    CRYPTOPP_ALIGN_DATA(16) word32 temp[4];
 
     if (flags & BlockTransformation::BT_ReverseDirection)
     {
@@ -1616,15 +1615,14 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
 
         while (length >= blockSize)
         {
-            // temp[] is an aligned array
-            std::memcpy(temp, inBlocks, 8);
             __m128i block, zero = _mm_setzero_si128();
-            block = _mm_load_si128(CONST_M128_CAST(temp));
+            block = _mm_xor_si128(block, _mm_castpd_si128(
+                _mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
 
             if (flags & BlockTransformation::BT_XorInput)
             {
-                std::memcpy(temp, xorBlocks, 8);
-                block = _mm_xor_si128(block, _mm_load_si128(CONST_M128_CAST(temp)));
+                block = _mm_xor_si128(block, _mm_castpd_si128(
+                    _mm_loaddup_pd(reinterpret_cast<const double*>(xorBlocks))));
             }
 
             if (flags & BlockTransformation::BT_InBlockIsCounter)
@@ -1634,12 +1632,12 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F4 func4,
 
             if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
             {
-                std::memcpy(temp, xorBlocks, 8);
-                block = _mm_xor_si128(block, _mm_load_si128(CONST_M128_CAST(temp)));
+                block = _mm_xor_si128(block, _mm_castpd_si128(
+                    _mm_loaddup_pd(reinterpret_cast<const double*>(xorBlocks))));
             }
 
-            _mm_store_si128(M128_CAST(temp), block);
-            std::memcpy(outBlocks, temp, 8);
+            const word64 temp = _mm_cvtsi128_si64x(block);
+            std::memcpy(outBlocks, &temp, 8);
 
             inBlocks += inIncrement;
             outBlocks += outIncrement;
author	Jeffrey Walton <noloader@gmail.com>	2017-12-05 20:35:57 -0500
committer	Jeffrey Walton <noloader@gmail.com>	2017-12-05 20:35:57 -0500
commit	e9654192f20ec4d16e2fbccb167ce63bd22b48f5 (patch)
tree	75f45207abc977abda66b605eac0e052a2a2d94d
parent	490701accaf061a121c2f28c71a097bbca85d06b (diff)
download	cryptopp-git-e9654192f20ec4d16e2fbccb167ce63bd22b48f5.tar.gz