From c831d6ffeeb80ea805502e62ad795afb8ef6fdff Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Mon, 28 Oct 2019 01:12:58 -0400
Subject: Pre-splat SIMON and SPECK keys when appropriate for Altivec (PR #910)

SIMON and SPECK keys can be pre-splatted in the forward direction when Altivec instructions will be used. Pre-splatting does not work for the reverse transformation. It breaks modes like CBC, so the speed-up is only applied to the forward transformation.
---
 simon128_simd.cpp | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'simon128_simd.cpp')
diff --git a/simon128_simd.cpp b/simon128_simd.cpp
index a91da0f5..b780cf7e 100644
--- a/simon128_simd.cpp
+++ b/simon128_simd.cpp
@@ -66,7 +66,7 @@ using CryptoPP::vec_swap;  // SunCC
 #if defined(_MSC_VER) && !defined(_M_ARM64)
 inline uint64x2_t vld1q_dup_u64(const uint64_t* ptr)
 {
-	return vmovq_n_u64(*ptr);
+    return vmovq_n_u64(*ptr);
 }
 #endif
 
@@ -536,6 +536,7 @@ using CryptoPP::uint64x2_p;
 
 using CryptoPP::VecAnd;
 using CryptoPP::VecXor;
+using CryptoPP::VecLoad;
 using CryptoPP::VecPermute;
 
 // Rotate left by bit count
@@ -576,16 +577,18 @@ inline void SIMON128_Enc_Block(uint32x4_p &block, const word64 *subkeys, unsigne
 
     for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
     {
-        const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
-        y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
+        // Round keys are pre-splated in forward direction
+        const uint64x2_p rk1 = VecLoad(subkeys+i*2);
+        const uint64x2_p rk2 = VecLoad(subkeys+i*2+2);
 
-        const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
+        y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
         x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
     }
 
     if (rounds & 1)
     {
-        const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
+        // Round keys are pre-splated in forward direction
+        const uint64x2_p rk = VecLoad(subkeys+rounds*2-2);
         y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
         std::swap(x1, y1);
     }
@@ -667,12 +670,14 @@ inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
 
     for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
     {
-        const uint64x2_p rk1 = vec_splats((unsigned long long)subkeys[i]);
+        // Round keys are pre-splated in forward direction
+        const uint64x2_p rk1 = VecLoad(subkeys+i*2);
+        const uint64x2_p rk2 = VecLoad(subkeys+i*2+2);
+
         y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk1);
         y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk1);
         y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk1);
 
-        const uint64x2_p rk2 = vec_splats((unsigned long long)subkeys[i+1]);
         x1 = VecXor(VecXor(x1, SIMON128_f(y1)), rk2);
         x2 = VecXor(VecXor(x2, SIMON128_f(y2)), rk2);
         x3 = VecXor(VecXor(x3, SIMON128_f(y3)), rk2);
@@ -680,7 +685,9 @@ inline void SIMON128_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1,
 
     if (rounds & 1)
     {
-        const uint64x2_p rk = vec_splats((unsigned long long)subkeys[rounds-1]);
+        // Round keys are pre-splated in forward direction
+        const uint64x2_p rk = VecLoad(subkeys+rounds*2-2);
+
         y1 = VecXor(VecXor(y1, SIMON128_f(x1)), rk);
         y2 = VecXor(VecXor(y2, SIMON128_f(x2)), rk);
         y3 = VecXor(VecXor(y3, SIMON128_f(x3)), rk);
-- 
cgit v1.2.1