From 57ba87bdc9666034dce1a460ecc7b8d00aa1b380 Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Sun, 5 Apr 2020 09:51:34 -0400
Subject: Add 64-bit overload for VecLoadAligned

---
 ppc_simd.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 2 deletions(-)

(limited to 'ppc_simd.h')
diff --git a/ppc_simd.h b/ppc_simd.h
index ac97c376..1a6ad96e 100644
--- a/ppc_simd.h
+++ b/ppc_simd.h
@@ -413,7 +413,7 @@ inline uint32x4_p VecLoad(int off, const word32 src[4])
 /// \brief Loads a vector from a double word array
 /// \param src the double word array
 /// \details VecLoad() loads a vector from a double word array.
-/// \details VecLoad() uses POWER8's and VSX's <tt>vec_xl</tt> if available.
+/// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
 ///  The instruction does not require aligned effective memory addresses.
 ///  VecLoad_ALTIVEC() is used if POWER8 or VSX are not available.
 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
@@ -452,7 +452,7 @@ inline uint64x2_p VecLoad(const word64 src[2])
 /// \param src the double word array
 /// \param off offset into the double word array
 /// \details VecLoad() loads a vector from a double word array.
-/// \details VecLoad() uses POWER8's and VSX's <tt>vec_xl</tt> if available.
+/// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
 ///  The instruction does not require aligned effective memory addresses.
 ///  VecLoad_ALTIVEC() is used if POWER8 or VSX are not available.
 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
@@ -605,6 +605,74 @@ inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
 #endif
 }
 
+#if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
+
+/// \brief Loads a vector from an aligned double word array
+/// \param src the double word array
+/// \details VecLoadAligned() loads a vector from an aligned double word array.
+/// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
+///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
+///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
+/// \par Wraps
+///  vec_ld, vec_xl
+/// \since Crypto++ 8.0
+inline uint64x2_p VecLoadAligned(const word64 src[4])
+{
+    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
+    // word pointers. The ISA lacks loads for short* and char*.
+    // Power9/ISA 3.0 provides vec_xl for all datatypes.
+
+    // GCC and XLC use integer math for the effective address.
+    // LLVM uses pointer math for the effective address.
+    const uintptr_t eff = reinterpret_cast<uintptr_t>(src);
+    CRYPTOPP_ASSERT(eff % 16 == 0);
+
+#if defined(_ARCH_PWR9)
+    return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
+#elif (defined(_ARCH_PWR7) && defined(__VSX__)) || defined(_ARCH_PWR8)
+    // The 32-bit cast is not a typo. Compiler workaround.
+    return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
+#else
+    return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
+#endif
+}
+
+/// \brief Loads a vector from an aligned double word array
+/// \param src the double word array
+/// \details VecLoadAligned() loads a vector from an aligned double word array.
+/// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
+///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
+///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
+/// \par Wraps
+///  vec_ld, vec_xl
+/// \since Crypto++ 8.0
+inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
+{
+    // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
+    // word pointers. The ISA lacks loads for short* and char*.
+    // Power9/ISA 3.0 provides vec_xl for all datatypes.
+
+    // GCC and XLC use integer math for the effective address.
+    // LLVM uses pointer math for the effective address.
+    const uintptr_t eff = reinterpret_cast<uintptr_t>(src)+off;
+    CRYPTOPP_ASSERT(eff % 16 == 0);
+
+#if defined(_ARCH_PWR9)
+    return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
+#elif (defined(_ARCH_PWR7) && defined(__VSX__)) || defined(_ARCH_PWR8)
+# if defined(__clang__)
+    // The 32-bit cast is not a typo. Compiler workaround.
+    return (uint64x2_p)vec_xl(0, CONST_V32_CAST(eff));
+# else
+    return (uint64x2_p)vec_xl(off, CONST_V32_CAST(src));
+# endif
+#else
+    return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
+#endif
+}
+
+#endif
+
 /// \brief Loads a vector from a byte array
 /// \param src the byte array
 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
-- 
cgit v1.2.1