summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xGNUmakefile6
-rwxr-xr-xGNUmakefile-cross5
-rw-r--r--adv-simd.h24
-rw-r--r--cryptest.nmake4
-rw-r--r--cryptlib.vcxproj1
-rw-r--r--cryptlib.vcxproj.filters3
-rw-r--r--lea-simd.cpp495
-rw-r--r--lea.cpp42
-rw-r--r--lea.h12
9 files changed, 582 insertions, 10 deletions
diff --git a/GNUmakefile b/GNUmakefile
index 263eaca7..009feb03 100755
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -250,6 +250,7 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_SSSE3,$(CXXFLAGS)),)
ifeq ($(HAVE_SSSE3),1)
ARIA_FLAG = -mssse3
CHAM_FLAG = -mssse3
+ LEA_FLAG = -mssse3
SSSE3_FLAG = -mssse3
SIMON_FLAG = -mssse3
SPECK_FLAG = -mssse3
@@ -291,6 +292,7 @@ ifeq ($(SUN_COMPILER),1)
SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
CHAM_FLAG = -xarch=ssse3 -D__SSSE3__=1
+ LEA_FLAG = -xarch=ssse3 -D__SSSE3__=1
SIMON_FLAG = -xarch=ssse3 -D__SSSE3__=1
SPECK_FLAG = -xarch=ssse3 -D__SSSE3__=1
LDFLAGS += -xarch=ssse3
@@ -1068,6 +1070,10 @@ crc-simd.o : crc-simd.cpp
gcm-simd.o : gcm-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<
+# SSSE3 available
+lea-simd.o : lea-simd.cpp
+ $(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $<
+
# NEON available
neon-simd.o : neon-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(NEON_FLAG) -c) $<
diff --git a/GNUmakefile-cross b/GNUmakefile-cross
index b66b3874..820e3fd0 100755
--- a/GNUmakefile-cross
+++ b/GNUmakefile-cross
@@ -277,6 +277,7 @@ ifneq ($(IS_i686)$(IS_x86_64),00)
ifeq ($(HAVE_SSSE3),1)
ARIA_FLAG = -mssse3
CHAM_FLAG = -mssse3
+ LEA_FLAG = -mssse3
SSSE3_FLAG = -mssse3
SIMON_FLAG = -mssse3
SPECK_FLAG = -mssse3
@@ -504,6 +505,10 @@ crc-simd.o : crc-simd.cpp
gcm-simd.o : gcm-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<
+# SSSE3 available
+lea-simd.o : lea-simd.cpp
+ $(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $<
+
# NEON available
neon-simd.o : neon-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(NEON_FLAG) -c) $<
diff --git a/adv-simd.h b/adv-simd.h
index cdb0311a..1609faeb 100644
--- a/adv-simd.h
+++ b/adv-simd.h
@@ -6,7 +6,7 @@
// acceleration. After several implementations we noticed a lot of copy and
// paste occuring. adv-simd.h provides a template to avoid the copy and paste.
//
-// There are 8 templates provided in this file. The number following the
+// There are 9 templates provided in this file. The number following the
// function name is the block size of the cipher. The name following that
// is the acceleration and arrangement. For example 4x1_SSE means Intel SSE
// using two encrypt (or decrypt) functions: one that operates on 4 blocks,
@@ -22,6 +22,14 @@
// * AdvancedProcessBlocks64_6x2_ALTIVEC
// * AdvancedProcessBlocks128_6x2_ALTIVEC
//
+// If an arrangement ends in 2, like 6x2, then the template will handle the
+// single block case by padding with 0's and using the two block function.
+// This happens at most one time when processing multiple blocks. The extra
+// processing of a zero block is trivial and worth the tradeoff.
+//
+// The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions
+// of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually
+// results in a failed link due to the const/non-const mismatch.
#ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
#define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
@@ -323,7 +331,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
}
/// \brief AdvancedProcessBlocks for 1 and 6 blocks
-/// \tparam F1 function to process 1 128-bit blocks
+/// \tparam F1 function to process 1 128-bit block
/// \tparam F6 function to process 6 128-bit blocks
/// \tparam W word type of the subkey table
/// \details AdvancedProcessBlocks128_NEON1x6 processes 6 and 2 NEON SIMD words
@@ -721,7 +729,7 @@ NAMESPACE_BEGIN(CryptoPP)
template <typename F1, typename F2, typename W>
inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
- const W *subKeys, size_t rounds, const byte *inBlocks,
+ MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
CRYPTOPP_ASSERT(subKeys);
@@ -878,7 +886,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2,
/// same word type.
template <typename F2, typename F6, typename W>
inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
- const W *subKeys, size_t rounds, const byte *inBlocks,
+ MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
CRYPTOPP_ASSERT(subKeys);
@@ -1125,7 +1133,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
/// same word type.
template <typename F2, typename F6, typename W>
inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
- const W *subKeys, size_t rounds, const byte *inBlocks,
+ MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
CRYPTOPP_ASSERT(subKeys);
@@ -1312,12 +1320,12 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
}
/// \brief AdvancedProcessBlocks for 1 and 4 blocks
-/// \tparam F1 function to process 1 128-bit blocks
+/// \tparam F1 function to process 1 128-bit block
/// \tparam F4 function to process 4 128-bit blocks
/// \tparam W word type of the subkey table
/// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words
/// at a time.
-/// \details The subkey type is usually word32 or word64. F1 and F6 must use the
+/// \details The subkey type is usually word32 or word64. F1 and F4 must use the
/// same word type.
template <typename F1, typename F4, typename W>
inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
@@ -1455,7 +1463,7 @@ NAMESPACE_END // CryptoPP
NAMESPACE_BEGIN(CryptoPP)
/// \brief AdvancedProcessBlocks for 1 and 6 blocks
-/// \tparam F1 function to process 1 128-bit blocks
+/// \tparam F1 function to process 1 128-bit block
/// \tparam F6 function to process 6 128-bit blocks
/// \tparam W word type of the subkey table
/// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words
diff --git a/cryptest.nmake b/cryptest.nmake
index 3c9e7c99..aa0477c7 100644
--- a/cryptest.nmake
+++ b/cryptest.nmake
@@ -47,9 +47,9 @@
# If you use 'make sources' from Linux makefile, then add 'winpipes.cpp' to the list below.
-LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp cham.cpp cham-simd.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp lea.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp simon.cpp simon-simd.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp speck.cpp speck-simd.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
+LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp cham.cpp cham-simd.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp lea.cpp lea-simd.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp simon.cpp simon-simd.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp speck.cpp speck-simd.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
-LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj cham.obj cham-simd.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj lea.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj simon.obj simon-simd.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj speck.obj speck-simd.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj
+LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj cham.obj cham-simd.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj lea.obj lea-simd.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj simon.obj simon-simd.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj speck.obj speck-simd.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj
TEST_SRCS = bench1.cpp bench2.cpp test.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp validat4.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp fipstest.cpp
diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj
index a2f7e0be..1bce1d1d 100644
--- a/cryptlib.vcxproj
+++ b/cryptlib.vcxproj
@@ -242,6 +242,7 @@
<ClCompile Include="kalynatab.cpp" />
<ClCompile Include="keccak.cpp" />
<ClCompile Include="lea.cpp" />
+ <ClCompile Include="lea-simd.cpp" />
<ClCompile Include="luc.cpp" />
<ClCompile Include="mars.cpp" />
<ClCompile Include="marss.cpp" />
diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters
index 9867d7ba..d91ce2d2 100644
--- a/cryptlib.vcxproj.filters
+++ b/cryptlib.vcxproj.filters
@@ -224,6 +224,9 @@
<ClCompile Include="lea.cpp">
<Filter>Source Files</Filter>
</ClCompile>
+ <ClCompile Include="lea-simd.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
<ClCompile Include="luc.cpp">
<Filter>Source Files</Filter>
</ClCompile>
diff --git a/lea-simd.cpp b/lea-simd.cpp
new file mode 100644
index 00000000..0076926d
--- /dev/null
+++ b/lea-simd.cpp
@@ -0,0 +1,495 @@
+// lea-simd.cpp - written and placed in the public domain by Jeffrey Walton
+//
+// This source file uses intrinsics and built-ins to gain access to
+// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
+// source file is needed because additional CXXFLAGS are required to enable
+// the appropriate instructions sets in some build configurations.
+
+#include "pch.h"
+#include "config.h"
+
+#include "cham.h"
+#include "misc.h"
+#include "adv-simd.h"
+
+// Uncomment for benchmarking C++ against SSE or NEON.
+// Do so in both simon.cpp and simon-simd.cpp.
+// #undef CRYPTOPP_SSSE3_AVAILABLE
+// #undef CRYPTOPP_ARM_NEON_AVAILABLE
+
+#if (CRYPTOPP_SSSE3_AVAILABLE)
+# include <pmmintrin.h>
+# include <tmmintrin.h>
+#endif
+
+ANONYMOUS_NAMESPACE_BEGIN
+
+using CryptoPP::word32;
+
+#if (CRYPTOPP_SSSE3_AVAILABLE)
+
+inline __m128i Xor(const __m128i& a, const __m128i& b)
+{
+ return _mm_xor_si128(a, b);
+}
+
+inline __m128i Add(const __m128i& a, const __m128i& b)
+{
+ return _mm_add_epi32(a, b);
+}
+
+inline __m128i Sub(const __m128i& a, const __m128i& b)
+{
+ return _mm_sub_epi32(a, b);
+}
+
+template <unsigned int R>
+inline __m128i RotateLeft(const __m128i& val)
+{
+ return _mm_or_si128(
+ _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
+}
+
+template <unsigned int R>
+inline __m128i RotateRight(const __m128i& val)
+{
+ return _mm_or_si128(
+ _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
+}
+
+// Faster than two Shifts and an Or.
+template <>
+inline __m128i RotateLeft<8>(const __m128i& val)
+{
+ const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
+ return _mm_shuffle_epi8(val, mask);
+}
+
+// Faster than two Shifts and an Or.
+template <>
+inline __m128i RotateRight<8>(const __m128i& val)
+{
+ const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
+ return _mm_shuffle_epi8(val, mask);
+}
+
+template <unsigned int IDX>
+inline __m128i LoadKey(const word32 rkey[])
+{
+ return _mm_loadu_si128((const __m128i*) &rkey[IDX*4]);
+}
+
+template <unsigned int IDX>
+inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
+{
+ // Should not be instantiated
+ CRYPTOPP_ASSERT(0);;
+ return _mm_setzero_si128();
+}
+
+template <>
+inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
+{
+ // LEA is little-endian oriented, so there is no need for a separate shuffle.
+ const __m128i r1 = _mm_unpacklo_epi32(a, b);
+ const __m128i r2 = _mm_unpacklo_epi32(c, d);
+ return _mm_unpacklo_epi64(r1, r2);
+}
+
+template <>
+inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
+{
+ // LEA is little-endian oriented, so there is no need for a separate shuffle.
+ const __m128i r1 = _mm_unpacklo_epi32(a, b);
+ const __m128i r2 = _mm_unpacklo_epi32(c, d);
+ return _mm_unpackhi_epi64(r1, r2);
+}
+
+template <>
+inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
+{
+ // LEA is little-endian oriented, so there is no need for a separate shuffle.
+ const __m128i r1 = _mm_unpackhi_epi32(a, b);
+ const __m128i r2 = _mm_unpackhi_epi32(c, d);
+ return _mm_unpacklo_epi64(r1, r2);
+}
+
+template <>
+inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
+{
+ // LEA is little-endian oriented, so there is no need for a separate shuffle.
+ const __m128i r1 = _mm_unpackhi_epi32(a, b);
+ const __m128i r2 = _mm_unpackhi_epi32(c, d);
+ return _mm_unpackhi_epi64(r1, r2);
+}
+
+template <unsigned int IDX>
+inline __m128i UnpackXMM(const __m128i& v)
+{
+ // Should not be instantiated
+ CRYPTOPP_ASSERT(0);;
+ return _mm_setzero_si128();
+}
+
+template <>
+inline __m128i UnpackXMM<0>(const __m128i& v)
+{
+ // Splat to all lanes
+ return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
+}
+
+template <>
+inline __m128i UnpackXMM<1>(const __m128i& v)
+{
+ // Splat to all lanes
+ return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
+}
+
+template <>
+inline __m128i UnpackXMM<2>(const __m128i& v)
+{
+ // Splat to all lanes
+ return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
+}
+
+template <>
+inline __m128i UnpackXMM<3>(const __m128i& v)
+{
+ // Splat to all lanes
+ return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
+}
+
+template <unsigned int IDX>
+inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
+{
+ return UnpackXMM<IDX>(a, b, c, d);
+}
+
+template <unsigned int IDX>
+inline __m128i RepackXMM(const __m128i& v)
+{
+ return UnpackXMM<IDX>(v);
+}
+
+inline void LEA_Encryption(__m128i temp[4], const word32 *subkeys, unsigned int rounds)
+{
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
+
+ if(rounds > 24)
+ {
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
+ }
+
+ if(rounds > 28)
+ {
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
+ }
+}
+
+inline void LEA_Decryption(__m128i temp[4], const word32 *subkeys, unsigned int rounds)
+{
+ if(rounds > 28)
+ {
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
+ }
+
+ if(rounds > 24)
+ {
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
+ }
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
+}
+
+inline void GCC_NO_UBSAN LEA_Enc_Block(__m128i &block0,
+ const word32 *subkeys, unsigned int rounds)
+{
+ __m128i temp[4];
+ temp[0] = UnpackXMM<0>(block0);
+ temp[1] = UnpackXMM<1>(block0);
+ temp[2] = UnpackXMM<2>(block0);
+ temp[3] = UnpackXMM<3>(block0);
+
+ LEA_Encryption(temp, subkeys, rounds);
+
+ block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
+}
+
+inline void GCC_NO_UBSAN LEA_Dec_Block(__m128i &block0,
+ const word32 *subkeys, unsigned int rounds)
+{
+ __m128i temp[4];
+ temp[0] = UnpackXMM<0>(block0);
+ temp[1] = UnpackXMM<1>(block0);
+ temp[2] = UnpackXMM<2>(block0);
+ temp[3] = UnpackXMM<3>(block0);
+
+ LEA_Decryption(temp, subkeys, rounds);
+
+ block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
+}
+
+inline void GCC_NO_UBSAN LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1,
+ __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
+{
+ __m128i temp[4];
+ temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
+ temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
+ temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
+ temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
+
+ LEA_Encryption(temp, subkeys, rounds);
+
+ block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
+ block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
+ block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
+ block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
+}
+
+inline void GCC_NO_UBSAN LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1,
+ __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
+{
+ __m128i temp[4];
+ temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
+ temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
+ temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
+ temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
+
+ LEA_Decryption(temp, subkeys, rounds);
+
+ block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
+ block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
+ block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
+ block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
+}
+
+#endif // CRYPTOPP_SSSE3_AVAILABLE
+
+ANONYMOUS_NAMESPACE_END
+
+NAMESPACE_BEGIN(CryptoPP)
+
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+void LEA_SplatKeys_SSSE3(SecBlock<word32>& rkeys)
+{
+ SecBlock<word32> temp(rkeys.size() * 4);
+ for (size_t i=0, j=0; i<rkeys.size(); i++, j+=4)
+ {
+ _mm_storeu_si128((__m128i*) &temp[j],
+ _mm_castps_si128(_mm_load_ps1((const float*) &rkeys[i])));
+ }
+ std::swap(rkeys, temp);
+}
+
+size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+ return AdvancedProcessBlocks128_4x1_SSE(LEA_Enc_Block, LEA_Enc_4_Blocks,
+ subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+ return AdvancedProcessBlocks128_4x1_SSE(LEA_Dec_Block, LEA_Dec_4_Blocks,
+ subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif // CRYPTOPP_SSSE3_AVAILABLE
+
+NAMESPACE_END
diff --git a/lea.cpp b/lea.cpp
index 008dcf8b..780bb407 100644
--- a/lea.cpp
+++ b/lea.cpp
@@ -18,6 +18,7 @@
#include "lea.h"
#include "misc.h"
+#include "cpu.h"
ANONYMOUS_NAMESPACE_BEGIN
@@ -554,6 +555,16 @@ inline void SetKey256(word32 rkey[192], const word32 key[8])
NAMESPACE_BEGIN(CryptoPP)
+#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
+extern void LEA_SplatKeys_SSSE3(SecBlock<word32>& rkeys);
+
+extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params)
{
CRYPTOPP_UNUSED(params);
@@ -584,6 +595,15 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con
default:
CRYPTOPP_ASSERT(0);;
}
+
+#if (CRYPTOPP_SSSE3_AVAILABLE)
+ if (HasSSSE3())
+ {
+ // If we pre-splat the round keys at setup then we avoid a shuffle
+ // at runtime for each subkey used during encryption and decryption.
+ LEA_SplatKeys_SSSE3(m_rkey);
+ }
+#endif
}
void LEA::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
@@ -826,4 +846,26 @@ void LEA::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byt
oblock(m_temp[0])(m_temp[1])(m_temp[2])(m_temp[3]);
}
+#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
+size_t LEA::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+ byte *outBlocks, size_t length, word32 flags) const
+{
+ if (HasSSSE3()) {
+ return LEA_Enc_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds,
+ inBlocks, xorBlocks, outBlocks, length, flags);
+ }
+ return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t LEA::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+ byte *outBlocks, size_t length, word32 flags) const
+{
+ if (HasSSSE3()) {
+ return LEA_Dec_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds,
+ inBlocks, xorBlocks, outBlocks, length, flags);
+ }
+ return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif // CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
+
NAMESPACE_END
diff --git a/lea.h b/lea.h
index e5a6dc4e..7b34724d 100644
--- a/lea.h
+++ b/lea.h
@@ -15,6 +15,10 @@
#include "secblock.h"
#include "algparam.h"
+#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
+# define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1
+#endif
+
NAMESPACE_BEGIN(CryptoPP)
/// \brief LEA block cipher information
@@ -59,6 +63,10 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+
+#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
+ size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
};
/// \brief Provides implementation for encryption transformation
@@ -69,6 +77,10 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+
+#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
+ size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
};
typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;