summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2018-06-23 03:54:51 -0400
committerJeffrey Walton <noloader@gmail.com>2018-06-23 03:54:51 -0400
commit9980d307349c3873022de8fa6d5fb557c533f019 (patch)
tree4eaa2e077ad0ee6cfec006058233ed0e569688d8
parent80ae9f4f0a66596a164fee67e3fdd09628d3871f (diff)
downloadcryptopp-git-9980d307349c3873022de8fa6d5fb557c533f019.tar.gz
Add LEA-128 NEON and ARMv8 implementation (GH #669)
LEA-128(128) from 35.6 cpb to 14.11 cpb on a LeMaker HiKey dev-board. LEA-128 from 12.60 cpb to 11.89 cpb on AMD Opteron 1100.
-rw-r--r--Filelist.txt1
-rwxr-xr-xGNUmakefile2
-rwxr-xr-xGNUmakefile-cross6
-rw-r--r--adv-simd.h142
-rw-r--r--lea-simd.cpp505
-rw-r--r--lea.cpp26
-rw-r--r--lea.h2
7 files changed, 681 insertions, 3 deletions
diff --git a/Filelist.txt b/Filelist.txt
index a76d1c61..ae1aeb03 100644
--- a/Filelist.txt
+++ b/Filelist.txt
@@ -164,6 +164,7 @@ keccak.cpp
keccak.h
lubyrack.h
lea.cpp
+lea-simd.cpp
lea.h
luc.cpp
luc.h
diff --git a/GNUmakefile b/GNUmakefile
index 009feb03..cdd798b1 100755
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -378,6 +378,7 @@ ifeq ($(IS_NEON),1)
GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
+ LEA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
SIMON_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
endif
@@ -388,6 +389,7 @@ ifeq ($(IS_ARMV8),1)
ifeq ($(HAVE_NEON),1)
ARIA_FLAG = -march=armv8-a
BLAKE2_FLAG = -march=armv8-a
+ LEA_FLAG = -march=armv8-a
NEON_FLAG = -march=armv8-a
SIMON_FLAG = -march=armv8-a
SPECK_FLAG = -march=armv8-a
diff --git a/GNUmakefile-cross b/GNUmakefile-cross
index 820e3fd0..2317541b 100755
--- a/GNUmakefile-cross
+++ b/GNUmakefile-cross
@@ -224,6 +224,7 @@ ifeq ($(IS_ARMv7),1)
GCM_FLAG = -march=armv7-a
ARIA_FLAG = -march=armv7-a
BLAKE2_FLAG = -march=armv7-a
+ LEA_FLAG = -march=armv7-a
endif
endif
@@ -234,6 +235,7 @@ ifeq ($(IS_NEON),1)
GCM_FLAG += -mfpu=neon
ARIA_FLAG += -mfpu=neon
BLAKE2_FLAG += -mfpu=neon
+ LEA_FLAG += -mfpu=neon
SIMON_FLAG += -mfpu=neon
SPECK_FLAG += -mfpu=neon
ifeq ($(IS_ANDROID),1)
@@ -242,6 +244,7 @@ ifeq ($(IS_NEON),1)
GCM_FLAG += -mfloat-abi=softfp
ARIA_FLAG += -mfloat-abi=softfp
BLAKE2_FLAG += -mfloat-abi=softfp
+ LEA_FLAG += -mfloat-abi=softfp
SIMON_FLAG += -mfloat-abi=softfp
SPECK_FLAG += -mfloat-abi=softfp
endif
@@ -255,6 +258,7 @@ ifneq ($(IS_ARMv8),0)
ifeq ($(IS_NEON),1)
ARIA_FLAG = -march=armv8-a
BLAKE2_FLAG = -march=armv8-a
+ LEA_FLAG = -march=armv8-a
NEON_FLAG = -march=armv8-a
SIMON_FLAG = -march=armv8-a
SPECK_FLAG = -march=armv8-a
@@ -505,7 +509,7 @@ crc-simd.o : crc-simd.cpp
gcm-simd.o : gcm-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<
-# SSSE3 available
+# SSSE3 or ARMv8a available
lea-simd.o : lea-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $<
diff --git a/adv-simd.h b/adv-simd.h
index 1609faeb..e86c12ba 100644
--- a/adv-simd.h
+++ b/adv-simd.h
@@ -18,6 +18,7 @@
// * AdvancedProcessBlocks64_6x2_SSE
// * AdvancedProcessBlocks128_6x2_SSE
// * AdvancedProcessBlocks64_6x2_NEON
+// * AdvancedProcessBlocks128_4x1_NEON
// * AdvancedProcessBlocks128_6x2_NEON
// * AdvancedProcessBlocks64_6x2_ALTIVEC
// * AdvancedProcessBlocks128_6x2_ALTIVEC
@@ -489,6 +490,147 @@ inline size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
return length;
}
+/// \brief AdvancedProcessBlocks for 1 and 4 blocks
+/// \tparam F1 function to process 1 128-bit block
+/// \tparam F4 function to process 4 128-bit blocks
+/// \tparam W word type of the subkey table
+/// \tparam V vector type of the NEON data type
+/// \details AdvancedProcessBlocks128_6x2_NEON processes 4 and 1 NEON SIMD words
+/// at a time.
+/// \details The subkey type is usually word32 or word64. V is the vector type and it is
+/// usually uint32x4_t or uint64x2_t. F1, F4, W and V must use the same word and
+/// vector type.
+template <typename F1, typename F4, typename W, typename V>
+inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
+ const V& unused, const W *subKeys, size_t rounds, const byte *inBlocks,
+ const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+ CRYPTOPP_ASSERT(subKeys);
+ CRYPTOPP_ASSERT(inBlocks);
+ CRYPTOPP_ASSERT(outBlocks);
+ CRYPTOPP_ASSERT(length >= 16);
+ CRYPTOPP_UNUSED(unused);
+
+#if defined(CRYPTOPP_LITTLE_ENDIAN)
+ const word32 s_one32x4[] = {0, 0, 0, 1<<24};
+#else
+ const word32 s_one32x4[] = {0, 0, 0, 1};
+#endif
+
+ const ptrdiff_t blockSize = 16;
+ // const ptrdiff_t neonBlockSize = 16;
+
+ ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
+ ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
+ ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
+
+ // Clang and Coverity are generating findings using xorBlocks as a flag.
+ const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
+ const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
+
+ if (flags & BT_ReverseDirection)
+ {
+ inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
+ xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
+ outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
+ inIncrement = 0-inIncrement;
+ xorIncrement = 0-xorIncrement;
+ outIncrement = 0-outIncrement;
+ }
+
+ if (flags & BT_AllowParallel)
+ {
+ while (length >= 4*blockSize)
+ {
+ uint64x2_t block0, block1, block2, block3, block4, block5;
+ if (flags & BT_InBlockIsCounter)
+ {
+ const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
+ block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
+
+ block1 = vaddq_u64(block0, be);
+ block2 = vaddq_u64(block1, be);
+ block3 = vaddq_u64(block2, be);
+ vst1q_u8(const_cast<byte*>(inBlocks),
+ vreinterpretq_u8_u64(vaddq_u64(block3, be)));
+ }
+ else
+ {
+ block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
+ inBlocks += inIncrement;
+ block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
+ inBlocks += inIncrement;
+ block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
+ inBlocks += inIncrement;
+ block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
+ inBlocks += inIncrement;
+ }
+
+ if (xorInput)
+ {
+ block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+ xorBlocks += xorIncrement;
+ block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+ xorBlocks += xorIncrement;
+ block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+ xorBlocks += xorIncrement;
+ block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+ xorBlocks += xorIncrement;
+ }
+
+ func4((V&)block0, (V&)block1, (V&)block2, (V&)block3, subKeys, static_cast<unsigned int>(rounds));
+
+ if (xorOutput)
+ {
+ block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+ xorBlocks += xorIncrement;
+ block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+ xorBlocks += xorIncrement;
+ block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+ xorBlocks += xorIncrement;
+ block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+ xorBlocks += xorIncrement;
+ }
+
+ vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
+ outBlocks += outIncrement;
+ vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
+ outBlocks += outIncrement;
+ vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
+ outBlocks += outIncrement;
+ vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
+ outBlocks += outIncrement;
+
+ length -= 4*blockSize;
+ }
+ }
+
+ while (length >= blockSize)
+ {
+ uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
+
+ if (xorInput)
+ block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+
+ if (flags & BT_InBlockIsCounter)
+ const_cast<byte *>(inBlocks)[15]++;
+
+ func1( (V&)block, subKeys, static_cast<unsigned int>(rounds));
+
+ if (xorOutput)
+ block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
+
+ vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
+
+ inBlocks += inIncrement;
+ outBlocks += outIncrement;
+ xorBlocks += xorIncrement;
+ length -= blockSize;
+ }
+
+ return length;
+}
+
/// \brief AdvancedProcessBlocks for 2 and 6 blocks
/// \tparam F2 function to process 2 128-bit blocks
/// \tparam F6 function to process 6 128-bit blocks
diff --git a/lea-simd.cpp b/lea-simd.cpp
index f91702c9..43060e94 100644
--- a/lea-simd.cpp
+++ b/lea-simd.cpp
@@ -27,6 +27,10 @@
# include <immintrin.h>
#endif
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+# include <arm_neon.h>
+#endif
+
// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
// compilers don't follow ACLE conventions for the include.
#if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
@@ -38,6 +42,489 @@ ANONYMOUS_NAMESPACE_BEGIN
using CryptoPP::word32;
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+
+inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b)
+{
+ return veorq_u32(a, b);
+}
+
+inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b)
+{
+ return vaddq_u32(a, b);
+}
+
+inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b)
+{
+ return vsubq_u32(a, b);
+}
+
+template <unsigned int R>
+inline uint32x4_t RotateLeft(const uint32x4_t& val)
+{
+ const uint32x4_t a(vshlq_n_u32(val, R));
+ const uint32x4_t b(vshrq_n_u32(val, 32 - R));
+ return vorrq_u32(a, b);
+}
+
+template <unsigned int R>
+inline uint32x4_t RotateRight(const uint32x4_t& val)
+{
+ const uint32x4_t a(vshlq_n_u32(val, 32 - R));
+ const uint32x4_t b(vshrq_n_u32(val, R));
+ return vorrq_u32(a, b);
+}
+
+#if defined(__aarch32__) || defined(__aarch64__)
+template <>
+inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+ const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
+ const uint8x16_t mask = vld1q_u8(maskb);
+#else
+ const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
+ const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+ return vreinterpretq_u32_u8(
+ vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
+}
+
+template <>
+inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+ const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
+ const uint8x16_t mask = vld1q_u8(maskb);
+#else
+ const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
+ const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+ return vreinterpretq_u32_u8(
+ vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
+}
+#endif
+
+uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b)
+{
+ uint32x2_t a1 = vget_low_u32(a);
+ uint32x2_t b1 = vget_low_u32(b);
+ uint32x2x2_t result = vzip_u32(a1, b1);
+ return vcombine_u32(result.val[0], result.val[1]);
+}
+
+uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b)
+{
+ uint32x2_t a1 = vget_high_u32(a);
+ uint32x2_t b1 = vget_high_u32(b);
+ uint32x2x2_t result = vzip_u32(a1, b1);
+ return vcombine_u32(result.val[0], result.val[1]);
+}
+
+uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b)
+{
+ uint64x1_t a1 = vget_low_u64((uint64x2_t)a);
+ uint64x1_t b1 = vget_low_u64((uint64x2_t)b);
+ return (uint32x4_t)vcombine_u64(a1, b1);
+}
+
+uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
+{
+ uint64x1_t a1 = vget_high_u64((uint64x2_t)a);
+ uint64x1_t b1 = vget_high_u64((uint64x2_t)b);
+ return (uint32x4_t)vcombine_u64(a1, b1);
+}
+
+template <unsigned int IDX>
+inline uint32x4_t LoadKey(const word32 rkey[])
+{
+ return vdupq_n_u32(rkey[IDX]);
+}
+
+template <unsigned int IDX>
+inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
+{
+ // Should not be instantiated
+ CRYPTOPP_ASSERT(0);;
+ return vmovq_n_u32(0);
+}
+
+template <>
+inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
+{
+ // LEA is little-endian oriented, so there is no need for a separate shuffle.
+ const uint32x4_t r1 = UnpackLow32(a, b);
+ const uint32x4_t r2 = UnpackLow32(c, d);
+ return UnpackLow64(r1, r2);
+}
+
+template <>
+inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
+{
+ // LEA is little-endian oriented, so there is no need for a separate shuffle.
+ const uint32x4_t r1 = UnpackLow32(a, b);
+ const uint32x4_t r2 = UnpackLow32(c, d);
+ return UnpackHigh64(r1, r2);
+}
+
+template <>
+inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
+{
+ // LEA is little-endian oriented, so there is no need for a separate shuffle.
+ const uint32x4_t r1 = UnpackHigh32(a, b);
+ const uint32x4_t r2 = UnpackHigh32(c, d);
+ return UnpackLow64(r1, r2);
+}
+
+template <>
+inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
+{
+ // LEA is little-endian oriented, so there is no need for a separate shuffle.
+ const uint32x4_t r1 = UnpackHigh32(a, b);
+ const uint32x4_t r2 = UnpackHigh32(c, d);
+ return UnpackHigh64(r1, r2);
+}
+
+template <unsigned int IDX>
+inline uint32x4_t UnpackNEON(const uint32x4_t& v)
+{
+ // Should not be instantiated
+ CRYPTOPP_ASSERT(0);;
+ return vmovq_n_u32(0);
+}
+
+template <>
+inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v)
+{
+ // Splat to all lanes
+ return vdupq_n_u32(vgetq_lane_u32(v, 0));
+}
+
+template <>
+inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v)
+{
+ // Splat to all lanes
+ return vdupq_n_u32(vgetq_lane_u32(v, 1));
+}
+
+template <>
+inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v)
+{
+ // Splat to all lanes
+ return vdupq_n_u32(vgetq_lane_u32(v, 2));
+}
+
+template <>
+inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v)
+{
+ // Splat to all lanes
+ return vdupq_n_u32(vgetq_lane_u32(v, 3));
+}
+
+template <unsigned int IDX>
+inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
+{
+ return UnpackNEON<IDX>(a, b, c, d);
+}
+
+template <unsigned int IDX>
+inline uint32x4_t RepackNEON(const uint32x4_t& v)
+{
+ return UnpackNEON<IDX>(v);
+}
+
+void LEA_Encryption(uint32x4_t temp[4], const word32 *subkeys, unsigned int rounds)
+{
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
+
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
+
+ if(rounds > 24)
+ {
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
+ }
+
+ if(rounds > 28)
+ {
+ temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
+ temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
+ temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
+ temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
+ temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
+ temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
+ temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
+ temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
+ temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
+ temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
+ temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
+ temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
+ }
+}
+
+void LEA_Decryption(uint32x4_t temp[4], const word32 *subkeys, unsigned int rounds)
+{
+ if(rounds > 28)
+ {
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
+ }
+
+ if(rounds > 24)
+ {
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
+ }
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
+
+ temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
+ temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
+ temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
+ temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
+ temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
+ temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
+ temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
+ temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
+}
+
+inline void LEA_Enc_Block(uint32x4_t &block0,
+ const word32 *subkeys, unsigned int rounds)
+{
+ uint32x4_t temp[4];
+ temp[0] = UnpackNEON<0>(block0);
+ temp[1] = UnpackNEON<1>(block0);
+ temp[2] = UnpackNEON<2>(block0);
+ temp[3] = UnpackNEON<3>(block0);
+
+ LEA_Encryption(temp, subkeys, rounds);
+
+ block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
+}
+
+inline void LEA_Dec_Block(uint32x4_t &block0,
+ const word32 *subkeys, unsigned int rounds)
+{
+ uint32x4_t temp[4];
+ temp[0] = UnpackNEON<0>(block0);
+ temp[1] = UnpackNEON<1>(block0);
+ temp[2] = UnpackNEON<2>(block0);
+ temp[3] = UnpackNEON<3>(block0);
+
+ LEA_Decryption(temp, subkeys, rounds);
+
+ block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
+}
+
+inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
+ uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
+{
+ uint32x4_t temp[4];
+ temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
+ temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
+ temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
+ temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
+
+ LEA_Encryption(temp, subkeys, rounds);
+
+ block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
+ block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
+ block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
+ block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
+}
+
+inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
+ uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
+{
+ uint32x4_t temp[4];
+ temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
+ temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
+ temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
+ temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
+
+ LEA_Decryption(temp, subkeys, rounds);
+
+ block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
+ block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
+ block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
+ block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
+}
+
+#endif // CRYPTOPP_ARM_NEON_AVAILABLE
+
#if (CRYPTOPP_SSSE3_AVAILABLE)
inline __m128i Xor(const __m128i& a, const __m128i& b)
@@ -504,4 +991,22 @@ size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
}
#endif // CRYPTOPP_SSSE3_AVAILABLE
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+ uint32x4_t unused; // Avoid template argument deduction/substitution failures
+ return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks,
+ unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+ uint32x4_t unused; // Avoid template argument deduction/substitution failures
+ return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks,
+ unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif // CRYPTOPP_ARM_NEON_AVAILABLE
+
NAMESPACE_END
diff --git a/lea.cpp b/lea.cpp
index 780bb407..35149157 100644
--- a/lea.cpp
+++ b/lea.cpp
@@ -556,6 +556,7 @@ inline void SetKey256(word32 rkey[192], const word32 key[8])
NAMESPACE_BEGIN(CryptoPP)
#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
+# if defined(CRYPTOPP_SSSE3_AVAILABLE)
extern void LEA_SplatKeys_SSSE3(SecBlock<word32>& rkeys);
extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
@@ -563,6 +564,15 @@ extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t
extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+# endif
+
+# if (CRYPTOPP_ARM_NEON_AVAILABLE)
+extern size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+# endif
#endif
void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params)
@@ -596,7 +606,7 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con
CRYPTOPP_ASSERT(0);;
}
-#if (CRYPTOPP_SSSE3_AVAILABLE)
+#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3())
{
// If we pre-splat the round keys at setup then we avoid a shuffle
@@ -850,20 +860,34 @@ void LEA::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byt
size_t LEA::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3()) {
return LEA_Enc_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
}
+#endif
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+ if (HasNEON())
+ return LEA_Enc_AdvancedProcessBlocks_NEON(m_rkey, (size_t)m_rounds,
+ inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
size_t LEA::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
byte *outBlocks, size_t length, word32 flags) const
{
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
if (HasSSSE3()) {
return LEA_Dec_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds,
inBlocks, xorBlocks, outBlocks, length, flags);
}
+#endif
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+ if (HasNEON())
+ return LEA_Dec_AdvancedProcessBlocks_NEON(m_rkey, (size_t)m_rounds,
+ inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
}
#endif // CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
diff --git a/lea.h b/lea.h
index 7b34724d..02fccd45 100644
--- a/lea.h
+++ b/lea.h
@@ -15,7 +15,7 @@
#include "secblock.h"
#include "algparam.h"
-#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
+#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
# define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1
#endif