From 9980d307349c3873022de8fa6d5fb557c533f019 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sat, 23 Jun 2018 03:54:51 -0400 Subject: Add LEA-128 NEON and ARMv8 implementation (GH #669) LEA-128(128) from 35.6 cpb to 14.11 cpb on a LeMaker HiKey dev-board. LEA-128 from 12.60 cpb to 11.89 cpb on AMD Opteron 1100. --- Filelist.txt | 1 + GNUmakefile | 2 + GNUmakefile-cross | 6 +- adv-simd.h | 142 +++++++++++++++ lea-simd.cpp | 505 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ lea.cpp | 26 ++- lea.h | 2 +- 7 files changed, 681 insertions(+), 3 deletions(-) diff --git a/Filelist.txt b/Filelist.txt index a76d1c61..ae1aeb03 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -164,6 +164,7 @@ keccak.cpp keccak.h lubyrack.h lea.cpp +lea-simd.cpp lea.h luc.cpp luc.h diff --git a/GNUmakefile b/GNUmakefile index 009feb03..cdd798b1 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -378,6 +378,7 @@ ifeq ($(IS_NEON),1) GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon + LEA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon SIMON_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon endif @@ -388,6 +389,7 @@ ifeq ($(IS_ARMV8),1) ifeq ($(HAVE_NEON),1) ARIA_FLAG = -march=armv8-a BLAKE2_FLAG = -march=armv8-a + LEA_FLAG = -march=armv8-a NEON_FLAG = -march=armv8-a SIMON_FLAG = -march=armv8-a SPECK_FLAG = -march=armv8-a diff --git a/GNUmakefile-cross b/GNUmakefile-cross index 820e3fd0..2317541b 100755 --- a/GNUmakefile-cross +++ b/GNUmakefile-cross @@ -224,6 +224,7 @@ ifeq ($(IS_ARMv7),1) GCM_FLAG = -march=armv7-a ARIA_FLAG = -march=armv7-a BLAKE2_FLAG = -march=armv7-a + LEA_FLAG = -march=armv7-a endif endif @@ -234,6 +235,7 @@ ifeq ($(IS_NEON),1) GCM_FLAG += -mfpu=neon ARIA_FLAG += -mfpu=neon BLAKE2_FLAG += -mfpu=neon + LEA_FLAG += -mfpu=neon SIMON_FLAG += -mfpu=neon SPECK_FLAG += -mfpu=neon ifeq ($(IS_ANDROID),1) @@ -242,6 +244,7 @@ ifeq ($(IS_NEON),1) GCM_FLAG += -mfloat-abi=softfp ARIA_FLAG += -mfloat-abi=softfp BLAKE2_FLAG += -mfloat-abi=softfp + LEA_FLAG += -mfloat-abi=softfp SIMON_FLAG += -mfloat-abi=softfp SPECK_FLAG += -mfloat-abi=softfp endif @@ -255,6 +258,7 @@ ifneq ($(IS_ARMv8),0) ifeq ($(IS_NEON),1) ARIA_FLAG = -march=armv8-a BLAKE2_FLAG = -march=armv8-a + LEA_FLAG = -march=armv8-a NEON_FLAG = -march=armv8-a SIMON_FLAG = -march=armv8-a SPECK_FLAG = -march=armv8-a @@ -505,7 +509,7 @@ crc-simd.o : crc-simd.cpp gcm-simd.o : gcm-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $< -# SSSE3 available +# SSSE3 or ARMv8a available lea-simd.o : lea-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $< diff --git a/adv-simd.h b/adv-simd.h index 1609faeb..e86c12ba 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -18,6 +18,7 @@ // * AdvancedProcessBlocks64_6x2_SSE // * AdvancedProcessBlocks128_6x2_SSE // * AdvancedProcessBlocks64_6x2_NEON +// * AdvancedProcessBlocks128_4x1_NEON // * AdvancedProcessBlocks128_6x2_NEON // * AdvancedProcessBlocks64_6x2_ALTIVEC // * AdvancedProcessBlocks128_6x2_ALTIVEC @@ -489,6 +490,147 @@ inline size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6, return length; } +/// \brief AdvancedProcessBlocks for 1 and 4 blocks +/// \tparam F1 function to process 1 128-bit block +/// \tparam F4 function to process 4 128-bit blocks +/// \tparam W word type of the subkey table +/// \tparam V vector type of the NEON data type +/// \details AdvancedProcessBlocks128_6x2_NEON processes 4 and 1 NEON SIMD words +/// at a time. +/// \details The subkey type is usually word32 or word64. V is the vector type and it is +/// usually uint32x4_t or uint64x2_t. F1, F4, W and V must use the same word and +/// vector type. +template +inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4, + const V& unused, const W *subKeys, size_t rounds, const byte *inBlocks, + const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 16); + CRYPTOPP_UNUSED(unused); + +#if defined(CRYPTOPP_LITTLE_ENDIAN) + const word32 s_one32x4[] = {0, 0, 0, 1<<24}; +#else + const word32 s_one32x4[] = {0, 0, 0, 1}; +#endif + + const ptrdiff_t blockSize = 16; + // const ptrdiff_t neonBlockSize = 16; + + ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; + ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); + + if (flags & BT_ReverseDirection) + { + inBlocks += static_cast(length) - blockSize; + xorBlocks += static_cast(length) - blockSize; + outBlocks += static_cast(length) - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BT_AllowParallel) + { + while (length >= 4*blockSize) + { + uint64x2_t block0, block1, block2, block3, block4, block5; + if (flags & BT_InBlockIsCounter) + { + const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4)); + block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + + block1 = vaddq_u64(block0, be); + block2 = vaddq_u64(block1, be); + block3 = vaddq_u64(block2, be); + vst1q_u8(const_cast(inBlocks), + vreinterpretq_u8_u64(vaddq_u64(block3, be))); + } + else + { + block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + inBlocks += inIncrement; + } + + if (xorInput) + { + block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + } + + func4((V&)block0, (V&)block1, (V&)block2, (V&)block3, subKeys, static_cast(rounds)); + + if (xorOutput) + { + block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + xorBlocks += xorIncrement; + } + + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0)); + outBlocks += outIncrement; + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1)); + outBlocks += outIncrement; + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2)); + outBlocks += outIncrement; + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3)); + outBlocks += outIncrement; + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); + + if (xorInput) + block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + + if (flags & BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1( (V&)block, subKeys, static_cast(rounds)); + + if (xorOutput) + block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); + + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block)); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + /// \brief AdvancedProcessBlocks for 2 and 6 blocks /// \tparam F2 function to process 2 128-bit blocks /// \tparam F6 function to process 6 128-bit blocks diff --git a/lea-simd.cpp b/lea-simd.cpp index f91702c9..43060e94 100644 --- a/lea-simd.cpp +++ b/lea-simd.cpp @@ -27,6 +27,10 @@ # include #endif +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include +#endif + // Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many // compilers don't follow ACLE conventions for the include. #if defined(CRYPTOPP_ARM_ACLE_AVAILABLE) @@ -38,6 +42,489 @@ ANONYMOUS_NAMESPACE_BEGIN using CryptoPP::word32; +#if (CRYPTOPP_ARM_NEON_AVAILABLE) + +inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b) +{ + return veorq_u32(a, b); +} + +inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b) +{ + return vaddq_u32(a, b); +} + +inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b) +{ + return vsubq_u32(a, b); +} + +template +inline uint32x4_t RotateLeft(const uint32x4_t& val) +{ + const uint32x4_t a(vshlq_n_u32(val, R)); + const uint32x4_t b(vshrq_n_u32(val, 32 - R)); + return vorrq_u32(a, b); +} + +template +inline uint32x4_t RotateRight(const uint32x4_t& val) +{ + const uint32x4_t a(vshlq_n_u32(val, 32 - R)); + const uint32x4_t b(vshrq_n_u32(val, R)); + return vorrq_u32(a, b); +} + +#if defined(__aarch32__) || defined(__aarch64__) +template <> +inline uint32x4_t RotateLeft<8>(const uint32x4_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u32_u8( + vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); +} + +template <> +inline uint32x4_t RotateRight<8>(const uint32x4_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u32_u8( + vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); +} +#endif + +uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b) +{ + uint32x2_t a1 = vget_low_u32(a); + uint32x2_t b1 = vget_low_u32(b); + uint32x2x2_t result = vzip_u32(a1, b1); + return vcombine_u32(result.val[0], result.val[1]); +} + +uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b) +{ + uint32x2_t a1 = vget_high_u32(a); + uint32x2_t b1 = vget_high_u32(b); + uint32x2x2_t result = vzip_u32(a1, b1); + return vcombine_u32(result.val[0], result.val[1]); +} + +uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b) +{ + uint64x1_t a1 = vget_low_u64((uint64x2_t)a); + uint64x1_t b1 = vget_low_u64((uint64x2_t)b); + return (uint32x4_t)vcombine_u64(a1, b1); +} + +uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b) +{ + uint64x1_t a1 = vget_high_u64((uint64x2_t)a); + uint64x1_t b1 = vget_high_u64((uint64x2_t)b); + return (uint32x4_t)vcombine_u64(a1, b1); +} + +template +inline uint32x4_t LoadKey(const word32 rkey[]) +{ + return vdupq_n_u32(rkey[IDX]); +} + +template +inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) +{ + // Should not be instantiated + CRYPTOPP_ASSERT(0);; + return vmovq_n_u32(0); +} + +template <> +inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const uint32x4_t r1 = UnpackLow32(a, b); + const uint32x4_t r2 = UnpackLow32(c, d); + return UnpackLow64(r1, r2); +} + +template <> +inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const uint32x4_t r1 = UnpackLow32(a, b); + const uint32x4_t r2 = UnpackLow32(c, d); + return UnpackHigh64(r1, r2); +} + +template <> +inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const uint32x4_t r1 = UnpackHigh32(a, b); + const uint32x4_t r2 = UnpackHigh32(c, d); + return UnpackLow64(r1, r2); +} + +template <> +inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const uint32x4_t r1 = UnpackHigh32(a, b); + const uint32x4_t r2 = UnpackHigh32(c, d); + return UnpackHigh64(r1, r2); +} + +template +inline uint32x4_t UnpackNEON(const uint32x4_t& v) +{ + // Should not be instantiated + CRYPTOPP_ASSERT(0);; + return vmovq_n_u32(0); +} + +template <> +inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v) +{ + // Splat to all lanes + return vdupq_n_u32(vgetq_lane_u32(v, 0)); +} + +template <> +inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v) +{ + // Splat to all lanes + return vdupq_n_u32(vgetq_lane_u32(v, 1)); +} + +template <> +inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v) +{ + // Splat to all lanes + return vdupq_n_u32(vgetq_lane_u32(v, 2)); +} + +template <> +inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v) +{ + // Splat to all lanes + return vdupq_n_u32(vgetq_lane_u32(v, 3)); +} + +template +inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d) +{ + return UnpackNEON(a, b, c, d); +} + +template +inline uint32x4_t RepackNEON(const uint32x4_t& v) +{ + return UnpackNEON(v); +} + +void LEA_Encryption(uint32x4_t temp[4], const word32 *subkeys, unsigned int rounds) +{ + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys)))); + + if(rounds > 24) + { + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys)))); + } + + if(rounds > 28) + { + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys)))); + } +} + +void LEA_Decryption(uint32x4_t temp[4], const word32 *subkeys, unsigned int rounds) +{ + if(rounds > 28) + { + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys)); + } + + if(rounds > 24) + { + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys)); + } + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys)); +} + +inline void LEA_Enc_Block(uint32x4_t &block0, + const word32 *subkeys, unsigned int rounds) +{ + uint32x4_t temp[4]; + temp[0] = UnpackNEON<0>(block0); + temp[1] = UnpackNEON<1>(block0); + temp[2] = UnpackNEON<2>(block0); + temp[3] = UnpackNEON<3>(block0); + + LEA_Encryption(temp, subkeys, rounds); + + block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void LEA_Dec_Block(uint32x4_t &block0, + const word32 *subkeys, unsigned int rounds) +{ + uint32x4_t temp[4]; + temp[0] = UnpackNEON<0>(block0); + temp[1] = UnpackNEON<1>(block0); + temp[2] = UnpackNEON<2>(block0); + temp[3] = UnpackNEON<3>(block0); + + LEA_Decryption(temp, subkeys, rounds); + + block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, + uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds) +{ + uint32x4_t temp[4]; + temp[0] = UnpackNEON<0>(block0, block1, block2, block3); + temp[1] = UnpackNEON<1>(block0, block1, block2, block3); + temp[2] = UnpackNEON<2>(block0, block1, block2, block3); + temp[3] = UnpackNEON<3>(block0, block1, block2, block3); + + LEA_Encryption(temp, subkeys, rounds); + + block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]); + block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]); + block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]); + block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, + uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds) +{ + uint32x4_t temp[4]; + temp[0] = UnpackNEON<0>(block0, block1, block2, block3); + temp[1] = UnpackNEON<1>(block0, block1, block2, block3); + temp[2] = UnpackNEON<2>(block0, block1, block2, block3); + temp[3] = UnpackNEON<3>(block0, block1, block2, block3); + + LEA_Decryption(temp, subkeys, rounds); + + block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]); + block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]); + block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]); + block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]); +} + +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + #if (CRYPTOPP_SSSE3_AVAILABLE) inline __m128i Xor(const __m128i& a, const __m128i& b) @@ -504,4 +991,22 @@ size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, } #endif // CRYPTOPP_SSSE3_AVAILABLE +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) +size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + uint32x4_t unused; // Avoid template argument deduction/substitution failures + return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks, + unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + uint32x4_t unused; // Avoid template argument deduction/substitution failures + return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks, + unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + NAMESPACE_END diff --git a/lea.cpp b/lea.cpp index 780bb407..35149157 100644 --- a/lea.cpp +++ b/lea.cpp @@ -556,6 +556,7 @@ inline void SetKey256(word32 rkey[192], const word32 key[8]) NAMESPACE_BEGIN(CryptoPP) #if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS +# if defined(CRYPTOPP_SSSE3_AVAILABLE) extern void LEA_SplatKeys_SSSE3(SecBlock& rkeys); extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, @@ -563,6 +564,15 @@ extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +# endif + +# if (CRYPTOPP_ARM_NEON_AVAILABLE) +extern size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +# endif #endif void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms) @@ -596,7 +606,7 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con CRYPTOPP_ASSERT(0);; } -#if (CRYPTOPP_SSSE3_AVAILABLE) +#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_SSSE3_AVAILABLE) if (HasSSSE3()) { // If we pre-splat the round keys at setup then we avoid a shuffle @@ -850,20 +860,34 @@ void LEA::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byt size_t LEA::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { +#if defined(CRYPTOPP_SSSE3_AVAILABLE) if (HasSSSE3()) { return LEA_Enc_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); } +#endif +#if (CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return LEA_Enc_AdvancedProcessBlocks_NEON(m_rkey, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } size_t LEA::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { +#if defined(CRYPTOPP_SSSE3_AVAILABLE) if (HasSSSE3()) { return LEA_Dec_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); } +#endif +#if (CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return LEA_Dec_AdvancedProcessBlocks_NEON(m_rkey, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } #endif // CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS diff --git a/lea.h b/lea.h index 7b34724d..02fccd45 100644 --- a/lea.h +++ b/lea.h @@ -15,7 +15,7 @@ #include "secblock.h" #include "algparam.h" -#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86) +#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) # define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1 #endif -- cgit v1.2.1