From 6138829572f58cfe67e2e1c6a6bb1c69c9ed7872 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 19 Jun 2018 18:03:28 -0400 Subject: Add CHAM128 SSSE3 implementation (PR #670) CHAM-128(128) from 10.5 cpb to 4.1 cpb. CHAM-128(256) from 12.5 cpb to 4.7 cpb. --- cham-simd.cpp | 402 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 cham-simd.cpp (limited to 'cham-simd.cpp') diff --git a/cham-simd.cpp b/cham-simd.cpp new file mode 100644 index 00000000..718e2361 --- /dev/null +++ b/cham-simd.cpp @@ -0,0 +1,402 @@ +// cham-simd.cpp - written and placed in the public domain by Jeffrey Walton +// +// This source file uses intrinsics and built-ins to gain access to +// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// source file is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" + +#include "cham.h" +#include "misc.h" +#include "adv-simd.h" + +// Uncomment for benchmarking C++ against SSE or NEON. +// Do so in both simon.cpp and simon-simd.cpp. +// #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE + +#if (CRYPTOPP_SSSE3_AVAILABLE) +# include +# include +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::word32; + +#if (CRYPTOPP_SSSE3_AVAILABLE) + +template +inline __m128i RotateLeft32(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); +} + +template +inline __m128i RotateRight32(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateLeft32<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateRight32<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); + return _mm_shuffle_epi8(val, mask); +} + +template +inline __m128i UnpackXMM(__m128i a, __m128i b, __m128i c, __m128i d) +{ + // Should not be instantiated + CRYPTOPP_ASSERT(0);; + return _mm_setzero_si128(); +} + +template <> +inline __m128i UnpackXMM<0>(__m128i a, __m128i b, __m128i c, __m128i d) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpacklo_epi32(a, b); + const __m128i r2 = _mm_unpacklo_epi32(c, d); + return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2), + _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); +} + +template <> +inline __m128i UnpackXMM<1>(__m128i a, __m128i b, __m128i c, __m128i d) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpacklo_epi32(a, b); + const __m128i r2 = _mm_unpacklo_epi32(c, d); + return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2), + _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); +} + +template <> +inline __m128i UnpackXMM<2>(__m128i a, __m128i b, __m128i c, __m128i d) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpackhi_epi32(a, b); + const __m128i r2 = _mm_unpackhi_epi32(c, d); + return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2), + _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); +} + +template <> +inline __m128i UnpackXMM<3>(__m128i a, __m128i b, __m128i c, __m128i d) +{ + // The shuffle converts to and from little-endian for SSE. A specialized + // CHAM implementation can avoid the shuffle by framing the data for + // encryption, decryption and benchmarks. The library cannot take the + // speed-up because of the byte oriented API. + const __m128i r1 = _mm_unpackhi_epi32(a, b); + const __m128i r2 = _mm_unpackhi_epi32(c, d); + return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2), + _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3)); +} + +template +inline __m128i UnpackXMM(__m128i v) +{ + return UnpackXMM(v, v, v, v); +} + +template +inline __m128i RepackXMM(__m128i a, __m128i b, __m128i c, __m128i d) +{ + return UnpackXMM(a, b, c, d); +} +#endif + +template +inline __m128i RepackXMM(__m128i v) +{ + return RepackXMM(v, v, v, v); +} + +inline void GCC_NO_UBSAN CHAM128_Enc_Block(__m128i &block0, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. UnpackXMM includes a + // little-endian swap for SSE. Thanks to Peter Cordes for help + // with packing and unpacking. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... + __m128i a = UnpackXMM<0>(block0); + __m128i b = UnpackXMM<1>(block0); + __m128i c = UnpackXMM<2>(block0); + __m128i d = UnpackXMM<3>(block0); + + __m128i counter = _mm_set_epi32(0,0,0,0); + __m128i increment = _mm_set_epi32(1,1,1,1); + + const unsigned int MASK = (rounds == 80 ? 7 : 15); + for (int i=0; i(rounds); i+=4) + { + __m128i t1, t2, k, k1, k2; + + k = _mm_loadu_si128((const __m128i*) &subkeys[i & MASK]); + k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); + k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); + + t1 = _mm_xor_si128(a, counter); + t2 = _mm_xor_si128(RotateLeft32<1>(b), k1); + a = RotateLeft32<8>(_mm_add_epi32(t1, t2)); + + counter = _mm_add_epi32(counter, increment); + + t1 = _mm_xor_si128(b, counter); + t2 = _mm_xor_si128(RotateLeft32<8>(c), k2); + b = RotateLeft32<1>(_mm_add_epi32(t1, t2)); + + counter = _mm_add_epi32(counter, increment); + + k1 = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); + k2 = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); + + t1 = _mm_xor_si128(c, counter); + t2 = _mm_xor_si128(RotateLeft32<1>(d), k1); + c = RotateLeft32<8>(_mm_add_epi32(t1, t2)); + + counter = _mm_add_epi32(counter, increment); + + t1 = _mm_xor_si128(d, counter); + t2 = _mm_xor_si128(RotateLeft32<8>(a), k2); + d = RotateLeft32<1>(_mm_add_epi32(t1, t2)); + + counter = _mm_add_epi32(counter, increment); + } + + // Repack + // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... + block0 = RepackXMM<0>(a,b,c,d); +} + +inline void GCC_NO_UBSAN CHAM128_Dec_Block(__m128i &block0, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. UnpackXMM includes a + // little-endian swap for SSE. Thanks to Peter Cordes for help + // with packing and unpacking. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... + __m128i a = UnpackXMM<0>(block0); + __m128i b = UnpackXMM<1>(block0); + __m128i c = UnpackXMM<2>(block0); + __m128i d = UnpackXMM<3>(block0); + + __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1); + __m128i decrement = _mm_set_epi32(1,1,1,1); + + const unsigned int MASK = (rounds == 80 ? 7 : 15); + for (int i = static_cast(rounds)-1; i >= 0; i-=4) + { + __m128i t1, t2, k, k1, k2; + + k = _mm_loadu_si128((const __m128i*) &subkeys[(i-3) & MASK]); + k1 = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); + k2 = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); + + // Odd round + t1 = RotateRight32<1>(d); + t2 = _mm_xor_si128(RotateLeft32<8>(a), k1); + d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); + + counter = _mm_sub_epi32(counter, decrement); + + // Even round + t1 = RotateRight32<8>(c); + t2 = _mm_xor_si128(RotateLeft32<1>(d), k2); + c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); + + counter = _mm_sub_epi32(counter, decrement); + + k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); + k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); + + // Odd round + t1 = RotateRight32<1>(b); + t2 = _mm_xor_si128(RotateLeft32<8>(c), k1); + b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); + + counter = _mm_sub_epi32(counter, decrement); + + // Even round + t1 = RotateRight32<8>(a); + t2 = _mm_xor_si128(RotateLeft32<1>(b), k2); + a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); + + counter = _mm_sub_epi32(counter, decrement); + } + + // Repack + // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... + block0 = RepackXMM<0>(a,b,c,d); +} + +inline void GCC_NO_UBSAN CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. UnpackXMM includes a + // little-endian swap for SSE. Thanks to Peter Cordes for help + // with packing and unpacking. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... + __m128i a = UnpackXMM<0>(block0, block1, block2, block3); + __m128i b = UnpackXMM<1>(block0, block1, block2, block3); + __m128i c = UnpackXMM<2>(block0, block1, block2, block3); + __m128i d = UnpackXMM<3>(block0, block1, block2, block3); + + __m128i counter = _mm_set_epi32(0,0,0,0); + __m128i increment = _mm_set_epi32(1,1,1,1); + + const unsigned int MASK = (rounds == 80 ? 7 : 15); + for (int i=0; i(rounds); i+=4) + { + __m128i t1, t2, k, k1, k2; + + k = _mm_loadu_si128((const __m128i*) &subkeys[i & MASK]); + k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); + k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); + + t1 = _mm_xor_si128(a, counter); + t2 = _mm_xor_si128(RotateLeft32<1>(b), k1); + a = RotateLeft32<8>(_mm_add_epi32(t1, t2)); + + counter = _mm_add_epi32(counter, increment); + + t1 = _mm_xor_si128(b, counter); + t2 = _mm_xor_si128(RotateLeft32<8>(c), k2); + b = RotateLeft32<1>(_mm_add_epi32(t1, t2)); + + counter = _mm_add_epi32(counter, increment); + + k1 = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); + k2 = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); + + t1 = _mm_xor_si128(c, counter); + t2 = _mm_xor_si128(RotateLeft32<1>(d), k1); + c = RotateLeft32<8>(_mm_add_epi32(t1, t2)); + + counter = _mm_add_epi32(counter, increment); + + t1 = _mm_xor_si128(d, counter); + t2 = _mm_xor_si128(RotateLeft32<8>(a), k2); + d = RotateLeft32<1>(_mm_add_epi32(t1, t2)); + + counter = _mm_add_epi32(counter, increment); + } + + // Repack + // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... + block0 = RepackXMM<0>(a,b,c,d); + block1 = RepackXMM<1>(a,b,c,d); + block2 = RepackXMM<2>(a,b,c,d); + block3 = RepackXMM<3>(a,b,c,d); +} + +inline void GCC_NO_UBSAN CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. UnpackXMM includes a + // little-endian swap for SSE. Thanks to Peter Cordes for help + // with packing and unpacking. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ... + __m128i a = UnpackXMM<0>(block0, block1, block2, block3); + __m128i b = UnpackXMM<1>(block0, block1, block2, block3); + __m128i c = UnpackXMM<2>(block0, block1, block2, block3); + __m128i d = UnpackXMM<3>(block0, block1, block2, block3); + + __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1); + __m128i decrement = _mm_set_epi32(1,1,1,1); + + const unsigned int MASK = (rounds == 80 ? 7 : 15); + for (int i = static_cast(rounds)-1; i >= 0; i-=4) + { + __m128i t1, t2, k, k1, k2; + + k = _mm_loadu_si128((const __m128i*) &subkeys[(i-3) & MASK]); + k1 = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); + k2 = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); + + // Odd round + t1 = RotateRight32<1>(d); + t2 = _mm_xor_si128(RotateLeft32<8>(a), k1); + d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); + + counter = _mm_sub_epi32(counter, decrement); + + // Even round + t1 = RotateRight32<8>(c); + t2 = _mm_xor_si128(RotateLeft32<1>(d), k2); + c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); + + counter = _mm_sub_epi32(counter, decrement); + + k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); + k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); + + // Odd round + t1 = RotateRight32<1>(b); + t2 = _mm_xor_si128(RotateLeft32<8>(c), k1); + b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); + + counter = _mm_sub_epi32(counter, decrement); + + // Even round + t1 = RotateRight32<8>(a); + t2 = _mm_xor_si128(RotateLeft32<1>(b), k2); + a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter); + + counter = _mm_sub_epi32(counter, decrement); + } + + // Repack + // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ... + block0 = RepackXMM<0>(a,b,c,d); + block1 = RepackXMM<1>(a,b,c,d); + block2 = RepackXMM<2>(a,b,c,d); + block3 = RepackXMM<3>(a,b,c,d); +} + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_4x1_SSE(CHAM128_Enc_Block, CHAM128_Enc_4_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_4x1_SSE(CHAM128_Dec_Block, CHAM128_Dec_4_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_SSSE3_AVAILABLE + +NAMESPACE_END -- cgit v1.2.1