summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Filelist.txt1
-rwxr-xr-xGNUmakefile6
-rwxr-xr-xGNUmakefile-cross5
-rw-r--r--cham-simd.cpp402
-rw-r--r--cham.cpp33
-rw-r--r--cham.h14
-rw-r--r--cryptlib.vcxproj1
-rw-r--r--cryptlib.vcxproj.filters3
8 files changed, 464 insertions, 1 deletions
diff --git a/Filelist.txt b/Filelist.txt
index 49264165..a76d1c61 100644
--- a/Filelist.txt
+++ b/Filelist.txt
@@ -50,6 +50,7 @@ ccm.h
chacha.cpp
chacha.h
cham.cpp
+cham-simd.cpp
cham.h
channels.cpp
channels.h
diff --git a/GNUmakefile b/GNUmakefile
index 3963fdd1..263eaca7 100755
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -249,6 +249,7 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_SSSE3,$(CXXFLAGS)),)
HAVE_SSSE3 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -mssse3 -dM -E - 2>/dev/null | $(GREP) -i -c __SSSE3__)
ifeq ($(HAVE_SSSE3),1)
ARIA_FLAG = -mssse3
+ CHAM_FLAG = -mssse3
SSSE3_FLAG = -mssse3
SIMON_FLAG = -mssse3
SPECK_FLAG = -mssse3
@@ -289,6 +290,7 @@ ifeq ($(SUN_COMPILER),1)
ifeq ($(COUNT),0)
SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
+ CHAM_FLAG = -xarch=ssse3 -D__SSSE3__=1
SIMON_FLAG = -xarch=ssse3 -D__SSSE3__=1
SPECK_FLAG = -xarch=ssse3 -D__SSSE3__=1
LDFLAGS += -xarch=ssse3
@@ -1050,6 +1052,10 @@ aria-simd.o : aria-simd.cpp
blake2-simd.o : blake2-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(BLAKE2_FLAG) -c) $<
+# SSSE3 available
+cham-simd.o : cham-simd.cpp
+ $(CXX) $(strip $(CXXFLAGS) $(CHAM_FLAG) -c) $<
+
# SSE2 on i586
sse-simd.o : sse-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SSE_FLAG) -c) $<
diff --git a/GNUmakefile-cross b/GNUmakefile-cross
index a74df747..b66b3874 100755
--- a/GNUmakefile-cross
+++ b/GNUmakefile-cross
@@ -276,6 +276,7 @@ ifneq ($(IS_i686)$(IS_x86_64),00)
HAVE_SSSE3 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -mssse3 -dM -E - 2>/dev/null | $(EGREP) -i -c __SSSE3__)
ifeq ($(HAVE_SSSE3),1)
ARIA_FLAG = -mssse3
+ CHAM_FLAG = -mssse3
SSSE3_FLAG = -mssse3
SIMON_FLAG = -mssse3
SPECK_FLAG = -mssse3
@@ -487,6 +488,10 @@ aria-simd.o : aria-simd.cpp
blake2-simd.o : blake2-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(BLAKE2_FLAG) -c) $<
+# SSSE3 available
+cham-simd.o : cham-simd.cpp
+ $(CXX) $(strip $(CXXFLAGS) $(CHAM_FLAG) -c) $<
+
# SSE2 on i586
cpu.o : cpu.cpp
$(CXX) $(strip $(CXXFLAGS) $(CPU_FLAG) -c) $<
diff --git a/cham-simd.cpp b/cham-simd.cpp
new file mode 100644
index 00000000..718e2361
--- /dev/null
+++ b/cham-simd.cpp
@@ -0,0 +1,402 @@
+// cham-simd.cpp - written and placed in the public domain by Jeffrey Walton
+//
+// This source file uses intrinsics and built-ins to gain access to
+// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
+// source file is needed because additional CXXFLAGS are required to enable
+// the appropriate instructions sets in some build configurations.
+
+#include "pch.h"
+#include "config.h"
+
+#include "cham.h"
+#include "misc.h"
+#include "adv-simd.h"
+
+// Uncomment for benchmarking C++ against SSE or NEON.
+// Do so in both simon.cpp and simon-simd.cpp.
+// #undef CRYPTOPP_SSSE3_AVAILABLE
+// #undef CRYPTOPP_ARM_NEON_AVAILABLE
+
+#if (CRYPTOPP_SSSE3_AVAILABLE)
+# include <pmmintrin.h>
+# include <tmmintrin.h>
+#endif
+
+ANONYMOUS_NAMESPACE_BEGIN
+
+using CryptoPP::word32;
+
+#if (CRYPTOPP_SSSE3_AVAILABLE)
+
+template <unsigned int R>
+inline __m128i RotateLeft32(const __m128i& val)
+{
+ return _mm_or_si128(
+ _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
+}
+
+template <unsigned int R>
+inline __m128i RotateRight32(const __m128i& val)
+{
+ return _mm_or_si128(
+ _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateLeft32<8>(const __m128i& val)
+{
+ const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
+ return _mm_shuffle_epi8(val, mask);
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateRight32<8>(const __m128i& val)
+{
+ const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
+ return _mm_shuffle_epi8(val, mask);
+}
+
+template <unsigned int IDX>
+inline __m128i UnpackXMM(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ // Should not be instantiated
+ CRYPTOPP_ASSERT(0);;
+ return _mm_setzero_si128();
+}
+
+template <>
+inline __m128i UnpackXMM<0>(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ // The shuffle converts to and from little-endian for SSE. A specialized
+ // CHAM implementation can avoid the shuffle by framing the data for
+ // encryption, decryption and benchmarks. The library cannot take the
+ // speed-up because of the byte oriented API.
+ const __m128i r1 = _mm_unpacklo_epi32(a, b);
+ const __m128i r2 = _mm_unpacklo_epi32(c, d);
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
+ _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
+}
+
+template <>
+inline __m128i UnpackXMM<1>(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ // The shuffle converts to and from little-endian for SSE. A specialized
+ // CHAM implementation can avoid the shuffle by framing the data for
+ // encryption, decryption and benchmarks. The library cannot take the
+ // speed-up because of the byte oriented API.
+ const __m128i r1 = _mm_unpacklo_epi32(a, b);
+ const __m128i r2 = _mm_unpacklo_epi32(c, d);
+ return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
+ _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
+}
+
+template <>
+inline __m128i UnpackXMM<2>(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ // The shuffle converts to and from little-endian for SSE. A specialized
+ // CHAM implementation can avoid the shuffle by framing the data for
+ // encryption, decryption and benchmarks. The library cannot take the
+ // speed-up because of the byte oriented API.
+ const __m128i r1 = _mm_unpackhi_epi32(a, b);
+ const __m128i r2 = _mm_unpackhi_epi32(c, d);
+ return _mm_shuffle_epi8(_mm_unpacklo_epi64(r1, r2),
+ _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
+}
+
+template <>
+inline __m128i UnpackXMM<3>(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ // The shuffle converts to and from little-endian for SSE. A specialized
+ // CHAM implementation can avoid the shuffle by framing the data for
+ // encryption, decryption and benchmarks. The library cannot take the
+ // speed-up because of the byte oriented API.
+ const __m128i r1 = _mm_unpackhi_epi32(a, b);
+ const __m128i r2 = _mm_unpackhi_epi32(c, d);
+ return _mm_shuffle_epi8(_mm_unpackhi_epi64(r1, r2),
+ _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3));
+}
+
+template <unsigned int IDX>
+inline __m128i UnpackXMM(__m128i v)
+{
+ return UnpackXMM<IDX>(v, v, v, v);
+}
+
+template <unsigned int IDX>
+inline __m128i RepackXMM(__m128i a, __m128i b, __m128i c, __m128i d)
+{
+ return UnpackXMM<IDX>(a, b, c, d);
+}
+#endif
+
+template <unsigned int IDX>
+inline __m128i RepackXMM(__m128i v)
+{
+ return RepackXMM<IDX>(v, v, v, v);
+}
+
+inline void GCC_NO_UBSAN CHAM128_Enc_Block(__m128i &block0,
+ const word32 *subkeys, unsigned int rounds)
+{
+ // Rearrange the data for vectorization. UnpackXMM includes a
+ // little-endian swap for SSE. Thanks to Peter Cordes for help
+ // with packing and unpacking.
+ // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
+ __m128i a = UnpackXMM<0>(block0);
+ __m128i b = UnpackXMM<1>(block0);
+ __m128i c = UnpackXMM<2>(block0);
+ __m128i d = UnpackXMM<3>(block0);
+
+ __m128i counter = _mm_set_epi32(0,0,0,0);
+ __m128i increment = _mm_set_epi32(1,1,1,1);
+
+ const unsigned int MASK = (rounds == 80 ? 7 : 15);
+ for (int i=0; i<static_cast<int>(rounds); i+=4)
+ {
+ __m128i t1, t2, k, k1, k2;
+
+ k = _mm_loadu_si128((const __m128i*) &subkeys[i & MASK]);
+ k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
+ k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
+
+ t1 = _mm_xor_si128(a, counter);
+ t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
+ a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
+
+ counter = _mm_add_epi32(counter, increment);
+
+ t1 = _mm_xor_si128(b, counter);
+ t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
+ b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
+
+ counter = _mm_add_epi32(counter, increment);
+
+ k1 = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
+ k2 = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
+
+ t1 = _mm_xor_si128(c, counter);
+ t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
+ c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
+
+ counter = _mm_add_epi32(counter, increment);
+
+ t1 = _mm_xor_si128(d, counter);
+ t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
+ d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
+
+ counter = _mm_add_epi32(counter, increment);
+ }
+
+ // Repack
+ // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
+ block0 = RepackXMM<0>(a,b,c,d);
+}
+
+inline void GCC_NO_UBSAN CHAM128_Dec_Block(__m128i &block0,
+ const word32 *subkeys, unsigned int rounds)
+{
+ // Rearrange the data for vectorization. UnpackXMM includes a
+ // little-endian swap for SSE. Thanks to Peter Cordes for help
+ // with packing and unpacking.
+ // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
+ __m128i a = UnpackXMM<0>(block0);
+ __m128i b = UnpackXMM<1>(block0);
+ __m128i c = UnpackXMM<2>(block0);
+ __m128i d = UnpackXMM<3>(block0);
+
+ __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
+ __m128i decrement = _mm_set_epi32(1,1,1,1);
+
+ const unsigned int MASK = (rounds == 80 ? 7 : 15);
+ for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
+ {
+ __m128i t1, t2, k, k1, k2;
+
+ k = _mm_loadu_si128((const __m128i*) &subkeys[(i-3) & MASK]);
+ k1 = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
+ k2 = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
+
+ // Odd round
+ t1 = RotateRight32<1>(d);
+ t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
+ d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
+
+ counter = _mm_sub_epi32(counter, decrement);
+
+ // Even round
+ t1 = RotateRight32<8>(c);
+ t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
+ c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
+
+ counter = _mm_sub_epi32(counter, decrement);
+
+ k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
+ k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
+
+ // Odd round
+ t1 = RotateRight32<1>(b);
+ t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
+ b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
+
+ counter = _mm_sub_epi32(counter, decrement);
+
+ // Even round
+ t1 = RotateRight32<8>(a);
+ t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
+ a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
+
+ counter = _mm_sub_epi32(counter, decrement);
+ }
+
+ // Repack
+ // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
+ block0 = RepackXMM<0>(a,b,c,d);
+}
+
+inline void GCC_NO_UBSAN CHAM128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
+ __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
+{
+ // Rearrange the data for vectorization. UnpackXMM includes a
+ // little-endian swap for SSE. Thanks to Peter Cordes for help
+ // with packing and unpacking.
+ // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
+ __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
+ __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
+ __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
+ __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
+
+ __m128i counter = _mm_set_epi32(0,0,0,0);
+ __m128i increment = _mm_set_epi32(1,1,1,1);
+
+ const unsigned int MASK = (rounds == 80 ? 7 : 15);
+ for (int i=0; i<static_cast<int>(rounds); i+=4)
+ {
+ __m128i t1, t2, k, k1, k2;
+
+ k = _mm_loadu_si128((const __m128i*) &subkeys[i & MASK]);
+ k1 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
+ k2 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
+
+ t1 = _mm_xor_si128(a, counter);
+ t2 = _mm_xor_si128(RotateLeft32<1>(b), k1);
+ a = RotateLeft32<8>(_mm_add_epi32(t1, t2));
+
+ counter = _mm_add_epi32(counter, increment);
+
+ t1 = _mm_xor_si128(b, counter);
+ t2 = _mm_xor_si128(RotateLeft32<8>(c), k2);
+ b = RotateLeft32<1>(_mm_add_epi32(t1, t2));
+
+ counter = _mm_add_epi32(counter, increment);
+
+ k1 = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
+ k2 = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
+
+ t1 = _mm_xor_si128(c, counter);
+ t2 = _mm_xor_si128(RotateLeft32<1>(d), k1);
+ c = RotateLeft32<8>(_mm_add_epi32(t1, t2));
+
+ counter = _mm_add_epi32(counter, increment);
+
+ t1 = _mm_xor_si128(d, counter);
+ t2 = _mm_xor_si128(RotateLeft32<8>(a), k2);
+ d = RotateLeft32<1>(_mm_add_epi32(t1, t2));
+
+ counter = _mm_add_epi32(counter, increment);
+ }
+
+ // Repack
+ // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
+ block0 = RepackXMM<0>(a,b,c,d);
+ block1 = RepackXMM<1>(a,b,c,d);
+ block2 = RepackXMM<2>(a,b,c,d);
+ block3 = RepackXMM<3>(a,b,c,d);
+}
+
+inline void GCC_NO_UBSAN CHAM128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
+ __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
+{
+ // Rearrange the data for vectorization. UnpackXMM includes a
+ // little-endian swap for SSE. Thanks to Peter Cordes for help
+ // with packing and unpacking.
+ // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 B1 C1 D1][A2 B2 C2 D2] ...
+ __m128i a = UnpackXMM<0>(block0, block1, block2, block3);
+ __m128i b = UnpackXMM<1>(block0, block1, block2, block3);
+ __m128i c = UnpackXMM<2>(block0, block1, block2, block3);
+ __m128i d = UnpackXMM<3>(block0, block1, block2, block3);
+
+ __m128i counter = _mm_set_epi32(rounds-1,rounds-1,rounds-1,rounds-1);
+ __m128i decrement = _mm_set_epi32(1,1,1,1);
+
+ const unsigned int MASK = (rounds == 80 ? 7 : 15);
+ for (int i = static_cast<int>(rounds)-1; i >= 0; i-=4)
+ {
+ __m128i t1, t2, k, k1, k2;
+
+ k = _mm_loadu_si128((const __m128i*) &subkeys[(i-3) & MASK]);
+ k1 = _mm_shuffle_epi8(k, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
+ k2 = _mm_shuffle_epi8(k, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
+
+ // Odd round
+ t1 = RotateRight32<1>(d);
+ t2 = _mm_xor_si128(RotateLeft32<8>(a), k1);
+ d = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
+
+ counter = _mm_sub_epi32(counter, decrement);
+
+ // Even round
+ t1 = RotateRight32<8>(c);
+ t2 = _mm_xor_si128(RotateLeft32<1>(d), k2);
+ c = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
+
+ counter = _mm_sub_epi32(counter, decrement);
+
+ k1 = _mm_shuffle_epi8(k, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
+ k2 = _mm_shuffle_epi8(k, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
+
+ // Odd round
+ t1 = RotateRight32<1>(b);
+ t2 = _mm_xor_si128(RotateLeft32<8>(c), k1);
+ b = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
+
+ counter = _mm_sub_epi32(counter, decrement);
+
+ // Even round
+ t1 = RotateRight32<8>(a);
+ t2 = _mm_xor_si128(RotateLeft32<1>(b), k2);
+ a = _mm_xor_si128(_mm_sub_epi32(t1, t2), counter);
+
+ counter = _mm_sub_epi32(counter, decrement);
+ }
+
+ // Repack
+ // [A1 B1 C1 D1][A2 B2 C2 D2] ... => [A1 A2 A3 A4][B1 B2 B3 B4] ...
+ block0 = RepackXMM<0>(a,b,c,d);
+ block1 = RepackXMM<1>(a,b,c,d);
+ block2 = RepackXMM<2>(a,b,c,d);
+ block3 = RepackXMM<3>(a,b,c,d);
+}
+
+ANONYMOUS_NAMESPACE_END
+
+NAMESPACE_BEGIN(CryptoPP)
+
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+ return AdvancedProcessBlocks128_4x1_SSE(CHAM128_Enc_Block, CHAM128_Enc_4_Blocks,
+ subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+ return AdvancedProcessBlocks128_4x1_SSE(CHAM128_Dec_Block, CHAM128_Dec_4_Blocks,
+ subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif // CRYPTOPP_SSSE3_AVAILABLE
+
+NAMESPACE_END
diff --git a/cham.cpp b/cham.cpp
index 53fe3c16..efd869af 100644
--- a/cham.cpp
+++ b/cham.cpp
@@ -8,6 +8,7 @@
#include "cham.h"
#include "misc.h"
+#include "cpu.h"
// CHAM table of parameters
// +-------------------------------------------------
@@ -95,6 +96,14 @@ ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
+extern size_t CHAM128_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t CHAM128_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
+ const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
void CHAM64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params)
{
CRYPTOPP_UNUSED(params);
@@ -299,4 +308,28 @@ void CHAM128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock,
oblock(m_x[0])(m_x[1])(m_x[2])(m_x[3]);
}
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
+size_t CHAM128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+ byte *outBlocks, size_t length, word32 flags) const
+{
+ if (HasSSSE3()) {
+ const size_t rounds = (m_kw == 4 ? 80 : 96);
+ return CHAM128_Enc_AdvancedProcessBlocks_SSSE3(m_rk, rounds,
+ inBlocks, xorBlocks, outBlocks, length, flags);
+ }
+ return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t CHAM128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+ byte *outBlocks, size_t length, word32 flags) const
+{
+ if (HasSSSE3()) {
+ const size_t rounds = (m_kw == 4 ? 80 : 96);
+ return CHAM128_Dec_AdvancedProcessBlocks_SSSE3(m_rk, rounds,
+ inBlocks, xorBlocks, outBlocks, length, flags);
+ }
+ return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif // CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
+
NAMESPACE_END
diff --git a/cham.h b/cham.h
index 52edc4dc..1c8b02b1 100644
--- a/cham.h
+++ b/cham.h
@@ -15,6 +15,10 @@
#include "secblock.h"
#include "algparam.h"
+#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
+# define CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS 1
+#endif
+
NAMESPACE_BEGIN(CryptoPP)
/// \brief CHAM block cipher information
@@ -92,7 +96,7 @@ typedef CHAM64::Decryption CHAM64Decryption;
/// \brief CHAM 128-bit block cipher
/// \details CHAM128 provides 128-bit block size. The valid key size is 128-bit and 256-bit.
/// \note Crypto++ provides a byte oriented implementation
-/// \sa CHAM128, <a href="http://www.cryptopp.com/wiki/CHAM">CHAM</a>, <a href=
+/// \sa CHAM64, <a href="http://www.cryptopp.com/wiki/CHAM">CHAM</a>, <a href=
/// "https://pdfs.semanticscholar.org/2f57/61b5c2614cffd58a09cc83c375a2b32a2ed3.pdf">
/// CHAM: A Family of Lightweight Block Ciphers for Resource-Constrained Devices</a>
/// \since Crypto++ 7.1
@@ -120,6 +124,10 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
+ size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
};
/// \brief Provides implementation for encryption transformation
@@ -130,6 +138,10 @@ public:
{
public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+
+#if CRYPTOPP_CHAM128_ADVANCED_PROCESS_BLOCKS
+ size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
};
typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj
index 774705cc..a2f7e0be 100644
--- a/cryptlib.vcxproj
+++ b/cryptlib.vcxproj
@@ -192,6 +192,7 @@
<ClCompile Include="ccm.cpp" />
<ClCompile Include="chacha.cpp" />
<ClCompile Include="cham.cpp" />
+ <ClCompile Include="cham-simd.cpp" />
<ClCompile Include="channels.cpp" />
<ClCompile Include="cmac.cpp" />
<ClCompile Include="crc.cpp" />
diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters
index 954a74ce..9867d7ba 100644
--- a/cryptlib.vcxproj.filters
+++ b/cryptlib.vcxproj.filters
@@ -89,6 +89,9 @@
<ClCompile Include="cham.cpp">
<Filter>Source Files</Filter>
</ClCompile>
+ <ClCompile Include="cham-simd.cpp">
+ <Filter>Source Files</Filter>
+ </ClCompile>
<ClCompile Include="channels.cpp">
<Filter>Source Files</Filter>
</ClCompile>