From 081afde0fdf6202073dc9dfc46b501a0f06388a6 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 3 Dec 2017 04:10:55 -0500 Subject: Add SIMON-64 SSE intrinsics Performance went from about 29 cpb (C++) to about 11.1 cpb (SSE) --- simon.cpp | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'simon.cpp') diff --git a/simon.cpp b/simon.cpp index 565a5205..a966a4ca 100644 --- a/simon.cpp +++ b/simon.cpp @@ -7,9 +7,10 @@ #include "misc.h" #include "cpu.h" -// Uncomment for benchmarking C++ against SSE2 or NEON. -// Do so in both speck.cpp and speck-simd.cpp. +// Uncomment for benchmarking C++ against SSE or NEON. +// Do so in both simon.cpp and simon-simd.cpp. // #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_SSE41_AVAILABLE // #undef CRYPTOPP_ARM_NEON_AVAILABLE // Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about @@ -206,6 +207,14 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, siz const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); #endif +#if defined(CRYPTOPP_SSE41_AVAILABLE) +extern size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + #if defined(CRYPTOPP_SSSE3_AVAILABLE) extern size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -376,6 +385,38 @@ void SIMON128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock } #if defined(CRYPTOPP_SIMON_ADVANCED_PROCESS_BLOCKS) +size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSE41_AVAILABLE) + if (HasSSE41()) + return SIMON64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSE41_AVAILABLE) + if (HasSSE41()) + return SIMON64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + size_t SIMON128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { -- cgit v1.2.1