From 8e8e95cea204319d99380cbedd0159746876355a Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 13 Oct 2019 16:17:37 -0400 Subject: Make XTS mode parallelizable (GH #891) On CoffeeLake performance increased from 3.4 cpb to 1.75 cpb. On Core2Duo performance increased from 27 cpb to 19 cpb. --- xts.cpp | 177 ++++++++++++++++++++++++++++++---------------------------------- 1 file changed, 82 insertions(+), 95 deletions(-) (limited to 'xts.cpp') diff --git a/xts.cpp b/xts.cpp index 29f2722e..e2dec0cf 100644 --- a/xts.cpp +++ b/xts.cpp @@ -1,18 +1,11 @@ // xts.cpp - written and placed in the public domain by Jeffrey Walton -// -// The best performance is achieved on machines with AES hardware acceleration. -// However, 64-bit machines without hardware acceleration profit the most with -// separate calls to ProcessBlock followed by XorBuffer rather than a single -// call to AdvancedProcessBlocks. That's because we did not parallelize, and -// XorBuffer uses SSE2 and ASIMD when available. Parallelizing slowed things -// down due to copying m_register for GF_Double. XorBuffer profits without -// AESNI and friends since XorBuffer only uses load, store and xor. #include "pch.h" #include "xts.h" #include "misc.h" #include "modes.h" +#include "cpu.h" #if defined(CRYPTOPP_DEBUG) # include "aes.h" @@ -40,41 +33,6 @@ ANONYMOUS_NAMESPACE_BEGIN using namespace CryptoPP; -// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the -// base architecture. We can use the SIMD code below without an -// architecture option. No runtime tests are required. Unfortunately, -// we can't use it on Altivec because an architecture switch is required. -// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for -// 16-byte block sizes. count must be a multiple of 16 since SIMD words -// are used. -inline void XorBuffer(byte *buf, const byte *mask, size_t count) -{ - CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0)); - CRYPTOPP_UNUSED(count); - -#if defined(__SSE2__) || defined(_M_X64) - #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS) - for (size_t i=0; i= 16 && (count % 16 == 0)); - CRYPTOPP_UNUSED(count); #if defined(__SSE2__) || defined(_M_X64) - #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS) for (size_t i=0; i(false, LITTLE_ENDIAN_ORDER, k+idx); + x = GetWord(false, LITTLE_ENDIAN_ORDER, in+idx); word64 y = (x >> 63); x = (x << 1) + carry; - PutWord(false, LITTLE_ENDIAN_ORDER, k+idx, x); + PutWord(false, LITTLE_ENDIAN_ORDER, out+idx, x); carry = y; } #else word32 carry = 0, x; for (size_t i=0, idx=0; i(false, LITTLE_ENDIAN_ORDER, k+idx); + x = GetWord(false, LITTLE_ENDIAN_ORDER, in+idx); word32 y = (x >> 31); x = (x << 1) + carry; - PutWord(false, LITTLE_ENDIAN_ORDER, k+idx, x); + PutWord(false, LITTLE_ENDIAN_ORDER, out+idx, x); carry = y; } #endif @@ -139,6 +94,7 @@ inline void GF_Double(byte *k, unsigned int len) CRYPTOPP_ASSERT(len >= 16); CRYPTOPP_ASSERT(len <= 128); + byte* k = out; if (carry) { switch (len) @@ -184,6 +140,7 @@ inline void GF_Double(byte *k, unsigned int len) #else CRYPTOPP_ASSERT(len == 16); + byte* k = out; if (carry) { k[0] ^= 0x87; @@ -192,6 +149,11 @@ inline void GF_Double(byte *k, unsigned int len) #endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS } +inline void GF_Double(byte *inout, unsigned int len) +{ + GF_Double(inout, inout, len); +} + #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING) using CryptoPP::AES; @@ -247,7 +209,8 @@ void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs & void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength) { BlockOrientedCipherModeBase::Resynchronize(iv, ivLength); - GetTweakCipher().ProcessBlock(m_register); + std::memcpy(m_xregister, m_register, ivLength); + GetTweakCipher().ProcessBlock(m_xregister); } void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order) @@ -257,37 +220,61 @@ void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order) std::memset(iv+8, 0x00, iv.size()-8); BlockOrientedCipherModeBase::Resynchronize(iv, iv.size()); - GetTweakCipher().ProcessBlock(m_register); + std::memcpy(m_xregister, iv, iv.size()); + GetTweakCipher().ProcessBlock(m_xregister); } void XTS_ModeBase::ResizeBuffers() { BlockOrientedCipherModeBase::ResizeBuffers(); - m_workspace.New(GetBlockCipher().BlockSize()); + m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks); + m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks); } void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length) { const unsigned int blockSize = GetBlockCipher().BlockSize(); + const size_t parallelSize = blockSize*ParallelBlocks; + size_t i = 0; // data unit is multiple of 16 bytes CRYPTOPP_ASSERT(length % blockSize == 0); - // now encrypt the data unit, AES_BLK_BYTES at a time - for (size_t i=0; i= inLength); const unsigned int blockSize = GetBlockCipher().BlockSize(); - const unsigned int blocks = inLength / blockSize; - const unsigned int tail = inLength % blockSize; + const size_t blocks = inLength / blockSize; + const size_t tail = inLength % blockSize; outLength = inLength; if (tail == 0) @@ -327,22 +314,22 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co ProcessData(outString, inString, inLength-head); outString += head; - inString += head; inLength -= head; + inString += head; inLength -= head; } ///// handle the full block ///// // merge the tweak into the input block - XorBuffer(m_workspace, inString, m_register, blockSize); + XorBuffer(m_xworkspace, inString, m_xregister, blockSize); // encrypt one block - GetBlockCipher().ProcessBlock(m_workspace); + GetBlockCipher().ProcessBlock(m_xworkspace); // merge the tweak into the output block - XorBuffer(outString, m_workspace, m_register, blockSize); + XorBuffer(outString, m_xworkspace, m_xregister, blockSize); // Multiply T by alpha - GF_Double(m_register, m_register.size()); + GF_Double(m_xregister, blockSize); ///// handle final partial block ///// @@ -351,20 +338,20 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co const size_t len = inLength-blockSize; // copy in the final plaintext bytes - std::memcpy(m_workspace, inString, len); + std::memcpy(m_xworkspace, inString, len); // and copy out the final ciphertext bytes std::memcpy(outString, outString-blockSize, len); // "steal" ciphertext to complete the block - std::memcpy(m_workspace+len, outString-blockSize+len, blockSize-len); + std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len); // merge the tweak into the input block - XorBuffer(m_workspace, m_register, blockSize); + XorBuffer(m_xworkspace, m_xregister, blockSize); // encrypt one block - GetBlockCipher().ProcessBlock(m_workspace); + GetBlockCipher().ProcessBlock(m_xworkspace); // merge the tweak into the previous output block - XorBuffer(outString-blockSize, m_workspace, m_register, blockSize); + XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize); return outLength; } @@ -375,8 +362,8 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c CRYPTOPP_ASSERT(outLength >= inLength); const unsigned int blockSize = GetBlockCipher().BlockSize(); - const unsigned int blocks = inLength / blockSize; - const unsigned int tail = inLength % blockSize; + const size_t blocks = inLength / blockSize; + const size_t tail = inLength % blockSize; outLength = inLength; if (tail == 0) @@ -392,12 +379,12 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c ProcessData(outString, inString, inLength-head); outString += head; - inString += head; inLength -= head; + inString += head; inLength -= head; } - SecByteBlock poly1(m_register); - SecByteBlock poly2(m_register); - GF_Double(poly2, poly2.size()); + #define poly1 (m_xregister+0*blockSize) + #define poly2 (m_xregister+1*blockSize) + GF_Double(poly2, poly1, blockSize); ///// handle final partial block ///// @@ -406,20 +393,20 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c const size_t len = inLength-blockSize; // merge the tweak into the input block - XorBuffer(m_workspace, inString-blockSize, poly2, blockSize); + XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize); // encrypt one block - GetBlockCipher().ProcessBlock(m_workspace); + GetBlockCipher().ProcessBlock(m_xworkspace); // merge the tweak into the output block - XorBuffer(m_workspace, poly2, blockSize); + XorBuffer(m_xworkspace, poly2, blockSize); // copy in the final plaintext bytes std::memcpy(outString-blockSize, inString, len); // and copy out the final ciphertext bytes - std::memcpy(outString, m_workspace, len); + std::memcpy(outString, m_xworkspace, len); // "steal" ciphertext to complete the block - std::memcpy(outString-blockSize+len, m_workspace+len, blockSize-len); + std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len); ///// handle the full previous block ///// @@ -427,13 +414,13 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c outString -= blockSize; // merge the tweak into the input block - XorBuffer(m_workspace, outString, poly1, blockSize); + XorBuffer(m_xworkspace, outString, poly1, blockSize); // encrypt one block - GetBlockCipher().ProcessBlock(m_workspace); + GetBlockCipher().ProcessBlock(m_xworkspace); // merge the tweak into the output block - XorBuffer(outString, m_workspace, poly1, blockSize); + XorBuffer(outString, m_xworkspace, poly1, blockSize); return outLength; } -- cgit v1.2.1