summaryrefslogtreecommitdiff
path: root/xts.cpp
diff options
context:
space:
mode:
authorJeffrey Walton <noloader@gmail.com>2019-10-13 16:17:37 -0400
committerJeffrey Walton <noloader@gmail.com>2019-10-13 16:17:37 -0400
commit8e8e95cea204319d99380cbedd0159746876355a (patch)
tree1bf4e060c569cf89352179b81af4284836e7616d /xts.cpp
parentc9b8452d570ce0d5f883c249b942518ef509fc18 (diff)
downloadcryptopp-git-8e8e95cea204319d99380cbedd0159746876355a.tar.gz
Make XTS mode parallelizable (GH #891)
On CoffeeLake performance increased from 3.4 cpb to 1.75 cpb. On Core2Duo performance increased from 27 cpb to 19 cpb.
Diffstat (limited to 'xts.cpp')
-rw-r--r--xts.cpp177
1 files changed, 82 insertions, 95 deletions
diff --git a/xts.cpp b/xts.cpp
index 29f2722e..e2dec0cf 100644
--- a/xts.cpp
+++ b/xts.cpp
@@ -1,18 +1,11 @@
// xts.cpp - written and placed in the public domain by Jeffrey Walton
-//
-// The best performance is achieved on machines with AES hardware acceleration.
-// However, 64-bit machines without hardware acceleration profit the most with
-// separate calls to ProcessBlock followed by XorBuffer rather than a single
-// call to AdvancedProcessBlocks. That's because we did not parallelize, and
-// XorBuffer uses SSE2 and ASIMD when available. Parallelizing slowed things
-// down due to copying m_register for GF_Double. XorBuffer profits without
-// AESNI and friends since XorBuffer only uses load, store and xor.
#include "pch.h"
#include "xts.h"
#include "misc.h"
#include "modes.h"
+#include "cpu.h"
#if defined(CRYPTOPP_DEBUG)
# include "aes.h"
@@ -47,88 +40,50 @@ using namespace CryptoPP;
// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
// 16-byte block sizes. count must be a multiple of 16 since SIMD words
// are used.
-inline void XorBuffer(byte *buf, const byte *mask, size_t count)
-{
- CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
- CRYPTOPP_UNUSED(count);
-
-#if defined(__SSE2__) || defined(_M_X64)
- #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
- for (size_t i=0; i<count; i+=16)
- _mm_storeu_si128(M128_CAST(buf+i), _mm_xor_si128(
- _mm_loadu_si128(CONST_M128_CAST(mask+i)), _mm_loadu_si128(CONST_M128_CAST(buf+i))));
- #else
- _mm_storeu_si128(M128_CAST(buf), _mm_xor_si128(
- _mm_loadu_si128(CONST_M128_CAST(mask)), _mm_loadu_si128(CONST_M128_CAST(buf))));
- #endif
-
-#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
- #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
- for (size_t i=0; i<count; i+=16)
- vst1q_u8(buf+i, veorq_u8(vld1q_u8(mask+i), vld1q_u8(buf+i)));
- #else
- vst1q_u8(buf, veorq_u8(vld1q_u8(mask), vld1q_u8(buf)));
- #endif
-
-#else
- xorbuf(buf, mask, count);
-#endif
-}
-
-// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
-// base architecture. We can use the SIMD code below without an
-// architecture option. No runtime tests are required. Unfortunately,
-// we can't use it on Altivec because an architecture switch is required.
-// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
-// 16-byte block sizes. count must be a multiple of 16 since SIMD words
-// are used.
inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
{
CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
- CRYPTOPP_UNUSED(count);
#if defined(__SSE2__) || defined(_M_X64)
- #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16)
- _mm_storeu_si128(M128_CAST(output+i), _mm_xor_si128(
- _mm_loadu_si128(CONST_M128_CAST(input+i)), _mm_loadu_si128(CONST_M128_CAST(mask+i))));
- #else
- _mm_storeu_si128(M128_CAST(output), _mm_xor_si128(
- _mm_loadu_si128(CONST_M128_CAST(input)), _mm_loadu_si128(CONST_M128_CAST(mask))));
- #endif
+ _mm_storeu_si128(M128_CAST(output+i),
+ _mm_xor_si128(
+ _mm_loadu_si128(CONST_M128_CAST(input+i)),
+ _mm_loadu_si128(CONST_M128_CAST(mask+i))));
#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
- #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
for (size_t i=0; i<count; i+=16)
vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
- #else
- vst1q_u8(output, veorq_u8(vld1q_u8(input), vld1q_u8(mask)));
- #endif
#else
xorbuf(output, input, mask, count);
#endif
}
+inline void XorBuffer(byte *buf, const byte *mask, size_t count)
+{
+ XorBuffer(buf, buf, mask, count);
+}
+
// Borrowed from CMAC, but little-endian representation
-inline void GF_Double(byte *k, unsigned int len)
+inline void GF_Double(byte *out, const byte* in, unsigned int len)
{
#if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
word64 carry = 0, x;
for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
{
- x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx);
+ x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
word64 y = (x >> 63); x = (x << 1) + carry;
- PutWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx, x);
+ PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
carry = y;
}
#else
word32 carry = 0, x;
for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
{
- x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx);
+ x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
word32 y = (x >> 31); x = (x << 1) + carry;
- PutWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx, x);
+ PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
carry = y;
}
#endif
@@ -139,6 +94,7 @@ inline void GF_Double(byte *k, unsigned int len)
CRYPTOPP_ASSERT(len >= 16);
CRYPTOPP_ASSERT(len <= 128);
+ byte* k = out;
if (carry)
{
switch (len)
@@ -184,6 +140,7 @@ inline void GF_Double(byte *k, unsigned int len)
#else
CRYPTOPP_ASSERT(len == 16);
+ byte* k = out;
if (carry)
{
k[0] ^= 0x87;
@@ -192,6 +149,11 @@ inline void GF_Double(byte *k, unsigned int len)
#endif // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
}
+inline void GF_Double(byte *inout, unsigned int len)
+{
+ GF_Double(inout, inout, len);
+}
+
#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
using CryptoPP::AES;
@@ -247,7 +209,8 @@ void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &
void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
{
BlockOrientedCipherModeBase::Resynchronize(iv, ivLength);
- GetTweakCipher().ProcessBlock(m_register);
+ std::memcpy(m_xregister, m_register, ivLength);
+ GetTweakCipher().ProcessBlock(m_xregister);
}
void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
@@ -257,37 +220,61 @@ void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
std::memset(iv+8, 0x00, iv.size()-8);
BlockOrientedCipherModeBase::Resynchronize(iv, iv.size());
- GetTweakCipher().ProcessBlock(m_register);
+ std::memcpy(m_xregister, iv, iv.size());
+ GetTweakCipher().ProcessBlock(m_xregister);
}
void XTS_ModeBase::ResizeBuffers()
{
BlockOrientedCipherModeBase::ResizeBuffers();
- m_workspace.New(GetBlockCipher().BlockSize());
+ m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
+ m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
}
void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
{
const unsigned int blockSize = GetBlockCipher().BlockSize();
+ const size_t parallelSize = blockSize*ParallelBlocks;
+ size_t i = 0;
// data unit is multiple of 16 bytes
CRYPTOPP_ASSERT(length % blockSize == 0);
- // now encrypt the data unit, AES_BLK_BYTES at a time
- for (size_t i=0; i<length; i+=blockSize)
+ // encrypt the data unit, optimal size at a time
+ for ( ; i+parallelSize<=length; i+=parallelSize)
{
+ // m_xregister[0] always points to the next tweak.
+ GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
+ GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
+ GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
+
// merge the tweak into the input block
- XorBuffer(m_workspace, inString+i, m_register, blockSize);
+ XorBuffer(m_xworkspace, inString+i, m_xregister, parallelSize);
+
+ // encrypt one block, merge the tweak into the output block
+ GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister, outString+i, parallelSize, BlockTransformation::BT_AllowParallel);
+
+ // Multiply T by alpha. m_xregister[0] always points to the next tweak.
+ GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
+ }
+
+ // encrypt the data unit, blocksize at a time
+ for ( ; i<length; i+=blockSize)
+ {
+ // merge the tweak into the input block
+ XorBuffer(m_xworkspace, inString+i, m_xregister, blockSize);
// encrypt one block
- GetBlockCipher().ProcessBlock(m_workspace);
+ GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block
- XorBuffer(outString+i, m_workspace, m_register, blockSize);
+ XorBuffer(outString+i, m_xworkspace, m_xregister, blockSize);
// Multiply T by alpha
- GF_Double(m_register, m_register.size());
+ GF_Double(m_xregister, blockSize);
}
+
+ CRYPTOPP_ASSERT(i == length);
}
size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
@@ -310,8 +297,8 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
CRYPTOPP_ASSERT(outLength >= inLength);
const unsigned int blockSize = GetBlockCipher().BlockSize();
- const unsigned int blocks = inLength / blockSize;
- const unsigned int tail = inLength % blockSize;
+ const size_t blocks = inLength / blockSize;
+ const size_t tail = inLength % blockSize;
outLength = inLength;
if (tail == 0)
@@ -327,22 +314,22 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
ProcessData(outString, inString, inLength-head);
outString += head;
- inString += head; inLength -= head;
+ inString += head; inLength -= head;
}
///// handle the full block /////
// merge the tweak into the input block
- XorBuffer(m_workspace, inString, m_register, blockSize);
+ XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
// encrypt one block
- GetBlockCipher().ProcessBlock(m_workspace);
+ GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block
- XorBuffer(outString, m_workspace, m_register, blockSize);
+ XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
// Multiply T by alpha
- GF_Double(m_register, m_register.size());
+ GF_Double(m_xregister, blockSize);
///// handle final partial block /////
@@ -351,20 +338,20 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
const size_t len = inLength-blockSize;
// copy in the final plaintext bytes
- std::memcpy(m_workspace, inString, len);
+ std::memcpy(m_xworkspace, inString, len);
// and copy out the final ciphertext bytes
std::memcpy(outString, outString-blockSize, len);
// "steal" ciphertext to complete the block
- std::memcpy(m_workspace+len, outString-blockSize+len, blockSize-len);
+ std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
// merge the tweak into the input block
- XorBuffer(m_workspace, m_register, blockSize);
+ XorBuffer(m_xworkspace, m_xregister, blockSize);
// encrypt one block
- GetBlockCipher().ProcessBlock(m_workspace);
+ GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the previous output block
- XorBuffer(outString-blockSize, m_workspace, m_register, blockSize);
+ XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
return outLength;
}
@@ -375,8 +362,8 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
CRYPTOPP_ASSERT(outLength >= inLength);
const unsigned int blockSize = GetBlockCipher().BlockSize();
- const unsigned int blocks = inLength / blockSize;
- const unsigned int tail = inLength % blockSize;
+ const size_t blocks = inLength / blockSize;
+ const size_t tail = inLength % blockSize;
outLength = inLength;
if (tail == 0)
@@ -392,12 +379,12 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
ProcessData(outString, inString, inLength-head);
outString += head;
- inString += head; inLength -= head;
+ inString += head; inLength -= head;
}
- SecByteBlock poly1(m_register);
- SecByteBlock poly2(m_register);
- GF_Double(poly2, poly2.size());
+ #define poly1 (m_xregister+0*blockSize)
+ #define poly2 (m_xregister+1*blockSize)
+ GF_Double(poly2, poly1, blockSize);
///// handle final partial block /////
@@ -406,20 +393,20 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
const size_t len = inLength-blockSize;
// merge the tweak into the input block
- XorBuffer(m_workspace, inString-blockSize, poly2, blockSize);
+ XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
// encrypt one block
- GetBlockCipher().ProcessBlock(m_workspace);
+ GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block
- XorBuffer(m_workspace, poly2, blockSize);
+ XorBuffer(m_xworkspace, poly2, blockSize);
// copy in the final plaintext bytes
std::memcpy(outString-blockSize, inString, len);
// and copy out the final ciphertext bytes
- std::memcpy(outString, m_workspace, len);
+ std::memcpy(outString, m_xworkspace, len);
// "steal" ciphertext to complete the block
- std::memcpy(outString-blockSize+len, m_workspace+len, blockSize-len);
+ std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
///// handle the full previous block /////
@@ -427,13 +414,13 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
outString -= blockSize;
// merge the tweak into the input block
- XorBuffer(m_workspace, outString, poly1, blockSize);
+ XorBuffer(m_xworkspace, outString, poly1, blockSize);
// encrypt one block
- GetBlockCipher().ProcessBlock(m_workspace);
+ GetBlockCipher().ProcessBlock(m_xworkspace);
// merge the tweak into the output block
- XorBuffer(outString, m_workspace, poly1, blockSize);
+ XorBuffer(outString, m_xworkspace, poly1, blockSize);
return outLength;
}