Make XTS mode parallelizable (GH #891)

On CoffeeLake performance increased from 3.4 cpb to 1.75 cpb. On Core2Duo performance increased from 27 cpb to 19 cpb.
author: Jeffrey Walton <noloader@gmail.com> 2019-10-13 16:17:37 -0400
committer: Jeffrey Walton <noloader@gmail.com> 2019-10-13 16:17:37 -0400
commit: 8e8e95cea204319d99380cbedd0159746876355a (patch)
tree: 1bf4e060c569cf89352179b81af4284836e7616d /xts.cpp
parent: c9b8452d570ce0d5f883c249b942518ef509fc18 (diff)
download: cryptopp-git-8e8e95cea204319d99380cbedd0159746876355a.tar.gz
1 files changed, 82 insertions, 95 deletions
diff --git a/xts.cpp b/xts.cpp
index 29f2722e..e2dec0cf 100644
--- a/xts.cpp
+++ b/xts.cpp
@@ -1,18 +1,11 @@
 // xts.cpp - written and placed in the public domain by Jeffrey Walton
-//
-// The best performance is achieved on machines with AES hardware acceleration.
-// However, 64-bit machines without hardware acceleration profit the most with
-// separate calls to ProcessBlock followed by XorBuffer rather than a single
-// call to AdvancedProcessBlocks. That's because we did not parallelize, and
-// XorBuffer uses SSE2 and ASIMD when available. Parallelizing slowed things
-// down due to copying m_register for GF_Double. XorBuffer profits without
-// AESNI and friends since XorBuffer only uses load, store and xor.
 
 #include "pch.h"
 
 #include "xts.h"
 #include "misc.h"
 #include "modes.h"
+#include "cpu.h"
 
 #if defined(CRYPTOPP_DEBUG)
 # include "aes.h"
@@ -47,88 +40,50 @@ using namespace CryptoPP;
 // The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
 // 16-byte block sizes. count must be a multiple of 16 since SIMD words
 // are used.
-inline void XorBuffer(byte *buf, const byte *mask, size_t count)
-{
-    CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
-    CRYPTOPP_UNUSED(count);
-
-#if defined(__SSE2__) || defined(_M_X64)
-    #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
-    for (size_t i=0; i<count; i+=16)
-        _mm_storeu_si128(M128_CAST(buf+i), _mm_xor_si128(
-            _mm_loadu_si128(CONST_M128_CAST(mask+i)), _mm_loadu_si128(CONST_M128_CAST(buf+i))));
-    #else
-        _mm_storeu_si128(M128_CAST(buf), _mm_xor_si128(
-            _mm_loadu_si128(CONST_M128_CAST(mask)), _mm_loadu_si128(CONST_M128_CAST(buf))));
-    #endif
-
-#elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
-    #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
-    for (size_t i=0; i<count; i+=16)
-        vst1q_u8(buf+i, veorq_u8(vld1q_u8(mask+i), vld1q_u8(buf+i)));
-    #else
-        vst1q_u8(buf, veorq_u8(vld1q_u8(mask), vld1q_u8(buf)));
-    #endif
-
-#else
-    xorbuf(buf, mask, count);
-#endif
-}
-
-// Aarch32, Aarch64, Altivec and X86_64 include SIMD as part of the
-// base architecture. We can use the SIMD code below without an
-// architecture option. No runtime tests are required. Unfortunately,
-// we can't use it on Altivec because an architecture switch is required.
-// The updated XorBuffer gains 0.3 to 1.5 cpb on the architectures for
-// 16-byte block sizes. count must be a multiple of 16 since SIMD words
-// are used.
 inline void XorBuffer(byte *output, const byte *input, const byte *mask, size_t count)
 {
     CRYPTOPP_ASSERT(count >= 16 && (count % 16 == 0));
-    CRYPTOPP_UNUSED(count);
 
 #if defined(__SSE2__) || defined(_M_X64)
-    #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
     for (size_t i=0; i<count; i+=16)
-        _mm_storeu_si128(M128_CAST(output+i), _mm_xor_si128(
-            _mm_loadu_si128(CONST_M128_CAST(input+i)), _mm_loadu_si128(CONST_M128_CAST(mask+i))));
-    #else
-        _mm_storeu_si128(M128_CAST(output), _mm_xor_si128(
-            _mm_loadu_si128(CONST_M128_CAST(input)), _mm_loadu_si128(CONST_M128_CAST(mask))));
-    #endif
+        _mm_storeu_si128(M128_CAST(output+i),
+            _mm_xor_si128(
+                _mm_loadu_si128(CONST_M128_CAST(input+i)),
+                _mm_loadu_si128(CONST_M128_CAST(mask+i))));
 
 #elif defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
-    #if (CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS)
     for (size_t i=0; i<count; i+=16)
         vst1q_u8(output+i, veorq_u8(vld1q_u8(input+i), vld1q_u8(mask+i)));
-    #else
-        vst1q_u8(output, veorq_u8(vld1q_u8(input), vld1q_u8(mask)));
-    #endif
 
 #else
     xorbuf(output, input, mask, count);
 #endif
 }
 
+inline void XorBuffer(byte *buf, const byte *mask, size_t count)
+{
+    XorBuffer(buf, buf, mask, count);
+}
+
 // Borrowed from CMAC, but little-endian representation
-inline void GF_Double(byte *k, unsigned int len)
+inline void GF_Double(byte *out, const byte* in, unsigned int len)
 {
 #if defined(_M_X64) || defined(_M_ARM64) || defined(_LP64) || defined(__LP64__)
     word64 carry = 0, x;
     for (size_t i=0, idx=0; i<len/8; ++i, idx+=8)
     {
-        x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx);
+        x = GetWord<word64>(false, LITTLE_ENDIAN_ORDER, in+idx);
         word64 y = (x >> 63); x = (x << 1) + carry;
-        PutWord<word64>(false, LITTLE_ENDIAN_ORDER, k+idx, x);
+        PutWord<word64>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
         carry = y;
     }
 #else
     word32 carry = 0, x;
     for (size_t i=0, idx=0; i<len/4; ++i, idx+=4)
     {
-        x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx);
+        x = GetWord<word32>(false, LITTLE_ENDIAN_ORDER, in+idx);
         word32 y = (x >> 31); x = (x << 1) + carry;
-        PutWord<word32>(false, LITTLE_ENDIAN_ORDER, k+idx, x);
+        PutWord<word32>(false, LITTLE_ENDIAN_ORDER, out+idx, x);
         carry = y;
     }
 #endif
@@ -139,6 +94,7 @@ inline void GF_Double(byte *k, unsigned int len)
     CRYPTOPP_ASSERT(len >= 16);
     CRYPTOPP_ASSERT(len <= 128);
 
+    byte* k = out;
     if (carry)
     {
         switch (len)
@@ -184,6 +140,7 @@ inline void GF_Double(byte *k, unsigned int len)
 #else
     CRYPTOPP_ASSERT(len == 16);
 
+    byte* k = out;
     if (carry)
     {
         k[0] ^= 0x87;
@@ -192,6 +149,11 @@ inline void GF_Double(byte *k, unsigned int len)
 #endif  // CRYPTOPP_XTS_WIDE_BLOCK_CIPHERS
 }
 
+inline void GF_Double(byte *inout, unsigned int len)
+{
+    GF_Double(inout, inout, len);
+}
+
 #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
 
 using CryptoPP::AES;
@@ -247,7 +209,8 @@ void XTS_ModeBase::SetKey(const byte *key, size_t length, const NameValuePairs &
 void XTS_ModeBase::Resynchronize(const byte *iv, int ivLength)
 {
     BlockOrientedCipherModeBase::Resynchronize(iv, ivLength);
-    GetTweakCipher().ProcessBlock(m_register);
+    std::memcpy(m_xregister, m_register, ivLength);
+    GetTweakCipher().ProcessBlock(m_xregister);
 }
 
 void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
@@ -257,37 +220,61 @@ void XTS_ModeBase::Resynchronize(word64 sector, ByteOrder order)
     std::memset(iv+8, 0x00, iv.size()-8);
 
     BlockOrientedCipherModeBase::Resynchronize(iv, iv.size());
-    GetTweakCipher().ProcessBlock(m_register);
+    std::memcpy(m_xregister, iv, iv.size());
+    GetTweakCipher().ProcessBlock(m_xregister);
 }
 
 void XTS_ModeBase::ResizeBuffers()
 {
     BlockOrientedCipherModeBase::ResizeBuffers();
-    m_workspace.New(GetBlockCipher().BlockSize());
+    m_xworkspace.New(GetBlockCipher().BlockSize()*ParallelBlocks);
+    m_xregister.New(GetBlockCipher().BlockSize()*ParallelBlocks);
 }
 
 void XTS_ModeBase::ProcessData(byte *outString, const byte *inString, size_t length)
 {
     const unsigned int blockSize = GetBlockCipher().BlockSize();
+    const size_t parallelSize = blockSize*ParallelBlocks;
+    size_t i = 0;
 
     // data unit is multiple of 16 bytes
     CRYPTOPP_ASSERT(length % blockSize == 0);
 
-    // now encrypt the data unit, AES_BLK_BYTES at a time
-    for (size_t i=0; i<length; i+=blockSize)
+    // encrypt the data unit, optimal size at a time
+    for ( ; i+parallelSize<=length; i+=parallelSize)
     {
+        // m_xregister[0] always points to the next tweak.
+        GF_Double(m_xregister+1*blockSize, m_xregister+0*blockSize, blockSize);
+        GF_Double(m_xregister+2*blockSize, m_xregister+1*blockSize, blockSize);
+        GF_Double(m_xregister+3*blockSize, m_xregister+2*blockSize, blockSize);
+
         // merge the tweak into the input block
-        XorBuffer(m_workspace, inString+i, m_register, blockSize);
+        XorBuffer(m_xworkspace, inString+i, m_xregister, parallelSize);
+
+        // encrypt one block, merge the tweak into the output block
+        GetBlockCipher().AdvancedProcessBlocks(m_xworkspace, m_xregister, outString+i, parallelSize, BlockTransformation::BT_AllowParallel);
+
+        // Multiply T by alpha. m_xregister[0] always points to the next tweak.
+        GF_Double(m_xregister+0, m_xregister+3*blockSize, blockSize);
+    }
+
+    // encrypt the data unit, blocksize at a time
+    for ( ; i<length; i+=blockSize)
+    {
+        // merge the tweak into the input block
+        XorBuffer(m_xworkspace, inString+i, m_xregister, blockSize);
 
         // encrypt one block
-        GetBlockCipher().ProcessBlock(m_workspace);
+        GetBlockCipher().ProcessBlock(m_xworkspace);
 
         // merge the tweak into the output block
-        XorBuffer(outString+i, m_workspace, m_register, blockSize);
+        XorBuffer(outString+i, m_xworkspace, m_xregister, blockSize);
 
         // Multiply T by alpha
-        GF_Double(m_register, m_register.size());
+        GF_Double(m_xregister, blockSize);
     }
+
+    CRYPTOPP_ASSERT(i == length);
 }
 
 size_t XTS_ModeBase::ProcessLastBlock(byte *outString, size_t outLength, const byte *inString, size_t inLength)
@@ -310,8 +297,8 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
     CRYPTOPP_ASSERT(outLength >= inLength);
 
     const unsigned int blockSize = GetBlockCipher().BlockSize();
-    const unsigned int blocks = inLength / blockSize;
-    const unsigned int tail = inLength % blockSize;
+    const size_t blocks = inLength / blockSize;
+    const size_t tail = inLength % blockSize;
     outLength = inLength;
 
     if (tail == 0)
@@ -327,22 +314,22 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
         ProcessData(outString, inString, inLength-head);
 
         outString += head;
-        inString  += head; inLength  -= head;
+        inString  += head; inLength -= head;
     }
 
     ///// handle the full block /////
 
     // merge the tweak into the input block
-    XorBuffer(m_workspace, inString, m_register, blockSize);
+    XorBuffer(m_xworkspace, inString, m_xregister, blockSize);
 
     // encrypt one block
-    GetBlockCipher().ProcessBlock(m_workspace);
+    GetBlockCipher().ProcessBlock(m_xworkspace);
 
     // merge the tweak into the output block
-    XorBuffer(outString, m_workspace, m_register, blockSize);
+    XorBuffer(outString, m_xworkspace, m_xregister, blockSize);
 
     // Multiply T by alpha
-    GF_Double(m_register, m_register.size());
+    GF_Double(m_xregister, blockSize);
 
     ///// handle final partial block /////
 
@@ -351,20 +338,20 @@ size_t XTS_ModeBase::ProcessLastPlainBlock(byte *outString, size_t outLength, co
     const size_t len = inLength-blockSize;
 
     // copy in the final plaintext bytes
-    std::memcpy(m_workspace, inString, len);
+    std::memcpy(m_xworkspace, inString, len);
     // and copy out the final ciphertext bytes
     std::memcpy(outString, outString-blockSize, len);
     // "steal" ciphertext to complete the block
-    std::memcpy(m_workspace+len, outString-blockSize+len, blockSize-len);
+    std::memcpy(m_xworkspace+len, outString-blockSize+len, blockSize-len);
 
     // merge the tweak into the input block
-    XorBuffer(m_workspace, m_register, blockSize);
+    XorBuffer(m_xworkspace, m_xregister, blockSize);
 
     // encrypt one block
-    GetBlockCipher().ProcessBlock(m_workspace);
+    GetBlockCipher().ProcessBlock(m_xworkspace);
 
     // merge the tweak into the previous output block
-    XorBuffer(outString-blockSize, m_workspace, m_register, blockSize);
+    XorBuffer(outString-blockSize, m_xworkspace, m_xregister, blockSize);
 
     return outLength;
 }
@@ -375,8 +362,8 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
     CRYPTOPP_ASSERT(outLength >= inLength);
 
     const unsigned int blockSize = GetBlockCipher().BlockSize();
-    const unsigned int blocks = inLength / blockSize;
-    const unsigned int tail = inLength % blockSize;
+    const size_t blocks = inLength / blockSize;
+    const size_t tail = inLength % blockSize;
     outLength = inLength;
 
     if (tail == 0)
@@ -392,12 +379,12 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
         ProcessData(outString, inString, inLength-head);
 
         outString += head;
-        inString  += head; inLength  -= head;
+        inString  += head; inLength -= head;
     }
 
-    SecByteBlock poly1(m_register);
-    SecByteBlock poly2(m_register);
-    GF_Double(poly2, poly2.size());
+    #define poly1 (m_xregister+0*blockSize)
+    #define poly2 (m_xregister+1*blockSize)
+    GF_Double(poly2, poly1, blockSize);
 
     ///// handle final partial block /////
 
@@ -406,20 +393,20 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
     const size_t len = inLength-blockSize;
 
     // merge the tweak into the input block
-    XorBuffer(m_workspace, inString-blockSize, poly2, blockSize);
+    XorBuffer(m_xworkspace, inString-blockSize, poly2, blockSize);
 
     // encrypt one block
-    GetBlockCipher().ProcessBlock(m_workspace);
+    GetBlockCipher().ProcessBlock(m_xworkspace);
 
     // merge the tweak into the output block
-    XorBuffer(m_workspace, poly2, blockSize);
+    XorBuffer(m_xworkspace, poly2, blockSize);
 
     // copy in the final plaintext bytes
     std::memcpy(outString-blockSize, inString, len);
     // and copy out the final ciphertext bytes
-    std::memcpy(outString, m_workspace, len);
+    std::memcpy(outString, m_xworkspace, len);
     // "steal" ciphertext to complete the block
-    std::memcpy(outString-blockSize+len, m_workspace+len, blockSize-len);
+    std::memcpy(outString-blockSize+len, m_xworkspace+len, blockSize-len);
 
     ///// handle the full previous block /////
 
@@ -427,13 +414,13 @@ size_t XTS_ModeBase::ProcessLastCipherBlock(byte *outString, size_t outLength, c
     outString -= blockSize;
 
     // merge the tweak into the input block
-    XorBuffer(m_workspace, outString, poly1, blockSize);
+    XorBuffer(m_xworkspace, outString, poly1, blockSize);
 
     // encrypt one block
-    GetBlockCipher().ProcessBlock(m_workspace);
+    GetBlockCipher().ProcessBlock(m_xworkspace);
 
     // merge the tweak into the output block
-    XorBuffer(outString, m_workspace, poly1, blockSize);
+    XorBuffer(outString, m_xworkspace, poly1, blockSize);
 
     return outLength;
 }
author	Jeffrey Walton <noloader@gmail.com>	2019-10-13 16:17:37 -0400
committer	Jeffrey Walton <noloader@gmail.com>	2019-10-13 16:17:37 -0400
commit	8e8e95cea204319d99380cbedd0159746876355a (patch)
tree	1bf4e060c569cf89352179b81af4284836e7616d /xts.cpp
parent	c9b8452d570ce0d5f883c249b942518ef509fc18 (diff)
download	cryptopp-git-8e8e95cea204319d99380cbedd0159746876355a.tar.gz