1) Improve the lookup table data to require less instructions to extract the necessary data. We now store len - offset in a signed int16, this happens to remove masking offset in the calculations and the calculations that need to be done precisely give the flags that we need for testing correctness.

2) Replace offset extraction with a lookup mask. This is less uops and is needed because we need to special case type 3 to always return 0 as to properly trigger the fallback. 3) Unroll the loop twice, this removes some loop-condition checks AND it improves the generated assembly. The loop variables tend to end up in a different register requiring mov's having two consecutive copies allows the elision of the mov's. PiperOrigin-RevId: 346663328
author: Snappy Team <no-reply@google.com> 2020-12-10 00:30:54 +0000
committer: Victor Costan <costan@google.com> 2020-12-14 02:48:03 +0000
commit: 3b571656fa4d73a5f6d21629a041b867045d5d49 (patch)
tree: 860f1a968d4d776e62a8df2378e5f90c31f91c48
parent: a9730ed50526b714eff8731066a70006cf816420 (diff)
download: snappy-git-3b571656fa4d73a5f6d21629a041b867045d5d49.tar.gz
2 files changed, 188 insertions, 126 deletions
diff --git a/snappy.cc b/snappy.cc
index 7578901..8449b6e 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -70,6 +70,7 @@
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
@@ -79,6 +80,8 @@
 
 namespace snappy {
 
+namespace {
+
 // The amount of slop bytes writers are using for unconditional copies.
 constexpr int kSlopBytes = 64;
 
@@ -90,25 +93,30 @@ using internal::kMaximumTagLength;
 using internal::LITERAL;
 
 // We translate the information encoded in a tag through a lookup table to a
-// format that requires fewer instructions to decode.
-// The returned format encodes the offset in the high byte and the length
-// in the low byte. Because length will never be 0, we use zero as an indicator
-// for an exceptional value (copy 3 tag or a literal > 60 bytes).
-constexpr size_t kLiteralOffset = 256;
-inline constexpr uint16_t MakeEntry(uint16_t len, uint16_t offset) {
-  return len | (offset << 8);
+// format that requires fewer instructions to decode. Effectively we store
+// the length minus the tag part of the offset. The lowest significant byte
+// thus stores the length. While total length - offset is given by
+// entry - ExtractOffset(type). The nice thing is that the subtraction
+// immediately sets the flags for the necessary check that offset >= length.
+// This folds the cmp with sub. We engineer the long literals and copy-4 to
+// always fail this check, so their presence doesn't affect the fast path.
+// To prevent literals from triggering the guard against offset < length (offset
+// does not apply to literals) the table is giving them a spurious offset of
+// 256.
+inline constexpr int16_t MakeEntry(int16_t len, int16_t offset) {
+  return len - (offset << 8);
 }
 
-inline constexpr uint16_t OffsetAndLength(uint8_t tag, int type) {
-  return type == 3      ? 0                             // copy-4 (or type == 3)
-         : type == 2    ? MakeEntry((tag >> 2) + 1, 0)  // copy-2
-         : type == 1    ? MakeEntry(((tag >> 2) & 7) + 4, tag >> 5)  // copy-1
-         : tag < 60 * 4 ? MakeEntry((tag >> 2) + 1, 1)
-                        : 0;  // long literal
+inline constexpr int16_t LengthMinusOffset(int data, int type) {
+  return type == 3   ? 0xFF                    // copy-4 (or type == 3)
+         : type == 2 ? MakeEntry(data + 1, 0)  // copy-2
+         : type == 1 ? MakeEntry((data & 7) + 4, data >> 3)  // copy-1
+         : data < 60 ? MakeEntry(data + 1, 1)  // note spurious offset.
+                     : 0xFF;                   // long literal
 }
 
-inline constexpr uint16_t OffsetAndLength(uint8_t tag) {
-  return OffsetAndLength(tag, tag & 3);
+inline constexpr int16_t LengthMinusOffset(uint8_t tag) {
+  return LengthMinusOffset(tag >> 2, tag & 3);
 }
 
 template <size_t... Ints>
@@ -121,23 +129,29 @@ template <size_t... Is>
 struct make_index_sequence<0, Is...> : index_sequence<Is...> {};
 
 template <size_t... seq>
-constexpr std::array<uint16_t, 256> MakeTable(index_sequence<seq...>) {
-  return std::array<uint16_t, 256>{OffsetAndLength(seq)...};
+constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) {
+  return std::array<int16_t, 256>{LengthMinusOffset(seq)...};
 }
 
-alignas(64) constexpr std::array<uint16_t, 256> offset_and_length_table =
-    MakeTable(make_index_sequence<256>{});
+// We maximally co-locate the two tables so that only one register needs to be
+// reserved for the table address.
+struct {
+  alignas(64) const std::array<int16_t, 256> length_minus_offset;
+  uint32_t extract_masks[4];  // Used for extracting offset based on tag type.
+} table = {MakeTable(make_index_sequence<256>{}), {0, 0xFF, 0xFFFF, 0}};
 
 // Any hash function will produce a valid compressed bitstream, but a good
 // hash function reduces the number of collisions and thus yields better
 // compression for compressible input, and more speed for incompressible
 // input. Of course, it doesn't hurt if the hash function is reasonably fast
 // either, as it gets called a lot.
-static inline uint32_t HashBytes(uint32_t bytes, uint32_t mask) {
+inline uint32_t HashBytes(uint32_t bytes, uint32_t mask) {
   constexpr uint32_t kMagic = 0x1e35a7bd;
   return ((kMagic * bytes) >> (32 - kMaxHashTableBits)) & mask;
 }
 
+}  // namespace
+
 size_t MaxCompressedLength(size_t source_bytes) {
   // Compressed data can be defined as:
   //    compressed := item* literal*
@@ -967,6 +981,72 @@ static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) {
   return (value & masks[shift]) != 0;
 }
 
+inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) {
+  return offset != 0;
+}
+
+void MemCopy(char* dst, const uint8_t* src, size_t size) {
+  std::memcpy(dst, src, size);
+}
+
+void MemCopy(ptrdiff_t dst, const uint8_t* src, size_t size) {}
+
+void MemMove(char* dst, const void* src, size_t size) {
+  std::memmove(dst, src, size);
+}
+
+void MemMove(ptrdiff_t dst, const void* src, size_t size) {}
+
+SNAPPY_ATTRIBUTE_ALWAYS_INLINE
+size_t AdvanceToNextTag(const uint8_t** ip_p, size_t* tag) {
+  const uint8_t*& ip = *ip_p;
+  // This section is crucial for the throughput of the decompression loop.
+  // The latency of an iteration is fundamentally constrained by the
+  // following data chain on ip.
+  // ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
+  //                       ip2 = ip + 2 + (c >> 2)
+  // This amounts to 8 cycles.
+  // 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
+  size_t literal_len = *tag >> 2;
+  size_t tag_type = *tag;
+  bool is_literal;
+#if defined(__GNUC__) && defined(__x86_64__)
+  // TODO clang misses the fact that the (c & 3) already correctly
+  // sets the zero flag.
+  asm("and $3, %k[tag_type]\n\t"
+      : [tag_type] "+r"(tag_type), "=@ccz"(is_literal));
+#else
+  tag_type &= 3;
+  is_literal = (tag_type == 0);
+#endif
+  // TODO
+  // This is code is subtle. Loading the values first and then cmov has less
+  // latency then cmov ip and then load. However clang would move the loads
+  // in an optimization phase, volatile prevents this transformation.
+  // Note that we have enough slop bytes (64) that the loads are always valid.
+  size_t tag_literal =
+      static_cast<const volatile uint8_t*>(ip)[1 + literal_len];
+  size_t tag_copy = static_cast<const volatile uint8_t*>(ip)[tag_type];
+  *tag = is_literal ? tag_literal : tag_copy;
+  const uint8_t* ip_copy = ip + 1 + tag_type;
+  const uint8_t* ip_literal = ip + 2 + literal_len;
+  ip = is_literal ? ip_literal : ip_copy;
+#if defined(__GNUC__) && defined(__x86_64__)
+  // TODO Clang is "optimizing" zero-extension (a totally free
+  // operation) this means that after the cmov of tag, it emits another movzb
+  // tag, byte(tag). It really matters as it's on the core chain. This dummy
+  // asm, persuades clang to do the zero-extension at the load (it's automatic)
+  // removing the expensive movzb.
+  asm("" ::"r"(tag_copy));
+#endif
+  return tag_type;
+}
+
+// Extract the offset for copy-1 and copy-2 returns 0 for literals or copy-4.
+inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) {
+  return val & table.extract_masks[tag_type];
+};
+
 // Core decompression loop, when there is enough data available.
 // Decompresses the input buffer [ip, ip_limit) into the output buffer
 // [op, op_limit_min_slop). Returning when either we are too close to the end
@@ -976,113 +1056,92 @@ static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) {
 // TODO This function probably does not need to be inlined, as it
 // should decode large chunks at a time. This allows runtime dispatch to
 // implementations based on CPU capability (BMI2 / perhaps 32 / 64 byte memcpy).
-std::pair<const uint8_t*, char*> DecompressBranchless(
-    const uint8_t* ip, const uint8_t* ip_limit, char* op_ptr, char* op_base,
-    char* op_limit_min_slop_ptr) {
-  std::ptrdiff_t op = op_ptr - op_base;
-  std::ptrdiff_t op_limit_min_slop = op_limit_min_slop_ptr - op_base;
-  std::ptrdiff_t op_limit = op_limit_min_slop + kSlopBytes - 1;
-  if (kSlopBytes < ip_limit - ip && op < op_limit_min_slop) {
-    const uint8_t* const ip_limit_min_slop = ip_limit - kSlopBytes + 1;
+template <typename T>
+std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
+    const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base,
+    ptrdiff_t op_limit_min_slop) {
+  // We unroll the inner loop twice so we need twice the spare room.
+  op_limit_min_slop -= kSlopBytes;
+  if (2 * (kSlopBytes + 1) < ip_limit - ip && op < op_limit_min_slop) {
+    const uint8_t* const ip_limit_min_slop = ip_limit - 2 * kSlopBytes - 1;
     ip++;
     // ip points just past the tag and we are touching at maximum kSlopBytes
     // in an iteration.
+    size_t tag = ip[-1];
     do {
-      const uint8_t* old_ip = ip;
-      const uint64_t tag = ip[-1];
-      uint32_t offset_and_length = offset_and_length_table[tag];
-      if (SNAPPY_PREDICT_FALSE(offset_and_length == 0)) {
-        // Exception case (long literal or copy 4).
-        if ((tag & 3) == 3) {
-        break_loop:
-          ip = old_ip;
-          break;
+      // The throughput is limited by instructions, unrolling the inner loop
+      // twice reduces the amount of instructions checking limits and also
+      // leads to reduced mov's.
+      for (int i = 0; i < 2; i++) {
+        const uint8_t* old_ip = ip;
+        assert(tag == ip[-1]);
+        // For literals tag_type = 0, hence we will always obtain 0 from
+        // ExtractLowBytes. For literals offset will thus be kLiteralOffset.
+        ptrdiff_t len_min_offset = table.length_minus_offset[tag];
+        size_t tag_type = AdvanceToNextTag(&ip, &tag);
+        uint32_t next = LittleEndian::Load32(old_ip);
+        size_t len = len_min_offset & 0xFF;
+        len_min_offset -= ExtractOffset(next, tag_type);
+        if (SNAPPY_PREDICT_FALSE(len_min_offset > 0)) {
+          if (SNAPPY_PREDICT_FALSE(len & 0x80)) {
+            // Exceptional case (long literal or copy 4).
+            // Actually doing the copy here is negatively impacting the main
+            // loop due to compiler incorrectly allocating a register for
+            // this fallback. Hence we just break.
+          break_loop:
+            ip = old_ip;
+            goto exit;
+          }
+          // Only copy-1 or copy-2 tags can get here.
+          assert(tag_type == 1 || tag_type == 2);
+          std::ptrdiff_t delta = op + len_min_offset - len;
+          // Guard against copies before the buffer start.
+          if (SNAPPY_PREDICT_FALSE(delta < 0 ||
+                                  !Copy64BytesWithPatternExtension(
+                                      op_base + op, len - len_min_offset))) {
+            goto break_loop;
+          }
+          op += len;
+          continue;
         }
-        assert((tag & 3) == 0);  // This is a literal
-        assert(tag >= 60 * 4);   // It's a long literal
-        uint32_t next = LittleEndian::Load32(ip);
-        uint32_t length_bytes = (tag >> 2) - 59;
-        uint32_t literal_len = ExtractLowBytes(next, length_bytes) + 1;
-        ip += length_bytes;
-        // Note we use >= instead of > because we also need to increment ip
-        // because we have to leave it past the tag.
-        if (literal_len >= ip_limit - ip) goto break_loop;
-        if (literal_len > op_limit - op) goto break_loop;
-        memcpy(op_base + op, ip, literal_len);
-        ip += literal_len;
-        op += literal_len;
-        assert(ip < ip_limit);  // See above test
-        ip++;
-        continue;
-      }
-      size_t tag_type;
-      {
-        // This section is crucial for the throughput of the decompression loop.
-        // The latency of an iteration is fundamentally constrained by the
-        // following data chain on ip.
-        // ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
-        //                       ip2 = ip + 2 + (c >> 2)
-        // This amounts to 8 cycles.
-        // 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
-        size_t literal_len = tag >> 2;
+        std::ptrdiff_t delta = op + len_min_offset - len;
+        if (SNAPPY_PREDICT_FALSE(delta < 0)) {
 #if defined(__GNUC__) && defined(__x86_64__)
-        // TODO
-        // clang misses the fact that the (c & 3) already correctly sets
-        // the zero flag.
-        tag_type = tag;
-        bool is_literal;
-        asm("and $3, %0\n\t" : "+r"(tag_type), "=@ccz"(is_literal));
-        bool is_copy = !is_literal;
-#else
-        tag_type = tag & 3;
-        bool is_copy = (tag_type != 0);
+          // TODO
+          // When validating, both code path reduced to `op += len`. Ie. this
+          // becomes effectively
+          //
+          // if (delta < 0) if (tag_type != 0) goto break_loop;
+          // op += len;
+          //
+          // The compiler interchanges the predictable and almost always false
+          // first if-statement with the completely unpredictable second
+          // if-statement, putting an unpredictable branch on every iteration.
+          // This empty asm is worth almost 2x, which I think qualifies for an
+          // award for the most load-bearing empty statement.
+          asm("");
 #endif
-        const uint8_t* ip_copy = ip + 1 + tag_type;
-        const uint8_t* ip_literal = ip + 2 + literal_len;
-        ip = is_copy ? ip_copy : ip_literal;
-      }
-      uint32_t next = LittleEndian::Load32(old_ip);
-      // For literals tag_type = 0, hence we will always obtain 0 from
-      // ExtractLowBytes. For literals offset will thus be kLiteralOffset.
-      std::ptrdiff_t offset =
-          (offset_and_length & 0x700) + ExtractLowBytes(next, tag_type);
-      size_t len = offset_and_length & 0xFF;
-      std::ptrdiff_t delta = op - offset;
-      if (SNAPPY_PREDICT_FALSE(delta < 0)) {
-        if (tag_type != 0) goto break_loop;
-        std::memcpy(op_base + op, old_ip, 64);
-        op += len;
-        continue;
-      }
-      // By choosing literals to have kLiteralOffset this test will
-      // always succeed for literals.
-      if (SNAPPY_PREDICT_FALSE(std::size_t(offset) < len)) {
-        assert(tag_type != 0);
-        // offset 0 is an error.
-        if (!Copy64BytesWithPatternExtension(op_base + op, offset)) {
-          break;
+
+          // Due to the spurious offset in literals have this will trigger
+          // at the start of a block when op is still smaller than 256.
+          if (tag_type != 0) goto break_loop;
+          MemCopy(op_base + op, old_ip, 64);
+          op += len;
+          continue;
         }
+
+        // For copies we need to copy from op_base + delta, for literals
+        // we need to copy from ip instead of from the stream.
+        const void* from =
+            tag_type ? reinterpret_cast<void*>(op_base + delta) : old_ip;
+        MemMove(op_base + op, from, 64);
         op += len;
-        continue;
       }
-
-      const uint8_t* from =
-          tag_type ? reinterpret_cast<const uint8_t*>(op_base + delta) : old_ip;
-      // For literals we need to copy from ip instead of from the stream.
-      // We also need to compensate for the offset in that case.
-      std::memmove(op_base + op, from, 64);
-      op += len;
     } while (ip < ip_limit_min_slop && op < op_limit_min_slop);
+  exit:
     ip--;
     assert(ip <= ip_limit);
   }
-  return {ip, op_base + op};
-}
-
-template <typename T>
-std::pair<const uint8_t*, T> DecompressBranchless(const uint8_t* ip,
-                                                  const uint8_t*, T op, char*,
-                                                  char*) {
   return {ip, op};
 }
 
@@ -1179,15 +1238,15 @@ class SnappyDecompressor {
     MAYBE_REFILL();
     for (;;) {
       {
-        char* op_limit_min_slop;
+        ptrdiff_t op_limit_min_slop;
         auto op_base = writer->GetBase(&op_limit_min_slop);
         if (op_base) {
           auto res =
               DecompressBranchless(reinterpret_cast<const uint8_t*>(ip),
                                    reinterpret_cast<const uint8_t*>(ip_limit_),
-                                   op, op_base, op_limit_min_slop);
+                                   op - op_base, op_base, op_limit_min_slop);
           ip = reinterpret_cast<const char*>(res.first);
-          op = res.second;
+          op = op_base + res.second;
           MAYBE_REFILL();
         }
       }
@@ -1250,7 +1309,7 @@ class SnappyDecompressor {
 
           if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
         } else {
-          const uint32_t entry = offset_and_length_table[c];
+          const ptrdiff_t entry = table.length_minus_offset[c];
           preload = LittleEndian::Load32(ip);
           const uint32_t trailer = ExtractLowBytes(preload, c & 3);
           const uint32_t length = entry & 0xff;
@@ -1259,7 +1318,7 @@ class SnappyDecompressor {
           // copy_offset/256 is encoded in bits 8..10.  By just fetching
           // those bits, we get copy_offset (since the bit-field starts at
           // bit 8).
-          const uint32_t copy_offset = (entry & 0x700) + trailer;
+          const uint32_t copy_offset = trailer - entry + length;
           if (!writer->AppendFromSelf(copy_offset, length, &op)) goto exit;
 
           ip += (c & 3);
@@ -1523,7 +1582,7 @@ class SnappyIOVecWriter {
   }
 
   char* GetOutputPtr() { return nullptr; }
-  char* GetBase(char**) { return nullptr; }
+  char* GetBase(ptrdiff_t*) { return nullptr; }
   void SetOutputPtr(char* op) {
     // TODO: Switch to [[maybe_unused]] when we can assume C++17.
     (void)op;
@@ -1688,8 +1747,8 @@ class SnappyArrayWriter {
   inline bool CheckLength() const { return op_ == op_limit_; }
 
   char* GetOutputPtr() { return op_; }
-  char* GetBase(char** op_limit_min_slop) {
-    *op_limit_min_slop = op_limit_min_slop_;
+  char* GetBase(ptrdiff_t* op_limit_min_slop) {
+    *op_limit_min_slop = op_limit_min_slop_ - base_;
     return base_;
   }
   void SetOutputPtr(char* op) { op_ = op; }
@@ -1782,7 +1841,10 @@ class SnappyDecompressionValidator {
   inline SnappyDecompressionValidator() : expected_(0), produced_(0) {}
   inline void SetExpectedLength(size_t len) { expected_ = len; }
   size_t GetOutputPtr() { return produced_; }
-  char* GetBase(char**) { return nullptr; }
+  size_t GetBase(ptrdiff_t* op_limit_min_slop) {
+    *op_limit_min_slop = std::numeric_limits<ptrdiff_t>::max() - kSlopBytes + 1;
+    return 1;
+  }
   void SetOutputPtr(size_t op) { produced_ = op; }
   inline bool CheckLength() const { return expected_ == produced_; }
   inline bool Append(const char* ip, size_t len, size_t* produced) {
@@ -1887,8 +1949,8 @@ class SnappyScatteredWriter {
         op_limit_(NULL),
         op_limit_min_slop_(NULL) {}
   char* GetOutputPtr() { return op_ptr_; }
-  char* GetBase(char** op_limit_min_slop) {
-    *op_limit_min_slop = op_limit_min_slop_;
+  char* GetBase(ptrdiff_t* op_limit_min_slop) {
+    *op_limit_min_slop = op_limit_min_slop_ - op_base_;
     return op_base_;
   }
   void SetOutputPtr(char* op) { op_ptr_ = op; }
diff --git a/snappy_unittest.cc b/snappy_unittest.cc
index 0b568b2..7b220d1 100644
--- a/snappy_unittest.cc
+++ b/snappy_unittest.cc
@@ -1368,7 +1368,7 @@ struct SourceFiles {
   size_t max_size = 0;
 };
 
-static void BM_UFlatMedley(int iters, int) {
+static void BM_UFlatMedley(int iters) {
   static const SourceFiles* const source = new SourceFiles();
 
   std::vector<char> dst(source->max_size);
@@ -1408,7 +1408,7 @@ static void BM_UValidate(int iters, int arg) {
 }
 BENCHMARK(BM_UValidate)->DenseRange(0, ARRAYSIZE(files) - 1);
 
-static void BM_UValidateMedley(int iters, int) {
+static void BM_UValidateMedley(int iters) {
   static const SourceFiles* const source = new SourceFiles();
 
   size_t processed = 0;
author	Snappy Team <no-reply@google.com>	2020-12-10 00:30:54 +0000
committer	Victor Costan <costan@google.com>	2020-12-14 02:48:03 +0000
commit	3b571656fa4d73a5f6d21629a041b867045d5d49 (patch)
tree	860f1a968d4d776e62a8df2378e5f90c31f91c48
parent	a9730ed50526b714eff8731066a70006cf816420 (diff)
download	snappy-git-3b571656fa4d73a5f6d21629a041b867045d5d49.tar.gz