Speed up decompression by caching ip_.

It is seemingly hard for the compiler to understand that ip_, the current input pointer into the compressed data stream, can not alias on anything else, and thus using it directly will incur memory traffic as it cannot be kept in a register. The code already knew about this and cached it into a local variable, but since Step() only decoded one tag, it had to move ip_ back into place between every tag. This seems to have cost us a significant amount of performance, so changing Step() into a function that decodes as much as it can before it saves ip_ back and returns. (Note that Step() was already inlined, so it is not the manual inlining that buys the performance here.) The wins are about 3-6% for Core 2, 6-13% on Core i7 and 5-12% on Opteron (for plain array-to-array decompression, in 64-bit opt mode). There is a tiny difference in the behavior here; if an invalid literal is encountered (ie., the writer refuses the Append() operation), ip_ will now point to the byte past the tag byte, instead of where the literal was originally thought to end. However, we don't use ip_ for anything after DecompressAllTags() has returned, so this should not change external behavior in any way. Microbenchmark results for Core i7, 64-bit (Opteron results are similar): Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 79134 79110 8835 1.2GB/s html [ +6.2%] BM_UFlat/1 786126 786096 891 851.8MB/s urls [+10.0%] BM_UFlat/2 9948 9948 69125 11.9GB/s jpg [ -1.3%] BM_UFlat/3 31999 31998 21898 2.7GB/s pdf [ +6.5%] BM_UFlat/4 318909 318829 2204 1.2GB/s html4 [ +6.5%] BM_UFlat/5 31384 31390 22363 747.5MB/s cp [ +9.2%] BM_UFlat/6 14037 14034 49858 757.7MB/s c [+10.6%] BM_UFlat/7 4612 4612 151395 769.5MB/s lsp [ +9.5%] BM_UFlat/8 1203174 1203007 582 816.3MB/s xls [+19.3%] BM_UFlat/9 253869 253955 2757 571.1MB/s txt1 [+11.4%] BM_UFlat/10 219292 219290 3194 544.4MB/s txt2 [+12.1%] BM_UFlat/11 672135 672131 1000 605.5MB/s txt3 [+11.2%] BM_UFlat/12 902512 902492 776 509.2MB/s txt4 [+12.5%] BM_UFlat/13 372110 371998 1881 1.3GB/s bin [ +5.8%] BM_UFlat/14 50407 50407 10000 723.5MB/s sum [+13.5%] BM_UFlat/15 5699 5701 100000 707.2MB/s man [+12.4%] BM_UFlat/16 83448 83424 8383 1.3GB/s pb [ +5.7%] BM_UFlat/17 256958 256963 2723 684.1MB/s gaviota [ +7.9%] BM_UValidate/0 42795 42796 16351 2.2GB/s html [+25.8%] BM_UValidate/1 490672 490622 1427 1.3GB/s urls [+22.7%] BM_UValidate/2 237 237 2950297 499.0GB/s jpg [+24.9%] BM_UValidate/3 14610 14611 47901 6.0GB/s pdf [+26.8%] BM_UValidate/4 171973 171990 4071 2.2GB/s html4 [+25.7%] git-svn-id: http://snappy.googlecode.com/svn/trunk@38 03e5f5b5-db94-4691-08a0-1a8bf15f6143
author: snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143> 2011-06-02 17:59:40 +0000
committer: snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143> 2011-06-02 17:59:40 +0000
commit: f56157f8b866453cf9096aacbffc629550482cd6 (patch)
tree: 82a418618cd831e21badd8f2269c536c77209881
parent: 5258607248a000affb2f72b87b9bbd044806677d (diff)
download: snappy-f56157f8b866453cf9096aacbffc629550482cd6.tar.gz
1 files changed, 45 insertions, 36 deletions
diff --git a/snappy.cc b/snappy.cc
index 625029c..b3045a3 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -653,45 +653,54 @@ class SnappyDecompressor {
   // Process the next item found in the input.
   // Returns true if successful, false on error or end of input.
   template <class Writer>
-  bool Step(Writer* writer) {
+  void DecompressAllTags(Writer* writer) {
     const char* ip = ip_;
-    if (ip_limit_ - ip < 5) {
-      if (!RefillTag()) return false;
-      ip = ip_;
-    }
-
-    const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
-    const uint32 entry = char_table[c];
-    const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
-    ip += entry >> 11;
-    const uint32 length = entry & 0xff;
+    for ( ;; ) {
+      if (ip_limit_ - ip < 5) {
+        ip_ = ip;
+        if (!RefillTag()) return;
+        ip = ip_;
+      }
 
-    if ((c & 0x3) == LITERAL) {
-      uint32 literal_length = length + trailer;
-      uint32 avail = ip_limit_ - ip;
-      while (avail < literal_length) {
+      const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
+      const uint32 entry = char_table[c];
+      const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+      ip += entry >> 11;
+      const uint32 length = entry & 0xff;
+
+      if ((c & 0x3) == LITERAL) {
+        uint32 literal_length = length + trailer;
+        uint32 avail = ip_limit_ - ip;
+        while (avail < literal_length) {
+          bool allow_fast_path = (avail >= 16);
+          if (!writer->Append(ip, avail, allow_fast_path)) goto end;
+          literal_length -= avail;
+          reader_->Skip(peeked_);
+          size_t n;
+          ip = reader_->Peek(&n);
+          avail = n;
+          peeked_ = avail;
+          if (avail == 0) goto end;  // Premature end of input
+          ip_limit_ = ip + avail;
+        }
         bool allow_fast_path = (avail >= 16);
-        if (!writer->Append(ip, avail, allow_fast_path)) return false;
-        literal_length -= avail;
-        reader_->Skip(peeked_);
-        size_t n;
-        ip = reader_->Peek(&n);
-        avail = n;
-        peeked_ = avail;
-        if (avail == 0) return false;  // Premature end of input
-        ip_limit_ = ip + avail;
+        if (!writer->Append(ip, literal_length, allow_fast_path)) {
+          goto end;
+        }
+        ip += literal_length;
+      } else {
+        // copy_offset/256 is encoded in bits 8..10.  By just fetching
+        // those bits, we get copy_offset (since the bit-field starts at
+        // bit 8).
+        const uint32 copy_offset = entry & 0x700;
+        if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
+          goto end;
+        }
       }
-      ip_ = ip + literal_length;
-      bool allow_fast_path = (avail >= 16);
-      return writer->Append(ip, literal_length, allow_fast_path);
-    } else {
-      ip_ = ip;
-      // copy_offset/256 is encoded in bits 8..10.  By just fetching
-      // those bits, we get copy_offset (since the bit-field starts at
-      // bit 8).
-      const uint32 copy_offset = entry & 0x700;
-      return writer->AppendFromSelf(copy_offset + trailer, length);
     }
+
+end:
+    ip_ = ip;
   }
 };
 
@@ -770,7 +779,7 @@ static bool InternalUncompress(Source* r,
   writer->SetExpectedLength(uncompressed_len);
 
   // Process the entire input
-  while (decompressor.Step(writer)) { }
+  decompressor.DecompressAllTags(writer);
   return (decompressor.eof() && writer->CheckLength());
 }
 
@@ -866,7 +875,7 @@ size_t Compress(Source* reader, Sink* writer) {
 
 // A type that writes to a flat array.
 // Note that this is not a "ByteSink", but a type that matches the
-// Writer template argument to SnappyDecompressor::Step().
+// Writer template argument to SnappyDecompressor::DecompressAllTags().
 class SnappyArrayWriter {
  private:
   char* base_;
author	snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>	2011-06-02 17:59:40 +0000
committer	snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>	2011-06-02 17:59:40 +0000
commit	f56157f8b866453cf9096aacbffc629550482cd6 (patch)
tree	82a418618cd831e21badd8f2269c536c77209881
parent	5258607248a000affb2f72b87b9bbd044806677d (diff)
download	snappy-f56157f8b866453cf9096aacbffc629550482cd6.tar.gz