Speed up decompression by moving the refill check to the end of the loop.

This seems to work because in most of the branches, the compiler can evaluate “ip_limit_ - ip” in a more efficient way than reloading ip_limit_ from memory (either by already having the entire expression in a register, or reconstructing it from “avail”, or something else). Memory loads, even from L1, are seemingly costly in the big picture at the current decompression speeds. Microbenchmarks (64-bit, opt mode): Westmere (Intel Core i7): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 74492 74491 187894 1.3GB/s html [ +5.9%] BM_UFlat/1 712268 712263 19644 940.0MB/s urls [ +3.8%] BM_UFlat/2 10591 10590 1000000 11.2GB/s jpg [ -6.8%] BM_UFlat/3 29643 29643 469915 3.0GB/s pdf [ +7.9%] BM_UFlat/4 304669 304667 45930 1.3GB/s html4 [ +4.8%] BM_UFlat/5 28508 28507 490077 823.1MB/s cp [ +4.0%] BM_UFlat/6 12415 12415 1000000 856.5MB/s c [ +8.6%] BM_UFlat/7 3415 3415 4084723 1039.0MB/s lsp [+18.0%] BM_UFlat/8 979569 979563 14261 1002.5MB/s xls [ +5.8%] BM_UFlat/9 230150 230148 60934 630.2MB/s txt1 [ +5.2%] BM_UFlat/10 197167 197166 71135 605.5MB/s txt2 [ +4.7%] BM_UFlat/11 607394 607390 23041 670.1MB/s txt3 [ +5.6%] BM_UFlat/12 808502 808496 17316 568.4MB/s txt4 [ +5.0%] BM_UFlat/13 372791 372788 37564 1.3GB/s bin [ +3.3%] BM_UFlat/14 44541 44541 313969 818.8MB/s sum [ +5.7%] BM_UFlat/15 4833 4833 2898697 834.1MB/s man [ +4.8%] BM_UFlat/16 79855 79855 175356 1.4GB/s pb [ +4.8%] BM_UFlat/17 245845 245843 56838 715.0MB/s gaviota [ +5.8%] Clovertown (Intel Core 2): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 107911 107890 100000 905.1MB/s html [ +2.2%] BM_UFlat/1 1011237 1011041 10000 662.3MB/s urls [ +2.5%] BM_UFlat/2 26775 26770 523089 4.4GB/s jpg [ +0.0%] BM_UFlat/3 48103 48095 290618 1.8GB/s pdf [ +3.4%] BM_UFlat/4 437724 437644 31937 892.6MB/s html4 [ +2.1%] BM_UFlat/5 39607 39600 358284 592.5MB/s cp [ +2.4%] BM_UFlat/6 18227 18224 768191 583.5MB/s c [ +2.7%] BM_UFlat/7 5171 5170 2709437 686.4MB/s lsp [ +3.9%] BM_UFlat/8 1560291 1559989 8970 629.5MB/s xls [ +3.6%] BM_UFlat/9 335401 335343 41731 432.5MB/s txt1 [ +3.0%] BM_UFlat/10 287014 286963 48758 416.0MB/s txt2 [ +2.8%] BM_UFlat/11 888522 888356 15752 458.1MB/s txt3 [ +2.9%] BM_UFlat/12 1186600 1186378 10000 387.3MB/s txt4 [ +3.1%] BM_UFlat/13 572295 572188 24468 855.4MB/s bin [ +2.1%] BM_UFlat/14 64060 64049 218401 569.4MB/s sum [ +4.1%] BM_UFlat/15 7264 7263 1916168 555.0MB/s man [ +1.4%] BM_UFlat/16 108853 108836 100000 1039.1MB/s pb [ +1.7%] BM_UFlat/17 364289 364223 38419 482.6MB/s gaviota [ +4.9%] Barcelona (AMD Opteron): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 103900 103871 100000 940.2MB/s html [ +8.3%] BM_UFlat/1 1000435 1000107 10000 669.5MB/s urls [ +6.6%] BM_UFlat/2 24659 24652 567362 4.8GB/s jpg [ +0.1%] BM_UFlat/3 48206 48193 291121 1.8GB/s pdf [ +5.0%] BM_UFlat/4 421980 421850 33174 926.0MB/s html4 [ +7.3%] BM_UFlat/5 40368 40357 346994 581.4MB/s cp [ +8.7%] BM_UFlat/6 19836 19830 708695 536.2MB/s c [ +8.0%] BM_UFlat/7 6100 6098 2292774 581.9MB/s lsp [ +9.0%] BM_UFlat/8 1693093 1692514 8261 580.2MB/s xls [ +8.0%] BM_UFlat/9 365991 365886 38225 396.4MB/s txt1 [ +7.1%] BM_UFlat/10 311330 311238 44950 383.6MB/s txt2 [ +7.6%] BM_UFlat/11 975037 974737 14376 417.5MB/s txt3 [ +6.9%] BM_UFlat/12 1303558 1303175 10000 352.6MB/s txt4 [ +7.3%] BM_UFlat/13 517448 517290 27144 946.2MB/s bin [ +5.5%] BM_UFlat/14 66537 66518 210352 548.3MB/s sum [ +7.5%] BM_UFlat/15 7976 7974 1760383 505.6MB/s man [ +5.6%] BM_UFlat/16 103121 103092 100000 1097.0MB/s pb [ +8.7%] BM_UFlat/17 391431 391314 35733 449.2MB/s gaviota [ +6.5%] R=sanjay git-svn-id: http://snappy.googlecode.com/svn/trunk@54 03e5f5b5-db94-4691-08a0-1a8bf15f6143
author: snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143> 2011-12-05 21:27:26 +0000
committer: snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143> 2011-12-05 21:27:26 +0000
commit: 95a111b29c7be2240a41672153f1b2e01b9fc7d0 (patch)
tree: 445d694b5d8e8cfc6c07c116546ac71df385cbbb
parent: dc9e55f2949e4518742d029c724f416abae3f5a5 (diff)
download: snappy-95a111b29c7be2240a41672153f1b2e01b9fc7d0.tar.gz
1 files changed, 18 insertions, 6 deletions
diff --git a/snappy.cc b/snappy.cc
index 799a640..b8b41cb 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -669,13 +669,20 @@ class SnappyDecompressor {
   template <class Writer>
   void DecompressAllTags(Writer* writer) {
     const char* ip = ip_;
-    for ( ;; ) {
-      if (ip_limit_ - ip < 5) {
-        ip_ = ip;
-        if (!RefillTag()) return;
-        ip = ip_;
-      }
 
+    // We could have put this refill fragment only at the beginning of the loop.
+    // However, duplicating it at the end of each branch gives the compiler more
+    // scope to optimize the <ip_limit_ - ip> expression based on the local
+    // context, which overall increases speed.
+    #define MAYBE_REFILL() \
+        if (ip_limit_ - ip < 5) { \
+          ip_ = ip; \
+          if (!RefillTag()) return; \
+          ip = ip_; \
+        }
+
+    MAYBE_REFILL();
+    for ( ;; ) {
       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
 
       if ((c & 0x3) == LITERAL) {
@@ -683,6 +690,7 @@ class SnappyDecompressor {
         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
           DCHECK_LT(literal_length, 61);
           ip += literal_length;
+          MAYBE_REFILL();
           continue;
         }
         if (PREDICT_FALSE(literal_length >= 61)) {
@@ -709,6 +717,7 @@ class SnappyDecompressor {
           return;
         }
         ip += literal_length;
+        MAYBE_REFILL();
       } else {
         const uint32 entry = char_table[c];
         const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
@@ -722,8 +731,11 @@ class SnappyDecompressor {
         if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
           return;
         }
+        MAYBE_REFILL();
       }
     }
+
+#undef MAYBE_REFILL
   }
 };
author	snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>	2011-12-05 21:27:26 +0000
committer	snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>	2011-12-05 21:27:26 +0000
commit	95a111b29c7be2240a41672153f1b2e01b9fc7d0 (patch)
tree	445d694b5d8e8cfc6c07c116546ac71df385cbbb
parent	dc9e55f2949e4518742d029c724f416abae3f5a5 (diff)
download	snappy-95a111b29c7be2240a41672153f1b2e01b9fc7d0.tar.gz