summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsnappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>2011-12-05 21:27:26 +0000
committersnappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>2011-12-05 21:27:26 +0000
commit95a111b29c7be2240a41672153f1b2e01b9fc7d0 (patch)
tree445d694b5d8e8cfc6c07c116546ac71df385cbbb
parentdc9e55f2949e4518742d029c724f416abae3f5a5 (diff)
downloadsnappy-95a111b29c7be2240a41672153f1b2e01b9fc7d0.tar.gz
Speed up decompression by moving the refill check to the end of the loop.
This seems to work because in most of the branches, the compiler can evaluate “ip_limit_ - ip” in a more efficient way than reloading ip_limit_ from memory (either by already having the entire expression in a register, or reconstructing it from “avail”, or something else). Memory loads, even from L1, are seemingly costly in the big picture at the current decompression speeds. Microbenchmarks (64-bit, opt mode): Westmere (Intel Core i7): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 74492 74491 187894 1.3GB/s html [ +5.9%] BM_UFlat/1 712268 712263 19644 940.0MB/s urls [ +3.8%] BM_UFlat/2 10591 10590 1000000 11.2GB/s jpg [ -6.8%] BM_UFlat/3 29643 29643 469915 3.0GB/s pdf [ +7.9%] BM_UFlat/4 304669 304667 45930 1.3GB/s html4 [ +4.8%] BM_UFlat/5 28508 28507 490077 823.1MB/s cp [ +4.0%] BM_UFlat/6 12415 12415 1000000 856.5MB/s c [ +8.6%] BM_UFlat/7 3415 3415 4084723 1039.0MB/s lsp [+18.0%] BM_UFlat/8 979569 979563 14261 1002.5MB/s xls [ +5.8%] BM_UFlat/9 230150 230148 60934 630.2MB/s txt1 [ +5.2%] BM_UFlat/10 197167 197166 71135 605.5MB/s txt2 [ +4.7%] BM_UFlat/11 607394 607390 23041 670.1MB/s txt3 [ +5.6%] BM_UFlat/12 808502 808496 17316 568.4MB/s txt4 [ +5.0%] BM_UFlat/13 372791 372788 37564 1.3GB/s bin [ +3.3%] BM_UFlat/14 44541 44541 313969 818.8MB/s sum [ +5.7%] BM_UFlat/15 4833 4833 2898697 834.1MB/s man [ +4.8%] BM_UFlat/16 79855 79855 175356 1.4GB/s pb [ +4.8%] BM_UFlat/17 245845 245843 56838 715.0MB/s gaviota [ +5.8%] Clovertown (Intel Core 2): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 107911 107890 100000 905.1MB/s html [ +2.2%] BM_UFlat/1 1011237 1011041 10000 662.3MB/s urls [ +2.5%] BM_UFlat/2 26775 26770 523089 4.4GB/s jpg [ +0.0%] BM_UFlat/3 48103 48095 290618 1.8GB/s pdf [ +3.4%] BM_UFlat/4 437724 437644 31937 892.6MB/s html4 [ +2.1%] BM_UFlat/5 39607 39600 358284 592.5MB/s cp [ +2.4%] BM_UFlat/6 18227 18224 768191 583.5MB/s c [ +2.7%] BM_UFlat/7 5171 5170 2709437 686.4MB/s lsp [ +3.9%] BM_UFlat/8 1560291 1559989 8970 629.5MB/s xls [ +3.6%] BM_UFlat/9 335401 335343 41731 432.5MB/s txt1 [ +3.0%] BM_UFlat/10 287014 286963 48758 416.0MB/s txt2 [ +2.8%] BM_UFlat/11 888522 888356 15752 458.1MB/s txt3 [ +2.9%] BM_UFlat/12 1186600 1186378 10000 387.3MB/s txt4 [ +3.1%] BM_UFlat/13 572295 572188 24468 855.4MB/s bin [ +2.1%] BM_UFlat/14 64060 64049 218401 569.4MB/s sum [ +4.1%] BM_UFlat/15 7264 7263 1916168 555.0MB/s man [ +1.4%] BM_UFlat/16 108853 108836 100000 1039.1MB/s pb [ +1.7%] BM_UFlat/17 364289 364223 38419 482.6MB/s gaviota [ +4.9%] Barcelona (AMD Opteron): Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_UFlat/0 103900 103871 100000 940.2MB/s html [ +8.3%] BM_UFlat/1 1000435 1000107 10000 669.5MB/s urls [ +6.6%] BM_UFlat/2 24659 24652 567362 4.8GB/s jpg [ +0.1%] BM_UFlat/3 48206 48193 291121 1.8GB/s pdf [ +5.0%] BM_UFlat/4 421980 421850 33174 926.0MB/s html4 [ +7.3%] BM_UFlat/5 40368 40357 346994 581.4MB/s cp [ +8.7%] BM_UFlat/6 19836 19830 708695 536.2MB/s c [ +8.0%] BM_UFlat/7 6100 6098 2292774 581.9MB/s lsp [ +9.0%] BM_UFlat/8 1693093 1692514 8261 580.2MB/s xls [ +8.0%] BM_UFlat/9 365991 365886 38225 396.4MB/s txt1 [ +7.1%] BM_UFlat/10 311330 311238 44950 383.6MB/s txt2 [ +7.6%] BM_UFlat/11 975037 974737 14376 417.5MB/s txt3 [ +6.9%] BM_UFlat/12 1303558 1303175 10000 352.6MB/s txt4 [ +7.3%] BM_UFlat/13 517448 517290 27144 946.2MB/s bin [ +5.5%] BM_UFlat/14 66537 66518 210352 548.3MB/s sum [ +7.5%] BM_UFlat/15 7976 7974 1760383 505.6MB/s man [ +5.6%] BM_UFlat/16 103121 103092 100000 1097.0MB/s pb [ +8.7%] BM_UFlat/17 391431 391314 35733 449.2MB/s gaviota [ +6.5%] R=sanjay git-svn-id: http://snappy.googlecode.com/svn/trunk@54 03e5f5b5-db94-4691-08a0-1a8bf15f6143
-rw-r--r--snappy.cc24
1 files changed, 18 insertions, 6 deletions
diff --git a/snappy.cc b/snappy.cc
index 799a640..b8b41cb 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -669,13 +669,20 @@ class SnappyDecompressor {
template <class Writer>
void DecompressAllTags(Writer* writer) {
const char* ip = ip_;
- for ( ;; ) {
- if (ip_limit_ - ip < 5) {
- ip_ = ip;
- if (!RefillTag()) return;
- ip = ip_;
- }
+ // We could have put this refill fragment only at the beginning of the loop.
+ // However, duplicating it at the end of each branch gives the compiler more
+ // scope to optimize the <ip_limit_ - ip> expression based on the local
+ // context, which overall increases speed.
+ #define MAYBE_REFILL() \
+ if (ip_limit_ - ip < 5) { \
+ ip_ = ip; \
+ if (!RefillTag()) return; \
+ ip = ip_; \
+ }
+
+ MAYBE_REFILL();
+ for ( ;; ) {
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
if ((c & 0x3) == LITERAL) {
@@ -683,6 +690,7 @@ class SnappyDecompressor {
if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
DCHECK_LT(literal_length, 61);
ip += literal_length;
+ MAYBE_REFILL();
continue;
}
if (PREDICT_FALSE(literal_length >= 61)) {
@@ -709,6 +717,7 @@ class SnappyDecompressor {
return;
}
ip += literal_length;
+ MAYBE_REFILL();
} else {
const uint32 entry = char_table[c];
const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
@@ -722,8 +731,11 @@ class SnappyDecompressor {
if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
return;
}
+ MAYBE_REFILL();
}
}
+
+#undef MAYBE_REFILL
}
};