summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsnappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>2011-06-03 20:47:14 +0000
committersnappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>2011-06-03 20:47:14 +0000
commitd737ce7568c794f0a92cf1190706b9b7b79e395d (patch)
treefc7161209a92d0fcc2d7a4b0851b847372d1c064
parent4e2b06ffbd764ebf3ce892a5c5f1e1255759dd58 (diff)
downloadsnappy-d737ce7568c794f0a92cf1190706b9b7b79e395d.tar.gz
Speed up decompression by not needing a lookup table for literal items.
Looking up into and decoding the values from char_table has long shown up as a hotspot in the decompressor. While it turns out that it's hard to make a more efficient decoder for the copy ops, the literals are simple enough that we can decode them without needing a table lookup. (This means that 1/4 of the table is now unused, although that in itself doesn't buy us anything.) The gains are small, but definitely present; some tests win as much as 10%, but 1-4% is more typical. These results are from Core i7, in 64-bit mode; Core 2 and Opteron show similar results. (I've run with more iterations than unusual to make sure the smaller gains don't drown entirely in noise.) Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 74665 74428 182055 1.3GB/s html [ +3.1%] BM_UFlat/1 714106 711997 19663 940.4MB/s urls [ +4.4%] BM_UFlat/2 9820 9789 1427115 12.1GB/s jpg [ -1.2%] BM_UFlat/3 30461 30380 465116 2.9GB/s pdf [ +0.8%] BM_UFlat/4 301445 300568 46512 1.3GB/s html4 [ +2.2%] BM_UFlat/5 29338 29263 479452 801.8MB/s cp [ +1.6%] BM_UFlat/6 13004 12970 1000000 819.9MB/s c [ +2.1%] BM_UFlat/7 4180 4168 3349282 851.4MB/s lsp [ +1.3%] BM_UFlat/8 1026149 1024000 10000 959.0MB/s xls [+10.7%] BM_UFlat/9 237441 236830 59072 612.4MB/s txt1 [ +0.3%] BM_UFlat/10 203966 203298 69307 587.2MB/s txt2 [ +0.8%] BM_UFlat/11 627230 625000 22400 651.2MB/s txt3 [ +0.7%] BM_UFlat/12 836188 833979 16787 551.0MB/s txt4 [ +1.3%] BM_UFlat/13 351904 350750 39886 1.4GB/s bin [ +3.8%] BM_UFlat/14 45685 45562 308370 800.4MB/s sum [ +5.9%] BM_UFlat/15 5286 5270 2656546 764.9MB/s man [ +1.5%] BM_UFlat/16 78774 78544 178117 1.4GB/s pb [ +4.3%] BM_UFlat/17 242270 241345 58091 728.3MB/s gaviota [ +1.2%] BM_UValidate/0 42149 42000 333333 2.3GB/s html [ -3.0%] BM_UValidate/1 432741 431303 32483 1.5GB/s urls [ +7.8%] BM_UValidate/2 198 197 71428571 600.7GB/s jpg [+16.8%] BM_UValidate/3 14560 14521 965517 6.1GB/s pdf [ -4.1%] BM_UValidate/4 169065 168671 83832 2.3GB/s html4 [ -2.9%] R=jeff Revision created by MOE tool push_codebase. git-svn-id: http://snappy.googlecode.com/svn/trunk@41 03e5f5b5-db94-4691-08a0-1a8bf15f6143
-rw-r--r--snappy.cc20
1 files changed, 15 insertions, 5 deletions
diff --git a/snappy.cc b/snappy.cc
index a591aba..dc6c2f3 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -663,13 +663,18 @@ class SnappyDecompressor {
}
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
- const uint32 entry = char_table[c];
- const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
- ip += entry >> 11;
- const uint32 length = entry & 0xff;
if ((c & 0x3) == LITERAL) {
- uint32 literal_length = length + trailer;
+ uint32 literal_length = c >> 2;
+ if (PREDICT_FALSE(literal_length >= 60)) {
+ // Long literal.
+ const uint32 literal_length_length = literal_length - 59;
+ literal_length =
+ LittleEndian::Load32(ip) & wordmask[literal_length_length];
+ ip += literal_length_length;
+ }
+ ++literal_length;
+
uint32 avail = ip_limit_ - ip;
while (avail < literal_length) {
bool allow_fast_path = (avail >= 16);
@@ -689,6 +694,11 @@ class SnappyDecompressor {
}
ip += literal_length;
} else {
+ const uint32 entry = char_table[c];
+ const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+ const uint32 length = entry & 0xff;
+ ip += entry >> 11;
+
// copy_offset/256 is encoded in bits 8..10. By just fetching
// those bits, we get copy_offset (since the bit-field starts at
// bit 8).