summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsnappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>2011-06-02 17:59:40 +0000
committersnappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>2011-06-02 17:59:40 +0000
commitf56157f8b866453cf9096aacbffc629550482cd6 (patch)
tree82a418618cd831e21badd8f2269c536c77209881
parent5258607248a000affb2f72b87b9bbd044806677d (diff)
downloadsnappy-f56157f8b866453cf9096aacbffc629550482cd6.tar.gz
Speed up decompression by caching ip_.
It is seemingly hard for the compiler to understand that ip_, the current input pointer into the compressed data stream, can not alias on anything else, and thus using it directly will incur memory traffic as it cannot be kept in a register. The code already knew about this and cached it into a local variable, but since Step() only decoded one tag, it had to move ip_ back into place between every tag. This seems to have cost us a significant amount of performance, so changing Step() into a function that decodes as much as it can before it saves ip_ back and returns. (Note that Step() was already inlined, so it is not the manual inlining that buys the performance here.) The wins are about 3-6% for Core 2, 6-13% on Core i7 and 5-12% on Opteron (for plain array-to-array decompression, in 64-bit opt mode). There is a tiny difference in the behavior here; if an invalid literal is encountered (ie., the writer refuses the Append() operation), ip_ will now point to the byte past the tag byte, instead of where the literal was originally thought to end. However, we don't use ip_ for anything after DecompressAllTags() has returned, so this should not change external behavior in any way. Microbenchmark results for Core i7, 64-bit (Opteron results are similar): Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 79134 79110 8835 1.2GB/s html [ +6.2%] BM_UFlat/1 786126 786096 891 851.8MB/s urls [+10.0%] BM_UFlat/2 9948 9948 69125 11.9GB/s jpg [ -1.3%] BM_UFlat/3 31999 31998 21898 2.7GB/s pdf [ +6.5%] BM_UFlat/4 318909 318829 2204 1.2GB/s html4 [ +6.5%] BM_UFlat/5 31384 31390 22363 747.5MB/s cp [ +9.2%] BM_UFlat/6 14037 14034 49858 757.7MB/s c [+10.6%] BM_UFlat/7 4612 4612 151395 769.5MB/s lsp [ +9.5%] BM_UFlat/8 1203174 1203007 582 816.3MB/s xls [+19.3%] BM_UFlat/9 253869 253955 2757 571.1MB/s txt1 [+11.4%] BM_UFlat/10 219292 219290 3194 544.4MB/s txt2 [+12.1%] BM_UFlat/11 672135 672131 1000 605.5MB/s txt3 [+11.2%] BM_UFlat/12 902512 902492 776 509.2MB/s txt4 [+12.5%] BM_UFlat/13 372110 371998 1881 1.3GB/s bin [ +5.8%] BM_UFlat/14 50407 50407 10000 723.5MB/s sum [+13.5%] BM_UFlat/15 5699 5701 100000 707.2MB/s man [+12.4%] BM_UFlat/16 83448 83424 8383 1.3GB/s pb [ +5.7%] BM_UFlat/17 256958 256963 2723 684.1MB/s gaviota [ +7.9%] BM_UValidate/0 42795 42796 16351 2.2GB/s html [+25.8%] BM_UValidate/1 490672 490622 1427 1.3GB/s urls [+22.7%] BM_UValidate/2 237 237 2950297 499.0GB/s jpg [+24.9%] BM_UValidate/3 14610 14611 47901 6.0GB/s pdf [+26.8%] BM_UValidate/4 171973 171990 4071 2.2GB/s html4 [+25.7%] git-svn-id: http://snappy.googlecode.com/svn/trunk@38 03e5f5b5-db94-4691-08a0-1a8bf15f6143
-rw-r--r--snappy.cc81
1 files changed, 45 insertions, 36 deletions
diff --git a/snappy.cc b/snappy.cc
index 625029c..b3045a3 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -653,45 +653,54 @@ class SnappyDecompressor {
// Process the next item found in the input.
// Returns true if successful, false on error or end of input.
template <class Writer>
- bool Step(Writer* writer) {
+ void DecompressAllTags(Writer* writer) {
const char* ip = ip_;
- if (ip_limit_ - ip < 5) {
- if (!RefillTag()) return false;
- ip = ip_;
- }
-
- const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
- const uint32 entry = char_table[c];
- const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
- ip += entry >> 11;
- const uint32 length = entry & 0xff;
+ for ( ;; ) {
+ if (ip_limit_ - ip < 5) {
+ ip_ = ip;
+ if (!RefillTag()) return;
+ ip = ip_;
+ }
- if ((c & 0x3) == LITERAL) {
- uint32 literal_length = length + trailer;
- uint32 avail = ip_limit_ - ip;
- while (avail < literal_length) {
+ const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
+ const uint32 entry = char_table[c];
+ const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
+ ip += entry >> 11;
+ const uint32 length = entry & 0xff;
+
+ if ((c & 0x3) == LITERAL) {
+ uint32 literal_length = length + trailer;
+ uint32 avail = ip_limit_ - ip;
+ while (avail < literal_length) {
+ bool allow_fast_path = (avail >= 16);
+ if (!writer->Append(ip, avail, allow_fast_path)) goto end;
+ literal_length -= avail;
+ reader_->Skip(peeked_);
+ size_t n;
+ ip = reader_->Peek(&n);
+ avail = n;
+ peeked_ = avail;
+ if (avail == 0) goto end; // Premature end of input
+ ip_limit_ = ip + avail;
+ }
bool allow_fast_path = (avail >= 16);
- if (!writer->Append(ip, avail, allow_fast_path)) return false;
- literal_length -= avail;
- reader_->Skip(peeked_);
- size_t n;
- ip = reader_->Peek(&n);
- avail = n;
- peeked_ = avail;
- if (avail == 0) return false; // Premature end of input
- ip_limit_ = ip + avail;
+ if (!writer->Append(ip, literal_length, allow_fast_path)) {
+ goto end;
+ }
+ ip += literal_length;
+ } else {
+ // copy_offset/256 is encoded in bits 8..10. By just fetching
+ // those bits, we get copy_offset (since the bit-field starts at
+ // bit 8).
+ const uint32 copy_offset = entry & 0x700;
+ if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
+ goto end;
+ }
}
- ip_ = ip + literal_length;
- bool allow_fast_path = (avail >= 16);
- return writer->Append(ip, literal_length, allow_fast_path);
- } else {
- ip_ = ip;
- // copy_offset/256 is encoded in bits 8..10. By just fetching
- // those bits, we get copy_offset (since the bit-field starts at
- // bit 8).
- const uint32 copy_offset = entry & 0x700;
- return writer->AppendFromSelf(copy_offset + trailer, length);
}
+
+end:
+ ip_ = ip;
}
};
@@ -770,7 +779,7 @@ static bool InternalUncompress(Source* r,
writer->SetExpectedLength(uncompressed_len);
// Process the entire input
- while (decompressor.Step(writer)) { }
+ decompressor.DecompressAllTags(writer);
return (decompressor.eof() && writer->CheckLength());
}
@@ -866,7 +875,7 @@ size_t Compress(Source* reader, Sink* writer) {
// A type that writes to a flat array.
// Note that this is not a "ByteSink", but a type that matches the
-// Writer template argument to SnappyDecompressor::Step().
+// Writer template argument to SnappyDecompressor::DecompressAllTags().
class SnappyArrayWriter {
private:
char* base_;