summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsnappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>2012-02-23 17:00:36 +0000
committersnappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143>2012-02-23 17:00:36 +0000
commit07aad56d4785780c2bbf5fedd69675dd16e13517 (patch)
tree0bdc1caa2138bd691ad80037d86f112f709931c0
parent9d7fa341db0083c68f4a769c6f9fdbc6b9598737 (diff)
downloadsnappy-07aad56d4785780c2bbf5fedd69675dd16e13517.tar.gz
For 32-bit platforms, do not try to accelerate multiple neighboring
32-bit loads with a 64-bit load during compression (it's not a win). The main target for this optimization is ARM, but 32-bit x86 gets a small gain, too, although there is noise in the microbenchmarks. It's a no-op for 64-bit x86. It does not affect decompression. Microbenchmark results on a Cortex-A9 1GHz, using g++ 4.6.2 (from Ubuntu/Linaro), -O2 -DNDEBUG -Wa,-march=armv7a -mtune=cortex-a9 -mthumb-interwork, minimum 1000 iterations: Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_ZFlat/0 1158277 1160000 1000 84.2MB/s html (23.57 %) [ +4.3%] BM_ZFlat/1 14861782 14860000 1000 45.1MB/s urls (50.89 %) [ +1.1%] BM_ZFlat/2 393595 390000 1000 310.5MB/s jpg (99.88 %) [ +0.0%] BM_ZFlat/3 650583 650000 1000 138.4MB/s pdf (82.13 %) [ +3.1%] BM_ZFlat/4 4661480 4660000 1000 83.8MB/s html4 (23.55 %) [ +4.3%] BM_ZFlat/5 491973 490000 1000 47.9MB/s cp (48.12 %) [ +2.0%] BM_ZFlat/6 193575 192678 1038 55.2MB/s c (42.40 %) [ +9.0%] BM_ZFlat/7 62343 62754 3187 56.5MB/s lsp (48.37 %) [ +2.6%] BM_ZFlat/8 17708468 17710000 1000 55.5MB/s xls (41.34 %) [ -0.3%] BM_ZFlat/9 3755345 3760000 1000 38.6MB/s txt1 (59.81 %) [ +8.2%] BM_ZFlat/10 3324217 3320000 1000 36.0MB/s txt2 (64.07 %) [ +4.2%] BM_ZFlat/11 10139932 10140000 1000 40.1MB/s txt3 (57.11 %) [ +6.4%] BM_ZFlat/12 13532109 13530000 1000 34.0MB/s txt4 (68.35 %) [ +5.0%] BM_ZFlat/13 4690847 4690000 1000 104.4MB/s bin (18.21 %) [ +4.1%] BM_ZFlat/14 830682 830000 1000 43.9MB/s sum (51.88 %) [ +1.2%] BM_ZFlat/15 84784 85011 2235 47.4MB/s man (59.36 %) [ +1.1%] BM_ZFlat/16 1293254 1290000 1000 87.7MB/s pb (23.15 %) [ +2.3%] BM_ZFlat/17 2775155 2780000 1000 63.2MB/s gaviota (38.27 %) [+12.2%] Core i7 in 32-bit mode (only one run and 100 iterations, though, so noisy): Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_ZFlat/0 227582 223464 3043 437.0MB/s html (23.57 %) [ +7.4%] BM_ZFlat/1 2982430 2918455 233 229.4MB/s urls (50.89 %) [ +2.9%] BM_ZFlat/2 46967 46658 15217 2.5GB/s jpg (99.88 %) [ +0.0%] BM_ZFlat/3 115298 114864 5833 783.2MB/s pdf (82.13 %) [ +1.5%] BM_ZFlat/4 913440 899743 778 434.2MB/s html4 (23.55 %) [ +0.3%] BM_ZFlat/5 110302 108571 7000 216.1MB/s cp (48.12 %) [ +0.0%] BM_ZFlat/6 44409 43372 15909 245.2MB/s c (42.40 %) [ +0.8%] BM_ZFlat/7 15713 15643 46667 226.9MB/s lsp (48.37 %) [ +2.7%] BM_ZFlat/8 2625539 2602230 269 377.4MB/s xls (41.34 %) [ +1.4%] BM_ZFlat/9 808884 811429 875 178.8MB/s txt1 (59.81 %) [ -3.9%] BM_ZFlat/10 709532 700000 1000 170.5MB/s txt2 (64.07 %) [ +0.0%] BM_ZFlat/11 2177682 2162162 333 188.2MB/s txt3 (57.11 %) [ -1.4%] BM_ZFlat/12 2849640 2840000 250 161.8MB/s txt4 (68.35 %) [ -1.4%] BM_ZFlat/13 849760 835476 778 585.8MB/s bin (18.21 %) [ +1.2%] BM_ZFlat/14 165940 164571 4375 221.6MB/s sum (51.88 %) [ +1.4%] BM_ZFlat/15 20939 20571 35000 196.0MB/s man (59.36 %) [ +2.1%] BM_ZFlat/16 239209 236544 2917 478.1MB/s pb (23.15 %) [ +4.2%] BM_ZFlat/17 616206 610000 1000 288.2MB/s gaviota (38.27 %) [ -1.6%] R=sanjay git-svn-id: http://snappy.googlecode.com/svn/trunk@60 03e5f5b5-db94-4691-08a0-1a8bf15f6143
-rw-r--r--snappy.cc41
1 files changed, 37 insertions, 4 deletions
diff --git a/snappy.cc b/snappy.cc
index 6c6bd6b..4d4eb42 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -272,16 +272,49 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
}
} // end namespace internal
-// For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will
+// For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
// equal UNALIGNED_LOAD32(p + offset). Motivation: On x86-64 hardware we have
// empirically found that overlapping loads such as
// UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
// are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
+//
+// We have different versions for 64- and 32-bit; ideally we would avoid the
+// two functions and just inline the UNALIGNED_LOAD64 call into
+// GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
+// enough to avoid loading the value multiple times then. For 64-bit, the load
+// is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
+// done at GetUint32AtOffset() time.
+
+#ifdef ARCH_K8
+
+typedef uint64 EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+ return UNALIGNED_LOAD64(ptr);
+}
+
static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
- DCHECK(0 <= offset && offset <= 4) << offset;
+ DCHECK_GE(offset, 0);
+ DCHECK_LE(offset, 4);
return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
}
+#else
+
+typedef const char* EightBytesReference;
+
+static inline EightBytesReference GetEightBytesAt(const char* ptr) {
+ return ptr;
+}
+
+static inline uint32 GetUint32AtOffset(const char* v, int offset) {
+ DCHECK_GE(offset, 0);
+ DCHECK_LE(offset, 4);
+ return UNALIGNED_LOAD32(v + offset);
+}
+
+#endif
+
// Flat array compression that does not emit the "uncompressed length"
// prefix. Compresses "input" string to the "*op" buffer.
//
@@ -378,7 +411,7 @@ char* CompressFragment(const char* input,
// though we don't yet know how big the literal will be. We handle that
// by proceeding to the next iteration of the main loop. We also can exit
// this loop via goto if we get close to exhausting the input.
- uint64 input_bytes = 0;
+ EightBytesReference input_bytes;
uint32 candidate_bytes = 0;
do {
@@ -397,7 +430,7 @@ char* CompressFragment(const char* input,
if (PREDICT_FALSE(ip >= ip_limit)) {
goto emit_remainder;
}
- input_bytes = UNALIGNED_LOAD64(insert_tail);
+ input_bytes = GetEightBytesAt(insert_tail);
uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
table[prev_hash] = ip - base_ip - 1;
uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);