summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt13
-rw-r--r--cmake/config.h.in7
-rw-r--r--snappy.cc79
3 files changed, 79 insertions, 20 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6eef485..2a0bc10 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,6 +175,19 @@ int main() {
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
+ return _mm_crc32_u32(0, 1);
+}" SNAPPY_HAVE_X86_CRC32)
+
+check_cxx_source_compiles("
+#include <arm_neon.h>
+#include <arm_acle.h>
+int main() {
+ return __crc32cw(0, 1);
+}" SNAPPY_HAVE_NEON_CRC32)
+
+check_cxx_source_compiles("
+#include <immintrin.h>
+int main() {
return _bzhi_u32(0, 1);
}" SNAPPY_HAVE_BMI2)
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 5ea2b5a..d1de25c 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -46,12 +46,19 @@
/* Define to 1 if you target processors with SSSE3+ and have <tmmintrin.h>. */
#cmakedefine01 SNAPPY_HAVE_SSSE3
+/* Define to 1 if you target processors with SSE4.2 and have <crc32intrin.h>. */
+#cmakedefine01 SNAPPY_HAVE_X86_CRC32
+
/* Define to 1 if you target processors with BMI2+ and have <bmi2intrin.h>. */
#cmakedefine01 SNAPPY_HAVE_BMI2
/* Define to 1 if you target processors with NEON and have <arm_neon.h>. */
#cmakedefine01 SNAPPY_HAVE_NEON
+/* Define to 1 if you have <arm_neon.h> and <arm_acle.h> and want to optimize
+ compression speed by using __crc32cw from <arm_acle.h>. */
+#cmakedefine01 SNAPPY_HAVE_NEON_CRC32
+
/* Define to 1 if your processor stores words with the most significant byte
first (like Motorola and SPARC, unlike Intel and VAX). */
#cmakedefine01 SNAPPY_IS_BIG_ENDIAN
diff --git a/snappy.cc b/snappy.cc
index 932f59f..57d7319 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -45,10 +45,28 @@
#endif
#endif // !defined(SNAPPY_HAVE_BMI2)
-#if SNAPPY_HAVE_BMI2
+#if !defined(SNAPPY_HAVE_X86_CRC32)
+#if defined(__SSE4_2__)
+#define SNAPPY_HAVE_X86_CRC32 1
+#else
+#define SNAPPY_HAVE_X86_CRC32 0
+#endif
+#endif // !defined(SNAPPY_HAVE_X86_CRC32)
+
+#if !defined(SNAPPY_HAVE_NEON_CRC32)
+#if SNAPPY_HAVE_NEON && defined(__ARM_FEATURE_CRC32)
+#define SNAPPY_HAVE_NEON_CRC32 1
+#else
+#define SNAPPY_HAVE_NEON_CRC32 0
+#endif
+#endif // !defined(SNAPPY_HAVE_NEON_CRC32)
+
+#if SNAPPY_HAVE_BMI2 || SNAPPY_HAVE_X86_CRC32
// Please do not replace with <x86intrin.h>. or with headers that assume more
// advanced SSE versions without checking with all the OWNERS.
#include <immintrin.h>
+#elif SNAPPY_HAVE_NEON_CRC32
+#include <arm_acle.h>
#endif
#include <algorithm>
@@ -127,14 +145,34 @@ constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) {
alignas(64) const std::array<int16_t, 256> kLengthMinusOffset =
MakeTable(make_index_sequence<256>{});
-// Any hash function will produce a valid compressed bitstream, but a good
-// hash function reduces the number of collisions and thus yields better
-// compression for compressible input, and more speed for incompressible
-// input. Of course, it doesn't hurt if the hash function is reasonably fast
-// either, as it gets called a lot.
-inline uint32_t HashBytes(uint32_t bytes, uint32_t mask) {
+// Given a table of uint16_t whose size is mask / 2 + 1, return a pointer to the
+// relevant entry, if any, for the given bytes. Any hash function will do,
+// but a good hash function reduces the number of collisions and thus yields
+// better compression for compressible input.
+//
+// REQUIRES: mask is 2 * (table_size - 1), and table_size is a power of two.
+inline uint16_t* TableEntry(uint16_t* table, uint32_t bytes, uint32_t mask) {
+ // Our choice is quicker-and-dirtier than the typical hash function;
+ // empirically, that seems beneficial. The upper bits of kMagic * bytes are a
+ // higher-quality hash than the lower bits, so when using kMagic * bytes we
+ // also shift right to get a higher-quality end result. There's no similar
+ // issue with a CRC because all of the output bits of a CRC are equally good
+ // "hashes." So, a CPU instruction for CRC, if available, tends to be a good
+ // choice.
+#if SNAPPY_HAVE_NEON_CRC32
+ // We use mask as the second arg to the CRC function, as it's about to
+ // be used anyway; it'd be equally correct to use 0 or some constant.
+ // Mathematically, _mm_crc32_u32 (or similar) is a function of the
+ // xor of its arguments.
+ const uint32_t hash = __crc32cw(bytes, mask);
+#elif SNAPPY_HAVE_X86_CRC32
+ const uint32_t hash = _mm_crc32_u32(bytes, mask);
+#else
constexpr uint32_t kMagic = 0x1e35a7bd;
- return ((kMagic * bytes) >> (32 - kMaxHashTableBits)) & mask;
+ const uint32_t hash = (kMagic * bytes) >> (31 - kMaxHashTableBits);
+#endif
+ return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +
+ (hash & mask));
}
} // namespace
@@ -727,7 +765,7 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
const char* ip = input;
assert(input_size <= kBlockSize);
assert((table_size & (table_size - 1)) == 0); // table must be power of two
- const uint32_t mask = table_size - 1;
+ const uint32_t mask = 2 * (table_size - 1);
const char* ip_end = input + input_size;
const char* base_ip = ip;
@@ -778,11 +816,11 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
// loaded in preload.
uint32_t dword = i == 0 ? preload : static_cast<uint32_t>(data);
assert(dword == LittleEndian::Load32(ip + i));
- uint32_t hash = HashBytes(dword, mask);
- candidate = base_ip + table[hash];
+ uint16_t* table_entry = TableEntry(table, dword, mask);
+ candidate = base_ip + *table_entry;
assert(candidate >= base_ip);
assert(candidate < ip + i);
- table[hash] = delta + i;
+ *table_entry = delta + i;
if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) {
*op = LITERAL | (i << 2);
UnalignedCopy128(next_emit, op + 1);
@@ -799,7 +837,7 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
}
while (true) {
assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip));
- uint32_t hash = HashBytes(data, mask);
+ uint16_t* table_entry = TableEntry(table, data, mask);
uint32_t bytes_between_hash_lookups = skip >> 5;
skip += bytes_between_hash_lookups;
const char* next_ip = ip + bytes_between_hash_lookups;
@@ -807,11 +845,11 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
ip = next_emit;
goto emit_remainder;
}
- candidate = base_ip + table[hash];
+ candidate = base_ip + *table_entry;
assert(candidate >= base_ip);
assert(candidate < ip);
- table[hash] = ip - base_ip;
+ *table_entry = ip - base_ip;
if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==
LittleEndian::Load32(candidate))) {
break;
@@ -857,12 +895,13 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
assert((data & 0xFFFFFFFFFF) ==
(LittleEndian::Load64(ip) & 0xFFFFFFFFFF));
// We are now looking for a 4-byte match again. We read
- // table[Hash(ip, shift)] for that. To improve compression,
+ // table[Hash(ip, mask)] for that. To improve compression,
// we also update table[Hash(ip - 1, mask)] and table[Hash(ip, mask)].
- table[HashBytes(LittleEndian::Load32(ip - 1), mask)] = ip - base_ip - 1;
- uint32_t hash = HashBytes(data, mask);
- candidate = base_ip + table[hash];
- table[hash] = ip - base_ip;
+ *TableEntry(table, LittleEndian::Load32(ip - 1), mask) =
+ ip - base_ip - 1;
+ uint16_t* table_entry = TableEntry(table, data, mask);
+ candidate = base_ip + *table_entry;
+ *table_entry = ip - base_ip;
// Measurements on the benchmarks have shown the following probabilities
// for the loop to exit (ie. avg. number of iterations is reciprocal).
// BM_Flat/6 txt1 p = 0.3-0.4