summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSnappy Team <no-reply@google.com>2021-07-07 19:22:28 +0000
committerVictor Costan <costan@google.com>2021-08-02 14:49:57 +0000
commit9cc3689b2120bd25a74f264f05ed8692c9e22dfe (patch)
tree07458abb3846abcb0f4f42f709900163b51361c6
parentb4888f76161debdbcde30a64be577b82fd40de29 (diff)
downloadsnappy-git-9cc3689b2120bd25a74f264f05ed8692c9e22dfe.tar.gz
Optimize memset to pure SIMD because compilers generate consistently bad code. clang for ARM and gcc for x86 https://gcc.godbolt.org/z/oxeGG7aEx
PiperOrigin-RevId: 383467656
-rw-r--r--snappy-internal.h8
-rw-r--r--snappy.cc8
2 files changed, 15 insertions, 1 deletions
diff --git a/snappy-internal.h b/snappy-internal.h
index ad2b36a..f1aafa9 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -56,6 +56,9 @@ inline void V128_StoreU(V128* dst, V128 val);
// Each packed integer in the shuffle mask must be in [0,16).
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
+// Constructs V128 with 16 chars |c|.
+inline V128 V128_DupChar(char c);
+
#if SNAPPY_HAVE_SSSE3
inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
@@ -66,6 +69,9 @@ inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
return _mm_shuffle_epi8(input, shuffle_mask);
}
+
+inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); }
+
#else
inline V128 V128_Load(const V128* src) {
return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
@@ -83,6 +89,8 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
return vqtbl1q_u8(input, shuffle_mask);
}
+
+inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
#endif
#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
diff --git a/snappy.cc b/snappy.cc
index 7d0ff71..632ab85 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -80,6 +80,7 @@ using internal::V128_Load;
using internal::V128_LoadU;
using internal::V128_Shuffle;
using internal::V128_StoreU;
+using internal::V128_DupChar;
#endif
// We translate the information encoded in a tag through a lookup table to a
@@ -308,7 +309,12 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
case 0:
return false;
case 1: {
- std::memset(dst, dst[-1], 64);
+ // TODO: Ideally we should memset, move back once the
+ // codegen issues are fixed.
+ V128 pattern = V128_DupChar(dst[-1]);
+ for (int i = 0; i < 4; i++) {
+ V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
+ }
return true;
}
case 2: