Enable vector byte shuffle optimizations on ARM NEON

The SSSE3 intrinsics we use have their direct analogues in NEON, so making this optimization portable requires a very thin translation layer. PiperOrigin-RevId: 381280165
author: atdt <atdt@google.com> 2021-06-24 17:09:34 +0000
committer: Victor Costan <costan@google.com> 2021-07-05 01:05:44 +0000
commit: b3fb0b5b4b076f1af12f5c727b33e0abf723fe12 (patch)
tree: cfadc71482af5ac16d8d366ca1ebb3b5c7dc7ebc /snappy-internal.h
parent: b638ebe5d95ec4559921a72f8c2bbc4b1b5a2fd0 (diff)
download: snappy-git-b3fb0b5b4b076f1af12f5c727b33e0abf723fe12.tar.gz
1 files changed, 50 insertions, 0 deletions
diff --git a/snappy-internal.h b/snappy-internal.h
index 720ccd8..ad2b36a 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -36,6 +36,56 @@
 namespace snappy {
 namespace internal {
 
+#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
+#if SNAPPY_HAVE_SSSE3
+using V128 = __m128i;
+#else
+using V128 = uint8x16_t;
+#endif
+
+// Load 128 bits of integer data. `src` must be 16-byte aligned.
+inline V128 V128_Load(const V128* src);
+
+// Load 128 bits of integer data. `src` does not need to be aligned.
+inline V128 V128_LoadU(const V128* src);
+
+// Store 128 bits of integer data. `dst` does not need to be aligned.
+inline void V128_StoreU(V128* dst, V128 val);
+
+// Shuffle packed 8-bit integers using a shuffle mask.
+// Each packed integer in the shuffle mask must be in [0,16).
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
+
+#if SNAPPY_HAVE_SSSE3
+inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
+
+inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
+
+inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
+
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
+  return _mm_shuffle_epi8(input, shuffle_mask);
+}
+#else
+inline V128 V128_Load(const V128* src) {
+  return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
+}
+
+inline V128 V128_LoadU(const V128* src) {
+  return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
+}
+
+inline void V128_StoreU(V128* dst, V128 val) {
+  vst1q_u8(reinterpret_cast<uint8_t*>(dst), val);
+}
+
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
+  assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
+  return vqtbl1q_u8(input, shuffle_mask);
+}
+#endif
+#endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
+
 // Working memory performs a single allocation to hold all scratch space
 // required for compression.
 class WorkingMemory {
author	atdt <atdt@google.com>	2021-06-24 17:09:34 +0000
committer	Victor Costan <costan@google.com>	2021-07-05 01:05:44 +0000
commit	b3fb0b5b4b076f1af12f5c727b33e0abf723fe12 (patch)
tree	cfadc71482af5ac16d8d366ca1ebb3b5c7dc7ebc /snappy-internal.h
parent	b638ebe5d95ec4559921a72f8c2bbc4b1b5a2fd0 (diff)
download	snappy-git-b3fb0b5b4b076f1af12f5c727b33e0abf723fe12.tar.gz