diff options
author | atdt <atdt@google.com> | 2021-06-24 17:09:34 +0000 |
---|---|---|
committer | Victor Costan <costan@google.com> | 2021-07-05 01:05:44 +0000 |
commit | b3fb0b5b4b076f1af12f5c727b33e0abf723fe12 (patch) | |
tree | cfadc71482af5ac16d8d366ca1ebb3b5c7dc7ebc /snappy-internal.h | |
parent | b638ebe5d95ec4559921a72f8c2bbc4b1b5a2fd0 (diff) | |
download | snappy-git-b3fb0b5b4b076f1af12f5c727b33e0abf723fe12.tar.gz |
Enable vector byte shuffle optimizations on ARM NEON
The SSSE3 intrinsics we use have their direct analogues in NEON, so making this optimization portable requires a very thin translation layer.
PiperOrigin-RevId: 381280165
Diffstat (limited to 'snappy-internal.h')
-rw-r--r-- | snappy-internal.h | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/snappy-internal.h b/snappy-internal.h index 720ccd8..ad2b36a 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -36,6 +36,56 @@ namespace snappy { namespace internal { +#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE +#if SNAPPY_HAVE_SSSE3 +using V128 = __m128i; +#else +using V128 = uint8x16_t; +#endif + +// Load 128 bits of integer data. `src` must be 16-byte aligned. +inline V128 V128_Load(const V128* src); + +// Load 128 bits of integer data. `src` does not need to be aligned. +inline V128 V128_LoadU(const V128* src); + +// Store 128 bits of integer data. `dst` does not need to be aligned. +inline void V128_StoreU(V128* dst, V128 val); + +// Shuffle packed 8-bit integers using a shuffle mask. +// Each packed integer in the shuffle mask must be in [0,16). +inline V128 V128_Shuffle(V128 input, V128 shuffle_mask); + +#if SNAPPY_HAVE_SSSE3 +inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); } + +inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); } + +inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); } + +inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { + return _mm_shuffle_epi8(input, shuffle_mask); +} +#else +inline V128 V128_Load(const V128* src) { + return vld1q_u8(reinterpret_cast<const uint8_t*>(src)); +} + +inline V128 V128_LoadU(const V128* src) { + return vld1q_u8(reinterpret_cast<const uint8_t*>(src)); +} + +inline void V128_StoreU(V128* dst, V128 val) { + vst1q_u8(reinterpret_cast<uint8_t*>(dst), val); +} + +inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { + assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15); + return vqtbl1q_u8(input, shuffle_mask); +} +#endif +#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE + // Working memory performs a single allocation to hold all scratch space // required for compression. class WorkingMemory { |