diff options
author | David Green <david.green@arm.com> | 2022-01-17 11:41:53 +0000 |
---|---|---|
committer | David Green <david.green@arm.com> | 2022-01-19 07:14:46 +0000 |
commit | 6c6e890ef961260b60f9f9a2051ac4618dbd06bd (patch) | |
tree | 3a0f1caa72ba90717ec95e197a3e6fb8f1cdad29 | |
parent | 8b07ff196aa007e9b073aa4f062dbb23443594ec (diff) | |
download | snappy-git-6c6e890ef961260b60f9f9a2051ac4618dbd06bd.tar.gz |
Change LittleEndian loads/stores to use memcpy
The existing code uses a series of 8bit loads with shifts and ors to
emulate an (unaligned) load of a larger type. These are then expected to
become single loads in the compiler, producing optimal assembly. Whilst
this is true it happens very late in the compiler, meaning that
throughout most of the pipeline it is treated (and cost-modelled) as
multiple loads, shifts and ors. This can make the compiler make poor
decisions (such as not unrolling loops that should be), or to break up
the pattern before it is turned into a single load.
For example the loops in CompressFragment do not get unrolled as
expected due to a higher cost than the unroll threshold in clang.
Instead this patch uses a more conventional methods of loading unaligned
data, using a memcpy directly which the compiler will be able to deal
with much more straight forwardly, modelling it as a single unaligned
load. The old code is left as-is for big-endian systems.
This helps improve the performance of the BM_ZFlat benchmarks by up to
10-15% on an Arm Neoverse N1.
Change-Id: I986f845ebd0a0806d052d2be3e4dbcbee91713d7
-rw-r--r-- | snappy-stubs-internal.h | 48 |
1 files changed, 36 insertions, 12 deletions
diff --git a/snappy-stubs-internal.h b/snappy-stubs-internal.h index 7d43c92..8e28b3d 100644 --- a/snappy-stubs-internal.h +++ b/snappy-stubs-internal.h @@ -171,27 +171,37 @@ class LittleEndian { public: // Functions to do unaligned loads and stores in little-endian order. static inline uint16_t Load16(const void *ptr) { - const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr); - // Compiles to a single mov/str on recent clang and gcc. +#if SNAPPY_IS_BIG_ENDIAN + const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr); return (static_cast<uint16_t>(buffer[0])) | (static_cast<uint16_t>(buffer[1]) << 8); +#else + uint16_t x; + memcpy(&x, ptr, 2); + return x; +#endif } static inline uint32_t Load32(const void *ptr) { - const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr); - // Compiles to a single mov/str on recent clang and gcc. +#if SNAPPY_IS_BIG_ENDIAN + const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr); return (static_cast<uint32_t>(buffer[0])) | (static_cast<uint32_t>(buffer[1]) << 8) | (static_cast<uint32_t>(buffer[2]) << 16) | (static_cast<uint32_t>(buffer[3]) << 24); +#else + uint32_t x; + memcpy(&x, ptr, 4); + return x; +#endif } static inline uint64_t Load64(const void *ptr) { - const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr); - // Compiles to a single mov/str on recent clang and gcc. +#if SNAPPY_IS_BIG_ENDIAN + const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr); return (static_cast<uint64_t>(buffer[0])) | (static_cast<uint64_t>(buffer[1]) << 8) | (static_cast<uint64_t>(buffer[2]) << 16) | @@ -200,30 +210,41 @@ class LittleEndian { (static_cast<uint64_t>(buffer[5]) << 40) | (static_cast<uint64_t>(buffer[6]) << 48) | (static_cast<uint64_t>(buffer[7]) << 56); +#else + uint64_t x; + memcpy(&x, ptr, 8); + return x; +#endif } static inline void Store16(void *dst, uint16_t value) { - uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst); - // Compiles to a single mov/str on recent clang and gcc. +#if SNAPPY_IS_BIG_ENDIAN + uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst); buffer[0] = static_cast<uint8_t>(value); buffer[1] = static_cast<uint8_t>(value >> 8); +#else + memcpy(dst, &value, 2); +#endif } static void Store32(void *dst, uint32_t value) { - uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst); - // Compiles to a single mov/str on recent clang and gcc. +#if SNAPPY_IS_BIG_ENDIAN + uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst); buffer[0] = static_cast<uint8_t>(value); buffer[1] = static_cast<uint8_t>(value >> 8); buffer[2] = static_cast<uint8_t>(value >> 16); buffer[3] = static_cast<uint8_t>(value >> 24); +#else + memcpy(dst, &value, 4); +#endif } static void Store64(void* dst, uint64_t value) { - uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst); - // Compiles to a single mov/str on recent clang and gcc. +#if SNAPPY_IS_BIG_ENDIAN + uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst); buffer[0] = static_cast<uint8_t>(value); buffer[1] = static_cast<uint8_t>(value >> 8); buffer[2] = static_cast<uint8_t>(value >> 16); @@ -232,6 +253,9 @@ class LittleEndian { buffer[5] = static_cast<uint8_t>(value >> 40); buffer[6] = static_cast<uint8_t>(value >> 48); buffer[7] = static_cast<uint8_t>(value >> 56); +#else + memcpy(dst, &value, 8); +#endif } static inline constexpr bool IsLittleEndian() { |