From d643b9a98888bdc746e8b19621ab19208e79fdac Mon Sep 17 00:00:00 2001 From: Jun He Date: Tue, 3 Aug 2021 15:36:20 +0800 Subject: decompress: add hint to remove extra AND Clang doesn't realize the load with free zero-extension, and emits another extra 'and xn, xm, 0xff' to calc offset. With this change ,this extra op is removed, and consistent 1.7% performance uplift is observed. Signed-off-by: Jun He Change-Id: Ica4617852c4b93eadc6c5c551dc3961ffbadb8f0 --- snappy.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/snappy.cc b/snappy.cc index 3f446c6..72a5899 100644 --- a/snappy.cc +++ b/snappy.cc @@ -1108,6 +1108,15 @@ std::pair DecompressBranchless( // ip points just past the tag and we are touching at maximum kSlopBytes // in an iteration. size_t tag = ip[-1]; +#if defined(__clang__) && defined(__aarch64__) + // Workaround for https://bugs.llvm.org/show_bug.cgi?id=51317 + // when loading 1 byte, clang for aarch64 doesn't realize that it(ldrb) + // comes with free zero-extension, so clang generates another + // 'and xn, xm, 0xff' before it use that as the offset. This 'and' is + // redundant and can be removed by adding this dummy asm, which gives + // clang a hint that we're doing the zero-extension at the load. + asm("" ::"r"(tag)); +#endif do { // The throughput is limited by instructions, unrolling the inner loop // twice reduces the amount of instructions checking limits and also -- cgit v1.2.1