From b4888f76161debdbcde30a64be577b82fd40de29 Mon Sep 17 00:00:00 2001 From: Snappy Team Date: Fri, 2 Jul 2021 07:52:56 +0000 Subject: Optimize tag extraction for ARM with conditional increment instruction generation (csinc). For codegen see https://gcc.godbolt.org/z/a8z9j95Pv PiperOrigin-RevId: 382688740 --- snappy.cc | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/snappy.cc b/snappy.cc index 31f1575..7d0ff71 100644 --- a/snappy.cc +++ b/snappy.cc @@ -1004,7 +1004,26 @@ void MemMove(ptrdiff_t dst, const void* src, size_t size) { } SNAPPY_ATTRIBUTE_ALWAYS_INLINE -size_t AdvanceToNextTag(const uint8_t** ip_p, size_t* tag) { +size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) { + const uint8_t*& ip = *ip_p; + // This section is crucial for the throughput of the decompression loop. + // The latency of an iteration is fundamentally constrained by the + // following data chain on ip. + // ip -> c = Load(ip) -> delta1 = (c & 3) -> ip += delta1 or delta2 + // delta2 = ((c >> 2) + 1) ip++ + // This is different from X86 optimizations because ARM has conditional add + // instruction (csinc) and it removes several register moves. + const size_t literal_tag_offset = (*tag >> 2) + 1; + const size_t tag_type = *tag & 3; + const bool is_literal = (tag_type == 0); + *tag = is_literal ? ip[literal_tag_offset] : ip[tag_type]; + ip += is_literal ? literal_tag_offset : tag_type; + ip++; + return tag_type; +} + +SNAPPY_ATTRIBUTE_ALWAYS_INLINE +size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) { const uint8_t*& ip = *ip_p; // This section is crucial for the throughput of the decompression loop. // The latency of an iteration is fundamentally constrained by the @@ -1084,7 +1103,11 @@ std::pair DecompressBranchless( // For literals tag_type = 0, hence we will always obtain 0 from // ExtractLowBytes. For literals offset will thus be kLiteralOffset. ptrdiff_t len_min_offset = table.length_minus_offset[tag]; - size_t tag_type = AdvanceToNextTag(&ip, &tag); +#if defined(__aarch64__) + size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag); +#else + size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag); +#endif uint32_t next = LittleEndian::Load32(old_ip); size_t len = len_min_offset & 0xFF; len_min_offset -= ExtractOffset(next, tag_type); -- cgit v1.2.1