summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJun He <jun.he@arm.com>2021-08-17 16:36:43 +0800
committerJun He <jun.he@arm.com>2021-08-30 09:51:37 +0800
commitaeb5de55a9c646ca334e4f6252af28536ca22349 (patch)
tree84902cec7dfeea4ab7be23b73c950462521cd2fd
parent7062d7f1d8a5ca7ade075579f36681ff48616dc3 (diff)
downloadsnappy-git-aeb5de55a9c646ca334e4f6252af28536ca22349.tar.gz
decompress: refine data depdency
The final ip advance value doesn't have to wait for the result of offset to load *tag. It can be computed along with the offset, so the codegen will use one csinc in parallel with ldrb. This will improve the throughput. With this change it is observed ~4.2% uplift in UFlat/10 and ~3.7% in UFlatMedley Signed-off-by: Jun He <jun.he@arm.com> Change-Id: I20ab211235bbf578c6c978f2bbd9160a49e920da
-rw-r--r--snappy.cc12
1 files changed, 8 insertions, 4 deletions
diff --git a/snappy.cc b/snappy.cc
index 4008e76..670b87e 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -1015,12 +1015,16 @@ size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) {
// delta2 = ((c >> 2) + 1) ip++
// This is different from X86 optimizations because ARM has conditional add
// instruction (csinc) and it removes several register moves.
- const size_t literal_tag_offset = (*tag >> 2) + 1;
const size_t tag_type = *tag & 3;
const bool is_literal = (tag_type == 0);
- *tag = is_literal ? ip[literal_tag_offset] : ip[tag_type];
- ip += is_literal ? literal_tag_offset : tag_type;
- ip++;
+ if (is_literal) {
+ size_t next_literal_tag = (*tag >> 2) + 1;
+ *tag = ip[next_literal_tag];
+ ip += next_literal_tag + 1;
+ } else {
+ *tag = ip[tag_type];
+ ip += tag_type + 1;
+ }
return tag_type;
}