From f9ca7d45162664ddd9fb70e19335c2426e5d75bb Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Thu, 1 Dec 2022 18:51:52 +0200 Subject: liblzma: Omit zero-skipping from ARM64 filter. It has some complicated downsides and its usefulness is more limited than I originally thought. So this change is bad for certain very specific situations but a generic solution that works for other filters (and is otherwise better too) is planned anyway. And this way 7-Zip can use the same compatible filter for the .7z format. This is still marked as experimental with a new temporary Filter ID. --- src/liblzma/api/lzma/bcj.h | 2 +- src/liblzma/simple/arm64.c | 81 +++++++++++++--------------------------------- 2 files changed, 24 insertions(+), 59 deletions(-) diff --git a/src/liblzma/api/lzma/bcj.h b/src/liblzma/api/lzma/bcj.h index b68b6ba..15775d7 100644 --- a/src/liblzma/api/lzma/bcj.h +++ b/src/liblzma/api/lzma/bcj.h @@ -49,7 +49,7 @@ * Filter for SPARC binaries. */ -#define LZMA_FILTER_ARM64 LZMA_VLI_C(0x3FDB87B33B27010B) +#define LZMA_FILTER_ARM64 LZMA_VLI_C(0x3FDB87B33B27020B) /**< * Filter for ARM64 binaries. * diff --git a/src/liblzma/simple/arm64.c b/src/liblzma/simple/arm64.c index f7ad657..5e7f265 100644 --- a/src/liblzma/simple/arm64.c +++ b/src/liblzma/simple/arm64.c @@ -6,13 +6,6 @@ /// This converts ARM64 relative addresses in the BL and ADRP immediates /// to absolute values to increase redundancy of ARM64 code. /// -/// Unlike the older BCJ filters, this handles zeros specially. This way -/// the filter won't be counterproductive on Linux kernel modules, object -/// files, and static libraries where the immediates are all zeros (to be -/// filled later by a linker). Usually this has no downsides but with bad -/// luck it can reduce the effectiveness of the filter and trying a different -/// start offset can mitigate the problem. -/// /// Converting B or ADR instructions was also tested but it's not useful. /// A majority of the jumps for the B instruction are very small (+/- 0xFF). /// These are typical for loops and if-statements. Encoding them to their @@ -30,20 +23,6 @@ #include "simple_private.h" -static uint32_t -arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder) -{ - if (!is_encoder) - pc = 0U - pc; - - uint32_t dest = src + pc; - if ((dest & mask) == 0) - dest = pc; - - return dest; -} - - static size_t arm64_code(void *simple lzma_attribute((__unused__)), uint32_t now_pos, bool is_encoder, @@ -51,29 +30,15 @@ arm64_code(void *simple lzma_attribute((__unused__)), { size_t i; - // Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower + // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower // with auto-vectorization that is enabled by default with -O2. - // Even -Os, which doesn't use vectorization, produces faster code. - // Disabling vectorization with -O2 gives good speed (faster than -Os) - // and reasonable code size. - // // Such vectorization bloat happens with -O2 when targeting ARM64 too // but performance hasn't been tested. - // - // Clang 14 and 15 won't auto-vectorize this loop if the condition - // for ADRP is replaced with the commented-out version. However, - // at least Clang 14.0.6 doesn't generate as fast code with that - // condition. The commented-out code is also bigger. - // - // GCC 12.2 on x86-64 with -O2 produces good code with both versions - // of the ADRP if-statement although the single-branch version is - // slightly faster and smaller than the commented-out version. - // Speed is similar to non-vectorized clang -O2. #ifdef __clang__ # pragma clang loop vectorize(disable) #endif for (i = 0; i + 4 <= size; i += 4) { - const uint32_t pc = (uint32_t)(now_pos + i); + uint32_t pc = (uint32_t)(now_pos + i); uint32_t instr = read32le(buffer + i); if ((instr >> 26) == 0x25) { @@ -87,27 +52,17 @@ arm64_code(void *simple lzma_attribute((__unused__)), // so this is a compromise that slightly favors big // files. With the full range only six bits of the 32 // need to match to trigger a conversion. - const uint32_t mask26 = 0x03FFFFFF; - const uint32_t src = instr & mask26; + const uint32_t src = instr; instr = 0x94000000; - if (src == 0) - continue; + pc >>= 2; + if (!is_encoder) + pc = 0U - pc; - instr |= arm64_conv(src, pc >> 2, mask26, is_encoder) - & mask26; + instr |= (src + pc) & 0x03FFFFFF; write32le(buffer + i, instr); -/* - // This is a more readable version of the one below but this - // has two branches. It results in bigger and slower code. - } else if ((instr & 0x9FF00000) == 0x90000000 - || (instr & 0x9FF00000) == 0x90F00000) { -*/ - // This is only a rotation, addition, and testing that - // none of the bits covered by the bitmask are set. - } else if (((((instr << 8) | (instr >> 24)) - + (0x10000000 - 0x90)) & 0xE000009F) == 0) { + } else if ((instr & 0x9F000000) == 0x90000000) { // ADRP instruction: // Only values in the range +/-512 MiB are converted. // @@ -120,15 +75,25 @@ arm64_code(void *simple lzma_attribute((__unused__)), // range, nine bits of 32 need to match to trigger a // conversion (two 10-bit match choices = 9 bits). const uint32_t src = ((instr >> 29) & 3) - | ((instr >> 3) & 0x0003FFFC); - instr &= 0x9000001F; + | ((instr >> 3) & 0x001FFFFC); - if (src == 0) + // With the addition only one branch is needed to + // check the +/- range. This is usually false when + // processing ARM64 code so branch prediction will + // handle it well in terms of performance. + // + //if ((src & 0x001E0000) != 0 + // && (src & 0x001E0000) != 0x001E0000) + if ((src + 0x00020000) & 0x001C0000) continue; - const uint32_t dest = arm64_conv( - src, pc >> 12, 0x3FFFF, is_encoder); + instr &= 0x9000001F; + + pc >>= 12; + if (!is_encoder) + pc = 0U - pc; + const uint32_t dest = src + pc; instr |= (dest & 3) << 29; instr |= (dest & 0x0003FFFC) << 3; instr |= (0U - (dest & 0x00020000)) & 0x00E00000; -- cgit v1.2.1