diff options
author | James Zern <jzern@google.com> | 2015-01-31 09:47:15 -0800 |
---|---|---|
committer | James Zern <jzern@google.com> | 2015-03-03 17:53:45 -0800 |
commit | 0339fa26eb233d2a756625e507ee51dd62c0562f (patch) | |
tree | d2da6bd1d74d012b39704acb0d21dce8fdab82e0 | |
parent | 5a0c2207f4411a83213f22a05313afc14ca97955 (diff) | |
download | libwebp-0339fa26eb233d2a756625e507ee51dd62c0562f.tar.gz |
lossless_neon: enable subtract green for aarch64
similar to:
1ba61b0 enable NEON intrinsics in aarch64 builds
vtbl1_u8 is available everywhere but Xcode-based iOS arm64 builds, use
vtbl1q_u8 there.
performance varies based on the input, 1-3% on encode was observed
(cherry picked from commit 416e1cea9b7f7a626341005cced947add7da5c54)
Change-Id: Ifec35b37eb856acfcf69ed7f16fa078cd40b7034
-rw-r--r-- | src/dsp/lossless_neon.c | 48 |
1 files changed, 36 insertions, 12 deletions
diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c index 987767b5..9f8fa9da 100644 --- a/src/dsp/lossless_neon.c +++ b/src/dsp/lossless_neon.c @@ -259,20 +259,44 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { //------------------------------------------------------------------------------ // Subtract-Green Transform -// vtbl? are unavailable in iOS/arm64 builds. -#if !defined(__aarch64__) +// vtbl?_u8 are marked unavailable for iOS arm64, use wider versions there. +#if defined(__APPLE__) && defined(__aarch64__) && \ + defined(__apple_build_version__) +#define USE_VTBLQ +#endif + +#ifdef USE_VTBLQ +// 255 = byte will be zeroed +static const uint8_t kGreenShuffle[16] = { + 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255 +}; -// 255 = byte will be zero'd +static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, + const uint8x16_t shuffle) { + return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)), + vtbl1q_u8(argb, vget_high_u8(shuffle))); +} +#else // !USE_VTBLQ +// 255 = byte will be zeroed static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; +static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb, + const uint8x8_t shuffle) { + return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), + vtbl1_u8(vget_high_u8(argb), shuffle)); +} +#endif // USE_VTBLQ + static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); +#ifdef USE_VTBLQ + const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); +#else const uint8x8_t shuffle = vld1_u8(kGreenShuffle); +#endif for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); - const uint8x16_t greens = - vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), - vtbl1_u8(vget_high_u8(argb), shuffle)); + const uint8x16_t greens = DoGreenShuffle(argb, shuffle); vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); } // fallthrough and finish off with plain-C @@ -281,19 +305,21 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); +#ifdef USE_VTBLQ + const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); +#else const uint8x8_t shuffle = vld1_u8(kGreenShuffle); +#endif for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); - const uint8x16_t greens = - vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), - vtbl1_u8(vget_high_u8(argb), shuffle)); + const uint8x16_t greens = DoGreenShuffle(argb, shuffle); vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); } -#endif // !__aarch64__ +#undef USE_VTBLQ #endif // USE_INTRINSICS @@ -320,11 +346,9 @@ void VP8LDspInitNEON(void) { VP8LPredictors[12] = Predictor12; VP8LPredictors[13] = Predictor13; -#if !defined(__aarch64__) VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; #endif -#endif #endif // WEBP_USE_NEON } |