summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Zern <jzern@google.com>2015-01-31 09:47:15 -0800
committerJames Zern <jzern@google.com>2015-03-03 17:53:45 -0800
commit0339fa26eb233d2a756625e507ee51dd62c0562f (patch)
treed2da6bd1d74d012b39704acb0d21dce8fdab82e0
parent5a0c2207f4411a83213f22a05313afc14ca97955 (diff)
downloadlibwebp-0339fa26eb233d2a756625e507ee51dd62c0562f.tar.gz
lossless_neon: enable subtract green for aarch64
similar to: 1ba61b0 enable NEON intrinsics in aarch64 builds vtbl1_u8 is available everywhere but Xcode-based iOS arm64 builds, use vtbl1q_u8 there. performance varies based on the input, 1-3% on encode was observed (cherry picked from commit 416e1cea9b7f7a626341005cced947add7da5c54) Change-Id: Ifec35b37eb856acfcf69ed7f16fa078cd40b7034
-rw-r--r--src/dsp/lossless_neon.c48
1 files changed, 36 insertions, 12 deletions
diff --git a/src/dsp/lossless_neon.c b/src/dsp/lossless_neon.c
index 987767b5..9f8fa9da 100644
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@@ -259,20 +259,44 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
//------------------------------------------------------------------------------
// Subtract-Green Transform
-// vtbl? are unavailable in iOS/arm64 builds.
-#if !defined(__aarch64__)
+// vtbl?_u8 are marked unavailable for iOS arm64, use wider versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+ defined(__apple_build_version__)
+#define USE_VTBLQ
+#endif
+
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+ 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
-// 255 = byte will be zero'd
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+ const uint8x16_t shuffle) {
+ return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+ vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else // !USE_VTBLQ
+// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+ const uint8x8_t shuffle) {
+ return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+ vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif // USE_VTBLQ
+
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
- const uint8x16_t greens =
- vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
- vtbl1_u8(vget_high_u8(argb), shuffle));
+ const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
@@ -281,19 +305,21 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
- const uint8x16_t greens =
- vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
- vtbl1_u8(vget_high_u8(argb), shuffle));
+ const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
}
-#endif // !__aarch64__
+#undef USE_VTBLQ
#endif // USE_INTRINSICS
@@ -320,11 +346,9 @@ void VP8LDspInitNEON(void) {
VP8LPredictors[12] = Predictor12;
VP8LPredictors[13] = Predictor13;
-#if !defined(__aarch64__)
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
#endif
-#endif
#endif // WEBP_USE_NEON
}