diff options
author | Frank Galligan <fgalligan@google.com> | 2015-01-20 14:38:39 -0800 |
---|---|---|
committer | Gerrit Code Review <gerrit@gerrit.golo.chromium.org> | 2015-01-20 14:38:39 -0800 |
commit | 469ff48d7bdbd2e39ca4c8ec2a530a6e79f85b39 (patch) | |
tree | 6aa61284d9e5a6b5fb459ac50bbf62921188bd08 /vp9/encoder/arm | |
parent | 79b88cc2a57a9c27bf12e053204dbc5d0c141ec6 (diff) | |
parent | 6e7e1cf32f85f91ddfcb49a807e598e8ead131fe (diff) | |
download | libvpx-469ff48d7bdbd2e39ca4c8ec2a530a6e79f85b39.tar.gz |
Merge "Add Neon intrinsics for vp9_avg_8x8_neon"
Diffstat (limited to 'vp9/encoder/arm')
-rw-r--r-- | vp9/encoder/arm/neon/vp9_avg_neon.c | 49 |
1 files changed, 49 insertions, 0 deletions
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c new file mode 100644 index 000000000..f505fcb7a --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { + const uint32x4_t a = vpaddlq_u16(v_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { + uint8x8_t v_s0 = vld1_u8(s); + const uint8x8_t v_s1 = vld1_u8(s + p); + uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); + + v_s0 = vld1_u8(s + 2 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 3 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 4 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 5 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 6 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 7 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + return (horizontal_add_u16x8(v_sum) + 32) >> 6; +} |