Merge "Add Neon intrinsics for vp9_avg_8x8_neon"

author: Frank Galligan <fgalligan@google.com> 2015-01-20 14:38:39 -0800
committer: Gerrit Code Review <gerrit@gerrit.golo.chromium.org> 2015-01-20 14:38:39 -0800
commit: 469ff48d7bdbd2e39ca4c8ec2a530a6e79f85b39 (patch)
tree: 6aa61284d9e5a6b5fb459ac50bbf62921188bd08 /vp9/encoder/arm
parent: 79b88cc2a57a9c27bf12e053204dbc5d0c141ec6 (diff)
parent: 6e7e1cf32f85f91ddfcb49a807e598e8ead131fe (diff)
download: libvpx-469ff48d7bdbd2e39ca4c8ec2a530a6e79f85b39.tar.gz
1 files changed, 49 insertions, 0 deletions
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
new file mode 100644
index 000000000..f505fcb7a
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+  const uint32x4_t a = vpaddlq_u16(v_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
+  uint8x8_t v_s0 = vld1_u8(s);
+  const uint8x8_t v_s1 = vld1_u8(s + p);
+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+  v_s0 = vld1_u8(s + 2 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 3 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 4 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 5 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 6 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 7 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
author	Frank Galligan <fgalligan@google.com>	2015-01-20 14:38:39 -0800
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>	2015-01-20 14:38:39 -0800
commit	469ff48d7bdbd2e39ca4c8ec2a530a6e79f85b39 (patch)
tree	6aa61284d9e5a6b5fb459ac50bbf62921188bd08 /vp9/encoder/arm
parent	79b88cc2a57a9c27bf12e053204dbc5d0c141ec6 (diff)
parent	6e7e1cf32f85f91ddfcb49a807e598e8ead131fe (diff)
download	libvpx-469ff48d7bdbd2e39ca4c8ec2a530a6e79f85b39.tar.gz