summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorSalome Thirot <salome.thirot@arm.com>2023-03-10 16:30:36 +0000
committerSalome Thirot <salome.thirot@arm.com>2023-03-22 10:50:17 +0000
commit5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a (patch)
tree55710015d76eca92f8ef322062e653776bf45615 /vpx_dsp
parent882399bd54a82aa72ba766356d8fda31fbe40450 (diff)
downloadlibvpx-5c7867beacb35f8f937ad03f8ca5e2f1ae9c7a6a.tar.gz
Add Neon implementations of vpx_highbd_avg_<w>x<h>_c
Add Neon implementation of vpx_highbd_avg_4x4_c and vpx_highbd_avg_8x8_c as well as the corresponding tests. Change-Id: Ib1b06af5206774347690c9c56e194b76aa409c91
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/highbd_avg_neon.c24
-rw-r--r--vpx_dsp/arm/mem_neon.h21
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl4
3 files changed, 47 insertions, 2 deletions
diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
index b84a7875d..fc10197d7 100644
--- a/vpx_dsp/arm/highbd_avg_neon.c
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -16,6 +16,30 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/sum_neon.h"
+uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+ const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * a_stride, a_stride);
+ const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * a_stride, a_stride);
+ return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) {
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+ uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+ load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+ sum = vaddq_u16(a0, a1);
+ sum = vaddq_u16(sum, a2);
+ sum = vaddq_u16(sum, a3);
+ sum = vaddq_u16(sum, a4);
+ sum = vaddq_u16(sum, a5);
+ sum = vaddq_u16(sum, a6);
+ sum = vaddq_u16(sum, a7);
+
+ return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
// coeff: 32 bits, dynamic range [-2147483648, 2147483647].
// length: value range {16, 64, 256, 1024}.
// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index fa14f80b2..1a20da70e 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -419,4 +419,25 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
vst1q_u8(s, s7);
}
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+ uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+ uint16x8_t *s6, uint16x8_t *s7) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+ *s4 = vld1q_u16(s);
+ s += p;
+ *s5 = vld1q_u16(s);
+ s += p;
+ *s6 = vld1q_u16(s);
+ s += p;
+ *s7 = vld1q_u16(s);
+}
+
#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 7cd3a0be8..6637186f8 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -995,10 +995,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Avg
#
add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
- specialize qw/vpx_highbd_avg_8x8 sse2/;
+ specialize qw/vpx_highbd_avg_8x8 sse2 neon/;
add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
- specialize qw/vpx_highbd_avg_4x4 sse2/;
+ specialize qw/vpx_highbd_avg_4x4 sse2 neon/;
add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
specialize qw/vpx_highbd_minmax_8x8 neon/;