summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoryuanhecai <yuanhecai@loongson.cn>2022-04-12 16:02:55 +0800
committeryuanhecai <yuanhecai@loongson.cn>2022-05-13 15:18:03 +0800
commit0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7 (patch)
tree830f59f5e56d78c3ee0c91138124fe32eb93b48b
parenta6bff83a603affa2799bbacedc24f9ca8632a5c6 (diff)
downloadlibvpx-0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7.tar.gz
vp9[loongarch]: Optimize vpx_hadamard_16x16/8x8
1. vpx_hadamard_16x16_lsx 2. vpx_hadamard_8x8_lsx Bug: webm:1755 Change-Id: I3b1e0a2c026c3806b7bbbd191d0edf0e78912af7
-rw-r--r--test/hadamard_test.cc7
-rw-r--r--vpx_dsp/loongarch/avg_lsx.c90
-rw-r--r--vpx_dsp/loongarch/bitdepth_conversion_lsx.h48
-rw-r--r--vpx_dsp/vpx_dsp.mk4
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl8
5 files changed, 153 insertions, 4 deletions
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index dab945a56..10b1e79c1 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -285,6 +285,13 @@ INSTANTIATE_TEST_SUITE_P(
HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16)));
#endif // HAVE_VSX
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+ LSX, HadamardLowbdTest,
+ ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8),
+ HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16)));
+#endif // HAVE_LSX
+
#if CONFIG_VP9_HIGHBITDEPTH
class HadamardHighbdTest : public HadamardTestBase {
protected:
diff --git a/vpx_dsp/loongarch/avg_lsx.c b/vpx_dsp/loongarch/avg_lsx.c
new file mode 100644
index 000000000..750c9de29
--- /dev/null
+++ b/vpx_dsp/loongarch/avg_lsx.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h"
+
+void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride,
+ tran_low_t *dst) {
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ ptrdiff_t src_stride2 = src_stride << 1;
+ ptrdiff_t src_stride3 = src_stride2 + src_stride;
+ ptrdiff_t src_stride4 = src_stride2 << 1;
+ ptrdiff_t src_stride6 = src_stride3 << 1;
+
+ int16_t *src_tmp = (int16_t *)src;
+ src0 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2);
+ src3 = __lsx_vldx(src_tmp, src_stride6);
+ src_tmp += src_stride4;
+ src4 = __lsx_vld(src_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6);
+ src7 = __lsx_vldx(src_tmp, src_stride6);
+
+ LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+ tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+ LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+ src4, src5, src7, src6, src3, src2);
+ LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+ tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+ LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+ tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+ LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+ src4, src5, src7, src6, src3, src2);
+ LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+ tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+ store_tran_low(tmp0, dst, 0);
+ store_tran_low(tmp1, dst, 8);
+ store_tran_low(tmp2, dst, 16);
+ store_tran_low(tmp3, dst, 24);
+ store_tran_low(tmp4, dst, 32);
+ store_tran_low(tmp5, dst, 40);
+ store_tran_low(tmp6, dst, 48);
+ store_tran_low(tmp7, dst, 56);
+}
+
+void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride,
+ tran_low_t *dst) {
+ int i;
+ __m128i a0, a1, a2, a3, b0, b1, b2, b3;
+
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0);
+ /* Top right. */
+ vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64);
+ /* Bottom left. */
+ vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128);
+ /* Bottom right. */
+ vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192);
+
+ for (i = 0; i < 64; i += 8) {
+ a0 = load_tran_low(dst);
+ a1 = load_tran_low(dst + 64);
+ a2 = load_tran_low(dst + 128);
+ a3 = load_tran_low(dst + 192);
+
+ LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1);
+ DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3);
+ LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2);
+
+ store_tran_low(a0, dst, 0);
+ store_tran_low(a1, dst, 64);
+ store_tran_low(a2, dst, 128);
+ store_tran_low(a3, dst, 192);
+
+ dst += 8;
+ }
+}
diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
new file mode 100644
index 000000000..4834f18fc
--- /dev/null
+++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define load_tran_low(s) \
+ ({ \
+ __m128i res0_m; \
+ __m128i v0_m = __lsx_vld(s, 0); \
+ __m128i v1_m = __lsx_vld(s + 4, 0); \
+ res0_m = __lsx_vsrlni_h_w(v0_m, v1_m, 0); \
+ res0_m; \
+ })
+
+#define store_tran_low(v, s, c) \
+ { \
+ __m128i v0_m, v1_m; \
+ v1_m = __lsx_vexth_w_h(v); \
+ v0_m = __lsx_vsllwil_w_h(v, 0); \
+ __lsx_vst(v0_m, s + c, 0); \
+ __lsx_vst(v1_m, s + c + 4, 0); \
+ }
+#else
+#define load_tran_low(s) \
+ ({ \
+ __m128i res0_m; \
+ res0_m = __lsx_vld(s, 0); \
+ res0_m; \
+ })
+
+#define store_tran_low(v, s, c) __lsx_vst(v, s + c, 0)
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index ddccfc1f4..7de8b0205 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -339,6 +339,7 @@ DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/avg_lsx.c
ifeq ($(VPX_ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
endif
@@ -439,6 +440,9 @@ DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h
DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
+# LSX utilities
+DSP_SRCS-$(HAVE_LSX) += loongarch/bitdepth_conversion_lsx.h
+
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
DSP_SRCS-yes += vpx_dsp_rtcd.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b441b337b..1c88dcdfa 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -789,10 +789,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
+ specialize qw/vpx_hadamard_8x8 sse2 neon vsx lsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
+ specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_32x32 sse2 avx2/;
@@ -813,10 +813,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_highbd_satd avx2/;
} else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
+ specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
+ specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_32x32 sse2 avx2/;