vp9[loongarch]: Optimize vpx_hadamard_16x16/8x8

1. vpx_hadamard_16x16_lsx 2. vpx_hadamard_8x8_lsx Bug: webm:1755 Change-Id: I3b1e0a2c026c3806b7bbbd191d0edf0e78912af7
author: yuanhecai <yuanhecai@loongson.cn> 2022-04-12 16:02:55 +0800
committer: yuanhecai <yuanhecai@loongson.cn> 2022-05-13 15:18:03 +0800
commit: 0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7 (patch)
tree: 830f59f5e56d78c3ee0c91138124fe32eb93b48b
parent: a6bff83a603affa2799bbacedc24f9ca8632a5c6 (diff)
download: libvpx-0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7.tar.gz
5 files changed, 153 insertions, 4 deletions
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index dab945a56..10b1e79c1 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -285,6 +285,13 @@ INSTANTIATE_TEST_SUITE_P(
                       HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16)));
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16)));
+#endif  // HAVE_LSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
 class HadamardHighbdTest : public HadamardTestBase {
  protected:
diff --git a/vpx_dsp/loongarch/avg_lsx.c b/vpx_dsp/loongarch/avg_lsx.c
new file mode 100644
index 000000000..750c9de29
--- /dev/null
+++ b/vpx_dsp/loongarch/avg_lsx.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h"
+
+void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride,
+                          tran_low_t *dst) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  ptrdiff_t src_stride2 = src_stride << 1;
+  ptrdiff_t src_stride3 = src_stride2 + src_stride;
+  ptrdiff_t src_stride4 = src_stride2 << 1;
+  ptrdiff_t src_stride6 = src_stride3 << 1;
+
+  int16_t *src_tmp = (int16_t *)src;
+  src0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2);
+  src3 = __lsx_vldx(src_tmp, src_stride6);
+  src_tmp += src_stride4;
+  src4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6);
+  src7 = __lsx_vldx(src_tmp, src_stride6);
+
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  store_tran_low(tmp0, dst, 0);
+  store_tran_low(tmp1, dst, 8);
+  store_tran_low(tmp2, dst, 16);
+  store_tran_low(tmp3, dst, 24);
+  store_tran_low(tmp4, dst, 32);
+  store_tran_low(tmp5, dst, 40);
+  store_tran_low(tmp6, dst, 48);
+  store_tran_low(tmp7, dst, 56);
+}
+
+void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride,
+                            tran_low_t *dst) {
+  int i;
+  __m128i a0, a1, a2, a3, b0, b1, b2, b3;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    a0 = load_tran_low(dst);
+    a1 = load_tran_low(dst + 64);
+    a2 = load_tran_low(dst + 128);
+    a3 = load_tran_low(dst + 192);
+
+    LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1);
+    DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3);
+    LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2);
+
+    store_tran_low(a0, dst, 0);
+    store_tran_low(a1, dst, 64);
+    store_tran_low(a2, dst, 128);
+    store_tran_low(a3, dst, 192);
+
+    dst += 8;
+  }
+}
diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
new file mode 100644
index 000000000..4834f18fc
--- /dev/null
+++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define load_tran_low(s)                      \
+  ({                                          \
+    __m128i res0_m;                           \
+    __m128i v0_m = __lsx_vld(s, 0);           \
+    __m128i v1_m = __lsx_vld(s + 4, 0);       \
+    res0_m = __lsx_vsrlni_h_w(v0_m, v1_m, 0); \
+    res0_m;                                   \
+  })
+
+#define store_tran_low(v, s, c)     \
+  {                                 \
+    __m128i v0_m, v1_m;             \
+    v1_m = __lsx_vexth_w_h(v);      \
+    v0_m = __lsx_vsllwil_w_h(v, 0); \
+    __lsx_vst(v0_m, s + c, 0);      \
+    __lsx_vst(v1_m, s + c + 4, 0);  \
+  }
+#else
+#define load_tran_low(s)      \
+  ({                          \
+    __m128i res0_m;           \
+    res0_m = __lsx_vld(s, 0); \
+    res0_m;                   \
+  })
+
+#define store_tran_low(v, s, c) __lsx_vst(v, s + c, 0)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index ddccfc1f4..7de8b0205 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -339,6 +339,7 @@ DSP_SRCS-$(HAVE_AVX2)  += x86/avg_intrin_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_LSX)   += loongarch/avg_lsx.c
 ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
@@ -439,6 +440,9 @@ DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
 DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
 DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
 
+# LSX utilities
+DSP_SRCS-$(HAVE_LSX)  += loongarch/bitdepth_conversion_lsx.h
+
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
 
 DSP_SRCS-yes += vpx_dsp_rtcd.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b441b337b..1c88dcdfa 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -789,10 +789,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon vsx lsx/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_hadamard_32x32 sse2 avx2/;
@@ -813,10 +813,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
     specialize qw/vpx_highbd_satd avx2/;
   } else {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
     specialize qw/vpx_hadamard_32x32 sse2 avx2/;
author	yuanhecai <yuanhecai@loongson.cn>	2022-04-12 16:02:55 +0800
committer	yuanhecai <yuanhecai@loongson.cn>	2022-05-13 15:18:03 +0800
commit	0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7 (patch)
tree	830f59f5e56d78c3ee0c91138124fe32eb93b48b
parent	a6bff83a603affa2799bbacedc24f9ca8632a5c6 (diff)
download	libvpx-0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7.tar.gz