summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoryuanhecai <yuanhecai@loongson.cn>2022-04-12 21:01:53 +0800
committeryuanhecai <yuanhecai@loongson.cn>2022-05-13 15:18:08 +0800
commit65d9ac5b5a3dd1c72c15a1fc5bcc004a43ad4c90 (patch)
treecb1b9d352b5cd74671b8321c93194fc4ae9bca69
parent0d51bb2fc5e1e5581d8d378aad3ac61b3205b3b7 (diff)
downloadlibvpx-65d9ac5b5a3dd1c72c15a1fc5bcc004a43ad4c90.tar.gz
vp9[loongarch]: Optimize fdct4x4/8x8_lsx
1. vpx_fdct4x4_lsx 2. vpx_fdct8x8_lsx Bug: webm:1755 Change-Id: If283fc08f9bedcbecd2c4052adb210f8fe00d4f0
-rw-r--r--test/dct_test.cc6
-rw-r--r--test/fdct8x8_test.cc7
-rw-r--r--vpx_dsp/loongarch/fwd_txfm_lsx.c92
-rw-r--r--vpx_dsp/loongarch/fwd_txfm_lsx.h99
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl4
5 files changed, 204 insertions, 4 deletions
diff --git a/test/dct_test.cc b/test/dct_test.cc
index 6178f8e2c..2182f87e5 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -587,7 +587,9 @@ INSTANTIATE_TEST_SUITE_P(VSX, TransDCT,
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
-static const FuncInfo dct_lsx_func_info[2] = {
+static const FuncInfo dct_lsx_func_info[4] = {
+ { &fdct_wrapper<vpx_fdct4x4_lsx>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+ { &fdct_wrapper<vpx_fdct8x8_lsx>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
{ &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
16, 1 },
{ &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
@@ -596,7 +598,7 @@ static const FuncInfo dct_lsx_func_info[2] = {
INSTANTIATE_TEST_SUITE_P(
LSX, TransDCT,
- ::testing::Combine(::testing::Range(0, 2),
+ ::testing::Combine(::testing::Range(0, 4),
::testing::Values(dct_lsx_func_info),
::testing::Values(0), ::testing::Values(VPX_BITS_8)));
#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 0822666e7..83d1ff142 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -768,4 +768,11 @@ INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT,
&vpx_idct8x8_64_add_vsx,
0, VPX_BITS_8)));
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT,
+ ::testing::Values(make_tuple(&vpx_fdct8x8_lsx,
+ &vpx_idct8x8_64_add_c, 0,
+ VPX_BITS_8)));
+#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c
index 03f194b43..6f2d4d6fe 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.c
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -11,6 +11,20 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
+ \
+ DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
+ DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
+ _t0 = __lsx_vilvl_h(_s1, _s0); \
+ _t1 = __lsx_vilvh_h(_s1, _s0); \
+ _t2 = __lsx_vilvl_h(_s3, _s2); \
+ _t3 = __lsx_vilvh_h(_s3, _s2); \
+ DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
+ DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
+ }
+
#if !CONFIG_VP9_HIGHBITDEPTH
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
int32_t src_stride) {
@@ -240,6 +254,84 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) {
__lsx_vst(in7, output, 240);
}
+void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ __m128i in0, in1, in2, in3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+
+ in0 = __lsx_vld(input, 0);
+ DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
+ in3 = __lsx_vldx(input, src_stride6);
+
+ /* fdct4 pre-process */
+ {
+ __m128i vec, mask;
+ __m128i zero = __lsx_vldi(0);
+
+ mask = __lsx_vinsgr2vr_b(zero, 1, 0);
+ DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
+ in3);
+ vec = __lsx_vseqi_h(in0, 0);
+ vec = __lsx_vxori_b(vec, 255);
+ vec = __lsx_vand_v(mask, vec);
+ in0 = __lsx_vadd_h(in0, vec);
+ }
+
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in2, output, 16);
+}
+
+void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+ int16_t *input_tmp = (int16_t *)input;
+
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
+ in2);
+ in3 = __lsx_vldx(input_tmp, src_stride6);
+ input_tmp += src_stride4;
+ in4 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
+ in6);
+ in7 = __lsx_vldx(input_tmp, src_stride6);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in1, output, 16);
+ __lsx_vst(in2, output, 32);
+ __lsx_vst(in3, output, 48);
+ __lsx_vst(in4, output, 64);
+ __lsx_vst(in5, output, 80);
+ __lsx_vst(in6, output, 96);
+ __lsx_vst(in7, output, 112);
+}
+
void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
int32_t src_stride) {
int32_t i;
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
index 9ed810226..d04427a6e 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.h
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -14,6 +14,105 @@
#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
#include "vpx_dsp/txfm_common.h"
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \
+ __m128i vec4_m, vec5_m, vec6_m, vec7_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \
+ \
+ LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
+ DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \
+ cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \
+ vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
+ \
+ vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \
+ cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \
+ vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
+ \
+ DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \
+ vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
+ vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \
+ }
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
+ __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
+ \
+ /* FDCT stage1 */ \
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+ s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
+ LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
+ x2_m = __lsx_vneg_h(x2_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
+ \
+ /* stage2 */ \
+ s1_m = __lsx_vilvl_h(s5_m, s6_m); \
+ s0_m = __lsx_vilvh_h(s5_m, s6_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
+ \
+ /* stage3 */ \
+ LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x0_m, x1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
+ \
+ x1_m = __lsx_vreplvei_h(coeff_m, 5); \
+ x0_m = __lsx_vneg_h(x0_m); \
+ x0_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 6); \
+ x3_m = __lsx_vneg_h(x3_m); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
+ }
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
+ { \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ \
+ DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \
+ vec1_m, vec2_m, vec3_m); \
+ DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \
+ vec5_m, vec6_m, vec7_m); \
+ DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \
+ in3, in0, in1, in2, in3); \
+ DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \
+ in7, in4, in5, in6, in7); \
+ }
+
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
{ \
__m128i tp0_m, tp1_m; \
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 1c88dcdfa..f17fc3b49 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -573,13 +573,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
} else {
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct4x4 neon sse2 msa/;
+ specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4_1 sse2 neon/;
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+ specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct8x8_1 sse2 neon msa/;