diff options
author | yuanhecai <yuanhecai@loongson.cn> | 2022-04-12 09:10:27 +0800 |
---|---|---|
committer | yuanhecai <yuanhecai@loongson.cn> | 2022-04-28 09:35:30 +0800 |
commit | 1b00ad52630a0379d2df16a4fc7351f4e3d0896e (patch) | |
tree | 83d9d09f162ab5155c565a9cb36cace949fab269 | |
parent | b1ed8e08a21b33c0f5039559113004bee7943dc4 (diff) | |
download | libvpx-1b00ad52630a0379d2df16a4fc7351f4e3d0896e.tar.gz |
vp9[loongarch]: Optimize sad8x8/32x64/64x32x4d
1. vpx_sad8x8x4d_lsx
2. vpx_sad32x64x4d_lsx
3. vpx_sad64x32x4d_lsx
Bug: webm:1755
Change-Id: I08a2b8717ec8623ffdd4451a04e68fa3a7228668
-rw-r--r-- | test/sad_test.cc | 3 | ||||
-rw-r--r-- | vpx_dsp/loongarch/sad_lsx.c | 97 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 |
3 files changed, 99 insertions, 7 deletions
diff --git a/test/sad_test.cc b/test/sad_test.cc index 12a6206b9..7ce25343f 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1147,8 +1147,11 @@ INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests)); const SadMxNx4Param x4d_lsx_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_lsx), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_lsx), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx), SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_lsx), }; INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests)); #endif // HAVE_LSX diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c index 30464b366..4764acbf8 100644 --- a/vpx_dsp/loongarch/sad_lsx.c +++ b/vpx_dsp/loongarch/sad_lsx.c @@ -165,6 +165,81 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, return sad; } +static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 2); + uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src0, src1, src2, src3, sad_tmp; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src0 = __lsx_vld(src_ptr, 0); + DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_ptr, src_stride3); + src_ptr += src_stride4; + ref0 = __lsx_vld(ref0_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1, + ref2); + ref3 = __lsx_vldx(ref0_ptr, ref_stride3); + ref0_ptr += ref_stride4; + ref4 = __lsx_vld(ref1_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5, + ref6); + ref7 = __lsx_vldx(ref1_ptr, ref_stride3); + ref1_ptr += ref_stride4; + ref8 = __lsx_vld(ref2_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9, + ref10); + ref11 = __lsx_vldx(ref2_ptr, ref_stride3); + ref2_ptr += ref_stride4; + ref12 = __lsx_vld(ref3_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13, + ref14); + ref15 = __lsx_vldx(ref3_ptr, ref_stride3); + ref3_ptr += ref_stride4; + + DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1); + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1); + sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *const aref_ptr[], int32_t ref_stride, int32_t height, @@ -527,6 +602,13 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \ } +#define VPX_SAD_8xHTx4D_LSX(height) \ + void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + #define VPX_SAD_16xHTx4D_LSX(height) \ void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ const uint8_t *const refs[], \ @@ -564,13 +646,15 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, second_pred); \ } -#define SAD64 \ - VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_AVGSAD_64xHT_LSX(64) +#define SAD64 \ + VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \ + VPX_AVGSAD_64xHT_LSX(64) SAD64 -#define SAD32 \ - VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_AVGSAD_32xHT_LSX(32) +#define SAD32 \ + VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \ + VPX_AVGSAD_32xHT_LSX(32) SAD32 @@ -578,6 +662,11 @@ SAD32 SAD16 +#define SAD8 VPX_SAD_8xHTx4D_LSX(8) + +SAD8 + #undef SAD64 #undef SAD32 #undef SAD16 +#undef SAD8 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 68d4f86f2..b441b337b 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -881,10 +881,10 @@ add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, con specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/; +specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/; +specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/; add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/; @@ -905,7 +905,7 @@ add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, cons specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; -specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/; +specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/; add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; |