summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoryuanhecai <yuanhecai@loongson.cn>2022-05-17 19:06:04 +0800
committeryuanhecai <yuanhecai@loongson.cn>2022-05-20 10:58:38 +0800
commitf92c451e6c03685e28217f2080cc52a994938664 (patch)
tree63b1b20b1643fe338170dd1a7770bc93cb38733f
parent63378a94f996304e2784ecd6584e70cf487991e9 (diff)
downloadlibvpx-f92c451e6c03685e28217f2080cc52a994938664.tar.gz
loongarch: Modify the representation of macros
Some macros have been changed to "#define do {...} While (0)", change the rest to "static INLINE ..." Bug: webm:1755 Change-Id: I445ac0c543f12df38f086b479394b111058367d0
-rw-r--r--vp8/common/loongarch/idct_lsx.c39
-rw-r--r--vp8/common/loongarch/loopfilter_filters_lsx.c16
-rw-r--r--vp8/common/loongarch/sixtap_filter_lsx.c322
-rw-r--r--vpx_dsp/loongarch/bitdepth_conversion_lsx.h43
-rw-r--r--vpx_dsp/loongarch/fwd_dct32x32_lsx.c4
-rw-r--r--vpx_dsp/loongarch/fwd_txfm_lsx.c4
-rw-r--r--vpx_dsp/loongarch/fwd_txfm_lsx.h40
-rw-r--r--vpx_dsp/loongarch/idct32x32_lsx.c4
-rw-r--r--vpx_dsp/loongarch/loopfilter_16_lsx.c8
-rw-r--r--vpx_dsp/loongarch/loopfilter_lsx.h20
-rw-r--r--vpx_dsp/loongarch/quantize_lsx.c192
-rw-r--r--vpx_dsp/loongarch/sad_lsx.c231
-rw-r--r--vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c102
-rw-r--r--vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c28
-rw-r--r--vpx_dsp/loongarch/vpx_convolve8_lsx.c110
-rw-r--r--vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c44
-rw-r--r--vpx_dsp/loongarch/vpx_convolve_lsx.h100
17 files changed, 654 insertions, 653 deletions
diff --git a/vp8/common/loongarch/idct_lsx.c b/vp8/common/loongarch/idct_lsx.c
index 679019ff6..eee871eec 100644
--- a/vp8/common/loongarch/idct_lsx.c
+++ b/vp8/common/loongarch/idct_lsx.c
@@ -16,47 +16,44 @@ static const int32_t cospi8sqrt2minus1 = 20091;
static const int32_t sinpi8sqrt2 = 35468;
#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
+ do { \
__m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
\
DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m); \
DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m); \
DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
- }
+ } while (0)
#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
+ do { \
__m128i s4_m, s5_m, s6_m, s7_m; \
\
TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \
DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2); \
out1 = __lsx_vilvh_d(s6_m, s4_m); \
out3 = __lsx_vilvh_d(s7_m, s5_m); \
- }
+ } while (0)
-#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in) \
- ({ \
- __m128i out_m; \
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1) \
+ do { \
__m128i zero_m = __lsx_vldi(0); \
__m128i tmp1_m, tmp2_m; \
__m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \
\
- tmp1_m = __lsx_vilvl_h(in, zero_m); \
- tmp2_m = __lsx_vilvh_h(in, zero_m); \
+ tmp1_m = __lsx_vilvl_h(in0, zero_m); \
+ tmp2_m = __lsx_vilvh_h(in0, zero_m); \
tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \
tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \
tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m); \
tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \
tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m); \
tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \
- out_m = __lsx_vpickev_h(tmp2_m, tmp1_m); \
- \
- out_m; \
- })
+ in1 = __lsx_vpickev_h(tmp2_m, tmp1_m); \
+ } while (0)
#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
+ do { \
__m128i a1_m, b1_m, c1_m, d1_m; \
__m128i c_tmp1_m, c_tmp2_m; \
__m128i d_tmp1_m, d_tmp2_m; \
@@ -65,7 +62,7 @@ static const int32_t sinpi8sqrt2 = 35468;
const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \
a1_m = __lsx_vadd_h(in0, in2); \
b1_m = __lsx_vsub_h(in0, in2); \
- c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1); \
+ EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m); \
\
c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m); \
c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1); \
@@ -77,13 +74,13 @@ static const int32_t sinpi8sqrt2 = 35468;
d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1); \
d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1); \
d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m); \
- d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3); \
+ EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m); \
d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m); \
LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
- }
+ } while (0)
#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
+ do { \
__m128i a1_m, b1_m, c1_m, d1_m; \
__m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \
__m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \
@@ -105,13 +102,13 @@ static const int32_t sinpi8sqrt2 = 35468;
d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16); \
d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m); \
LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
- }
+ } while (0)
#define UNPCK_SH_SW(in, out0, out1) \
- { \
+ do { \
out0 = __lsx_vsllwil_w_h(in, 0); \
out1 = __lsx_vexth_w_h(in); \
- }
+ } while (0)
static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred,
int32_t pred_stride, uint8_t *dest,
diff --git a/vp8/common/loongarch/loopfilter_filters_lsx.c b/vp8/common/loongarch/loopfilter_filters_lsx.c
index a3ac76d25..f743ec0c5 100644
--- a/vp8/common/loongarch/loopfilter_filters_lsx.c
+++ b/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -14,7 +14,7 @@
#include "vpx_util/loongson_intrinsics.h"
#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \
- { \
+ do { \
__m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
const __m128i cnst4b = __lsx_vldi(4); \
const __m128i cnst3b = __lsx_vldi(3); \
@@ -46,10 +46,10 @@
q1 = __lsx_vxori_b(q1_m, 0x80); \
p1_m = __lsx_vsadd_b(p1_m, filt); \
p1 = __lsx_vxori_b(p1_m, 0x80); \
- }
+ } while (0)
#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
- { \
+ do { \
__m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \
__m128i u, filt, t1, t2, filt_sign, q0_sub_p0; \
__m128i filt_r, filt_l; \
@@ -113,12 +113,12 @@
p0_m = __lsx_vsadd_b(p0_m, u); \
q0 = __lsx_vxori_b(q0_m, 0x80); \
p0 = __lsx_vxori_b(p0_m, 0x80); \
- }
+ } while (0)
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
flat_out) \
- { \
+ do { \
__m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
__m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
\
@@ -143,13 +143,13 @@
mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \
mask_out = __lsx_vslt_bu(limit_in, mask_out); \
mask_out = __lsx_vxori_b(mask_out, 0xff); \
- }
+ } while (0)
#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \
- { \
+ do { \
__lsx_vstelm_w(in0, pdst, 0, in0_idx); \
__lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \
- }
+ } while (0)
static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
const uint8_t *b_limit0_ptr,
diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c
index a23ed16d2..cd7ba5474 100644
--- a/vp8/common/loongarch/sixtap_filter_lsx.c
+++ b/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -33,37 +33,61 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
};
-#define DPADD_H3(in0, in1, in2, coeff0, coeff1, coeff2) \
- ({ \
- __m128i out0_m; \
- \
- out0_m = __lsx_vdp2_h_b(in0, coeff0); \
- out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); \
- out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); \
- \
- out0_m; \
- })
-
-#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
- filt_h2) \
- ({ \
- __m128i vec0_m, vec1_m, vec2_m; \
- __m128i hz_out_m; \
- \
- DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \
- vec1_m); \
- vec2_m = __lsx_vshuf_b(src1, src0, mask2); \
- hz_out_m = DPADD_H3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \
- \
- hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \
- hz_out_m = __lsx_vsat_h(hz_out_m, 7); \
- \
- hz_out_m; \
- })
+static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2,
+ __m128i coeff0, __m128i coeff1, __m128i coeff2) {
+ __m128i out0_m;
+
+ out0_m = __lsx_vdp2_h_b(in0, coeff0);
+ out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);
+ out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);
+
+ return out0_m;
+}
+
+static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+ __m128i mask1, __m128i mask2,
+ __m128i filt_h0, __m128i filt_h1,
+ __m128i filt_h2) {
+ __m128i vec0_m, vec1_m, vec2_m;
+ __m128i hz_out_m;
+
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+ vec1_m);
+ vec2_m = __lsx_vshuf_b(src1, src0, mask2);
+ hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);
+ hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+ hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+ return hz_out_m;
+}
+
+static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1,
+ __m128i filt0, __m128i filt1) {
+ __m128i tmp_m;
+
+ tmp_m = __lsx_vdp2_h_b(vec0, filt0);
+ tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1);
+
+ return tmp_m;
+}
+
+static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+ __m128i mask1, __m128i filt_h0,
+ __m128i filt_h1) {
+ __m128i vec0_m, vec1_m, hz_out_m;
+
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+ vec1_m);
+ hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1);
+ hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+ hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+ return hz_out_m;
+}
#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
mask2, filt0, filt1, filt2, out0, out1) \
- { \
+ do { \
__m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
\
DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
@@ -77,12 +101,12 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
vec5_m); \
DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
out0, out1); \
- }
+ } while (0)
#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
mask2, filt0, filt1, filt2, out0, out1, \
out2, out3) \
- ({ \
+ do { \
__m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
\
DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
@@ -105,35 +129,11 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, \
out3); \
- })
-
-#define FILT_4TAP_DPADD_H(vec0, vec1, filt0, filt1) \
- ({ \
- __m128i tmp0; \
- \
- tmp0 = __lsx_vdp2_h_b(vec0, filt0); \
- tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1); \
- \
- tmp0; \
- })
-
-#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
- ({ \
- __m128i vec0_m, vec1_m; \
- __m128i hz_out_m; \
- \
- DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, \
- vec1_m); \
- hz_out_m = FILT_4TAP_DPADD_H(vec0_m, vec1_m, filt_h0, filt_h1); \
- hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); \
- hz_out_m = __lsx_vsat_h(hz_out_m, 7); \
- \
- hz_out_m; \
- })
+ } while (0)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
filt0, filt1, out0, out1) \
- { \
+ do { \
__m128i vec0_m, vec1_m, vec2_m, vec3_m; \
\
DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
@@ -143,11 +143,11 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
vec3_m); \
DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
out0, out1); \
- }
+ } while (0)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
filt0, filt1, out0, out1, out2, out3) \
- ({ \
+ do { \
__m128i vec0_m, vec1_m, vec2_m, vec3_m; \
\
DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
@@ -163,7 +163,7 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \
out3); \
- })
+ } while (0)
static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
int32_t src_stride,
@@ -424,8 +424,8 @@ static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
src8776);
DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
- out0 = DPADD_H3(src2110, src4332, src6554, filt0, filt1, filt2);
- out1 = DPADD_H3(src4332, src6554, src8776, filt0, filt1, filt2);
+ out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2);
+ out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2);
out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
out0 = __lsx_vxori_b(out0, 128);
@@ -487,10 +487,10 @@ static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
src76_r, src87_r, src98_r, src109_r);
- out0_r = DPADD_H3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
- out1_r = DPADD_H3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
- out2_r = DPADD_H3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
- out3_r = DPADD_H3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+ out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+ out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+ out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+ out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -555,14 +555,14 @@ static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src54_r, src65_r, src76_r, src87_r);
DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
src54_l, src65_l, src76_l, src87_l);
- out0_r = DPADD_H3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
- out1_r = DPADD_H3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
- out2_r = DPADD_H3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
- out3_r = DPADD_H3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
- out0_l = DPADD_H3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
- out1_l = DPADD_H3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
- out2_l = DPADD_H3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
- out3_l = DPADD_H3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+ out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+ out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+ out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+ out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+ out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+ out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+ out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+ out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
@@ -621,12 +621,12 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src1, src2, src3);
src4 = __lsx_vxori_b(src4, 128);
- hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
- hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
- hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
@@ -636,7 +636,7 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src += src_stride_x2;
DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
- hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+ hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
@@ -645,15 +645,15 @@ static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src += src_stride_x2;
DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
- hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
+ hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
out2 = __lsx_vpackev_b(hz_out5, hz_out4);
- tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+ tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
out3 = __lsx_vpackev_b(hz_out7, hz_out6);
- tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+ tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
tmp0 = __lsx_vxori_b(tmp0, 128);
@@ -710,15 +710,15 @@ static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src1, src2, src3);
src4 = __lsx_vxori_b(src4, 128);
- hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
- hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
- hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
- hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
- hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
filt = __lsx_vld(filter_vert, 0);
DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
@@ -734,25 +734,25 @@ static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
src6, src7, src8);
- hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+ hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
out2 = __lsx_vpackev_b(hz_out5, hz_out4);
- tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+ tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
- hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+ hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
out5 = __lsx_vpackev_b(hz_out6, hz_out5);
- tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+ tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
- hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
+ hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
out7 = __lsx_vpackev_b(hz_out7, hz_out6);
- tmp2 = DPADD_H3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+ tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
- hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
+ hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
out6 = __lsx_vpackev_b(hz_out8, hz_out7);
- tmp3 = DPADD_H3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+ tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
VP8_FILTER_SHIFT, vec0, vec1);
@@ -997,14 +997,14 @@ static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
src4332 = __lsx_vilvl_d(src43_r, src32_r);
src4332 = __lsx_vxori_b(src4332, 128);
- out0 = FILT_4TAP_DPADD_H(src2110, src4332, filt0, filt1);
+ out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1);
src2 = __lsx_vld(src, 0);
src += src_stride;
DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
src2110 = __lsx_vilvl_d(src65_r, src54_r);
src2110 = __lsx_vxori_b(src2110, 128);
- out1 = FILT_4TAP_DPADD_H(src4332, src2110, filt0, filt1);
+ out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1);
out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
out0 = __lsx_vxori_b(out0, 128);
@@ -1055,10 +1055,10 @@ static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src8, src9, src10);
DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
src72_r, src87_r, src98_r, src109_r);
- out0_r = FILT_4TAP_DPADD_H(src10_r, src72_r, filt0, filt1);
- out1_r = FILT_4TAP_DPADD_H(src21_r, src87_r, filt0, filt1);
- out2_r = FILT_4TAP_DPADD_H(src72_r, src98_r, filt0, filt1);
- out3_r = FILT_4TAP_DPADD_H(src87_r, src109_r, filt0, filt1);
+ out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1);
+ out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1);
+ out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1);
+ out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1);
DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -1114,14 +1114,14 @@ static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src32_r, src43_r, src54_r, src65_r);
DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
src32_l, src43_l, src54_l, src65_l);
- out0_r = FILT_4TAP_DPADD_H(src10_r, src32_r, filt0, filt1);
- out1_r = FILT_4TAP_DPADD_H(src21_r, src43_r, filt0, filt1);
- out2_r = FILT_4TAP_DPADD_H(src32_r, src54_r, filt0, filt1);
- out3_r = FILT_4TAP_DPADD_H(src43_r, src65_r, filt0, filt1);
- out0_l = FILT_4TAP_DPADD_H(src10_l, src32_l, filt0, filt1);
- out1_l = FILT_4TAP_DPADD_H(src21_l, src43_l, filt0, filt1);
- out2_l = FILT_4TAP_DPADD_H(src32_l, src54_l, filt0, filt1);
- out3_l = FILT_4TAP_DPADD_H(src43_l, src65_l, filt0, filt1);
+ out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1);
+ out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1);
+ out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1);
+ out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1);
+ out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1);
+ out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1);
+ out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1);
+ out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1);
DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
@@ -1168,8 +1168,8 @@ static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
src2 = __lsx_vxori_b(src2, 128);
- hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
- hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
@@ -1182,16 +1182,16 @@ static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src += src_stride_x4;
DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
- hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
- tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+ tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
- hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
- tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+ tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
tmp0 = __lsx_vxori_b(tmp0, 128);
@@ -1239,9 +1239,9 @@ static inline void common_hv_4ht_4vt_8w_lsx(
DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
src2 = __lsx_vxori_b(src2, 128);
- hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
- hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
- hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
filt = __lsx_vld(filter_vert, 0);
@@ -1254,21 +1254,21 @@ static inline void common_hv_4ht_4vt_8w_lsx(
DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
src4, src5, src6);
- hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
- tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+ tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
- hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
- tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1);
+ tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
- hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
- tmp2 = FILT_4TAP_DPADD_H(vec1, vec4, filt_vt0, filt_vt1);
+ tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1);
- hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
- tmp3 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+ tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -1324,9 +1324,9 @@ static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
src2 = __lsx_vxori_b(src2, 128);
- hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
- hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
@@ -1341,17 +1341,17 @@ static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
src4, src5, src6);
- hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
+ hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
- tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+ tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
- hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
+ hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
- tmp1 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+ tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -1402,11 +1402,11 @@ static inline void common_hv_6ht_4vt_8w_lsx(
DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
src2 = __lsx_vxori_b(src2, 128);
- hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
- hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
- hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+ hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
filt_hz2);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
@@ -1420,25 +1420,25 @@ static inline void common_hv_6ht_4vt_8w_lsx(
DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
src4, src5, src6);
- hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
+ hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
- tmp0 = FILT_4TAP_DPADD_H(vec0, vec1, filt_vt0, filt_vt1);
+ tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
- hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
+ hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
- tmp1 = FILT_4TAP_DPADD_H(vec2, vec3, filt_vt0, filt_vt1);
+ tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
- hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
+ hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
- tmp2 = FILT_4TAP_DPADD_H(vec1, vec0, filt_vt0, filt_vt1);
+ tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1);
- hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
+ hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
filt_hz1, filt_hz2);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
- tmp3 = FILT_4TAP_DPADD_H(vec1, vec2, filt_vt0, filt_vt1);
+ tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -1492,9 +1492,9 @@ static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
src1, src2, src3);
src4 = __lsx_vxori_b(src4, 128);
- hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
- hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
- hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
@@ -1510,15 +1510,15 @@ static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
src6, src7, src8);
src += src_stride_x4;
- hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
out2 = __lsx_vpackev_b(hz_out5, hz_out4);
- tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+ tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
- hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
out3 = __lsx_vpackev_b(hz_out7, hz_out6);
- tmp1 = DPADD_H3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+ tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
tmp0 = __lsx_vxori_b(tmp0, 128);
@@ -1571,11 +1571,11 @@ static inline void common_hv_4ht_6vt_8w_lsx(
DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
src1, src2, src3);
src4 = __lsx_vxori_b(src4, 128);
- hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
- hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
- hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
- hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
- hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
@@ -1590,21 +1590,21 @@ static inline void common_hv_4ht_6vt_8w_lsx(
DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
src6, src7, src8);
- hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
out2 = __lsx_vpackev_b(hz_out5, hz_out4);
- tmp0 = DPADD_H3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+ tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
- hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
out5 = __lsx_vpackev_b(hz_out6, hz_out5);
- tmp1 = DPADD_H3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+ tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
- hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
out6 = __lsx_vpackev_b(hz_out7, hz_out6);
- tmp2 = DPADD_H3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+ tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
- hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+ hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
out7 = __lsx_vpackev_b(hz_out8, hz_out7);
- tmp3 = DPADD_H3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+ tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
__lsx_vstelm_d(vec0, dst, 0, 0);
diff --git a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
index 4834f18fc..b0db1e99c 100644
--- a/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
+++ b/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -16,33 +16,26 @@
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_util/loongson_intrinsics.h"
+static INLINE __m128i load_tran_low(const tran_low_t *s) {
#if CONFIG_VP9_HIGHBITDEPTH
-#define load_tran_low(s) \
- ({ \
- __m128i res0_m; \
- __m128i v0_m = __lsx_vld(s, 0); \
- __m128i v1_m = __lsx_vld(s + 4, 0); \
- res0_m = __lsx_vsrlni_h_w(v0_m, v1_m, 0); \
- res0_m; \
- })
-
-#define store_tran_low(v, s, c) \
- { \
- __m128i v0_m, v1_m; \
- v1_m = __lsx_vexth_w_h(v); \
- v0_m = __lsx_vsllwil_w_h(v, 0); \
- __lsx_vst(v0_m, s + c, 0); \
- __lsx_vst(v1_m, s + c + 4, 0); \
- }
+ __m128i v0_m = __lsx_vld(s, 0);
+ __m128i v1_m = __lsx_vld(s + 4, 0);
+ return __lsx_vsrlni_h_w(v0_m, v1_m, 0);
#else
-#define load_tran_low(s) \
- ({ \
- __m128i res0_m; \
- res0_m = __lsx_vld(s, 0); \
- res0_m; \
- })
+ return __lsx_vld(s, 0);
+#endif
+}
-#define store_tran_low(v, s, c) __lsx_vst(v, s + c, 0)
-#endif // CONFIG_VP9_HIGHBITDEPTH
+static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ __m128i v0_m, v1_m;
+ v1_m = __lsx_vexth_w_h(v);
+ v0_m = __lsx_vsllwil_w_h(v, 0);
+ __lsx_vst(v0_m, s + c, 0);
+ __lsx_vst(v1_m, s + c + 4, 0);
+#else
+ __lsx_vst(v, s + c, 0);
+#endif
+}
#endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
index e5c301b2c..9bb387721 100644
--- a/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
+++ b/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
@@ -13,10 +13,10 @@
#include "vpx_dsp/fwd_txfm.h"
#define UNPCK_SH_SW(in, out0, out1) \
- { \
+ do { \
out0 = __lsx_vsllwil_w_h(in, 0); \
out1 = __lsx_vexth_w_h(in); \
- }
+ } while (0)
static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
int32_t src_stride,
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.c b/vpx_dsp/loongarch/fwd_txfm_lsx.c
index 6f2d4d6fe..508532b9d 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.c
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -12,7 +12,7 @@
#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
- { \
+ do { \
__m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
\
DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
@@ -23,7 +23,7 @@
_t3 = __lsx_vilvh_h(_s3, _s2); \
DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
- }
+ } while (0)
#if !CONFIG_VP9_HIGHBITDEPTH
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
diff --git a/vpx_dsp/loongarch/fwd_txfm_lsx.h b/vpx_dsp/loongarch/fwd_txfm_lsx.h
index d04427a6e..4a9fce9a3 100644
--- a/vpx_dsp/loongarch/fwd_txfm_lsx.h
+++ b/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -15,7 +15,7 @@
#include "vpx_dsp/txfm_common.h"
#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
+ do { \
__m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
__m128i vec0_m, vec1_m, vec2_m, vec3_m; \
__m128i vec4_m, vec5_m, vec6_m, vec7_m; \
@@ -38,11 +38,11 @@
DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \
vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \
- }
+ } while (0)
#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
out3, out4, out5, out6, out7) \
- { \
+ do { \
__m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
__m128i s7_m, x0_m, x1_m, x2_m, x3_m; \
__m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
@@ -97,10 +97,10 @@
x3_m = __lsx_vneg_h(x3_m); \
x2_m = __lsx_vpackev_h(x2_m, x3_m); \
DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
- }
+ } while (0)
#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
- { \
+ do { \
__m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
\
DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \
@@ -111,10 +111,10 @@
in3, in0, in1, in2, in3); \
DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \
in7, in4, in5, in6, in7); \
- }
+ } while (0)
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
- { \
+ do { \
__m128i tp0_m, tp1_m; \
__m128i one = __lsx_vreplgr2vr_h(1); \
\
@@ -130,10 +130,10 @@
vec1 = __lsx_vadd_h(vec1, tp1_m); \
vec0 = __lsx_vsrai_h(vec0, 2); \
vec1 = __lsx_vsrai_h(vec1, 2); \
- }
+ } while (0)
#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
- { \
+ do { \
__m128i tp0_m, tp1_m; \
__m128i one_m = __lsx_vldi(0x401); \
\
@@ -147,10 +147,10 @@
vec1 = __lsx_vadd_h(vec1, tp1_m); \
vec0 = __lsx_vsrai_h(vec0, 2); \
vec1 = __lsx_vsrai_h(vec1, 2); \
- }
+ } while (0)
#define FDCT32_POSTPROC_NEG_W(vec) \
- { \
+ do { \
__m128i temp_m; \
__m128i one_m = __lsx_vreplgr2vr_w(1); \
\
@@ -159,11 +159,11 @@
temp_m = __lsx_vand_v(one_m, temp_m); \
vec = __lsx_vadd_w(vec, temp_m); \
vec = __lsx_vsrai_w(vec, 2); \
- }
+ } while (0)
#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \
const0, const1, out0, out1, out2, out3) \
- { \
+ do { \
__m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
__m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \
__m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \
@@ -188,11 +188,11 @@
DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \
DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
DCT_CONST_BITS, out2, out3); \
- }
+ } while (0)
#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \
in3) \
- { \
+ do { \
__m128i dst0_m, dst1_m, dst2_m, dst3_m; \
__m128i tmp0_m, tmp1_m; \
__m128i res0_m, res1_m, res2_m, res3_m; \
@@ -210,11 +210,11 @@
__lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \
__lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \
__lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \
- }
+ } while (0)
#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out2, out3, out4, out5, out6, out7) \
- { \
+ do { \
__m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
__m128i x0_m, x1_m, x2_m, x3_m; \
__m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
@@ -270,12 +270,12 @@
x3_m = __lsx_vneg_h(x3_m); \
x2_m = __lsx_vpackev_h(x2_m, x3_m); \
DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
- }
+ } while (0)
#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \
input7, out1, out3, out5, out7, out9, out11, out13, \
out15) \
- { \
+ do { \
__m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
__m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
__m128i stp36_m, stp37_m, vec0_m, vec1_m; \
@@ -373,7 +373,7 @@
cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \
cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \
DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \
- }
+ } while (0)
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
int32_t src_stride);
diff --git a/vpx_dsp/loongarch/idct32x32_lsx.c b/vpx_dsp/loongarch/idct32x32_lsx.c
index d6890c28e..ec07f57d9 100644
--- a/vpx_dsp/loongarch/idct32x32_lsx.c
+++ b/vpx_dsp/loongarch/idct32x32_lsx.c
@@ -12,10 +12,10 @@
#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
#define UNPCK_UB_SH(_in, _out0, _out1) \
- { \
+ do { \
_out0 = __lsx_vsllwil_hu_bu(_in, 0); \
_out1 = __lsx_vexth_hu_bu(_in); \
- }
+ } while (0)
static void idct32x8_row_transpose_store(const int16_t *input,
int16_t *tmp_buf) {
diff --git a/vpx_dsp/loongarch/loopfilter_16_lsx.c b/vpx_dsp/loongarch/loopfilter_16_lsx.c
index cbaefcd6e..539817777 100644
--- a/vpx_dsp/loongarch/loopfilter_16_lsx.c
+++ b/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -15,7 +15,7 @@
#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \
_in2, _in3, _in4, _in5, _in6, _in7) \
- { \
+ do { \
_in0 = __lsx_vld(_src, 0); \
_in1 = __lsx_vldx(_src, _stride); \
_in2 = __lsx_vldx(_src, _stride2); \
@@ -25,11 +25,11 @@
_in5 = __lsx_vldx(_src, _stride); \
_in6 = __lsx_vldx(_src, _stride2); \
_in7 = __lsx_vldx(_src, _stride3); \
- }
+ } while (0)
#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \
_stride, _stride2, _stride3, _stride4) \
- { \
+ do { \
__lsx_vst(_dst0, _dst, 0); \
__lsx_vstx(_dst1, _dst, _stride); \
__lsx_vstx(_dst2, _dst, _stride2); \
@@ -39,7 +39,7 @@
__lsx_vstx(_dst5, _dst, _stride); \
__lsx_vstx(_dst6, _dst, _stride2); \
__lsx_vstx(_dst7, _dst, _stride3); \
- }
+ } while (0)
static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
uint8_t *filter48,
diff --git a/vpx_dsp/loongarch/loopfilter_lsx.h b/vpx_dsp/loongarch/loopfilter_lsx.h
index 53e15fe6d..1c4383650 100644
--- a/vpx_dsp/loongarch/loopfilter_lsx.h
+++ b/vpx_dsp/loongarch/loopfilter_lsx.h
@@ -16,7 +16,7 @@
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
flat_out) \
- { \
+ do { \
__m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
__m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
\
@@ -47,10 +47,10 @@
\
mask_out = __lsx_vslt_bu(limit_in, mask_out); \
mask_out = __lsx_vxori_b(mask_out, 0xff); \
- }
+ } while (0)
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
- { \
+ do { \
__m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \
__m128i flat4_tmp = __lsx_vldi(1); \
\
@@ -64,11 +64,11 @@
flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \
flat_out = __lsx_vxori_b(flat_out, 0xff); \
flat_out = flat_out & (mask); \
- }
+ } while (0)
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
q6_in, q7_in, flat_in, flat2_out) \
- { \
+ do { \
__m128i flat5_tmp = __lsx_vldi(1); \
__m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \
__m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \
@@ -87,11 +87,11 @@
flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \
flat2_out = __lsx_vxori_b(flat2_out, 0xff); \
flat2_out = flat2_out & flat_in; \
- }
+ } while (0)
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \
p0_out, q0_out, q1_out) \
- { \
+ do { \
__m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
const __m128i cnst4b = __lsx_vldi(4); \
const __m128i cnst3b = __lsx_vldi(3); \
@@ -118,12 +118,12 @@
q1_m = __lsx_vssub_b(q1_m, filt); \
p1_m = __lsx_vsadd_b(p1_m, filt); \
DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \
- }
+ } while (0)
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
q1_filt8_out, q2_filt8_out) \
- { \
+ do { \
__m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \
\
tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \
@@ -162,6 +162,6 @@
tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \
tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \
q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \
- }
+ } while (0)
#endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c
index e3fbb9e9e..2fc33b06b 100644
--- a/vpx_dsp/loongarch/quantize_lsx.c
+++ b/vpx_dsp/loongarch/quantize_lsx.c
@@ -12,79 +12,83 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_util/loongson_intrinsics.h"
-#define CALCULATE_QCOEFF(coeff, coeff_abs, round, quant, shift, cmp_mask) \
- ({ \
- __m128i rounded, qcoeff; \
- \
- rounded = __lsx_vsadd_h(coeff_abs, round); \
- qcoeff = __lsx_vmuh_h(rounded, quant); \
- qcoeff = __lsx_vadd_h(rounded, qcoeff); \
- qcoeff = __lsx_vmuh_h(qcoeff, shift); \
- qcoeff = __lsx_vsigncov_h(coeff, qcoeff); \
- qcoeff = __lsx_vand_v(qcoeff, cmp_mask); \
- \
- qcoeff; \
- })
-
-#define CALCULATE_DQCOEFF_AND_STORE(qcoeff, dequant, dqcoeff) \
- { \
- __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); \
- __lsx_vst(dqcoeff16, dqcoeff, 0); \
- }
+static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
+ __m128i round, __m128i quant,
+ __m128i shift, __m128i cmp_mask) {
+ __m128i rounded, qcoeff;
+
+ rounded = __lsx_vsadd_h(coeff_abs, round);
+ qcoeff = __lsx_vmuh_h(rounded, quant);
+ qcoeff = __lsx_vadd_h(rounded, qcoeff);
+ qcoeff = __lsx_vmuh_h(qcoeff, shift);
+ qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
+ qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
+
+ return qcoeff;
+}
-#define CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff, dequant, dqcoeff) \
- { \
- __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; \
- __m128i zero = __lsx_vldi(0); \
- __m128i coeff = __lsx_vabsd_h(qcoeff, zero); \
- \
- __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); \
- __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); \
- \
- low = __lsx_vmul_h(coeff, dequant); \
- high = __lsx_vmuh_h(coeff, dequant); \
- dqcoeff32_0 = __lsx_vilvl_h(high, low); \
- dqcoeff32_1 = __lsx_vilvh_h(high, low); \
- \
- dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); \
- dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); \
- dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); \
- dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); \
- res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); \
- __lsx_vst(res, dqcoeff, 0); \
- }
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+ int16_t *dqcoeff) {
+ __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
+ __lsx_vst(dqcoeff16, dqcoeff, 0);
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
+ __m128i dequant,
+ int16_t *dqcoeff) {
+ // Un-sign to bias rounding like C.
+ __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
+ __m128i zero = __lsx_vldi(0);
+ __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
+
+ const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
+ const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
+
+ low = __lsx_vmul_h(coeff, dequant);
+ high = __lsx_vmuh_h(coeff, dequant);
+ dqcoeff32_0 = __lsx_vilvl_h(high, low);
+ dqcoeff32_1 = __lsx_vilvh_h(high, low);
+
+ // "Divide" by 2.
+ dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
+ dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
+ dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
+ dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
+ res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
+ __lsx_vst(res, dqcoeff, 0);
+}
+
+static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
+ __m128i zbin_mask0, __m128i zbin_mask1,
+ const int16_t *scan, int index,
+ __m128i zero) {
+ const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
+ const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
+ __m128i scan0 = __lsx_vld(scan + index, 0);
+ __m128i scan1 = __lsx_vld(scan + index + 8, 0);
+ __m128i eob0, eob1;
+
+ scan0 = __lsx_vsub_h(scan0, zbin_mask0);
+ scan1 = __lsx_vsub_h(scan1, zbin_mask1);
+ eob0 = __lsx_vandn_v(zero_coeff0, scan0);
+ eob1 = __lsx_vandn_v(zero_coeff1, scan1);
+ return __lsx_vmax_h(eob0, eob1);
+}
-#define SCAN_FOR_EOB(coeff0, coeff1, zbin_mask0, zbin_mask1, scan, index, \
- zero) \
- ({ \
- __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); \
- __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); \
- __m128i scan0 = __lsx_vld(scan + index, 0); \
- __m128i scan1 = __lsx_vld(scan + index + 8, 0); \
- __m128i eob0, eob1, eob_max; \
- \
- scan0 = __lsx_vsub_h(scan0, zbin_mask0); \
- scan1 = __lsx_vsub_h(scan1, zbin_mask1); \
- eob0 = __lsx_vandn_v(zero_coeff0, scan0); \
- eob1 = __lsx_vandn_v(zero_coeff1, scan1); \
- eob_max = __lsx_vmax_h(eob0, eob1); \
- eob_max; \
- })
-
-#define ACCUMULATE_EOB(eob) \
- ({ \
- __m128i eob_shuffled; \
- int16_t res_m; \
- \
- eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); \
- eob = __lsx_vmax_h(eob, eob_shuffled); \
- eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); \
- eob = __lsx_vmax_h(eob, eob_shuffled); \
- eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); \
- eob = __lsx_vmax_h(eob, eob_shuffled); \
- res_m = __lsx_vpickve2gr_h(eob, 1); \
- res_m; \
- })
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ int16_t res_m;
+
+ eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
+ eob = __lsx_vmax_h(eob, eob_shuffled);
+ res_m = __lsx_vpickve2gr_h(eob, 1);
+
+ return res_m;
+}
#if !CONFIG_VP9_HIGHBITDEPTH
void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
@@ -120,21 +124,21 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
qcoeff0 =
- CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
round = __lsx_vilvh_d(round, round);
quant = __lsx_vilvh_d(quant, quant);
quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
qcoeff1 =
- CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
__lsx_vst(qcoeff0, qcoeff_ptr, 0);
__lsx_vst(qcoeff1, qcoeff_ptr, 16);
- CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr);
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
dequant = __lsx_vilvh_d(dequant, dequant);
- CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + 8);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -147,24 +151,24 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
qcoeff0 =
- CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
qcoeff1 =
- CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
__lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
__lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
- CALCULATE_DQCOEFF_AND_STORE(qcoeff0, dequant, dqcoeff_ptr + index);
- CALCULATE_DQCOEFF_AND_STORE(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
- eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
zero);
eob = __lsx_vmax_h(eob, eob0);
index += 16;
}
- *eob_ptr = ACCUMULATE_EOB(eob);
+ *eob_ptr = accumulate_eob(eob);
}
void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
@@ -204,20 +208,20 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
qcoeff0 =
- CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
// remove DC in quant_shift, quant, quant_shift
round = __lsx_vilvh_d(round, round);
quant = __lsx_vilvh_d(quant, quant);
quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
qcoeff1 =
- CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
__lsx_vst(qcoeff0, qcoeff_ptr, 0);
__lsx_vst(qcoeff1, qcoeff_ptr, 16);
- CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr);
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
dequant = __lsx_vilvh_d(dequant, dequant);
- CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
- eob = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
+ eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
// AC only loop.
for (index = 16; index < 32 * 32; index += 16) {
coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -230,20 +234,20 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
qcoeff0 =
- CALCULATE_QCOEFF(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+ calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
qcoeff1 =
- CALCULATE_QCOEFF(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+ calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
__lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
__lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
- CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
- CALCULATE_DQCOEFF_AND_STORE_32x32(qcoeff1, dequant,
+ calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
dqcoeff_ptr + 8 + index);
- eob0 = SCAN_FOR_EOB(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
zero);
eob = __lsx_vmax_h(eob, eob0);
}
- *eob_ptr = ACCUMULATE_EOB(eob);
+ *eob_ptr = accumulate_eob(eob);
}
-#endif // !CONFIG_VP9_HIGHBITDEPTH
+#endif
diff --git a/vpx_dsp/loongarch/sad_lsx.c b/vpx_dsp/loongarch/sad_lsx.c
index 5eaebfb51..b6fbedb0d 100644
--- a/vpx_dsp/loongarch/sad_lsx.c
+++ b/vpx_dsp/loongarch/sad_lsx.c
@@ -8,59 +8,63 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_util/loongson_intrinsics.h"
-#define SAD_UB2_UH(in0, in1, ref0, ref1) \
- ({ \
- __m128i diff0_m, diff1_m, sad_m0; \
- __m128i sad_m = __lsx_vldi(0); \
- \
- diff0_m = __lsx_vabsd_bu(in0, ref0); \
- diff1_m = __lsx_vabsd_bu(in1, ref1); \
- \
- sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); \
- sad_m = __lsx_vadd_h(sad_m, sad_m0); \
- sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); \
- sad_m = __lsx_vadd_h(sad_m, sad_m0); \
- \
- sad_m; \
- })
-
-#define HADD_UW_U32(in) \
- ({ \
- __m128i res0_m; \
- uint32_t sum_m; \
- res0_m = __lsx_vhaddw_du_wu(in, in); \
- res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); \
- sum_m = __lsx_vpickve2gr_w(res0_m, 0); \
- sum_m; \
- })
-
-#define HADD_UH_U32(in) \
- ({ \
- __m128i res_m; \
- uint32_t sum_m; \
- res_m = __lsx_vhaddw_wu_hu(in, in); \
- sum_m = HADD_UW_U32(res_m); \
- sum_m; \
- })
-
-#define HADD_SW_S32(in) \
- ({ \
- __m128i res0_m; \
- int32_t sum_m; \
- \
- res0_m = __lsx_vhaddw_d_w(in, in); \
- res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
- sum_m = __lsx_vpickve2gr_w(res0_m, 0); \
- sum_m; \
- })
+static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
+ __m128i ref1) {
+ __m128i diff0_m, diff1_m, sad_m0;
+ __m128i sad_m = __lsx_vldi(0);
+
+ diff0_m = __lsx_vabsd_bu(in0, ref0);
+ diff1_m = __lsx_vabsd_bu(in1, ref1);
+
+ sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
+ sad_m = __lsx_vadd_h(sad_m, sad_m0);
+ sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
+ sad_m = __lsx_vadd_h(sad_m, sad_m0);
+
+ return sad_m;
+}
+
+static INLINE uint32_t hadd_uw_u32(__m128i in) {
+ __m128i res0_m;
+ uint32_t sum_m;
+
+ res0_m = __lsx_vhaddw_du_wu(in, in);
+ res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
+ sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+ return sum_m;
+}
+
+static INLINE uint32_t hadd_uh_u32(__m128i in) {
+ __m128i res_m;
+ uint32_t sum_m;
+
+ res_m = __lsx_vhaddw_wu_hu(in, in);
+ sum_m = hadd_uw_u32(res_m);
+
+ return sum_m;
+}
+
+static INLINE int32_t hadd_sw_s32(__m128i in) {
+ __m128i res0_m;
+ int32_t sum_m;
+
+ res0_m = __lsx_vhaddw_d_w(in, in);
+ res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
+ sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+ return sum_m;
+}
static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride,
int32_t height) {
int32_t ht_cnt;
+ uint32_t res;
__m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
__m128i sad = __lsx_vldi(0);
@@ -79,16 +83,18 @@ static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
ref += ref_stride;
DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
src0, src1, ref0, ref1);
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad = __lsx_vadd_h(sad, sad_tmp);
}
- return HADD_UH_U32(sad);
+ res = hadd_uh_u32(sad);
+ return res;
}
static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride,
int32_t height) {
int32_t ht_cnt = (height >> 2);
+ uint32_t res;
__m128i src0, src1, ref0, ref1, sad_tmp;
__m128i sad = __lsx_vldi(0);
int32_t src_stride2 = src_stride << 1;
@@ -99,23 +105,26 @@ static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
src += src_stride2;
ref += ref_stride2;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad = __lsx_vadd_h(sad, sad_tmp);
DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
src += src_stride2;
ref += ref_stride2;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad = __lsx_vadd_h(sad, sad_tmp);
}
- return HADD_UH_U32(sad);
+
+ res = hadd_uh_u32(sad);
+ return res;
}
static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride,
int32_t height) {
int32_t ht_cnt = (height >> 2);
+ uint32_t res;
__m128i src0, src1, ref0, ref1;
__m128i sad_tmp;
__m128i sad = __lsx_vldi(0);
@@ -125,31 +134,32 @@ static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
src += src_stride;
DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
ref += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad = __lsx_vadd_h(sad, sad_tmp);
DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
src += src_stride;
DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
ref += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad = __lsx_vadd_h(sad, sad_tmp);
DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
src += src_stride;
DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
ref += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad = __lsx_vadd_h(sad, sad_tmp);
DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
src += src_stride;
DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
ref += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad = __lsx_vadd_h(sad, sad_tmp);
}
- return HADD_UH_U32(sad);
+ res = hadd_uh_u32(sad);
+ return res;
}
static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
@@ -170,9 +180,9 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
ref3);
ref += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad0 = __lsx_vadd_h(sad0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
sad1 = __lsx_vadd_h(sad1, sad_tmp);
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
@@ -181,14 +191,14 @@ static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
ref3);
ref += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad0 = __lsx_vadd_h(sad0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
sad1 = __lsx_vadd_h(sad1, sad_tmp);
}
- sad = HADD_UH_U32(sad0);
- sad += HADD_UH_U32(sad1);
+ sad = hadd_uh_u32(sad0);
+ sad += hadd_uh_u32(sad1);
return sad;
}
@@ -247,25 +257,25 @@ static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad0 = __lsx_vadd_h(sad0, sad_tmp);
DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad1 = __lsx_vadd_h(sad1, sad_tmp);
DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad2 = __lsx_vadd_h(sad2, sad_tmp);
DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad3 = __lsx_vadd_h(sad3, sad_tmp);
}
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
}
static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
@@ -334,10 +344,10 @@ static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
sad3 = __lsx_vadd_h(sad3, sad_tmp);
}
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
}
static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
@@ -363,28 +373,28 @@ static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
ref0_ptr += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad0 = __lsx_vadd_h(sad0, sad_tmp);
DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
ref1_ptr += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad1 = __lsx_vadd_h(sad1, sad_tmp);
DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
ref2_ptr += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad2 = __lsx_vadd_h(sad2, sad_tmp);
DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
ref3_ptr += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad3 = __lsx_vadd_h(sad3, sad_tmp);
}
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
+ sad_array[0] = hadd_uh_u32(sad0);
+ sad_array[1] = hadd_uh_u32(sad1);
+ sad_array[2] = hadd_uh_u32(sad2);
+ sad_array[3] = hadd_uh_u32(sad3);
}
static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
@@ -419,60 +429,60 @@ static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
ref0, ref1, ref2, ref3);
ref0_ptr += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
ref0, ref1, ref2, ref3);
ref1_ptr += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
ref0, ref1, ref2, ref3);
ref2_ptr += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
ref0, ref1, ref2, ref3);
ref3_ptr += ref_stride;
- sad_tmp = SAD_UB2_UH(src0, src1, ref0, ref1);
+ sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, ref2, ref3);
+ sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
}
sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
sad = __lsx_vadd_w(sad, sad_tmp);
- sad_array[0] = HADD_UW_U32(sad);
+ sad_array[0] = hadd_uw_u32(sad);
sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
sad = __lsx_vadd_w(sad, sad_tmp);
- sad_array[1] = HADD_UW_U32(sad);
+ sad_array[1] = hadd_uw_u32(sad);
sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
sad = __lsx_vadd_w(sad, sad_tmp);
- sad_array[2] = HADD_UW_U32(sad);
+ sad_array[2] = hadd_uw_u32(sad);
sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
sad = __lsx_vadd_w(sad, sad_tmp);
- sad_array[3] = HADD_UW_U32(sad);
+ sad_array[3] = hadd_uw_u32(sad);
}
static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride,
int32_t height, const uint8_t *sec_pred) {
- int32_t ht_cnt = (height >> 2);
+ int32_t res, ht_cnt = (height >> 2);
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
__m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
__m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
@@ -514,26 +524,26 @@ static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
sec_pred += 128;
DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
- sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
sad = __lsx_vadd_h(sad, sad_tmp);
DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
- sad_tmp = SAD_UB2_UH(src2, src3, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
sad = __lsx_vadd_h(sad, sad_tmp);
DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
- sad_tmp = SAD_UB2_UH(src4, src5, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
sad = __lsx_vadd_h(sad, sad_tmp);
DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
- sad_tmp = SAD_UB2_UH(src6, src7, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
sad = __lsx_vadd_h(sad, sad_tmp);
}
-
- return HADD_UH_U32(sad);
+ res = hadd_uh_u32(sad);
+ return res;
}
static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
const uint8_t *ref, int32_t ref_stride,
int32_t height, const uint8_t *sec_pred) {
- int32_t ht_cnt = (height >> 2);
+ int32_t res, ht_cnt = (height >> 2);
__m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
__m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
__m128i sad, sad_tmp;
@@ -552,9 +562,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
sec_pred += 64;
DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
ref3, comp0, comp1, comp2, comp3);
- sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
sad0 = __lsx_vadd_h(sad0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
sad1 = __lsx_vadd_h(sad1, sad_tmp);
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
@@ -568,9 +578,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
sec_pred += 64;
DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
ref3, comp0, comp1, comp2, comp3);
- sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
sad0 = __lsx_vadd_h(sad0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
sad1 = __lsx_vadd_h(sad1, sad_tmp);
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
@@ -584,9 +594,9 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
sec_pred += 64;
DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
ref3, comp0, comp1, comp2, comp3);
- sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
sad0 = __lsx_vadd_h(sad0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
sad1 = __lsx_vadd_h(sad1, sad_tmp);
DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
@@ -600,16 +610,17 @@ static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
sec_pred += 64;
DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
ref3, comp0, comp1, comp2, comp3);
- sad_tmp = SAD_UB2_UH(src0, src1, comp0, comp1);
+ sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
sad0 = __lsx_vadd_h(sad0, sad_tmp);
- sad_tmp = SAD_UB2_UH(src2, src3, comp2, comp3);
+ sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
sad1 = __lsx_vadd_h(sad1, sad_tmp);
}
sad = __lsx_vhaddw_wu_hu(sad0, sad0);
sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
sad = __lsx_vadd_w(sad, sad_tmp);
- return HADD_SW_S32(sad);
+ res = hadd_sw_s32(sad);
+ return res;
}
#define VPX_SAD_8xHT_LSX(height) \
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
index 54fcd6c57..d1abf622a 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -57,13 +57,13 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
src6 = __lsx_vxori_b(src6, 128);
- tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
@@ -87,17 +87,17 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
src2 = __lsx_vilvl_d(src3, src2);
DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
src8, src9, src10);
- tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
tmp4 = __lsx_vpackev_b(tmp3, tmp4);
- out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
src0 = __lsx_vshuf_b(src1, tmp3, shuff);
src0 = __lsx_vpackev_b(src1, src0);
- out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+ out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
out0 = __lsx_vxori_b(out0, 128);
@@ -152,19 +152,19 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
src6 = __lsx_vxori_b(src6, 128);
- src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
@@ -181,25 +181,25 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
src8, src9, src10);
- src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
tmp3 = __lsx_vpackev_b(src7, src6);
- out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
src0 = __lsx_vpackev_b(src8, src7);
- out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+ out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
src1 = __lsx_vpackev_b(src9, src8);
- src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+ src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
src2 = __lsx_vpackev_b(src10, src9);
- src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+ src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
FILTER_BITS, out0, out1);
@@ -296,9 +296,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
src, src_stride4, src1, src2, src3, src4);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
@@ -348,11 +348,11 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
src, src_stride4, src5, src6, src7, src8);
src += src_stride4;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
- hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
- hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+ hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+ hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
hz_out1, hz_out3);
hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
@@ -449,20 +449,20 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
dst_tmp += dst_stride;
dst3 = __lsx_vldrepl_d(dst_tmp, 0);
DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
@@ -494,7 +494,7 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
src0 = __lsx_vld(src, 0);
src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
for (; loop_cnt--;) {
src1 = __lsx_vld(src, 0);
@@ -502,19 +502,19 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
src4 = __lsx_vldx(src, src_stride3);
src += src_stride4;
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
@@ -571,8 +571,8 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
for (; loop_cnt--;) {
src0 = __lsx_vld(src, 0);
@@ -588,32 +588,32 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
dst3 = __lsx_vldx(dst, dst_stride3);
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
tmp3 = __lsx_vavgr_bu(tmp3, dst0);
__lsx_vst(tmp3, dst, 0);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
tmp3 = __lsx_vavgr_bu(tmp3, dst1);
__lsx_vstx(tmp3, dst, dst_stride);
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
tmp3 = __lsx_vavgr_bu(tmp3, dst2);
__lsx_vstx(tmp3, dst, dst_stride2);
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
diff --git a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
index 584f24183..5c6413df4 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
@@ -68,9 +68,9 @@ static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
- out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
filter2, filter3);
- out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
+ out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
filter2, filter3);
out0 = __lsx_vssrarni_b_h(out1, out0, 7);
out0 = __lsx_vxori_b(out0, 128);
@@ -146,13 +146,13 @@ static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
src8, src9, src10);
DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
tmp0, tmp1, tmp2, tmp3);
- out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
filter2, filter3);
- out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
+ out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
filter2, filter3);
- out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
+ out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
filter2, filter3);
- out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
+ out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
filter2, filter3);
DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -231,13 +231,13 @@ static void common_vt_8t_and_aver_dst_16w_mult_lsx(
src0, src1, src2, src3);
DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
src4, src5, src7, src8);
- tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
filter2, filter3);
- tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
filter2, filter3);
- tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
filter2, filter3);
- tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
filter2, filter3);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -246,13 +246,13 @@ static void common_vt_8t_and_aver_dst_16w_mult_lsx(
DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
__lsx_vst(tmp0, dst_reg, 0);
__lsx_vstx(tmp1, dst_reg, dst_stride);
- tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
filter2, filter3);
- tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
filter2, filter3);
- tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
filter2, filter3);
- tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
filter2, filter3);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
diff --git a/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
index 73583abb9..9f5cd6cfe 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -54,13 +54,13 @@ static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
src6 = __lsx_vxori_b(src6, 128);
- tmp0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- tmp2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- tmp4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- tmp5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
@@ -73,17 +73,17 @@ static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
src += src_stride;
DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
src8, src9, src10);
- tmp3 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
tmp4 = __lsx_vpackev_b(tmp3, tmp4);
- out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- src1 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
src0 = __lsx_vshuf_b(src1, tmp3, shuff);
src0 = __lsx_vpackev_b(src1, src0);
- out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+ out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
out0 = __lsx_vssrarni_b_h(out1, out0, 7);
out0 = __lsx_vxori_b(out0, 128);
@@ -135,19 +135,19 @@ static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
src6 = __lsx_vxori_b(src6, 128);
- src0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+ src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+ src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+ src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+ src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+ src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+ src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
- src6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+ src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
@@ -161,25 +161,25 @@ static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
src += src_stride;
DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
src8, src9, src10);
- src7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+ src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
tmp3 = __lsx_vpackev_b(src7, src6);
- out0 = FILT_8TAP_DPADD_S_H(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+ out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- src8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+ src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
src0 = __lsx_vpackev_b(src8, src7);
- out1 = FILT_8TAP_DPADD_S_H(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+ out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- src9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+ src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
src1 = __lsx_vpackev_b(src9, src8);
- src3 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+ src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- src10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+ src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
src2 = __lsx_vpackev_b(src10, src9);
- src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+ src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -267,9 +267,9 @@ static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
src0 = __lsx_vld(src, 0);
DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
src, src_stride4, src1, src2, src3, src4);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
@@ -316,11 +316,11 @@ static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
src, src_stride4, src5, src6, src7, src8);
src += src_stride4;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
- hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
- hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+ hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+ hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+ hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
hz_out1, hz_out3);
@@ -382,20 +382,20 @@ static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
src, src_stride4, src1, src2, src3, src4);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
@@ -430,7 +430,7 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
src0 = __lsx_vld(src, 0);
src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
for (; loop_cnt--;) {
src1 = __lsx_vld(src, 0);
@@ -438,19 +438,19 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
src4 = __lsx_vldx(src, src_stride3);
src += src_stride4;
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
src1 = __lsx_vld(src, 0);
DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
src4 = __lsx_vldx(src, src_stride3);
@@ -470,19 +470,19 @@ static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
__lsx_vstelm_d(tmp2, dst, 0, 1);
dst += dst_stride;
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
@@ -534,8 +534,8 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
for (; loop_cnt--;) {
uint8_t *src_tmp0 = src + 8;
@@ -546,32 +546,32 @@ static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
src += src_stride4;
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
__lsx_vst(tmp, dst, 0);
dst += dst_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
__lsx_vst(tmp, dst, 0);
dst += dst_stride;
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+ hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+ hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
__lsx_vst(tmp, dst, 0);
dst += dst_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+ hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
diff --git a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
index 7e3a95b2f..6022e43c8 100644
--- a/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
+++ b/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -52,9 +52,9 @@ static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
tmp0, tmp1, tmp2, tmp3);
DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
- out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, reg3, filter0, filter1,
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
filter2, filter3);
- out1 = FILT_8TAP_DPADD_S_H(reg1, reg2, reg3, reg4, filter0, filter1,
+ out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
filter2, filter3);
out0 = __lsx_vssrarni_b_h(out1, out0, 7);
out0 = __lsx_vxori_b(out0, 128);
@@ -116,13 +116,13 @@ static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
src8, src9, src10);
DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
tmp0, tmp1, tmp2, tmp3);
- out0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, tmp0, filter0, filter1,
+ out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
filter2, filter3);
- out1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, tmp1, filter0, filter1,
+ out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
filter2, filter3);
- out2 = FILT_8TAP_DPADD_S_H(reg1, reg2, tmp0, tmp2, filter0, filter1,
+ out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
filter2, filter3);
- out3 = FILT_8TAP_DPADD_S_H(reg4, reg5, tmp1, tmp3, filter0, filter1,
+ out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
filter2, filter3);
DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
@@ -192,13 +192,13 @@ static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
src0, src1, src2, src3);
DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
src4, src5, src7, src8);
- tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
filter2, filter3);
- tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
filter2, filter3);
- tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
filter2, filter3);
- tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
filter2, filter3);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -206,13 +206,13 @@ static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
dst += dst_stride;
__lsx_vst(tmp1, dst, 0);
dst += dst_stride;
- tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
filter2, filter3);
- tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
filter2, filter3);
- tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
filter2, filter3);
- tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
filter2, filter3);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
@@ -298,25 +298,25 @@ static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
src0, src1, src2, src3);
DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
src4, src5, src7, src8);
- tmp0 = FILT_8TAP_DPADD_S_H(reg0, reg1, reg2, src0, filter0, filter1,
+ tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
filter2, filter3);
- tmp1 = FILT_8TAP_DPADD_S_H(reg3, reg4, reg5, src1, filter0, filter1,
+ tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
filter2, filter3);
- tmp2 = FILT_8TAP_DPADD_S_H(reg6, reg7, reg8, src4, filter0, filter1,
+ tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
filter2, filter3);
- tmp3 = FILT_8TAP_DPADD_S_H(reg9, reg10, reg11, src5, filter0, filter1,
+ tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
filter2, filter3);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
__lsx_vst(tmp0, dst_tmp, 0);
__lsx_vstx(tmp1, dst_tmp, dst_stride);
- tmp0 = FILT_8TAP_DPADD_S_H(reg1, reg2, src0, src2, filter0, filter1,
+ tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
filter2, filter3);
- tmp1 = FILT_8TAP_DPADD_S_H(reg4, reg5, src1, src3, filter0, filter1,
+ tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
filter2, filter3);
- tmp2 = FILT_8TAP_DPADD_S_H(reg7, reg8, src4, src7, filter0, filter1,
+ tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
filter2, filter3);
- tmp3 = FILT_8TAP_DPADD_S_H(reg10, reg11, src5, src8, filter0, filter1,
+ tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
filter2, filter3);
DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
diff --git a/vpx_dsp/loongarch/vpx_convolve_lsx.h b/vpx_dsp/loongarch/vpx_convolve_lsx.h
index 2428407f2..d886b0019 100644
--- a/vpx_dsp/loongarch/vpx_convolve_lsx.h
+++ b/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -11,11 +11,50 @@
#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
-#include "vpx_util/loongson_intrinsics.h"
+#include "./vpx_config.h"
#include "vpx_dsp/vpx_filter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1,
+ __m128i _reg2, __m128i _reg3,
+ __m128i _filter0, __m128i _filter1,
+ __m128i _filter2, __m128i _filter3) {
+ __m128i _vec0, _vec1;
+
+ _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);
+ _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);
+ _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);
+ _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);
+ return __lsx_vsadd_h(_vec0, _vec1);
+}
+
+static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1,
+ __m128i _mask0, __m128i _mask1,
+ __m128i _mask2, __m128i _mask3,
+ __m128i _filt_h0, __m128i _filt_h1,
+ __m128i _filt_h2, __m128i _filt_h3) {
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+ __m128i _out;
+
+ DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,
+ _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);
+ _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1,
+ _filt_h2, _filt_h3);
+ _out = __lsx_vsrari_h(_out, FILTER_BITS);
+ return __lsx_vsat_h(_out, 7);
+}
+
+static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask,
+ __m128i coeff) {
+ __m128i tmp0_m, tmp1_m;
+
+ tmp0_m = __lsx_vshuf_b(in1, in0, mask);
+ tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);
+ return __lsx_vsrari_h(tmp1_m, FILTER_BITS);
+}
#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
- { \
+ do { \
_src0 = __lsx_vld(_src, 0); \
_src += _stride; \
_src1 = __lsx_vld(_src, 0); \
@@ -23,43 +62,12 @@
_src2 = __lsx_vld(_src, 0); \
_src += _stride; \
_src3 = __lsx_vld(_src, 0); \
- }
-
-#define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, _filter0, _filter1, \
- _filter2, _filter3) \
- ({ \
- __m128i _vec0, _vec1; \
- \
- _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); \
- _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); \
- _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); \
- _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); \
- _vec0 = __lsx_vsadd_h(_vec0, _vec1); \
- \
- _vec0; \
- })
-
-#define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, \
- _filt_h0, _filt_h1, _filt_h2, _filt_h3) \
- ({ \
- __m128i _tmp0, _tmp1, _tmp2, _tmp3; \
- __m128i _out; \
- \
- DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, \
- _src1, _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, \
- _tmp3); \
- _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \
- _filt_h2, _filt_h3); \
- _out = __lsx_vsrari_h(_out, FILTER_BITS); \
- _out = __lsx_vsat_h(_out, 7); \
- \
- _out; \
- })
+ } while (0)
#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
_mask2, _mask3, _filter0, _filter1, \
_filter2, _filter3, _out0, _out1) \
- { \
+ do { \
__m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
__m128i _reg0, _reg1, _reg2, _reg3; \
\
@@ -78,12 +86,12 @@
DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \
_filter3, _reg2, _reg3); \
DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \
- }
+ } while (0)
#define HORIZ_8TAP_8WID_4VECS_FILT( \
_src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \
_filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \
- { \
+ do { \
__m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
__m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \
\
@@ -111,22 +119,10 @@
_reg5, _reg6, _reg7); \
DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \
_reg7, _out0, _out1, _out2, _out3); \
- }
-
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
- ({ \
- __m128i tmp0_m; \
- __m128i tmp1_m; \
- \
- tmp0_m = __lsx_vshuf_b(in1, in0, mask); \
- tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \
- tmp1_m = __lsx_vsrari_h(tmp1_m, shift); \
- \
- tmp1_m; \
- })
+ } while (0)
#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \
- { \
+ do { \
__m128i tmp0_m, tmp1_m; \
\
DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
@@ -137,6 +133,6 @@
__lsx_vstelm_d(tmp1_m, pdst, 0, 0); \
pdst += stride; \
__lsx_vstelm_d(tmp1_m, pdst, 0, 1); \
- }
+ } while (0)
#endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_