diff options
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm')
-rw-r--r-- | chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm | 298 |
1 files changed, 183 insertions, 115 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm index 7d01bccb4f5..f30f4909287 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm @@ -29,7 +29,8 @@ %if ARCH_X86_64 SECTION_RODATA 64 -int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 +const \ +int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 @@ -84,7 +85,7 @@ pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 -gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10 +gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 @@ -845,7 +846,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 punpcklwd m3, m5 ; dct8 in3 in5 punpckhwd m5, m2 ; dct16 in11 in5 punpcklwd m6, m2 ; dct4 in3 in1 -.main2: +cglobal_label .main2 vpbroadcastd m10, [o(pd_2048)] .main3: vpbroadcastq m13, [o(int_mshift)] @@ -1355,7 +1356,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m3, m3, q2031 jmp m(iadst_8x8_internal_8bpc).end2 ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -1422,7 +1423,7 @@ ALIGN function_align punpckhqdq m0, m4 ; out0 -out1 ret ALIGN function_align -.main_pass2: +cglobal_label .main_pass2 IADST8_1D_PACKED 2 ret @@ -1499,8 +1500,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128 sar r6d, 8 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly @@ -1608,7 +1609,54 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpscatterdq [r3+ym8]{k2}, m2 RET ALIGN function_align -.main: +cglobal_label .main_fast2 ; bottom three-quarters are zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + vpbroadcastd ym3, [o(pw_401_4076x8)] + vpbroadcastd ym5, [o(pw_799_4017x8)] + vpbroadcastd ym4, [o(pw_m1189_3920x8)] + pxor ym6, ym6 + punpckhwd ym2, ym0, ym0 + pmulhrsw ym2, ym3 ; t8a t15a + punpcklwd ym7, ym1, ym1 + pmulhrsw ym7, ym5 ; t4a t7a + punpckhwd ym1, ym1 + pmulhrsw ym4, ym1 ; t11a t12a + vpcmpub k7, ym13, ym10, 6 + punpcklwd ym9, ym6, ym0 + psubsw ym0, ym2, ym4 ; t11a t12a + paddsw ym8, ym2, ym4 ; t8a t15a + mova ym1, ym7 + jmp .main5 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + pxor ym6, ym6 + punpckhwd ym8, ym0, ym0 + punpckhwd ym4, ym3, ym3 + punpckhwd ym5, ym2, ym2 + punpcklwd ym7, ym1, ym1 + punpckhwd ym1, ym1 + punpcklwd ym3, ym3 + punpcklwd ym9, ym6, ym0 + punpcklwd ym6, ym2 + vpbroadcastd ym2, [o(pw_401_4076x8)] + vpbroadcastd ym0, [o(pw_m2598_3166x8)] + vpbroadcastd ym11, [o(pw_1931_3612x8)] + vpbroadcastd ym12, [o(pw_m1189_3920x8)] + pmulhrsw ym8, ym2 ; t8a t15a + vpbroadcastd ym2, [o(pw_799_4017x8)] + pmulhrsw ym0, ym4 ; t9a t14a + vpbroadcastd ym4, [o(pw_m2276_3406x8)] + pmulhrsw ym5, ym11 ; t10a t13a + pmulhrsw ym1, ym12 ; t11a t12a + pmulhrsw ym7, ym2 ; t4a t7a + pmulhrsw ym3, ym4 ; t5a t6a + vpcmpub k7, ym13, ym10, 6 + jmp .main4 +ALIGN function_align +cglobal_label .main WRAP_YMM IDCT16_1D_PACKED ret @@ -1685,13 +1733,14 @@ ALIGN function_align vpermi2q m6, m0, m2 ; in4 in8 in6 in10 vpermt2q m1, m10, m3 ; in11 in7 in9 in5 .main: - vpbroadcastd m9, [o(pd_2048)] - vpbroadcastq m13, [o(int_mshift)] - kxnorb k1, k1, k1 punpcklwd m0, m4, m5 ; in0 in15 in2 in13 punpckhwd m4, m5 ; in12 in3 in14 in1 punpcklwd m5, m6, m1 ; in4 in11 in6 in9 punpckhwd m6, m1 ; in8 in7 in10 in5 +cglobal_label .main2 + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + kxnorb k1, k1, k1 vpcmpub k7, m13, m9, 6 ; 0x33... pxor m8, m8 ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 @@ -1976,7 +2025,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd - mov r3d, 8 + or r3d, 8 .dconly: imul r6d, 181 add r6d, 128 @@ -2114,7 +2163,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vextracti32x4 [r3 +r4 ], m1, 3 RET ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -2168,6 +2217,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call .main_pass2 + movshdup m4, [o(permC)] pmulhrsw m0, m6 pmulhrsw m1, m6 psrlq m6, m4, 4 @@ -2194,9 +2244,8 @@ ALIGN function_align IADST8_1D_PACKED 1 ret ALIGN function_align -.main_pass2: +cglobal_label .main_pass2 IADST8_1D_PACKED 2 - movshdup m4, [o(permC)] pxor m5, m5 psubd m5, m6 packssdw m6, m5 @@ -2222,6 +2271,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call m(iadst_16x8_internal_8bpc).main_pass2 + movshdup m4, [o(permC)] pmulhrsw m5, m6, m0 pmulhrsw m0, m6, m1 psrlq m1, m4, 12 @@ -2276,8 +2326,8 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 @@ -2456,7 +2506,7 @@ ALIGN function_align pmulhrsw m3, m4 ; t5a t6a jmp .main4 ALIGN function_align -.main: +cglobal_label .main IDCT16_1D_PACKED ret @@ -2562,6 +2612,7 @@ ALIGN function_align vshufi32x4 m1, m5, q2020 ; 2 3 vshufi32x4 m5, m7, m9, q2020 ; 10 11 vshufi32x4 m7, m9, q3131 ; 14 15 +cglobal_label .main_pass2b REPX {pshufd x, x, q1032}, m1, m3, m5, m7 call .main vpbroadcastd m8, [o(pw_2896x8)] @@ -2770,13 +2821,13 @@ ALIGN function_align vpermt2q m9, m12, m7 jmp m(idct_16x16_internal_8bpc).end -%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] - vpbroadcastd m%3, [o(pw_%4_%5x8)] - punpcklwd m%1, m%2, m%2 - pmulhrsw m%1, m%3 - vpbroadcastd m%3, [o(pw_%6_%7x8)] - punpckhwd m%2, m%2 - pmulhrsw m%2, m%3 +%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] + vpbroadcastd m%4, [o(pw_%5_%6x8)] + punpcklwd m%1, m%3, m%3 + pmulhrsw m%1, m%4 + vpbroadcastd m%4, [o(pw_%7_%8x8)] + punpckhwd m%2, m%3, m%3 + pmulhrsw m%2, m%4 %endmacro cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob @@ -2864,82 +2915,86 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 - pxor ym4, ym4 vpermt2q m2, m5, m6 ; 8 10 vpermt2q m16, m5, m17 ; 9 11 - mova ym5, ym4 - mova ym6, ym4 - mova ym7, ym4 vextracti32x8 ym3, m2, 1 ; 12 14 vextracti32x8 ym17, m16, 1 ; 13 15 - call m(idct_8x16_internal_8bpc).main + call m(idct_8x16_internal_8bpc).main_fast call .main_fast .end: - vpbroadcastd ym12, strided - vpbroadcastd m13, [o(pw_2048)] - pmulld ym7, ym12, [o(gather8d)] - REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11 + vpbroadcastd ym8, strided + pmulld ym8, [o(gather8d)] + call .main_end lea r3, [dstq+strideq*4] - shl strideq, 4 - lea r4, [dstq+strideq] - add r1, r3 kxnorb k1, k1, k1 - pxor m6, m6 + lea r4, [dstq+strideq*8] + pxor m9, m9 + lea r1, [r3+strideq*8] kmovb k2, k1 - vpgatherdq m12{k1}, [r0+ym7] + vpgatherdq m12{k1}, [r0+ym8] kmovb k1, k2 - vpgatherdq m13{k2}, [r3+ym7] + vpgatherdq m13{k2}, [r3+ym8] kmovb k2, k1 - vpgatherdq m14{k1}, [r4+ym7] + vpgatherdq m14{k1}, [r4+ym8] kmovb k1, k2 - vpgatherdq m15{k2}, [r1+ym7] - REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 - punpcklbw m4, m12, m6 - punpckhbw m12, m6 - paddw m0, m4 + vpgatherdq m15{k2}, [r1+ym8] + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m11, m12, m9 + punpckhbw m12, m9 + paddw m0, m11 paddw m1, m12 packuswb m0, m1 kmovb k2, k1 - vpscatterdq [r0+ym7]{k1}, m0 - punpcklbw m4, m13, m6 - punpckhbw m13, m6 - paddw m2, m4 + vpscatterdq [r0+ym8]{k1}, m0 + punpcklbw m12, m13, m9 + punpckhbw m13, m9 + paddw m2, m12 paddw m3, m13 packuswb m2, m3 kmovb k1, k2 - vpscatterdq [r3+ym7]{k2}, m2 - punpcklbw m4, m14, m6 - punpckhbw m14, m6 - paddw m8, m4 - paddw m9, m14 - packuswb m8, m9 + vpscatterdq [r3+ym8]{k2}, m2 + punpcklbw m13, m14, m9 + punpckhbw m14, m9 + paddw m4, m13 + paddw m5, m14 + packuswb m4, m5 kmovb k2, k1 - vpscatterdq [r4+ym7]{k1}, m8 - punpcklbw m4, m15, m6 - punpckhbw m15, m6 - paddw m10, m4 - paddw m11, m15 - packuswb m10, m11 - vpscatterdq [r1+ym7]{k2}, m10 + vpscatterdq [r4+ym8]{k1}, m4 + punpcklbw m14, m15, m9 + punpckhbw m15, m9 + paddw m6, m14 + paddw m7, m15 + packuswb m6, m7 + vpscatterdq [r1+ym8]{k2}, m6 RET .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 INIT_YMM avx512icl ALIGN function_align -.main_fast: ; bottom half is zero - ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a - ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a - ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a - ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a +cglobal_label .main_fast2 ; bottom three-quarters are zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + mova m11, m12 + mova m17, m20 + mova m15, m21 + mova m16, m14 + jmp .main4 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main3 ALIGN function_align -.main: +cglobal_label .main punpcklwd m12, m21, m14 ; in31 in1 punpckhwd m14, m21 ; in3 in29 punpcklwd m21, m20, m15 ; in27 in5 @@ -2966,6 +3021,7 @@ ALIGN function_align paddsw m21, m16 ; t20 t27 psubsw m16, m14, m19 ; t22 t25 paddsw m14, m19 ; t23 t24 +.main4: ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a @@ -2997,8 +3053,8 @@ ALIGN function_align REPX {pshufb x, m18}, m20, m11, m21, m19 ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 - packssdw m18, m13 ; t23a t22 - packssdw m12, m15 ; t24a t25 + packssdw m18, m13 ; t23a t22 + packssdw m12, m15 ; t24a t25 ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 packssdw m16, m13 ; t20 t21a @@ -3007,32 +3063,27 @@ ALIGN function_align punpckhqdq m19, m21 ; t28a t29 punpcklqdq m21, m20, m11 ; t16 t17a punpckhqdq m20, m11 ; t31 t30a - psubsw m15, m1, m19 ; out28 out29 - paddsw m1, m19 ; out3 out2 - psubsw m9, m6, m13 ; out19 out18 - paddsw m6, m13 ; out12 out13 - psubsw m10, m5, m16 ; out20 out21 - paddsw m5, m16 ; out11 out10 - psubsw m19, m3, m12 ; out24 out25 - paddsw m3, m12 ; out7 out6 - psubsw m8, m7, m21 ; out16 out17 - paddsw m7, m21 ; out15 out14 - psubsw m21, m0, m20 ; out31 out30 - paddsw m0, m20 ; out0 out1 - psubsw m11, m4, m18 ; out23 out22 - paddsw m4, m18 ; out8 out9 - psubsw m18, m2, m14 ; out27 out26 - paddsw m2, m14 ; out4 out5 INIT_ZMM avx512icl - movu m16, [o(permD+3)] - vpermt2q m0, m16, m4 ; 0 1 8 9 - vpermt2q m8, m16, m19 ; 16 17 24 25 - vpermt2q m1, m16, m5 ; 3 2 11 10 - vpermt2q m9, m16, m18 ; 19 18 27 26 - vpermt2q m2, m16, m6 ; 4 5 12 13 - vpermt2q m10, m16, m15 ; 20 21 28 29 - vpermt2q m3, m16, m7 ; 7 6 15 14 - vpermt2q m11, m16, m21 ; 23 22 31 30 + mova m15, [o(permA)] + ret +cglobal_label .main_end + vpbroadcastd m10, [o(pw_2048)] + vpermt2q m0, m15, m1 ; t0 t1 t2 t3 + vpermt2q m20, m15, m19 ; t31 t30a t29 t28a + vpermt2q m2, m15, m3 ; t4 t5 t6 t7 + vpermt2q m14, m15, m12 ; t27 t26a t25 t24a + vpermt2q m4, m15, m5 ; t8 t9 t10 t11 + vpermt2q m18, m15, m16 ; t23a t22 t21a t20 + vpermt2q m6, m15, m7 ; t12 t13 t14 t15 + vpermt2q m13, m15, m21 ; t19a t18 t17a t16 + psubsw m7, m0, m20 ; out31 out30 out29 out28 + paddsw m0, m20 ; out0 out1 out2 out3 + psubsw m5, m2, m14 ; out27 out26 out25 out24 + paddsw m2, m14 ; out4 out5 out6 out7 + psubsw m3, m4, m18 ; out23 out22 out21 out20 + paddsw m4, m18 ; out8 out9 out10 out11 + psubsw m1, m6, m13 ; out19 out18 out17 out16 + paddsw m6, m13 ; out12 out13 out14 out15 vzeroupper ret @@ -3079,16 +3130,33 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob call m(idct_8x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast .pass2: - vpbroadcastd m12, [o(pw_8192)] - vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31 - vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30 - vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29 - vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28 - vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27 - vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26 - vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15 - vshufi32x4 m0, m8, q2020 ; 0 8 16 24 - REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m10, [o(pw_8192)] + vpermt2q m0, m15, m4 ; t0 t1 t9 t8 + vpermt2q m20, m15, m18 ; t31 t30a t23a t22 + vpermt2q m3, m15, m7 ; t7 t6 t14 t15 + vpermt2q m12, m15, m21 ; t25 t24a t17a t16 + vpermt2q m2, m15, m6 ; t4 t5 t13 t12 + vpermt2q m14, m15, m13 ; t23a t22 t21a t20 + vpermt2q m1, m15, m5 ; t3 t2 t10 t11 + vpermt2q m19, m15, m16 ; t27 t26a t19a t18 + psubsw m8, m0, m20 ; out31 out30 out22 out23 + paddsw m0, m20 ; out0 out1 out9 out8 + paddsw m6, m3, m12 ; out7 out6 out14 out15 + psubsw m3, m12 ; out24 out25 out17 out16 + psubsw m5, m2, m14 ; out27 out26 out18 out19 + paddsw m4, m2, m14 ; out4 out5 out13 out12 + psubsw m7, m1, m19 ; out28 out29 out21 out20 + paddsw m2, m1, m19 ; out3 out2 out10 out11 + vzeroupper + vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 + vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 + vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 + vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 + vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 + vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 + vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 + vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 call .main vpbroadcastd m8, [o(pw_2048)] @@ -3132,7 +3200,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 8 + or r3d, 8 .dconly2: imul r6d, 181 add r6d, 128+512 @@ -3158,7 +3226,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob jg .dconly_loop RET ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m10, [o(pd_2048)] .main2: ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a @@ -3535,7 +3603,7 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly ALIGN function_align .main_oddhalf_fast2: ; bottom three-quarters are zero @@ -3821,8 +3889,8 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -4603,7 +4671,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 ALIGN function_align .main_oddhalf_fast2: ; bottom three-quarters are zero @@ -5068,8 +5136,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 64 imul r6d, 181 - mov r3d, 64 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 @@ -5282,7 +5350,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob jnz .normal movsx r6d, word [cq] mov [cq], eobd - mov r3d, 16 + or r3d, 16 .dconly: imul r6d, 181 add r6d, 128+512 @@ -6012,8 +6080,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 64 imul r6d, 181 - mov r3d, 64 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -6674,8 +6742,8 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 32 imul r6d, 181 - mov r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -7117,7 +7185,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly ALIGN function_align .pass2_end: |