diff options
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm')
-rw-r--r-- | chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm | 231 |
1 files changed, 123 insertions, 108 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm index 4fb30ef4e7a..3833e17c99f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm @@ -361,18 +361,32 @@ ALIGN function_align %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 0, 4x4 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 - movd m1, [o(pw_2896x8)] + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 + mov r3d, 4 +.dconly: + add r5d, 128 + sar r5d, 8 +.dconly2: + imul r5d, 2896 + mova m2, [o(pixel_10bpc_max)] + add r5d, 34816 movd m0, r5d - packssdw m0, m0 - pmulhrsw m0, m1 - pshuflw m0, m0, q0000 + pshuflw m0, m0, q1111 + pxor m3, m3 punpcklqdq m0, m0 - mova m1, m0 - TAIL_CALL m(iadst_4x4_internal_16bpc).end +.dconly_loop: + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + paddw m1, m0 + pminsw m1, m2 + pmaxsw m1, m3 + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET %endif %endmacro @@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 4x8 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 2 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 2048 - sar r5d, 12 -.end: - imul r5d, 2896 - add r5d, 34816 - movd m0, r5d - pshuflw m0, m0, q1111 - punpcklqdq m0, m0 - pxor m4, m4 - mova m3, [o(pixel_10bpc_max)] - lea r2, [strideq*3] -.loop: - movq m1, [dstq+strideq*0] - movq m2, [dstq+strideq*2] - movhps m1, [dstq+strideq*1] - movhps m2, [dstq+r2] - paddw m1, m0 - paddw m2, m0 - REPX {pminsw x, m3}, m1, m2 - REPX {pmaxsw x, m4}, m1, m2 - movq [dstq+strideq*0], m1 - movhps [dstq+strideq*1], m1 - movq [dstq+strideq*2], m2 - movhps [dstq+r2 ], m2 - lea dstq, [dstq+strideq*4] - dec r3d - jg .loop - RET + mov r3d, 8 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly %endif %endmacro @@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 4 - add r5d, 6144 - sar r5d, 13 - jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end + mov r3d, 16 + add r5d, 384 + sar r5d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 %endif %endmacro @@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 2048 - sar r5d, 12 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 128 + sar r5d, 8 imul r5d, 2896 add r5d, 34816 movd m0, r5d @@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 2 .end: - add r5d, 6144 - sar r5d, 13 + add r5d, 384 + sar r5d, 9 .end2: imul r5d, 2896 add r5d, 34816 @@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 mov r3d, 4 %if stack_size_padded > 0 ; adjust to caller's stack allocation @@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 4 .dconly: - add r5d, 6144 - sar r5d, 13 + add r5d, 384 + sar r5d, 9 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -2755,6 +2742,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ret .round: %if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 pcmpeqd m8, m8 REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] @@ -2785,6 +2774,14 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 pcmpeqd m0, m0 REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 mova [r3+ 1*16], m1 @@ -3472,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 %if ARCH_X86_32 add rsp, 1*16 %endif @@ -3939,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (5+ARCH_X86_64*3+WIN64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 %endif @@ -4057,6 +4054,8 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ret .round: %if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 psrld m8, m11, 10 ; 2 REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] @@ -4087,6 +4086,14 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 mova m0, [o(pd_2)] REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 paddd m0, [r3+ 0*16] @@ -5162,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ call m(idct_8x8_internal_16bpc).round1_and_write_8x8 ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (31+2*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 @@ -5339,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ %endif RET .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 add rsp, (65+4*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly @@ -5944,6 +5951,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ ; final sumsub for idct16 as well as idct32, plus final downshift %macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx mova m%4, [r3+16*(23-%1)] + pmaxsd m%1, m12 + pminsd m%1, m13 psubd m%3, m%1, m%4 ; idct16 out15 - n paddd m%1, m%4 ; idct16 out0 + n pmaxsd m%1, m12 @@ -6019,6 +6028,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ .loop_dct32_end: mova m0, [r3+16*16] mova m6, [r3+16*24] + pmaxsd m0, m2 + pminsd m0, m3 psubd m5, m0, m6 ; idct16 out15 - n paddd m0, m6 ; idct16 out0 + n pmaxsd m0, m2 @@ -6045,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ %endif .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 .dconly1: - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -6344,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ %endif .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ @@ -6565,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 @@ -6838,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 @@ -7098,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 @@ -7537,6 +7548,8 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ mova m5, [r3-16* 4] ; idct64 48 + n mova m6, [r4-16*20] ; idct64 47 - n mova m7, [r3-16*20] ; idct64 32 + n + pmaxsd m0, m12 + pminsd m0, m13 paddd m8, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m12}, m8, m0 @@ -7565,11 +7578,13 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ mova [r4-16* 4], m6 mova [r3+16*12], m8 %else + mova m5, [o(clip_18b_min)] + mova m6, [o(clip_18b_max)] mova m1, [r3+16*44] ; idct16 15 - n + pmaxsd m0, m5 + pminsd m0, m6 paddd m4, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n - mova m5, [o(clip_18b_min)] - mova m6, [o(clip_18b_max)] REPX {pmaxsd x, m5}, m4, m0 REPX {pminsd x, m6}, m4, m0 paddd m1, m4, m3 ; idct32 out0 + n @@ -7632,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 .dconly1: - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -7876,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 add rsp, (1+8*32+1*WIN64)*16 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 @@ -8112,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ |