diff options
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86')
25 files changed, 7282 insertions, 1531 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h b/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h new file mode 100644 index 00000000000..553d6507412 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h @@ -0,0 +1,87 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/cdef.h" + +#define decl_cdef_fns(ext) \ + decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \ + decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \ + decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext)) + +decl_cdef_fns(avx512icl); +decl_cdef_fns(avx2); +decl_cdef_fns(sse4); +decl_cdef_fns(ssse3); +decl_cdef_fns(sse2); + +decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2)); +decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4)); +decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3)); + +static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + +#if BITDEPTH == 8 + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + + c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2); + c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2); + c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->dir = BF(dav1d_cdef_dir, ssse3); + c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3); + c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3); + c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + + c->dir = BF(dav1d_cdef_dir, sse4); +#if BITDEPTH == 8 + c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4); + c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4); + c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4); +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->dir = BF(dav1d_cdef_dir, avx2); + c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2); + c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2); + c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl); + c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl); + c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm new file mode 100644 index 00000000000..6d625a02a0c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm @@ -0,0 +1,622 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21 + db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29 + db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37 + db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45 +end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 +edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111 + dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011 + dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111 +pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4 +cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6 + dw 1, 2, 1, 10, 9, 18, 8, 17 + dw 8, 16, 8, 15, -7,-14, 1, -6 +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28 + db 2, 4, 2, 36, 34, 68, 32, 66 + db 32, 64, 32, 62,-30,-60, 2,-28 +pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3 +sec_taps4: dw 32, 16 +pw_m16384: times 2 dw -16384 +pw_2048: times 2 dw 2048 +pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4) +edge_mask8: dw 0x2121, 0x2020, 0x0101 + +SECTION .text + +%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp + psubw %1, %2, %3 + pabsw %1, %1 + vpcmpgtw k1, %3, %2 + vpsrlvw %7, %1, %6 + psubusw %7, %5, %7 + pminsw %1, %7 + vpsubw %1{k1}, %4, %1 +%endmacro + +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7 +; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r6-cdef_dirs4 + lea r6, [cdef_dirs4] + movu xm3, [dstq+strideq*0] + vinserti32x4 ym3, [dstq+strideq*1], 1 + mova xm2, [leftq] + lea r2, [dstq+strideq*2] + vinserti32x4 m3, [r2+strideq*0], 2 + mova m5, [base+cdef_perm] + vinserti32x4 m3, [r2+strideq*1], 3 + vpermt2d m2, m5, m3 + vinserti32x4 m1, m2, [topq+strideq*0-4], 0 + vinserti32x4 m1, [topq+strideq*1-4], 1 + mov r3d, edgem + movifnidn prid, prim + punpcklwd m3, m3 ; px + psrlw m5, 8 + vpbroadcastd m0, [base+pd_268435568] + pxor m12, m12 + cmp r3d, 0x0f + jne .mask_edges + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 +.main: + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + vpbroadcastd m15, [base+pri_taps4+priq] + xor prid, prid + add r4d, r3d + cmovns prid, r4d ; pri_shift + mov r4d, dirm + vpbroadcastw m14, prid + mov r5d, secm + vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] + call .constrain + test r5d, r5d + jz .end_no_clip + lzcnt r5d, r5d + vpbroadcastw m13, secm + add r3d, r5d + pminuw m6, m3, m8 + pmaxsw m7, m3, m8 + pminuw m6, m9 + pmaxsw m7, m9 + call .constrain_sec + pminuw m6, m8 + pmaxsw m7, m8 + pminuw m6, m9 + pmaxsw m7, m9 + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain + pminuw m6, m8 + pmaxsw m7, m8 + pminuw m6, m9 + pmaxsw m7, m9 + psrldq m8, m6, 2 + vpshldd m3, m0, 8 + psrldq m9, m7, 2 + paddd m0, m3 + pminuw m6, m8 + psrldq m0, 1 + pmaxsw m7, m9 + pmaxsw m0, m6 + pminsw m0, m7 + vpmovdw ym0, m0 + jmp .end +.sec_only: + tzcnt r5d, secm + mov r3d, dampingm + vpbroadcastw m13, secm + mov r4d, dirm + sub r3d, r5d ; sec_shift + call .constrain_sec + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain +.end_no_clip: + mova ym1, [base+end_perm4] + vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + vpermb m0, m1, m0 +.end: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm0, ym0, 1 + movq [r2+strideq*0], xm0 + movhps [r2+strideq*1], xm0 + RET +.mask_edges: + vpbroadcastd m6, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 + kmovw k1, [base+edge_mask4-8+r3*2] + jmp .mask_edges_main +.mask_edges_no_bottom: + kmovw k1, [base+edge_mask4+8+r3*2] +.mask_edges_main: + or r3d, 0x04 + vmovdqa32 m1{k1}, m6 ; edge pixels = -16384 + kmovw k1, [base+edge_mask4-8+r3*2] + vmovdqa32 m2{k1}, m6 + jmp .main +.constrain_sec: + vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] + vpbroadcastw m14, r3d + vpbroadcastd m15, [base+sec_taps4] +.constrain: + paddw m8, m5, m9 + vpermi2w m8, m1, m2 ; k0p0 k1p0 + psubw m9, m5, m9 + vpermi2w m9, m1, m2 ; k0p1 k1p1 + CONSTRAIN m10, m8, m3, m12, m13, m14, m11 + vpdpwssd m0, m10, m15 + CONSTRAIN m10, m9, m3, m12, m13, m14, m11 + vpdpwssd m0, m10, m15 + ret + +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 + +cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge + lea r6, [cdef_dirs4] + movu xm18, [dstq+strideq*0] + vinserti128 ym18, [dstq+strideq*1], 1 + mova xm1, [leftq+16*0] + mova xm2, [leftq+16*1] + lea r2, [strideq*3] + vinserti32x4 m18, [dstq+strideq*2], 2 + mova m5, [base+cdef_perm] + vinserti32x4 m18, [dstq+r2 ], 3 + vpermt2d m1, m5, m18 + vinserti32x4 m0, m1, [topq+strideq*0-4], 0 + vinserti32x4 m0, [topq+strideq*1-4], 1 + lea r3, [dstq+strideq*4] + movu xm19, [r3+strideq*0] + vinserti128 ym19, [r3+strideq*1], 1 + vinserti32x4 m19, [r3+strideq*2], 2 + vinserti32x4 m19, [r3+r2 ], 3 + mov r3d, edgem + movifnidn prid, prim + vpermt2d m2, m5, m19 + vpbroadcastd m16, [base+pd_268435568] + pxor m12, m12 + punpcklwd m18, m18 ; px (top) + psrlw m5, 8 + punpcklwd m19, m19 ; px (bottom) + mova m17, m16 + vshufi32x4 m1, m2, q3210 + cmp r3d, 0x0f + jne .mask_edges + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 +.main: + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + vpbroadcastd m15, [base+pri_taps4+priq] + xor prid, prid + add r4d, r3d + cmovns prid, r4d ; pri_shift + mov r4d, dirm + vpbroadcastw m14, prid + mov r5d, secm + vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] + call .constrain + test r5d, r5d + jz .end_no_clip + lzcnt r5d, r5d + vpbroadcastw m13, secm + add r3d, r5d + pminuw m3, m18, m6 + pmaxsw m4, m18, m6 + pminuw m20, m19, m7 + pmaxsw m21, m19, m7 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + call .constrain_sec + pminuw m3, m6 + pmaxsw m4, m6 + pminuw m20, m7 + pmaxsw m21, m7 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain + pminuw m3, m6 + pmaxsw m4, m6 + mov r3, 0xcccccccccccccccc + pminuw m20, m7 + pmaxsw m21, m7 + kmovq k1, r3 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + vbroadcasti32x4 m0, [base+deint_shuf] + vpshldd m6, m20, m3, 16 + vmovdqu8 m3{k1}, m20 + vpshldd m18, m16, 8 + vpshldd m7, m21, m4, 16 + vmovdqu8 m4{k1}, m21 + vpshldd m19, m17, 8 + pminuw m3, m6 + paddd m16, m18 + pmaxsw m4, m7 + paddd m17, m19 + psrldq m16, 1 + palignr m16{k1}, m17, m17, 15 + lea r6, [dstq+strideq*4] + pmaxsw m16, m3 + pminsw m16, m4 + pshufb m16, m0 + movq [dstq+strideq*0], xm16 + movhps [r6 +strideq*0], xm16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*1], xm17 + movhps [r6 +strideq*1], xm17 + vextracti32x4 xm17, m16, 2 + movq [dstq+strideq*2], xm17 + movhps [r6 +strideq*2], xm17 + vextracti32x4 xm16, m16, 3 + movq [dstq+r2 ], xm16 + movhps [r6 +r2 ], xm16 + RET +.sec_only: + mov r4d, dirm + tzcnt r5d, secm + mov r3d, dampingm + vpbroadcastw m13, secm + sub r3d, r5d ; sec_shift + call .constrain_sec + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain +.end_no_clip: + mova ym20, [base+end_perm4] + vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m19, m17, 8 + paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddd m17, m19 + vpermb m16, m20, m16 + vpermb m17, m20, m17 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + vextracti128 xm16, ym16, 1 + movq [dstq+strideq*2], xm16 + movhps [dstq+r2 ], xm16 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm17 + movhps [dstq+strideq*1], xm17 + vextracti128 xm17, ym17, 1 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + RET +.mask_edges: + vpbroadcastd m6, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 + kmovw k1, [base+edge_mask4-8+r3*2] + jmp .mask_edges_main +.mask_edges_no_bottom: + kmovw k1, [base+edge_mask4+8+r3*2] +.mask_edges_main: + mov r4d, r3d + or r3d, 0x0c + vmovdqa32 m0{k1}, m6 ; edge pixels = -16384 + kmovw k1, [base+edge_mask4-8+r3*2] + or r4d, 0x04 + vmovdqa32 m1{k1}, m6 + kmovw k1, [base+edge_mask4-8+r4*2] + vmovdqa32 m2{k1}, m6 + jmp .main +.constrain_sec: + vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] + vpbroadcastw m14, r3d + vpbroadcastd m15, [base+sec_taps4] +.constrain: + paddw m7, m5, m9 + mova m6, m0 + vpermt2w m6, m7, m1 ; k0p0 k1p0 (top) + psubw m9, m5, m9 + mova m8, m0 + vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom) + CONSTRAIN m10, m6, m18, m12, m13, m14, m11 + vpermt2w m8, m9, m1 ; k0p1 k1p1 (top) + vpdpwssd m16, m10, m15 + CONSTRAIN m10, m7, m19, m12, m13, m14, m11 + vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom) + vpdpwssd m17, m10, m15 + CONSTRAIN m10, m8, m18, m12, m13, m14, m11 + vpdpwssd m16, m10, m15 + CONSTRAIN m10, m9, m19, m12, m13, m14, m11 + vpdpwssd m17, m10, m15 + ret + +cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r6-cdef_dirs8 + lea r6, [cdef_dirs8] + movu ym17, [dstq+strideq*0] + vinserti32x8 m17, [dstq+strideq*1], 1 + movq xm4, [leftq+8*0] + movq xm5, [leftq+8*1] + psrld m2, [base+cdef_perm], 16 + movq xm6, [leftq+8*2] + movq xm7, [leftq+8*3] + lea r2, [strideq*3] + movu ym16, [topq+strideq*0-4] + vinserti32x8 m16, [topq+strideq*1-4], 1 + lea r3, [dstq+strideq*4] + movu ym18, [dstq+strideq*2] + vinserti32x8 m18, [dstq+r2 ], 1 + movu ym19, [r3+strideq*0] + vinserti32x8 m19, [r3+strideq*1], 1 + movu ym20, [r3+strideq*2] + vinserti32x8 m20, [r3+r2 ], 1 + vshufi32x4 m0, m17, m18, q2020 ; px (top) + mov r3d, edgem + vshufi32x4 m1, m19, m20, q2020 ; px (bottom) + movifnidn prid, prim + vpermt2d m17, m2, m4 + vpermt2d m18, m2, m5 + pxor m12, m12 + vpermt2d m19, m2, m6 + vpermt2d m20, m2, m7 + cmp r3d, 0x0f + jne .mask_edges + movu ym21, [botq+strideq*0-4] + vinserti32x8 m21, [botq+strideq*1-4], 1 +.main: + mova [rsp+64*0], m16 ; top + mova [rsp+64*1], m17 ; 0 1 + mova [rsp+64*2], m18 ; 2 3 + mova [rsp+64*3], m19 ; 4 5 + mova [rsp+64*4], m20 ; 6 7 + mova [rsp+64*5], m21 ; bottom + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + add r4d, r3d ; pri_shift + vpbroadcastw m14, r4d + mov r4d, dirm + vpbroadcastd m2, [base+pri_taps8+priq*2+0] + vpbroadcastd m3, [base+pri_taps8+priq*2+4] + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1 + pmaxsw m14, m12 + call .constrain + mov r5d, secm + pmullw m16, m8, m2 + pmullw m17, m9, m2 + test r5d, r5d + jnz .pri_sec + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 + call .constrain + pmullw m8, m3 + pmullw m9, m3 + jmp .end_no_clip +.pri_sec: + lzcnt r5d, r5d + add r3d, r5d ; sec_shift + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 + pminuw m18, m0, m4 + pmaxsw m19, m0, m4 + pminuw m20, m1, m5 + pmaxsw m21, m1, m5 + call .min_max_constrain2 + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2 + pmullw m8, m3 + pmullw m9, m3 + vpbroadcastw m13, secm + vpbroadcastw m14, r3d + paddw m16, m8 + paddw m17, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3 + mova m2, m8 + mova m3, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2 + paddw m2, m8 + paddw m3, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3 + paddw m2, m2 + paddw m3, m3 + paddw m16, m8 + paddw m17, m9 + call .min_max_constrain + vpbroadcastd m10, [base+pw_2048] + paddw m16, m2 + paddw m17, m3 + paddw m16, m8 + paddw m17, m9 + psraw m8, m16, 15 + psraw m9, m17, 15 + paddw m16, m8 + paddw m17, m9 + pmulhrsw m16, m10 + pmulhrsw m17, m10 + pminuw m18, m4 + pmaxsw m19, m4 + pminuw m20, m5 + pmaxsw m21, m5 + pminuw m18, m6 + pmaxsw m19, m6 + pminuw m20, m7 + pmaxsw m21, m7 + paddw m16, m0 + paddw m17, m1 + pmaxsw m16, m18 + pmaxsw m17, m20 + pminsw m16, m19 + pminsw m17, m21 + jmp .end +.sec_only: + tzcnt r5d, secm + mov r4d, dirm + mov r3d, dampingm + vpbroadcastw m13, secm + sub r3d, r5d + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] + vpbroadcastw m14, r3d + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] + mova m16, m8 + mova m17, m9 + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] + paddw m16, m8 + paddw m17, m9 + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] + paddw m16, m16 + paddw m17, m17 + paddw m16, m8 + paddw m17, m9 + call .constrain +.end_no_clip: + vpbroadcastd m10, [base+pw_2048] + paddw m16, m8 + paddw m17, m9 + psraw m8, m16, 15 + psraw m9, m17, 15 + paddw m16, m8 + paddw m17, m9 + pmulhrsw m16, m10 + pmulhrsw m17, m10 + paddw m16, m0 + paddw m17, m1 +.end: + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm17 + vextracti128 [dstq+strideq*1], ym17, 1 + vextracti32x4 [dstq+strideq*2], m17, 2 + vextracti32x4 [dstq+r2 ], m17, 3 + RET +.mask_edges: + vpbroadcastd m2, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + movu ym21, [botq+strideq*0-4] + vinserti32x8 m21, [botq+strideq*1-4], 1 + jmp .mask_edges_top +.mask_edges_no_bottom: + mova m21, m2 +.mask_edges_top: + test r3b, 0x04 + jnz .mask_edges_main + mova m16, m2 +.mask_edges_main: + and r3d, 0x03 + cmp r3d, 0x03 + je .main + kmovw k1, [base+edge_mask8+r3*2] + vmovdqa32 m16{k1}, m2 ; edge pixels = -16384 + vmovdqa32 m17{k1}, m2 + vmovdqa32 m18{k1}, m2 + vmovdqa32 m19{k1}, m2 + vmovdqa32 m20{k1}, m2 + vmovdqa32 m21{k1}, m2 + jmp .main +ALIGN function_align +.min_max_constrain: + pminuw m18, m4 + pmaxsw m19, m4 + pminuw m20, m5 + pmaxsw m21, m5 +.min_max_constrain2: + pminuw m18, m6 + pmaxsw m19, m6 + pminuw m20, m7 + pmaxsw m21, m7 +.constrain: + %define tmp rsp+gprsize+68 + movu m4, [tmp+r5+64*0] + vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top) + movu m5, [tmp+r5+64*2] + vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom) + neg r5 + movu m6, [tmp+r5+64*0] + vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top) + movu m7, [tmp+r5+64*2] + vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom) + CONSTRAIN m8, m4, m0, m12, m13, m14, m15 + CONSTRAIN m9, m5, m1, m12, m13, m14, m15 + CONSTRAIN m10, m6, m0, m12, m13, m14, m15 + CONSTRAIN m11, m7, m1, m12, m13, m14, m15 + paddw m8, m10 + paddw m9, m11 + ret + +%endif ; ARCH_X86_64 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h new file mode 100644 index 00000000000..eeaa328d1e1 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h @@ -0,0 +1,81 @@ +/* + * Copyright © 2018-2022, VideoLAN and dav1d authors + * Copyright © 2018-2022, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/filmgrain.h" + +#define decl_fg_fns(ext) \ +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \ +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext)) + +decl_fg_fns(ssse3); +decl_fg_fns(avx2); +decl_fg_fns(avx512icl); + +static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3); + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) { + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); + } + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h new file mode 100644 index 00000000000..7df563fee1c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h @@ -0,0 +1,146 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/ipred.h" + +#define decl_fn(type, name) \ + decl_##type##_fn(BF(dav1d_##name, ssse3)); \ + decl_##type##_fn(BF(dav1d_##name, avx2)); \ + decl_##type##_fn(BF(dav1d_##name, avx512icl)) +#define init_fn(type0, type1, name, suffix) \ + c->type0[type1] = BF(dav1d_##name, suffix) + +#define init_angular_ipred_fn(type, name, suffix) \ + init_fn(intra_pred, type, name, suffix) +#define init_cfl_pred_fn(type, name, suffix) \ + init_fn(cfl_pred, type, name, suffix) +#define init_cfl_ac_fn(type, name, suffix) \ + init_fn(cfl_ac, type, name, suffix) + +decl_fn(angular_ipred, ipred_dc); +decl_fn(angular_ipred, ipred_dc_128); +decl_fn(angular_ipred, ipred_dc_top); +decl_fn(angular_ipred, ipred_dc_left); +decl_fn(angular_ipred, ipred_h); +decl_fn(angular_ipred, ipred_v); +decl_fn(angular_ipred, ipred_paeth); +decl_fn(angular_ipred, ipred_smooth); +decl_fn(angular_ipred, ipred_smooth_h); +decl_fn(angular_ipred, ipred_smooth_v); +decl_fn(angular_ipred, ipred_z1); +decl_fn(angular_ipred, ipred_z2); +decl_fn(angular_ipred, ipred_z3); +decl_fn(angular_ipred, ipred_filter); + +decl_fn(cfl_pred, ipred_cfl); +decl_fn(cfl_pred, ipred_cfl_128); +decl_fn(cfl_pred, ipred_cfl_top); +decl_fn(cfl_pred, ipred_cfl_left); + +decl_fn(cfl_ac, ipred_cfl_ac_420); +decl_fn(cfl_ac, ipred_cfl_ac_422); +decl_fn(cfl_ac, ipred_cfl_ac_444); + +decl_fn(pal_pred, pal_pred); + +static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3); + init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3); + init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3); + + c->pal_pred = BF(dav1d_pal_pred, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + init_angular_ipred_fn(DC_PRED, ipred_dc, avx2); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2); + init_angular_ipred_fn(HOR_PRED, ipred_h, avx2); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx2); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2); + init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2); + init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2); + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2); + + c->pal_pred = BF(dav1d_pal_pred, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); + init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); +#endif + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl); + + c->pal_pred = BF(dav1d_pal_pred, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm index 4a1b060bd5f..1a307adc985 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm @@ -114,20 +114,20 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h vbroadcasti32x4 m2, [tlq] pshufb m2, m7 ; left PAETH 4, 5, 6 - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm8, ym0, 1 + vextracti32x4 xm9, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+r6 ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm8 + movq [dstq+r6 ], xm9 sub hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+r6 ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm8 + movhps [dstq+r6 ], xm9 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: @@ -220,19 +220,19 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pshufb m3, m4 pmulhrsw m3, m5 paddw m3, m6 - vextracti32x4 xmm0, m3, 3 - vextracti32x4 xmm1, ym3, 1 - vextracti32x4 xmm2, m3, 2 - movhps [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 + vextracti32x4 xm0, m3, 3 + vextracti32x4 xm1, ym3, 1 + vextracti32x4 xm2, m3, 2 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 add hq, 8 jg .end lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jl .w4_loop @@ -337,20 +337,20 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 sub hd, 8*2 jl .end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w4_loop .end: @@ -472,11 +472,11 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 vpdpwssd m0, m1, m6 vpermb m0, m14, m0 pavgw ym0, ym15 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 4*4 sub hd, 4*2 @@ -624,11 +624,11 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 pmovzxbw ym0, [idxq] add idxq, 16 vpermw ym0, ym0, ym3 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - movq [dstq+strideq*2], xmm1 - movhps [dstq+stride3q ], xmm1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm index 050ec9bb253..38c86b54f5c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm @@ -242,9 +242,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 jmp wq .w8: movq xmm1, [tlq+1] - vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 - paddd xmm2, xm0 + paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 @@ -275,9 +275,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 jmp wq .w16: movu xmm1, [tlq+1] - vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 - paddd xmm2, xm0 + paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 @@ -309,8 +309,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 .w32: movu ym1, [tlq+1] vpdpbusd ym0, ym1, ym3 - vextracti32x4 xmm1, ym0, 1 - paddd xmm1, xm0 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 @@ -345,8 +345,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 movu ym2, [tlq+33] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 - vextracti32x4 xmm1, ym0, 1 - paddd xmm1, xm0 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 @@ -524,12 +524,12 @@ INIT_YMM avx512icl pextrd [dstq+stride3q ], xm0, 3 sub hd, 8 jl .w4_ret - vextracti32x4 xmm0, m0, 1 + vextracti32x4 xm0, m0, 1 lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0], xmm0 - pextrd [dstq+strideq*1], xmm0, 1 - pextrd [dstq+strideq*2], xmm0, 2 - pextrd [dstq+stride3q ], xmm0, 3 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_ret: @@ -545,20 +545,20 @@ INIT_ZMM avx512icl vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 PAETH - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 sub hd, 8 jl .w8_ret lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w8_loop .w8_ret: @@ -639,18 +639,18 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 add hq, 8 jg .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jl .w4_loop .ret: @@ -669,11 +669,11 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop @@ -785,18 +785,18 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: @@ -815,11 +815,11 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop @@ -937,18 +937,18 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: @@ -978,11 +978,11 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx.h b/chromium/third_party/dav1d/libdav1d/src/x86/itx.h new file mode 100644 index 00000000000..46cfdb75d1d --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx.h @@ -0,0 +1,356 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + +#define decl_itx_fns(ext) \ +decl_itx17_fns( 4, 4, ext); \ +decl_itx16_fns( 4, 8, ext); \ +decl_itx16_fns( 4, 16, ext); \ +decl_itx16_fns( 8, 4, ext); \ +decl_itx16_fns( 8, 8, ext); \ +decl_itx16_fns( 8, 16, ext); \ +decl_itx2_fns ( 8, 32, ext); \ +decl_itx16_fns(16, 4, ext); \ +decl_itx16_fns(16, 8, ext); \ +decl_itx12_fns(16, 16, ext); \ +decl_itx2_fns (16, 32, ext); \ +decl_itx2_fns (32, 8, ext); \ +decl_itx2_fns (32, 16, ext); \ +decl_itx2_fns (32, 32, ext); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext)) + + +#define decl_itx2_bpc_fns(w, h, bpc, opt) \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt)) + +#define decl_itx12_bpc_fns(w, h, bpc, opt) \ +decl_itx2_bpc_fns(w, h, bpc, opt); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt)) + +#define decl_itx16_bpc_fns(w, h, bpc, opt) \ +decl_itx12_bpc_fns(w, h, bpc, opt); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt)) + +#define decl_itx_bpc_fns(bpc, ext) \ +decl_itx16_bpc_fns( 4, 4, bpc, ext); \ +decl_itx16_bpc_fns( 4, 8, bpc, ext); \ +decl_itx16_bpc_fns( 4, 16, bpc, ext); \ +decl_itx16_bpc_fns( 8, 4, bpc, ext); \ +decl_itx16_bpc_fns( 8, 8, bpc, ext); \ +decl_itx16_bpc_fns( 8, 16, bpc, ext); \ +decl_itx2_bpc_fns ( 8, 32, bpc, ext); \ +decl_itx16_bpc_fns(16, 4, bpc, ext); \ +decl_itx16_bpc_fns(16, 8, bpc, ext); \ +decl_itx12_bpc_fns(16, 16, bpc, ext); \ +decl_itx2_bpc_fns (16, 32, bpc, ext); \ +decl_itx2_bpc_fns (32, 8, bpc, ext); \ +decl_itx2_bpc_fns (32, 16, bpc, ext); \ +decl_itx2_bpc_fns (32, 32, bpc, ext); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext)) + +decl_itx_fns(avx512icl); +decl_itx_bpc_fns(10, avx512icl); +decl_itx_fns(avx2); +decl_itx_bpc_fns(10, avx2); +decl_itx_bpc_fns(12, avx2); +decl_itx_fns(sse4); +decl_itx_fns(ssse3); +decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); + +static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + + +#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) + +#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext) + +#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext) + +#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext) + +#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext) + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + +#if BITDEPTH == 8 + assign_itx16_fn(, 4, 4, ssse3); + assign_itx16_fn(R, 4, 8, ssse3); + assign_itx16_fn(R, 8, 4, ssse3); + assign_itx16_fn(, 8, 8, ssse3); + assign_itx16_fn(R, 4, 16, ssse3); + assign_itx16_fn(R, 16, 4, ssse3); + assign_itx16_fn(R, 8, 16, ssse3); + assign_itx16_fn(R, 16, 8, ssse3); + assign_itx12_fn(, 16, 16, ssse3); + assign_itx2_fn (R, 8, 32, ssse3); + assign_itx2_fn (R, 32, 8, ssse3); + assign_itx2_fn (R, 16, 32, ssse3); + assign_itx2_fn (R, 32, 16, ssse3); + assign_itx2_fn (, 32, 32, ssse3); + assign_itx1_fn (R, 16, 64, ssse3); + assign_itx1_fn (R, 32, 64, ssse3); + assign_itx1_fn (R, 64, 16, ssse3); + assign_itx1_fn (R, 64, 32, ssse3); + assign_itx1_fn ( , 64, 64, ssse3); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + +#if BITDEPTH == 16 + if (bpc == 10) { + assign_itx16_fn(, 4, 4, sse4); + assign_itx16_fn(R, 4, 8, sse4); + assign_itx16_fn(R, 4, 16, sse4); + assign_itx16_fn(R, 8, 4, sse4); + assign_itx16_fn(, 8, 8, sse4); + assign_itx16_fn(R, 8, 16, sse4); + assign_itx16_fn(R, 16, 4, sse4); + assign_itx16_fn(R, 16, 8, sse4); + assign_itx12_fn(, 16, 16, sse4); + assign_itx2_fn (R, 8, 32, sse4); + assign_itx2_fn (R, 32, 8, sse4); + assign_itx2_fn (R, 16, 32, sse4); + assign_itx2_fn (R, 32, 16, sse4); + assign_itx2_fn (, 32, 32, sse4); + assign_itx1_fn (R, 16, 64, sse4); + assign_itx1_fn (R, 32, 64, sse4); + assign_itx1_fn (R, 64, 16, sse4); + assign_itx1_fn (R, 64, 32, sse4); + assign_itx1_fn (, 64, 64, sse4); + } +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2); + +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx2); + assign_itx16_fn(R, 4, 8, avx2); + assign_itx16_fn(R, 4, 16, avx2); + assign_itx16_fn(R, 8, 4, avx2); + assign_itx16_fn( , 8, 8, avx2); + assign_itx16_fn(R, 8, 16, avx2); + assign_itx2_fn (R, 8, 32, avx2); + assign_itx16_fn(R, 16, 4, avx2); + assign_itx16_fn(R, 16, 8, avx2); + assign_itx12_fn( , 16, 16, avx2); + assign_itx2_fn (R, 16, 32, avx2); + assign_itx1_fn (R, 16, 64, avx2); + assign_itx2_fn (R, 32, 8, avx2); + assign_itx2_fn (R, 32, 16, avx2); + assign_itx2_fn ( , 32, 32, avx2); + assign_itx1_fn (R, 32, 64, avx2); + assign_itx1_fn (R, 64, 16, avx2); + assign_itx1_fn (R, 64, 32, avx2); + assign_itx1_fn ( , 64, 64, avx2); +#else + if (bpc == 10) { + assign_itx16_bpc_fn( , 4, 4, 10, avx2); + assign_itx16_bpc_fn(R, 4, 8, 10, avx2); + assign_itx16_bpc_fn(R, 4, 16, 10, avx2); + assign_itx16_bpc_fn(R, 8, 4, 10, avx2); + assign_itx16_bpc_fn( , 8, 8, 10, avx2); + assign_itx16_bpc_fn(R, 8, 16, 10, avx2); + assign_itx2_bpc_fn (R, 8, 32, 10, avx2); + assign_itx16_bpc_fn(R, 16, 4, 10, avx2); + assign_itx16_bpc_fn(R, 16, 8, 10, avx2); + assign_itx12_bpc_fn( , 16, 16, 10, avx2); + assign_itx2_bpc_fn (R, 16, 32, 10, avx2); + assign_itx1_bpc_fn (R, 16, 64, 10, avx2); + assign_itx2_bpc_fn (R, 32, 8, 10, avx2); + assign_itx2_bpc_fn (R, 32, 16, 10, avx2); + assign_itx2_bpc_fn ( , 32, 32, 10, avx2); + assign_itx1_bpc_fn (R, 32, 64, 10, avx2); + assign_itx1_bpc_fn (R, 64, 16, 10, avx2); + assign_itx1_bpc_fn (R, 64, 32, 10, avx2); + assign_itx1_bpc_fn ( , 64, 64, 10, avx2); + } else { + assign_itx16_bpc_fn( , 4, 4, 12, avx2); + assign_itx16_bpc_fn(R, 4, 8, 12, avx2); + assign_itx16_bpc_fn(R, 4, 16, 12, avx2); + assign_itx16_bpc_fn(R, 8, 4, 12, avx2); + assign_itx16_bpc_fn( , 8, 8, 12, avx2); + assign_itx16_bpc_fn(R, 8, 16, 12, avx2); + assign_itx2_bpc_fn (R, 8, 32, 12, avx2); + assign_itx16_bpc_fn(R, 16, 4, 12, avx2); + assign_itx16_bpc_fn(R, 16, 8, 12, avx2); + assign_itx12_bpc_fn( , 16, 16, 12, avx2); + assign_itx2_bpc_fn (R, 32, 8, 12, avx2); + } +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx512icl); // no wht + assign_itx16_fn(R, 4, 8, avx512icl); + assign_itx16_fn(R, 4, 16, avx512icl); + assign_itx16_fn(R, 8, 4, avx512icl); + assign_itx16_fn( , 8, 8, avx512icl); + assign_itx16_fn(R, 8, 16, avx512icl); + assign_itx2_fn (R, 8, 32, avx512icl); + assign_itx16_fn(R, 16, 4, avx512icl); + assign_itx16_fn(R, 16, 8, avx512icl); + assign_itx12_fn( , 16, 16, avx512icl); + assign_itx2_fn (R, 16, 32, avx512icl); + assign_itx1_fn (R, 16, 64, avx512icl); + assign_itx2_fn (R, 32, 8, avx512icl); + assign_itx2_fn (R, 32, 16, avx512icl); + assign_itx2_fn ( , 32, 32, avx512icl); + assign_itx1_fn (R, 32, 64, avx512icl); + assign_itx1_fn (R, 64, 16, avx512icl); + assign_itx1_fn (R, 64, 32, avx512icl); + assign_itx1_fn ( , 64, 64, avx512icl); +#else + if (bpc == 10) { + assign_itx16_bpc_fn( , 8, 8, 10, avx512icl); + assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl); + assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl); + assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl); + assign_itx12_bpc_fn( , 16, 16, 10, avx512icl); + assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl); + } +#endif +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm index c580944c7bb..811f711540f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm @@ -30,7 +30,6 @@ %if ARCH_X86_64 SECTION_RODATA 32 -pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482 itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 @@ -39,14 +38,17 @@ iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 -iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 -%macro COEF_PAIR 2 +%macro COEF_PAIR 2-3 0 pd_%1_%2: dd %1, %1, %2, %2 %define pd_%1 (pd_%1_%2 + 4*0) %define pd_%2 (pd_%1_%2 + 4*2) +%if %3 +dd -%2, -%2 +%define pd_%2_m%2 pd_%2 +%endif %endmacro COEF_PAIR 201, 995 @@ -56,8 +58,8 @@ COEF_PAIR 1380, 601 COEF_PAIR 1751, 2440 COEF_PAIR 2598, 1189 COEF_PAIR 2751, 2106 -COEF_PAIR 2896, 1567 -COEF_PAIR 2896, 3784 +COEF_PAIR 2896, 1567, 1 +COEF_PAIR 2896, 3784, 1 COEF_PAIR 3035, 3513 COEF_PAIR 3166, 3920 COEF_PAIR 3703, 3290 @@ -66,9 +68,6 @@ COEF_PAIR 4017, 2276 COEF_PAIR 4076, 3612 COEF_PAIR 4091, 3973 -%define pd_1321 (pd_1321_2482 + 4*0) -%define pd_2482 (pd_1321_2482 + 4*4) - pd_8: dd 8 pd_m601: dd -601 pd_m1189: dd -1189 @@ -77,17 +76,23 @@ pd_m2106: dd -2106 pd_m2598: dd -2598 pd_m2751: dd -2751 pd_m3344: dd -3344 +pd_1024: dd 1024 +pd_1321: dd 1321 +pd_1448: dd 1448 +pd_1697: dd 1697 +pd_2482: dd 2482 +pd_3072: dd 3072 ; 1024 + 2048 pd_3803: dd 3803 +pd_5119: dd 5119 ; 1024 + 4096 - 1 +pd_5120: dd 5120 ; 1024 + 4096 pd_5793: dd 5793 pd_6144: dd 6144 ; 2048 + 4096 -pd_10239: dd 10239 ; 2048 + 8192 - 1 -pd_10240: dd 10240 ; 2048 + 8192 -pd_11586: dd 11586 ; 5793 * 2 -pd_34816: dd 34816 ; 2048 + 32768 -pd_38912: dd 38912 ; 2048 + 4096 + 32768 +pd_17408: dd 17408 ; 1024 + 16384 pixel_10bpc_max: times 2 dw 0x03ff pixel_12bpc_max: times 2 dw 0x0fff +dconly_10bpc: times 2 dw 0x7c00 +dconly_12bpc: times 2 dw 0x7000 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff clip_20b_min: dd -0x80000 @@ -214,7 +219,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 -; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2 +; flags: 1 = packed, 2 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags %if %8 < 32 @@ -241,7 +246,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax pmulld m%1, m%5 pmulld m%2, m%5 %endif -%if %9 & 4 +%if %9 & 2 psubd m%4, m%6, m%4 psubd m%2, m%4, m%2 %else @@ -250,17 +255,10 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax %endif paddd m%2, m%4 %endif -%if %9 & 2 ; invert the upper half of dst1 before rounding - vbroadcasti128 m%4, [pw_2048_m2048] - psubd m%1, m%3 - psignd m%1, m%4 - paddd m%1, m%6 -%else %ifnum %6 paddd m%1, m%6 %endif psubd m%1, m%3 -%endif %ifnum %6 psrad m%2, 12 psrad m%1, 12 @@ -287,37 +285,39 @@ ALIGN function_align %endif %endmacro -%macro INV_TXFM_4X4_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x4 -%ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - movd xm1, [pw_2896x8] - mov [cq], eobd ; 0 - add r6d, 2048 - sar r6d, 12 - movd xm0, r6d - packssdw xm0, xm0 - pmulhrsw xm0, xm1 - vpbroadcastw xm0, xm0 - mova xm1, xm0 - jmp m(iadst_4x4_internal_10bpc).end -%endif -%endmacro - -%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2 - INV_TXFM_FN %1, %2, 0, 4x4, 12 +%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x4, %3 %ifidn %1_%2, dct_dct + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: add r6d, 128 sar r6d, 8 +.dconly3: imul r6d, 181 - add r6d, 128 - sar r6d, 8 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d - vpbroadcastd m0, xm0 - mova m1, m0 - jmp m(iadst_4x4_internal_12bpc).end + paddsw xm0, xm2 + vpbroadcastw xm0, xm0 +.dconly_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddsw xm1, xm0 + psubusw xm1, xm2 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + WRAP_XMM RET +%else + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly +%endif %endif %endmacro @@ -399,12 +399,50 @@ INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity +%macro IADST4_1D 0 + vpbroadcastd m5, [pd_1321] + vpbroadcastd m7, [pd_2482] + pmulld m4, m0, m5 ; 1321*in0 + pmulld m6, m3, m7 ; 2482*in3 + paddd m4, m6 ; 1321*in0 + 2482*in3 + pmulld m6, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m5 ; pd_3803 + pmulld m5, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [pd_m3344] + pmulld m1, m0 ; -t3 + pmulld m2, m0 ; out2 (unrounded) + psubd m6, m5 ; 2482*in0 - 1321*in2 + paddd m4, m7 ; t0 + psubd m6, m3 ; t1 + paddd m3, m4, m6 + psubd m4, m1 ; out0 (unrounded) + psubd m6, m1 ; out1 (unrounded) + paddd m3, m1 ; out3 (unrounded) +%endmacro + cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call .main + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass1_end: + vpbroadcastd m5, [pd_2048] + mova m2, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 packssdw m0, m1 - vpermd m0, m4, m0 - psrld m4, 4 - pshufb m0, m4 + vpermd m0, m2, m0 + psrld m2, 4 + pshufb m0, m2 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif jmp tx2q .pass2: lea r6, [deint_shuf+128] @@ -436,35 +474,16 @@ cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 RET ALIGN function_align .main: - mova m2, [cq+16*2] - vbroadcasti128 m5, [cq+16*0] + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] +%if WIN64 + movaps [rsp+16], xmm6 + movaps [rsp+32], xmm7 +%endif .main2: - mova m0, [pd_1321_2482] - vpbroadcastd m3, [pd_3803] - vpbroadcastd m1, [pd_m3344] - pmulld m4, m0, m2 - pmulld m3, m2 - pmulld m0, m5 - vpbroadcastd m5, [pd_2048] - psubd xm2, [cq+16*3] - psubd m2, [cq+16*0] - pmulld m2, m1 ; t2 t3 - vpermq m4, m4, q1032 - paddd m4, m3 - psubd m0, m4 - paddd xm4, xm4 - paddd m4, m0 ; t0 t1 - vinserti128 m3, m2, xm4, 1 ; t2 t0 - paddd m0, m4, m5 - psubd xm4, xm2 - psubd m1, m0, m2 - vpermq m2, m2, q3232 ; t3 t3 - psubd m1, m4 - mova m4, [itx4_shuf] - paddd m0, m2 ; out0 out1 - paddd m1, m3 ; out2 out3 - psrad m0, 12 - psrad m1, 12 + WRAP_XMM IADST4_1D ret INV_TXFM_4X4_FN flipadst, dct @@ -474,12 +493,9 @@ INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - packssdw m0, m1 - psrld m1, m4, 8 - vpermd m0, m1, m0 - psrld m4, 4 - pshufb m0, m4 - jmp tx2q + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_10bpc).pass1_end .pass2: lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 @@ -556,19 +572,20 @@ cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 movhps [r6 +strideq*1], xm1 RET -INV_TXFM_4X4_12BPC_FN dct, dct -INV_TXFM_4X4_12BPC_FN dct, identity -INV_TXFM_4X4_12BPC_FN dct, adst -INV_TXFM_4X4_12BPC_FN dct, flipadst +INV_TXFM_4X4_FN dct, dct, 12 +INV_TXFM_4X4_FN dct, identity, 12 +INV_TXFM_4X4_FN dct, adst, 12 +INV_TXFM_4X4_FN dct, flipadst, 12 -cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(idct_4x4_internal_10bpc).main mova m3, [idct4_12_shuf] mova m4, [idct4_12_shuf2] - vpermd m2, m3, m0 - vpermd m1, m4, m1 - jmp m(iadst_4x4_internal_12bpc).pass1_end + vpermd m2, m4, m1 + vpermd m1, m3, m0 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: + vpbroadcastd m5, [pd_2048] vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(idct_4x4_internal_10bpc).main2 @@ -576,33 +593,52 @@ cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 vpermq m1, m1, q2031 jmp m(iadst_4x4_internal_12bpc).end -INV_TXFM_4X4_12BPC_FN adst, dct -INV_TXFM_4X4_12BPC_FN adst, adst -INV_TXFM_4X4_12BPC_FN adst, flipadst -INV_TXFM_4X4_12BPC_FN adst, identity +INV_TXFM_4X4_FN adst, dct, 12 +INV_TXFM_4X4_FN adst, adst, 12 +INV_TXFM_4X4_FN adst, flipadst, 12 +INV_TXFM_4X4_FN adst, identity, 12 -cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - vpermd m2, m4, m0 - vpermd m1, m4, m1 + vinserti128 m1, m4, xm6, 1 + vinserti128 m2, xm3, 1 .pass1_end: - punpcklqdq m0, m2, m1 - punpckhqdq m1, m2, m1 + mova m3, [itx4_shuf] + vpbroadcastd m5, [pd_1024] + psrad m1, 1 + psrad m2, 1 + vpermd m1, m3, m1 + vpermd m2, m3, m2 + paddd m1, m5 + paddd m2, m5 + psrad m1, 11 + psrad m2, 11 .pass1_end2: vpbroadcastd m3, [clip_18b_min] vpbroadcastd m4, [clip_18b_max] + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 pmaxsd m0, m3 pmaxsd m1, m3 pminsd m0, m4 pminsd m1, m4 jmp tx2q .pass2: - mova [cq+16*0], m0 - vextracti128 [cq+16*3], m1, 1 - mova m2, m1 - vpermq m5, m0, q1010 - call m(iadst_4x4_internal_10bpc).main2 + call .main_pass2 + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass2_end: + vpbroadcastd m5, [pd_2048] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 .end: +%if WIN64 + WIN64_RESTORE_XMM_INTERNAL + %assign xmm_regs_used 6 +%endif +.end2: vpbroadcastd m4, [pw_16384] movq xm2, [dstq+strideq*0] movq xm3, [dstq+strideq*1] @@ -627,53 +663,53 @@ cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 movhps [r6 +strideq*0], xm0 movhps [r6 +strideq*1], xm1 RET +.main_pass2: + vextracti128 xm3, m1, 1 + mova xm2, xm1 + vextracti128 xm1, m0, 1 + jmp m(iadst_4x4_internal_10bpc).main2 -INV_TXFM_4X4_12BPC_FN flipadst, dct -INV_TXFM_4X4_12BPC_FN flipadst, adst -INV_TXFM_4X4_12BPC_FN flipadst, flipadst -INV_TXFM_4X4_12BPC_FN flipadst, identity +INV_TXFM_4X4_FN flipadst, dct, 12 +INV_TXFM_4X4_FN flipadst, adst, 12 +INV_TXFM_4X4_FN flipadst, flipadst, 12 +INV_TXFM_4X4_FN flipadst, identity, 12 -cglobal iflipadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 +cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main - psrld m4, 8 - vpermd m2, m4, m0 - vpermd m1, m4, m1 - punpckhqdq m0, m1, m2 - punpcklqdq m1, m2 - jmp m(iadst_4x4_internal_12bpc).pass1_end2 + vinserti128 m1, m3, xm2, 1 + vinserti128 m2, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass1_end .pass2: - mova [cq+16*0], m0 - vextracti128 [cq+16*3], m1, 1 - mova m2, m1 - vpermq m5, m0, q1010 - call m(iadst_4x4_internal_10bpc).main2 - vpermq m2, m0, q1032 - vpermq m0, m1, q1032 - mova m1, m2 - jmp m(iadst_4x4_internal_12bpc).end - -INV_TXFM_4X4_12BPC_FN identity, dct -INV_TXFM_4X4_12BPC_FN identity, adst -INV_TXFM_4X4_12BPC_FN identity, flipadst -INV_TXFM_4X4_12BPC_FN identity, identity - -cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 - vpbroadcastd m1, [pd_5793] - pmulld m0, m1, [cq+32*0] - pmulld m1, [cq+32*1] + call m(iadst_4x4_internal_12bpc).main_pass2 + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass2_end + +INV_TXFM_4X4_FN identity, dct, 12 +INV_TXFM_4X4_FN identity, adst, 12 +INV_TXFM_4X4_FN identity, flipadst, 12 +INV_TXFM_4X4_FN identity, identity, 12 + +cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + mova m2, [itx4_shuf] + vpbroadcastd m3, [pd_1697] + vpermd m0, m2, [cq+32*0] + vpermd m2, m2, [cq+32*1] vpbroadcastd m5, [pd_2048] - mova m3, [itx4_shuf] - paddd m0, m5 + pmulld m1, m3, m0 + pmulld m3, m2 paddd m1, m5 - psrad m0, 12 + paddd m3, m5 psrad m1, 12 - vpermd m2, m3, m0 - vpermd m1, m3, m1 - jmp m(iadst_4x4_internal_12bpc).pass1_end + psrad m3, 12 + paddd m1, m0 + paddd m2, m3 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: ; m0 = in0 in1 ; m1 = in2 in3 vpbroadcastd m3, [pd_5793] + vpbroadcastd m5, [pd_2048] pmulld m0, m3 pmulld m1, m3 paddd m0, m5 ; 2048 @@ -685,34 +721,19 @@ cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2 %macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 8 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 2048 - sar r6d, 12 -.end: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 - movd xm0, r6d - vpbroadcastw xm0, xm0 - vpbroadcastd xm3, [pixel_%3bpc_max] - pxor xm2, xm2 -.end_loop: - movq xm1, [dstq+strideq*0] - movhps xm1, [dstq+strideq*1] - paddw xm1, xm0 - pmaxsw xm1, xm2 - pminsw xm1, xm3 - movq [dstq+strideq*0], xm1 - movhps [dstq+strideq*1], xm1 - lea dstq, [dstq+strideq*2] - sub r3d, 2 - jg .end_loop - WRAP_XMM RET + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 +%else + jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly +%endif %endif %endmacro @@ -797,12 +818,14 @@ INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 -.pass1: call m(iadst_8x4_internal_10bpc).main - psrad m0, m4, 12 - psrad m1, m5, 12 - psrad m2, 12 - psrad m3, 12 + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass1_end: + REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: call .pass2_main @@ -918,13 +941,13 @@ INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 -.pass1: call m(iadst_8x4_internal_10bpc).main - psrad m0, m3, 12 - psrad m1, m2, 12 - psrad m2, m5, 12 - psrad m3, m4, 12 - jmp tx2q + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m2, m5, m6 + paddd m3, m5, m4 + jmp m(iadst_4x8_internal_10bpc).pass1_end .pass2: call m(iadst_4x8_internal_10bpc).pass2_main mova xm4, [pw_2048_m2048] @@ -1070,7 +1093,16 @@ INV_TXFM_4X8_FN adst, flipadst, 12 INV_TXFM_4X8_FN adst, identity, 12 cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 - jmp m(iadst_4x8_internal_10bpc).pass1 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m4, 1 + psrad m1, m6, 1 + psrad m2, 1 + psrad m3, 1 +.pass1_end: + vpbroadcastd m5, [pd_1024] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 11}, m0, m1, m2, m3 + jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] @@ -1146,7 +1178,12 @@ INV_TXFM_4X8_FN flipadst, flipadst, 12 INV_TXFM_4X8_FN flipadst, identity, 12 cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 - jmp m(iflipadst_4x8_internal_10bpc).pass1 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m3, 1 + psrad m1, m2, 1 + psrad m2, m6, 1 + psrad m3, m4, 1 + jmp m(iadst_4x8_internal_12bpc).pass1_end .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] @@ -1180,12 +1217,13 @@ cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 %macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x16, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd xm2, [dconly_%3bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 6144 - sar r6d, 13 - jmp m(inv_txfm_add_dct_dct_4x8_%3bpc).end + or r3d, 16 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 %endif %endmacro @@ -1196,7 +1234,7 @@ INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 .pass1: - vpbroadcastd m10, [pd_6144] + vpbroadcastd m10, [pd_3072] mova m1, [cq+32*2] mova m3, [cq+32*6] mova m5, [cq+32*3] @@ -1241,7 +1279,7 @@ ALIGN function_align vpbroadcastd m4, [pd_3784] vpbroadcastd m8, [pd_1567] vpbroadcastd m9, [pd_2048] - vpbroadcastd m6, [pd_2896] + vpbroadcastd m6, [pd_1448] ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h ret @@ -1253,7 +1291,7 @@ ALIGN function_align psubd m0, m2 paddd m9, m4, m6 psubd m4, m6 - REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h psubd m2, m0, m1 paddd m1, m0 psubd m6, m4, m5 @@ -1304,7 +1342,6 @@ INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 -.pass1: call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_6144] call m(iadst_16x4_internal_10bpc).main_end @@ -1545,7 +1582,6 @@ INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 -.pass1: vpbroadcastd m7, [pd_5793] pmulld m0, m7, [cq+32*0] pmulld m4, m7, [cq+32*1] @@ -1678,7 +1714,16 @@ INV_TXFM_4X16_FN adst, flipadst, 12 INV_TXFM_4X16_FN adst, identity, 12 cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iadst_4x16_internal_10bpc).pass1 + call .main_pass1 + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + psrad m6, 12 + psrad m7, 12 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -1740,6 +1785,22 @@ ALIGN function_align vperm2i128 m4, m8, m9, 0x20 ; 8 10 vperm2i128 m6, m8, m9, 0x31 ; 12 14 ret +ALIGN function_align +.main_pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_3072] + paddd m10, m4, m5 + psubd m4, m3 + psubd m5, m3 + paddd m3, m10 + psubd m8, m7, m1 + paddd m7, m9 + psubd m9, m1 + paddd m7, m1 + REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 + REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 + paddd m6, m0 + ret INV_TXFM_4X16_FN flipadst, dct, 12 INV_TXFM_4X16_FN flipadst, adst, 12 @@ -1747,7 +1808,16 @@ INV_TXFM_4X16_FN flipadst, flipadst, 12 INV_TXFM_4X16_FN flipadst, identity, 12 cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iflipadst_4x16_internal_10bpc).pass1 + call m(iadst_4x16_internal_12bpc).main_pass1 + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + psrad m4, m7, 12 + psrad m5, m6, 12 + psrad m6, m9, 12 + psrad m7, m8, 12 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -1772,17 +1842,49 @@ INV_TXFM_4X16_FN identity, flipadst, 12 INV_TXFM_4X16_FN identity, identity, 12 cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iidentity_4x16_internal_10bpc).pass1 + vpbroadcastd m8, [pd_1697] + mova m0, [cq+32*0] + mova m4, [cq+32*1] + mova m1, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m9, [pd_6144] + pmulld m2, m8, m0 + pmulld m6, m8, m4 + pmulld m3, m8, m1 + pmulld m7, m8, m5 + mova m10, [cq+32*4] + mova m11, [cq+32*5] + mova m12, [cq+32*6] + mova m13, [cq+32*7] + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m0, m2 + pmulld m2, m8, m10 + paddd m4, m6 + pmulld m6, m8, m11 + paddd m1, m3 + pmulld m3, m8, m12 + paddd m5, m7 + pmulld m7, m8, m13 + REPX {psrad x, 1 }, m0, m4, m1, m5 + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m2, m10 + paddd m6, m11 + paddd m3, m12 + paddd m7, m13 + REPX {psrad x, 1 }, m2, m6, m3, m7 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 - vpbroadcastd m8, [pd_11586] - vpbroadcastd m9, [pd_2048] + vpbroadcastd m8, [pd_5793] + vpbroadcastd m9, [pd_1024] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 @@ -1795,37 +1897,21 @@ cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x4, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 mov [cq], eobd ; 0 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 - movd xm0, r6d - vpbroadcastw m0, xm0 -.end: - vpbroadcastd m4, [pixel_%3bpc_max] - pxor m3, m3 - mova xm1, [dstq+strideq*0] - vinserti128 m1, [dstq+strideq*1], 1 - lea r6, [dstq+strideq*2] - mova xm2, [r6 +strideq*0] - vinserti128 m2, [r6 +strideq*1], 1 - paddw m1, m0 - paddw m2, m0 - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 - mova [dstq+strideq*0], xm1 - vextracti128 [dstq+strideq*1], m1, 1 - mova [r6 +strideq*0], xm2 - vextracti128 [r6 +strideq*1], m2, 1 - RET + or r3d, 4 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +%else + jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly +%endif %endif %endmacro @@ -1960,32 +2046,7 @@ ALIGN function_align REPX {paddd x, m4}, m0, m3, m2, m1 REPX {psrad x, 12}, m0, m3, m2, m1 .main2: - vbroadcasti128 m6, [pd_1321] - vbroadcasti128 m7, [pd_2482] - pmulld m4, m0, m6 ; 1321*in0 - pmulld m5, m3, m7 ; 2482*in3 - paddd m4, m5 ; 1321*in0 + 2482*in3 - pmulld m5, m0, m7 ; 2482*in0 - paddd m0, m3 ; in0 + in3 - paddd m7, m6 ; pd_3803 - pmulld m6, m2 ; 1321*in2 - pmulld m3, m7 ; 3803*in3 - pmulld m7, m2 ; 3803*in2 - psubd m2, m0 ; in2 - in0 - in3 - vpbroadcastd m0, [pd_m3344] - psubd m5, m6 ; 2482*in0 - 1321*in2 - vpbroadcastd m6, [pd_2048] - psubd m5, m3 ; t1 - pmulld m2, m0 ; t2 - pmulld m1, m0 ; -t3 - paddd m4, m7 ; t0 - paddd m5, m6 - paddd m3, m4, m5 - paddd m4, m6 - psubd m4, m1 ; out0 (unshifted) - psubd m5, m1 ; out1 (unshifted) - paddd m2, m6 ; out2 (unshifted) - paddd m3, m1 ; out3 (unshifted) + IADST4_1D ret INV_TXFM_8X4_FN flipadst, dct @@ -2103,10 +2164,13 @@ cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call .pass2_main - psrad m0, m4, 12 - psrad m1, m5, 12 - psrad m2, 12 - psrad m3, 12 + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass2_end: + REPX {psrad x, 12}, m0, m1, m2, m3 .end: vpbroadcastd m4, [pw_16384] REPX {psrad x, 3}, m0, m1, m2, m3 @@ -2162,11 +2226,12 @@ cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).pass2_main - psrad m0, m3, 12 - psrad m3, m4, 12 - psrad m1, m2, 12 - psrad m2, m5, 12 - jmp m(iadst_8x4_internal_12bpc).end + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m3, m5, m4 + paddd m2, m5, m6 + jmp m(iadst_8x4_internal_12bpc).pass2_end INV_TXFM_8X4_FN identity, dct, 12 INV_TXFM_8X4_FN identity, adst, 12 @@ -2197,32 +2262,36 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 %macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - mov [cq], eobd ; 0 - mov r3d, 8 + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 .dconly: - add r6d, 6144 - sar r6d, 13 + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm2 vpbroadcastw m0, xm0 - vpbroadcastd m3, [pixel_%3bpc_max] - pxor m2, m2 .dconly_loop: mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 - paddw m1, m0 - pmaxsw m1, m2 - pminsw m1, m3 + paddsw m1, m0 + psubusw m1, m2 mova [dstq+strideq*0], xm1 vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET +%else + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly +%endif %endif %endmacro @@ -2245,7 +2314,7 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a psubd m%10, m%7, m%9 ; t7 paddd m%7, m%9 ; out6 - vpbroadcastd m%9, [pd_2896] + vpbroadcastd m%9, [pd_1448] psubd m%4, m%8, m%6 ; t3 paddd m%8, m%6 ; -out7 psubd m%6, m%1, m%3 ; t2 @@ -2255,10 +2324,10 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 - psubd m%5, m%6, m%4 ; (t2 - t3) * 2896 - paddd m%4, m%6 ; (t2 + t3) * 2896 - psubd m%6, m%3, m%10 ; (t6 - t7) * 2896 - paddd m%3, m%10 ; (t6 + t7) * 2896 + psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 + paddd m%4, m%6 ; (t2 + t3) * 1448 + psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 + paddd m%3, m%10 ; (t6 + t7) * 1448 %endmacro INV_TXFM_8X8_FN dct, dct @@ -2430,8 +2499,8 @@ ALIGN function_align vpbroadcastd m11, [pd_2048] .main2: IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 - psrld m8, 11 ; pd_1 - vpbroadcastd m9, [pd_6144] + psrld m8, 10 ; pd_1 + vpbroadcastd m9, [pd_3072] ret ALIGN function_align .main_end: @@ -2440,14 +2509,14 @@ ALIGN function_align paddd m6, m8 psubd m7, m8, m7 REPX {psrad x, 1 }, m0, m1, m6, m7 - ; (1 + ((x + 2048) >> 12)) >> 1 = (6144 + x) >> 13 - ; (1 - ((x + 2048) >> 12)) >> 1 = (6143 - x) >> 13 - psubd m8, m9, m8 ; pd_6143 + ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 + ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 + psubd m8, m9, m8 ; pd_3071 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 - REPX {psrad x, 13}, m2, m3, m4, m5 + REPX {psrad x, 12}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct @@ -2496,10 +2565,10 @@ ALIGN function_align paddd m5, m9, m2 psubd m2, m8, m3 paddd m3, m9, m4 - psrad m4, m2, 13 - psrad m2, m10, 13 - psrad m3, 13 - psrad m5, 13 + psrad m4, m2, 12 + psrad m2, m10, 12 + psrad m3, 12 + psrad m5, 12 ret INV_TXFM_8X8_FN identity, dct @@ -2681,13 +2750,13 @@ ALIGN function_align paddd m6, m9 psubd m7, m9, m7 REPX {psrad x, 4}, m0, m1, m6, m7 - vpbroadcastd m9, [pd_34816] - psubd m8, m9, m8 ; 34815 + vpbroadcastd m9, [pd_17408] + psubd m8, m9, m8 ; 17407 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 - REPX {psrad x, 16}, m2, m3, m4, m5 + REPX {psrad x, 15}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct, 12 @@ -2729,13 +2798,14 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 8x16, %4 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_%4bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_8x8_%4bpc).dconly + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 %endif %endmacro @@ -2904,7 +2974,7 @@ ALIGN function_align vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 - ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 psubd m3, m1, m4 ; t10 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a @@ -3269,7 +3339,7 @@ cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_end: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 jmp m(idct_8x16_internal_12bpc).end ALIGN function_align .pass2_main: @@ -3302,9 +3372,9 @@ ALIGN function_align pmaxsd m7, m13, [cq+32* 3] ; 3 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_16x8_internal_10bpc).main_part2 - vpbroadcastd m14, [pd_34816] + vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_34815 + psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret @@ -3357,49 +3427,52 @@ ALIGN function_align m8, m9, m10, m11, m12, m13, m14 pminsd m15, [cq] mova [cq], m7 - vpbroadcastd m7, [pd_11586] + vpbroadcastd m7, [pd_5793] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 pmulld m7, [cq] mova [cq], m15 - vpbroadcastd m15, [pd_2048] + vpbroadcastd m15, [pd_1024] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [cq] - REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ret %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x4, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 - mov [cq], eobd ; 0 - mov r3d, 4 + vpbroadcastd m3, [dconly_%3bpc] +%if %3 = 10 .dconly: - add r6d, 6144 - sar r6d, 13 + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm3 vpbroadcastw m0, xm0 - vpbroadcastd m4, [pixel_%3bpc_max] - pxor m3, m3 .dconly_loop: - paddw m1, m0, [dstq+strideq*0] - paddw m2, m0, [dstq+strideq*1] - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 + paddsw m1, m0, [dstq+strideq*0] + paddsw m2, m0, [dstq+strideq*1] + psubusw m1, m3 + psubusw m2, m3 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET +%else + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly +%endif %endif %endmacro @@ -3480,13 +3553,30 @@ ALIGN function_align .pass1_main2: ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 - psubd m4, m10, m5 ; t9 -t10 + vbroadcasti128 m12, [pd_3784_m3784] + psubd m4, m10, m5 paddd m10, m5 ; t8 t11 - psubd m5, m11, m6 ; t14 -t13 + psignd m4, m12 ; t9 t10 + psubd m5, m11, m6 paddd m11, m6 ; t15 t12 - REPX {pmaxsd x, m8}, m4, m5, m10, m11 - REPX {pminsd x, m9}, m4, m5, m10, m11 - ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2 + psignd m5, m12 ; t14 t13 + vpbroadcastd m6, [pd_1567] + vpbroadcastd m13, [pd_3784] + REPX {pmaxsd x, m8}, m5, m4 + REPX {pminsd x, m9}, m5, m4 + pmulld m12, m5 + pmulld m5, m6 + vbroadcasti128 m6, [pd_1567_m1567] + pmulld m13, m4 + pmulld m4, m6 + REPX {pmaxsd x, m8}, m10, m11, m0, m1 + REPX {pminsd x, m9}, m10, m11, m0, m1 + paddd m12, m7 + paddd m5, m7 + paddd m4, m12 + psubd m5, m13 + psrad m4, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a vpbroadcastd m12, [pd_2896] punpckhqdq m6, m11, m5 punpcklqdq m11, m4 @@ -3500,8 +3590,8 @@ ALIGN function_align REPX {pminsd x, m9}, m5, m6 pmulld m5, m12 pmulld m6, m12 - REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10 - REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10 + REPX {pmaxsd x, m8}, m2, m3, m11, m10 + REPX {pminsd x, m9}, m2, m3, m11, m10 ret ALIGN function_align .pass1_main3: @@ -3565,10 +3655,10 @@ cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(idct_16x4_internal_10bpc).end ALIGN function_align .main: - vbroadcasti128 m6, [pd_1321] + vpbroadcastd m6, [pd_1321] mova m0, [cq+32*0] mova m1, [cq+32*1] - vbroadcasti128 m7, [pd_2482] + vpbroadcastd m7, [pd_2482] mova m2, [cq+32*6] mova m3, [cq+32*7] pmulld m4, m0, m6 @@ -3663,8 +3753,7 @@ INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 -.pass1: - vpbroadcastd m8, [pd_11586] + vpbroadcastd m8, [pd_5793] vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m1, [cq+32*1], q3120 ; 2 3 vpermq m2, [cq+32*2], q3120 ; 4 5 @@ -3673,10 +3762,10 @@ cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpermq m5, [cq+32*5], q3120 ; a b vpermq m6, [cq+32*6], q3120 ; c d vpermq m7, [cq+32*7], q3120 ; e f - vpbroadcastd m9, [pd_6144] + vpbroadcastd m9, [pd_3072] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed @@ -3729,17 +3818,15 @@ cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 pmulld m2, m6, m11 pmulld m4, m6, m12 pmulld m6, m13 - vpbroadcastd m10, [pd_2048] + vpbroadcastd m10, [pd_17408] call m(idct_4x16_internal_10bpc).pass1_main2 - REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 - vpbroadcastd m4, [pw_16384] vpbroadcastd m5, [pixel_12bpc_max] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 - REPX {pmulhrsw x, m4}, m0, m1, m2, m3 jmp m(idct_16x4_internal_10bpc).end2 INV_TXFM_16X4_FN adst, dct, 12 @@ -3824,7 +3911,37 @@ INV_TXFM_16X4_FN identity, flipadst, 12 INV_TXFM_16X4_FN identity, identity, 12 cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 - jmp m(iidentity_16x4_internal_10bpc).pass1 + vpbroadcastd m8, [pd_1697] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpbroadcastd m9, [pd_3072] + pmulld m4, m8, m0 + pmulld m5, m8, m1 + pmulld m6, m8, m2 + pmulld m7, m8, m3 + vpermq m10, [cq+32*4], q3120 ; 8 9 + vpermq m11, [cq+32*5], q3120 ; a b + vpermq m12, [cq+32*6], q3120 ; c d + vpermq m13, [cq+32*7], q3120 ; e f + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m0, m4 + pmulld m4, m8, m10 + paddd m1, m5 + pmulld m5, m8, m11 + paddd m2, m6 + pmulld m6, m8, m12 + paddd m3, m7 + pmulld m7, m8, m13 + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m4, m10 + paddd m5, m11 + paddd m6, m12 + paddd m7, m13 + jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -3844,13 +3961,14 @@ cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x8, %3 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%3bpc] mov [cq], eobd ; 0 - mov r3d, 8 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_16x4_%3bpc).dconly + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 %endif %endmacro @@ -4013,13 +4131,13 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 .pass1: lea r6, [rsp+32*4] call .main - vpbroadcastd m14, [pd_6144] + vpbroadcastd m14, [pd_3072] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_6143 + psubd m13, m14, m15 ; pd_3071 call .pass1_rotations .pass1_end: REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 jmp tx2q .pass2: call m(idct_16x8_internal_10bpc).transpose @@ -4127,8 +4245,6 @@ ALIGN function_align pmaxsd m10, m13 pminsd m9, m14 pminsd m10, m14 - pmulld m9, m15 - pmulld m10, m15 mova [r6-32*4], m1 mova m11, [r6-32*1] ; t7a mova m1, [r6-32*2] ; t6a @@ -4140,7 +4256,6 @@ ALIGN function_align pmaxsd m2, m13 pminsd m8, m14 pminsd m2, m14 - pmulld m8, m15 mova [r6-32*1], m11 mova [r6-32*3], m2 mova m1, [r6+32*3] ; t15 @@ -4153,8 +4268,6 @@ ALIGN function_align pmaxsd m11, m13 pminsd m7, m14 pminsd m11, m14 - pmulld m7, m15 - pmulld m11, m15 mova [r6-32*2], m12 pminsd m1, m14, [r6+32*0] ; t10a pminsd m12, m14, [r6+32*1] ; t11a @@ -4162,13 +4275,13 @@ ALIGN function_align paddd m1, m4 ; -out1 psubd m4, m5, m12 ; t11 paddd m5, m12 ; out14 - pmulld m12, m15, [r6-32*3] ; t6 + vpbroadcastd m12, [pd_1448] pmaxsd m6, m13 pmaxsd m4, m13 pminsd m6, m14 pminsd m4, m14 - pmulld m6, m15 - pmulld m4, m15 + REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 + pmulld m12, [r6-32*3] ; t6 mova [r6-32*3], m5 paddd m5, m11, m7 ; -out5 (unshifted) psubd m11, m7 ; out10 (unshifted) @@ -4233,7 +4346,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 .pass1: lea r6, [rsp+32*4] call m(iadst_16x8_internal_10bpc).main - vpbroadcastd m14, [pd_6144] + vpbroadcastd m14, [pd_3072] psrld m15, 11 psubd m13, m14, m15 call .pass1_rotations @@ -4313,16 +4426,16 @@ cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 mova [rsp], m15 - vpbroadcastd m15, [pd_11586] + vpbroadcastd m15, [pd_5793] REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pmulld m15, [rsp] mova [rsp], m7 - vpbroadcastd m7, [pd_6144] + vpbroadcastd m7, [pd_3072] REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [rsp] - REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: @@ -4340,6 +4453,10 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x8_internal_10bpc).pass1 .pass2: + call .pass2_main + RET +ALIGN function_align +.pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -4383,8 +4500,7 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpermq m1, m5, q3120 vpermq m2, m6, q3120 vpermq m3, m7, q3120 - call m(idct_16x8_internal_10bpc).write_16x4_zero - RET + jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .write_16x4_start: vpbroadcastd m9, [pixel_12bpc_max] @@ -4403,7 +4519,8 @@ cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iadst_16x8_internal_10bpc).pass1 .pass2: call .pass2_main - jmp m(idct_16x8_internal_12bpc).end + call m(idct_16x8_internal_12bpc).end + RET ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose @@ -4483,12 +4600,13 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 %macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 16x16, %4 %ifidn %1_%2, dct_dct - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%4bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 %endif %endmacro @@ -4756,17 +4874,17 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 add cq, 32 call .main sub cq, 32 - vpbroadcastd m8, [pd_10240] + vpbroadcastd m8, [pd_5120] paddd m4, m8 paddd m6, m8 paddd m9, m8 paddd m11, m8 - vpbroadcastd m8, [pd_10239] + vpbroadcastd m8, [pd_5119] psubd m5, m8, m5 psubd m7, m8, m7 psubd m10, m8, m10 psubd m12, m8, m12 - REPX {psrad x, 14}, m4, m5, m6, m7, m9, m10, m11, m12 + REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 @@ -4797,8 +4915,8 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 .fast: add r6, 32*8 call .main - vpbroadcastd m14, [pd_10240] - vpbroadcastd m13, [pd_10239] + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 paddd m0, m15 psubd m1, m15, m1 @@ -4818,7 +4936,7 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 psubd m15, [r6-32*4] .pass1_end: REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 sub r6, 32*8 jmp tx2q .pass2: @@ -4892,17 +5010,17 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx add cq, 32 call m(iadst_16x16_internal_10bpc).main sub cq, 32 - vpbroadcastd m8, [pd_10240] + vpbroadcastd m8, [pd_5120] paddd m11, m8 paddd m9, m8 paddd m6, m8 paddd m4, m8 - vpbroadcastd m8, [pd_10239] + vpbroadcastd m8, [pd_5119] psubd m12, m8, m12 psubd m10, m8, m10 psubd m7, m8, m7 psubd m5, m8, m5 - REPX {psrad x, 14}, m12, m11, m10, m9, m7, m6, m5, m4 + REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 mova [r6+32*0], m12 mova [r6+32*1], m11 mova [r6+32*2], m10 @@ -4933,8 +5051,8 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx .fast: add r6, 32*8 call m(iadst_16x16_internal_10bpc).main - vpbroadcastd m14, [pd_10240] - vpbroadcastd m13, [pd_10239] + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 psubd m8, m13, m7 paddd m7, m14, m9 @@ -4996,9 +5114,8 @@ INV_TXFM_16X16_FN identity, dct, -92 INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 -.pass1: - vpbroadcastd m15, [pd_11586] - vpbroadcastd m7, [pd_10240] + vpbroadcastd m15, [pd_5793] + vpbroadcastd m7, [pd_5120] lea r6, [rsp+32*4] sub eobd, 36 jl .fast @@ -5010,7 +5127,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx pmulld m3, m15, [cq+r3+32*39] add r6, 32*4 REPX {paddd x, m7}, m0, m1, m2, m3 - REPX {psrad x, 14}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 @@ -5038,7 +5155,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [cq] - REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: @@ -5203,7 +5320,7 @@ cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_part3: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 .end: packssdw m15, m14 packssdw m14, m13, m12 @@ -5320,15 +5437,15 @@ ALIGN function_align REPX {pminsd x, m14}, m1, m3, m4, m6 .pass2_fast2: call m(iadst_16x8_internal_10bpc).main_part2 - vpbroadcastd m14, [pd_34816] + vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 - psubd m13, m14, m15 ; pd_34815 + psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret ALIGN function_align .pass2_part2: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 - REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 @@ -5375,8 +5492,73 @@ cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx INV_TXFM_16X16_FN identity, dct, -92, 12 INV_TXFM_16X16_FN identity, identity, 0, 12 +%macro IDTX16_12BPC 1 ; src + pmulld m6, m7, m%1 + paddd m6, m15 + psrad m6, 12 + paddd m6, m%1 + psrad m%1, m6, 1 +%endmacro + cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 - jmp m(iidentity_16x16_internal_10bpc).pass1 + vpbroadcastd m7, [pd_1697] + vpbroadcastd m15, [pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + mova m10, [cq+r3+32*33] + mova m11, [cq+r3+32*35] + mova m12, [cq+r3+32*37] + mova m13, [cq+r3+32*39] + add r6, 32*4 + pmulld m0, m7, m10 + pmulld m1, m7, m11 + pmulld m2, m7, m12 + pmulld m3, m7, m13 + REPX {paddd x, m15}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + paddd m0, m10 + paddd m1, m11 + paddd m2, m12 + paddd m3, m13 + REPX {psrad x, 1 }, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m8, [cq+64* 6] + mova m9, [cq+64* 7] + REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 + mova [cq+64*0], m8 + mova [cq+64*1], m9 + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 + mova m6, [cq+64*15] + pmulld m7, m6 + paddd m7, m15 + psrad m7, 12 + paddd m7, m6 + mova m6, [cq+64*0] + psrad m15, m7, 1 + mova m7, [cq+64*1] + jmp tx2q .pass2: call m(iidentity_8x16_internal_12bpc).pass2_main call m(idct_16x16_internal_10bpc).transpose_fast @@ -5429,7 +5611,7 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx call m(idct_16x16_internal_12bpc).write_16x16 RET -%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift +%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack mova m%4, [r6+32*(%1-4)] mova m%2, [r5+32*(3-%1)] mova m%5, [r4+32*(%1-4)] @@ -5446,8 +5628,10 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx paddd m%2, m%3, m%5 ; out15 - n psubd m%3, m%5 ; out16 + n REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%if %7 & 1 packssdw m%1, m%3 ; out0 + n, out16 + n packssdw m%2, m%4 ; out15 - n, out31 - n +%endif %endmacro cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob @@ -5574,14 +5758,15 @@ cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 ALIGN function_align -.pass1_main: +.pass1_main_part1: mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] @@ -5590,7 +5775,6 @@ ALIGN function_align mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] - add cq, 32 call m(idct_8x8_internal_10bpc).main psrld m1, m11, 10 ; pd_2 REPX {paddd x, m1}, m0, m6, m5, m3 @@ -5603,6 +5787,11 @@ ALIGN function_align psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.pass1_main: + call .pass1_main_part1 + add cq, 32 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 @@ -5665,7 +5854,7 @@ ALIGN function_align vpbroadcastd m15, [pd_4017] vpbroadcastd m10, [pd_799] ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a - ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m1 ; t28a @@ -5734,7 +5923,7 @@ ALIGN function_align vpbroadcastd m15, [pd_2276] vpbroadcastd m10, [pd_3406] ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a - ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m1 ; t20a @@ -5747,8 +5936,8 @@ ALIGN function_align REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] - ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a - ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 mova m9, [r6-32*4] ; t16a mova m10, [r6-32*3] ; t17 psubd m2, m9, m7 ; t23 @@ -5881,8 +6070,9 @@ ALIGN function_align ret cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob - vpbroadcastd m5, [pw_5] vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_5] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -5947,30 +6137,262 @@ ALIGN function_align vextracti128 [dstq+r4 ], m3, 1 ret +cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + mov r4, cq + lea r6, [rsp+32*4] + call .pass1_main + cmp eobd, 43 + jge .eob43 + jmp .pass2_fast +.eob43: + call .pass1_main + cmp eobd, 107 + jge .eob107 +.pass2_fast: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + jmp .pass2_end +.eob107: + call .pass1_main + cmp eobd, 171 + jge .eob171 + jmp .pass2 +.eob171: + call .pass1_main +.pass2: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + pmaxsd m4, m12, [cq+128*1+64] + pmaxsd m5, m12, [cq+128*7+64] + pmaxsd m6, m12, [cq+128*1+96] + pmaxsd m7, m12, [cq+128*7+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + pmaxsd m4, m12, [cq+128*3+64] + pmaxsd m5, m12, [cq+128*5+64] + pmaxsd m6, m12, [cq+128*3+96] + pmaxsd m7, m12, [cq+128*5+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + pmaxsd m4, m12, [cq+128*2+64] + pmaxsd m5, m12, [cq+128*6+64] + pmaxsd m6, m12, [cq+128*2+96] + pmaxsd m7, m12, [cq+128*6+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + pmaxsd m4, m12, [cq+128*0+64] + pmaxsd m5, m12, [cq+128*4+64] + pmaxsd m6, m12, [cq+128*0+96] + pmaxsd m7, m12, [cq+128*4+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf +.pass2_end: + psrld m11, 8 ; pd_8 + IDCT32_END 0, 15, 8, 9, 10, 4 + IDCT32_END 1, 14, 8, 9, 10, 4 + punpckhqdq m8, m0, m1 ; 16 17 (interleaved) + punpcklqdq m0, m1 ; 0 1 (interleaved) + punpcklqdq m1, m14, m15 ; 14 15 (interleaved) + punpckhqdq m14, m15 ; 30 31 (interleaved) + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 4 + IDCT32_END 3, 14, 8, 9, 10, 4 + punpckhqdq m8, m2, m3 ; 18 19 (interleaved) + punpcklqdq m2, m3 ; 2 3 (interleaved) + punpcklqdq m3, m14, m15 ; 12 13 (interleaved) + punpckhqdq m14, m15 ; 28 29 (interleaved) + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 4 + IDCT32_END 5, 14, 8, 9, 10, 4 + punpckhqdq m8, m4, m5 ; 20 21 (interleaved) + punpcklqdq m4, m5 ; 4 5 (interleaved) + punpcklqdq m5, m14, m15 ; 10 11 (interleaved) + punpckhqdq m14, m15 ; 26 27 (interleaved) + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 4 + IDCT32_END 7, 14, 8, 9, 10, 4 + punpckhqdq m8, m6, m7 ; 22 23 (interleaved) + punpcklqdq m6, m7 ; 6 7 (interleaved) + punpcklqdq m7, m14, m15 ; 8 9 (interleaved) + punpckhqdq m14, m15 ; 24 25 (interleaved) + mova [r5-32*3], m8 + mova [r5-32*4], m14 + mova m15, m1 +.end: + vpermq m0, m0, q3120 + vpermq m1, m2, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m6, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m7, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m3, q3120 + vpermq m1, m15, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*3], q3120 + vpermq m1, [r5+32*1], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*1], q3120 + vpermq m1, [r5-32*3], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*4], q3120 + vpermq m1, [r5-32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*0], q3120 + vpermq m1, [r5+32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +ALIGN function_align +.pass1_main: + call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 32 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2, 0 + mova [cq+32*16], m8 + mova [cq+32*31], m9 + IDCT32_END 1, 14, 8, 9, 10, 2, 0 + mova [cq+32*17], m8 + mova [cq+32*30], m9 + mova [cq+32*14], m14 + IDCT32_END 2, 14, 8, 9, 10, 2, 0 + mova [cq+32*18], m8 + mova [cq+32*29], m9 + mova [cq+32*13], m14 + IDCT32_END 3, 14, 8, 9, 10, 2, 0 + mova [cq+32*19], m8 + mova [cq+32*28], m9 + mova [cq+32*12], m14 + IDCT32_END 4, 14, 8, 9, 10, 2, 0 + mova [cq+32*20], m8 + mova [cq+32*27], m9 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + IDCT32_END 5, 10, 0, 1, 2, 2, 0 + mova [cq+32*21], m0 + mova [cq+32*26], m1 + IDCT32_END 6, 9, 0, 1, 2, 2, 0 + mova [cq+32*22], m0 + mova [cq+32*25], m1 + IDCT32_END 7, 8, 0, 1, 2, 2, 0 + mova [cq+32*23], m0 + mova [cq+32*24], m1 + mova m0, [cq+32* 0] + mova m1, [cq+32* 1] + mova m2, [cq+32* 2] + mova m11, m14 + mova m12, [cq+32*12] + mova m13, [cq+32*13] + mova m14, [cq+32*14] + ret + +cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 + cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 8 + or r3d, 8 .dconly: - add r6d, 10240 - sar r6d, 14 + add r6d, 640 + sar r6d, 10 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d + paddsw xm0, xm3 vpbroadcastw m0, xm0 - vpbroadcastd m4, [pixel_10bpc_max] - pxor m3, m3 .dconly_loop: - paddw m1, m0, [dstq+32*0] - paddw m2, m0, [dstq+32*1] - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + psubusw m1, m3 + psubusw m2, m3 mova [dstq+32*0], m1 mova [dstq+32*1], m2 add dstq, strideq @@ -5979,6 +6401,39 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob RET .full: PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + call .pass1 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end + lea r6, [deint_shuf+128] + vpbroadcastd m11, [pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct_16x8_internal_8bpc).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.pass1: mova m0, [cq+32* 1] mova m1, [cq+32* 7] mova m2, [cq+32* 9] @@ -5988,10 +6443,7 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob mova m6, [cq+32*25] mova m7, [cq+32*31] vpbroadcastd m11, [pd_2048] - vpbroadcastd m12, [clip_18b_min] - vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] - lea r6, [rsp+32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+32* 3] mova m1, [cq+32* 5] @@ -6021,37 +6473,12 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob mova m7, [cq+32*28] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf - call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end - lea r6, [deint_shuf+128] - vpbroadcastd m11, [pw_2048] - mov r4, dstq - call .pass2 - mova m0, [r5+32*3] ; 16 17 - mova m1, [r5+32*2] ; 30 31 - mova m2, [r5+32*1] ; 18 19 - mova m3, [r5+32*0] ; 28 29 - mova m4, [r5-32*1] ; 20 21 - mova m5, [r5-32*2] ; 26 27 - mova m6, [r5-32*3] ; 22 23 - mova m7, [r5-32*4] ; 24 25 - call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose - lea dstq, [r4+32] - call .pass2 - RET -ALIGN function_align -.pass2: - call m(idct_16x8_internal_8bpc).main - REPX {pmulhrsw x, m11}, m0, m1, m2, m3 - call m(idct_16x8_internal_10bpc).write_16x4_start - pmulhrsw m0, m11, m4 - pmulhrsw m1, m11, m5 - pmulhrsw m2, m11, m6 - pmulhrsw m3, m11, m7 - jmp m(idct_16x8_internal_10bpc).write_16x4_zero + ret cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob - vpbroadcastd m5, [pw_4096] vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_4096] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -6078,6 +6505,47 @@ cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob jge .loop RET +cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 8 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 + call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end + mov r4, dstq + call m(idct_16x8_internal_12bpc).pass2_main + mova m0, [cq+32* 0] ; 16 + mova m1, [cq+32* 1] ; 17 + mova m2, [cq+32* 2] ; 18 + mova m3, [cq+32* 3] ; 19 + mova m4, [cq+32* 4] ; 20 + mova m5, [cq+32* 5] ; 21 + mova m6, [cq+32* 6] ; 22 + mova m7, [cq+32* 7] ; 23 + mova m8, [cq+32* 8] ; 24 + mova m9, [cq+32* 9] ; 25 + mova m10, [cq+32*10] ; 26 + mova m11, [cq+32*11] ; 27 + mova m12, [cq+32*12] ; 28 + mova m13, [cq+32*13] ; 29 + mova m14, [cq+32*14] ; 30 + mova m15, [cq+32*15] ; 31 + lea dstq, [r4+32] + call m(idct_16x8_internal_12bpc).pass2_main + RET + +cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 + %macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] mova m%4, [%2] paddsw m%3, m%1, m%4 @@ -6121,13 +6589,14 @@ cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 jmp .fast .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 .eob44: mova [r4+16*0], xm0 mova [r4+16*1], xm3 @@ -6472,14 +6941,15 @@ cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] jmp .end .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 16 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .full: add cq, 32 @@ -6742,9 +7212,10 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .fast: lea r4, [rsp+32*71] @@ -7019,12 +7490,13 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 64 - add r6d, 10240 - sar r6d, 14 - jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 .fast: lea r4, [rsp+32*38] pxor m0, m0 @@ -7246,7 +7718,7 @@ ALIGN function_align REPX {pmaxsd x, m12}, m8, m1, m6, m2 REPX {pminsd x, m13}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a - ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a REPX {pmaxsd x, m12}, m0, m3, m7, m4 REPX {pminsd x, m13}, m0, m3, m7, m4 vpbroadcastd m10, [r5+4*10] @@ -7301,7 +7773,7 @@ ALIGN function_align REPX {pmaxsd x, m12}, m8, m1, m3, m4 REPX {pminsd x, m13}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a - ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a REPX {pmaxsd x, m12}, m0, m2, m5, m7 REPX {pminsd x, m13}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a @@ -7358,14 +7830,15 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 64 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .fast: lea r4, [rsp+32*70] @@ -7540,30 +8013,26 @@ ALIGN function_align cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .normal - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 .dconly: - add r6d, 10240 - sar r6d, 14 + add r6d, 640 + sar r6d, 10 .dconly2: - imul r6d, 2896 - add r6d, 34816 - sar r6d, 16 + vpbroadcastd m5, [dconly_10bpc] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 movd xm0, r6d -%if WIN64 - movaps [rsp+8], xmm6 -%endif + paddsw xm0, xm5 vpbroadcastw m0, xm0 - vpbroadcastd m6, [pixel_10bpc_max] - pxor m5, m5 .dconly_loop: - paddw m1, m0, [dstq+32*0] - paddw m2, m0, [dstq+32*1] - paddw m3, m0, [dstq+32*2] - paddw m4, m0, [dstq+32*3] - REPX {pmaxsw x, m5}, m1, m2, m3, m4 - REPX {pminsw x, m6}, m1, m2, m3, m4 + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + paddsw m3, m0, [dstq+32*2] + paddsw m4, m0, [dstq+32*3] + REPX {psubusw x, m5}, m1, m2, m3, m4 mova [dstq+32*0], m1 mova [dstq+32*1], m2 mova [dstq+32*2], m3 @@ -7571,9 +8040,6 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob add dstq, strideq dec r3d jg .dconly_loop -%if WIN64 - movaps xmm6, [rsp+8] -%endif RET .normal: PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob @@ -7814,14 +8280,14 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 32 - add r6d, 2048 - sar r6d, 12 - imul r6d, 2896 - add r6d, 6144 - sar r6d, 13 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 .fast: pxor m0, m0 @@ -7963,9 +8429,9 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - imul r6d, [cq], 2896 + imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly .fast: pxor m0, m0 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm new file mode 100644 index 00000000000..b05fde54dc8 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm @@ -0,0 +1,2599 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23 + db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31 + db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55 + db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63 +idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51 + db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59 + db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17 + db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25 +iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23 + db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31 + db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19 + db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27 +permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13 + db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29 + db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15 + db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31 +permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2 + db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6 + db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7 + db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3 +permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6 + db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14 + db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7 + db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15 +idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 + db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 + db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 + db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 +idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25 + db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57 + db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29 + db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61 +idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30 + db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62 + db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31 + db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63 + +pw_2048_m2048: times 16 dw 2048 +pw_m2048_2048: times 16 dw -2048 +pw_2048: times 16 dw 2048 + +; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++- +%macro COEF_PAIR 2-3 0 ; a, b, flags +%if %3 == 1 +pd_%1_m%2: dd %1, %1, -%2, -%2 +%define pd_%1 (pd_%1_m%2 + 4*0) +%define pd_m%2 (pd_%1_m%2 + 4*2) +%elif %3 == 2 +pd_m%1_%2: dd -%1, -%1, %2, %2 +%define pd_m%1 (pd_m%1_%2 + 4*0) +%define pd_%2 (pd_m%1_%2 + 4*2) +%else +pd_%1_%2: dd %1, %1, %2, %2 +%define pd_%1 (pd_%1_%2 + 4*0) +%define pd_%2 (pd_%1_%2 + 4*2) +%if %3 == 3 +%define pd_%2_m%2 pd_%2 +dd -%2, -%2 +%endif +%endif +%endmacro + +COEF_PAIR 201, 995 +COEF_PAIR 401, 1189, 1 +COEF_PAIR 401, 1931 +COEF_PAIR 401, 3920 +COEF_PAIR 799, 2276, 1 +COEF_PAIR 799, 3406 +COEF_PAIR 799, 4017 +COEF_PAIR 1380, 601 +COEF_PAIR 1751, 2440 +COEF_PAIR 2598, 1189 +COEF_PAIR 2598, 1931, 2 +COEF_PAIR 2598, 3612 +COEF_PAIR 2751, 2106 +COEF_PAIR 2896, 1567, 3 +COEF_PAIR 2896, 3784, 3 +COEF_PAIR 3035, 3513 +COEF_PAIR 3166, 1931 +COEF_PAIR 3166, 3612 +COEF_PAIR 3166, 3920 +COEF_PAIR 3703, 3290 +COEF_PAIR 3857, 4052 +COEF_PAIR 4017, 2276 +COEF_PAIR 4017, 3406 +COEF_PAIR 4076, 1189 +COEF_PAIR 4076, 3612 +COEF_PAIR 4076, 3920 +COEF_PAIR 4091, 3973 + +pw_5: times 2 dw 5 +pw_4096 times 2 dw 4096 +pw_1697x16: times 2 dw 1697*16 +pw_2896x8: times 2 dw 2896*8 +pixel_10bpc_max: times 2 dw 0x03ff +dconly_10bpc: times 2 dw 0x7c00 +clip_18b_min: dd -0x20000 +clip_18b_max: dd 0x1ffff +pd_1: dd 1 +pd_2: dd 2 +pd_1448: dd 1448 +pd_2048: dd 2048 +pd_3071: dd 3071 ; 1024 + 2048 - 1 +pd_3072: dd 3072 ; 1024 + 2048 +pd_5119: dd 5119 ; 1024 + 4096 - 1 +pd_5120: dd 5120 ; 1024 + 4096 +pd_5793: dd 5793 + +cextern int8_permA +cextern idct_8x8_internal_8bpc_avx512icl.main +cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2 +cextern idct_8x16_internal_8bpc_avx512icl.main +cextern idct_8x16_internal_8bpc_avx512icl.main2 +cextern idct_8x16_internal_8bpc_avx512icl.main_fast +cextern idct_8x16_internal_8bpc_avx512icl.main_fast2 +cextern iadst_8x16_internal_8bpc_avx512icl.main2 +cextern idct_16x8_internal_8bpc_avx512icl.main +cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2 +cextern idct_16x16_internal_8bpc_avx512icl.main +cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2 +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end +cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main + +SECTION .text + +%define o_base (pw_2048+4*128) +%define o_base_8bpc (int8_permA+64*18) +%define o(x) (r5 - o_base + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +INIT_ZMM avx512icl + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = inv_dst1, 2 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %8 < 4096 + vpbroadcastd m%3, [o(pd_%8)] +%else + vbroadcasti32x4 m%3, [o(pd_%8)] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %7 < 4096 + vpbroadcastd m%5, [o(pd_%7)] +%else + vbroadcasti32x4 m%5, [o(pd_%7)] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 2 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%ifnum %6 + paddd m%1, m%6 +%endif +%if %9 & 1 + psubd m%1, m%3, m%1 +%else + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size +cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_10bpc) + lea r5, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_10bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 8x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 +.dconly: + add r6d, 384 + sar r6d, 9 +.dconly2: + vpbroadcastd ym2, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw ym1, r6d + paddsw ym1, ym2 +.dconly_loop: + mova xm0, [dstq+strideq*0] + vinserti32x4 ym0, [dstq+strideq*1], 1 + paddsw ym0, ym1 + psubusw ym0, ym2 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call .load + vpermi2q m1, m0, m2 ; 1 5 + vpermi2q m3, m6, m4 ; 7 3 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call .main + call .main_end + mova m4, [o(idct8x8p)] + packssdw m0, m2 ; 0 1 4 5 + packssdw m1, m3 ; 3 2 7 6 + vpermb m0, m4, m0 + vprolq m1, 32 + vpermb m2, m4, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + vextracti32x8 ym2, m0, 1 + vextracti32x8 ym3, m1, 1 + call m(idct_8x8_internal_8bpc).main + mova m10, [permC] + vpbroadcastd m12, [pw_2048] +.end: + vpermt2q m0, m10, m1 + vpermt2q m2, m10, m3 +.end2: + vpbroadcastd m11, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m10, m10 + pmulhrsw m8, m12, m0 + call .write_8x4_start + pmulhrsw m8, m12, m2 +.write_8x4: + lea dstq, [dstq+strideq*4] + add cq, 64*2 +.write_8x4_start: + mova xm9, [dstq+strideq*0] + vinserti32x4 ym9, [dstq+strideq*1], 1 + vinserti32x4 m9, [dstq+strideq*2], 2 + vinserti32x4 m9, [dstq+r6 ], 3 + mova [cq+64*0], m10 + mova [cq+64*1], m10 + paddw m9, m8 + pmaxsw m9, m10 + pminsw m9, m11 + mova [dstq+strideq*0], xm9 + vextracti32x4 [dstq+strideq*1], ym9, 1 + vextracti32x4 [dstq+strideq*2], m9, 2 + vextracti32x4 [dstq+r6 ], m9, 3 + ret +ALIGN function_align +.load: + mova m0, [cq+64*0] ; 0 1 + mova m4, [cq+64*1] ; 2 3 + mova m1, [o(permB)] + mova m2, [cq+64*2] ; 4 5 + mova m6, [cq+64*3] ; 6 7 + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psrlq m5, m1, 32 + vpbroadcastd m12, [o(pd_2896)] + mova m3, m1 + vpbroadcastd m11, [o(pd_1)] + ret +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vbroadcasti32x4 m8, [o(pd_799_4017)] + pmulld m8, m1 ; t4 t7 + vpmulld m0, [o(pd_2896)] {1to16} ; dct4 out0 out1 + REPX {paddd x, m13}, m8, m0 + REPX {psrad x, 12 }, m8, m0 + pmulld m3, m8, m12 + mova m2, m0 ; dct4 out3 out2 + jmp .main3 +.main_fast: ; bottom half is zero + vbroadcasti32x4 m3, [o(pd_4017_3406)] + vbroadcasti32x4 m8, [o(pd_799_m2276)] + vbroadcasti32x4 m2, [o(pd_2896_3784)] + vbroadcasti32x4 m9, [o(pd_2896_1567)] + pmulld m3, m1 ; t4a t5a + pmulld m1, m8 ; t7a t6a + pmulld m2, m0 ; t0 t3 + pmulld m0, m9 ; t1 t2 + jmp .main2 +.main: + ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276 + ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784 +.main2: + REPX {paddd x, m13}, m1, m3, m0, m2 + REPX {psrad x, 12 }, m1, m3, m0, m2 + punpcklqdq m8, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m8, m1 ; t5a t6a + paddd m8, m1 ; t4 t7 + pmaxsd m3, m14 + punpckhqdq m1, m2, m0 ; t3 t2 + pminsd m3, m15 + punpcklqdq m2, m0 ; t0 t1 + pmulld m3, m12 + paddd m0, m2, m1 ; dct4 out0 out1 + psubd m2, m1 ; dct4 out3 out2 + REPX {pmaxsd x, m14}, m8, m0, m2 + REPX {pminsd x, m15}, m8, m0, m2 +.main3: + pshufd m1, m3, q1032 + paddd m3, m13 + psubd m9, m3, m1 + paddd m3, m1 + psrad m9, 12 + psrad m3, 12 + punpckhqdq m1, m8, m3 ; t7 t6 + shufpd m8, m9, 0xaa ; t4 t5 + ret +.main_end: + paddd m0, m11 + paddd m2, m11 + psubd m3, m0, m1 ; out7 out6 + paddd m0, m1 ; out0 out1 + paddd m1, m2, m8 ; out3 out2 + psubd m2, m8 ; out4 out5 + REPX {vpsravd x, m11}, m0, m2, m3, m1 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity +INV_TXFM_8X8_FN adst, adst + +cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x8_internal_10bpc).load + vpermi2q m1, m6, m2 ; 7 5 + vpermi2q m3, m4, m0 ; 3 1 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call .main + punpckldq m1, m2, m4 ; out4 out6 + punpckhdq m2, m0 ; -out5 -out7 + punpckldq m0, m3 ; out0 out2 + punpckhdq m4, m3 ; -out1 -out3 + paddd m1, m11 + psubd m3, m11, m2 + paddd m0, m11 + psubd m4, m11, m4 +.pass1_end: + REPX {psrad x, 1}, m1, m0, m3, m4 + packssdw m0, m1 ; 0 2 4 6 + packssdw m4, m3 ; 1 3 5 7 + psrlq m1, [o(permB)], 8 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + psrlq m2, m1, 32 + vpermi2q m1, m0, m3 + vpermt2q m0, m2, m3 + jmp tx2q +.pass2: + call .main_pass2 + movu m10, [permC+2] + vbroadcasti32x8 m12, [pw_2048_m2048+16] + jmp m(idct_8x8_internal_10bpc).end +.main_pass2: + vextracti32x8 ym2, m0, 1 + vextracti32x8 ym3, m1, 1 + lea r5, [o_base_8bpc] + pshufd ym4, ym0, q1032 + pshufd ym5, ym1, q1032 + jmp m(iadst_8x8_internal_8bpc).main_pass2 +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m14}, m4, m2, m0, m1 + REPX {pminsd x, m15}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + shufpd m4, m2, 0xaa ; t4 t7 + shufpd m2, m5, 0xaa ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784 + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m14}, m1, m2 + REPX {pminsd x, m15}, m1, m2 + shufpd m3, m1, m2, 0xaa + shufpd m1, m2, 0x55 + pmulld m3, m12 + pmulld m1, m12 + paddd m3, m13 + psubd m2, m3, m1 + paddd m3, m1 + psrad m2, 12 ; out4 -out5 + pshufd m3, m3, q1032 + psrad m3, 12 ; out2 -out3 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, identity +INV_TXFM_8X8_FN flipadst, flipadst + +cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x8_internal_10bpc).load + vpermi2q m1, m6, m2 ; 7 5 + vpermi2q m3, m4, m0 ; 3 1 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call m(iadst_8x8_internal_10bpc).main + punpckhdq m1, m3, m4 ; -out3 -out1 + punpckldq m3, m0 ; out2 out0 + punpckhdq m0, m2 ; -out7 -out5 + punpckldq m4, m2 ; out6 out4 + psubd m1, m11, m1 + paddd m3, m11 + psubd m0, m11, m0 + paddd m4, m11 + jmp m(iadst_8x8_internal_10bpc).pass1_end +.pass2: + call m(iadst_8x8_internal_10bpc).main_pass2 + movu m10, [permC+1] + vbroadcasti32x8 m12, [pw_m2048_2048+16] + lea r6, [strideq*3] + vpermt2q m0, m10, m1 ; 7 6 5 4 + vpbroadcastd m11, [pixel_10bpc_max] + vpermt2q m2, m10, m3 ; 3 2 1 0 + pxor m10, m10 + pmulhrsw m8, m12, m2 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m0 + jmp m(idct_8x8_internal_10bpc).write_8x4 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + mova m1, [cq+64*0] + packssdw m1, [cq+64*2] ; 0 4 1 5 + mova m2, [cq+64*1] ; 2 6 3 7 + packssdw m2, [cq+64*3] + mova m0, [o(idtx8x8p)] + vpermb m1, m0, m1 + vpermb m2, m0, m2 + punpckldq m0, m1, m2 ; 0 1 4 5 + punpckhdq m1, m2 ; 2 3 6 7 + jmp tx2q +.pass2: + movu m3, [o(permC+2)] + vpbroadcastd m12, [o(pw_4096)] + psrlq m2, m3, 32 + vpermi2q m2, m0, m1 + vpermt2q m0, m3, m1 + jmp m(idct_8x8_internal_10bpc).end2 + +%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 8x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, 35 +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, adst + +cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call .load + call .main + call .main_end +.pass1_end: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + jmp tx2q +.pass2: + mova m8, [o(idct8x16p)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3 + punpckhdq m5, m0, m1 + punpckldq m0, m1 + punpckhdq m4, m2, m3 + punpckldq m2, m3 + punpcklqdq m8, m0, m2 ; 15 1 + punpckhqdq m0, m2 ; 7 9 + punpckhqdq m1, m5, m4 ; 3 13 + punpcklqdq m5, m4 ; 11 5 + lea r5, [o_base_8bpc] + vextracti32x8 ym7, m8, 1 ; 14 2 + vextracti32x8 ym3, m0, 1 ; 6 10 + vextracti32x8 ym6, m1, 1 ; 12 4 + vextracti32x8 ym9, m5, 1 ; 8 0 + call m(idct_8x16_internal_8bpc).main2 + mova m8, [permC] + vpbroadcastd m12, [pw_2048] + vpermt2q m0, m8, m1 + lea r6, [strideq*3] + vpermt2q m2, m8, m3 + vpbroadcastd m11, [pixel_10bpc_max] + vpermt2q m4, m8, m5 + pxor m10, m10 + vpermt2q m6, m8, m7 + pmulhrsw m8, m12, m0 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m2 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m4 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m6 + jmp m(idct_8x8_internal_10bpc).write_8x4 +.fast: + mova ym0, [cq+64*0] + mova ym4, [cq+64*2] + mova ym1, [cq+64*1] + mova ym5, [cq+64*5] + mova ym2, [cq+64*4] + mova ym6, [cq+64*6] + mova ym3, [cq+64*7] + mova ym7, [cq+64*3] + call .round_input_fast + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_10bpc).main_end + movu m6, [o(permC+3)] + packssdw m3, m1, m3 + packssdw m1, m0, m2 + vprolq m3, 32 + vpermd m1, m6, m1 + vpermd m3, m6, m3 + mova ym0, ym1 ; 0 4 + vextracti32x8 ym1, m1, 1 ; 1 5 + mova ym2, ym3 ; 2 6 + vextracti32x8 ym3, m3, 1 ; 3 7 + jmp tx2q +ALIGN function_align +.round_input_fast: + movshdup m8, [o(permB)] + vpbroadcastd m12, [o(pd_2896)] + vpermt2q m0, m8, m4 + vpermt2q m1, m8, m5 + vpermt2q m2, m8, m6 + vpermt2q m3, m8, m7 + vpbroadcastd m13, [o(pd_2048)] + REPX {pmulld x, m12}, m0, m1, m2, m3 + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + REPX {paddd x, m13}, m0, m1, m2, m3 + vpbroadcastd m11, [o(pd_1)] + REPX {psrad x, 12 }, m0, m1, m2, m3 + ret +ALIGN function_align +.load: + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] +.load2: + vpbroadcastd m12, [o(pd_2896)] + pmulld m0, m12, [cq+64*0] + pmulld m1, m12, [cq+64*1] + pmulld m2, m12, [cq+64*2] + pmulld m3, m12, [cq+64*3] + vpbroadcastd m13, [o(pd_2048)] + pmulld m4, m12, [cq+64*4] + pmulld m5, m12, [cq+64*5] + pmulld m6, m12, [cq+64*6] + pmulld m7, m12, [cq+64*7] + REPX {paddd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.main: + ITX_MULSUB_2D 5, 3, 8, 9, 10, 13, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 13, 799, 4017 ; t4a t7a + pmulld m0, m12 + pmulld m4, m12 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + psubd m5, m7, m3 ; t6a + paddd m7, m3 ; t7 + pmaxsd m5, m14 + pmaxsd m1, m14 + pminsd m5, m15 + pminsd m1, m15 + pmulld m5, m12 + pmulld m1, m12 + ITX_MULSUB_2D 2, 6, 3, 9, 10, 13, 1567, 3784 ; t2 t3 + pmaxsd m8, m14 + pmaxsd m7, m14 + paddd m0, m13 + pminsd m8, m15 + psubd m3, m0, m4 + paddd m5, m13 + paddd m0, m4 + psubd m4, m5, m1 + paddd m5, m1 + REPX {psrad x, 12 }, m3, m5, m0, m4 + paddd m1, m3, m2 ; dct4 out1 + psubd m2, m3, m2 ; dct4 out2 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + pminsd m6, m15, m7 + REPX {pmaxsd x, m14}, m0, m1, m2, m3 + REPX {pminsd x, m15}, m0, m1, m2, m3 + ret +.main_end: + vpbroadcastd m11, [o(pd_1)] +.main_end2: + REPX {paddd x, m11}, m0, m1, m2, m3 + psubd m7, m0, m6 ; out7 + paddd m0, m6 ; out0 + psubd m6, m1, m5 ; out6 + paddd m1, m5 ; out1 + psubd m5, m2, m4 ; out5 + paddd m2, m4 ; out2 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, identity, 35 +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, adst + +cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call m(idct_8x16_internal_10bpc).load + call .main + psrad m0, 1 + psrad m1, 1 + psrad m6, m10, 1 + psrad m7, m11, 1 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + jmp m(idct_8x16_internal_10bpc).pass1_end +.fast: + call .fast_main + punpcklqdq m1, m2, m4 ; out4 out6 + punpckhqdq m2, m0 ; -out5 -out7 + punpcklqdq m0, m3 ; out0 out2 + punpckhqdq m4, m3 ; -out1 -out3 + paddd m1, m11 + psubd m3, m11, m2 + paddd m0, m11 + psubd m4, m11, m4 +.fast_end: + movu m5, [o(permC+3)] + REPX {psrad x, 1}, m1, m0, m3, m4 + packssdw m2, m0, m1 ; 0 2 4 6 + packssdw m3, m4, m3 ; 1 3 5 7 + vpermd m2, m5, m2 + vpermd m3, m5, m3 + mova ym0, ym2 + vextracti32x8 ym2, m2, 1 + mova ym1, ym3 + vextracti32x8 ym3, m3, 1 + jmp tx2q +.pass2: + call .pass2_main + movu m4, [permB+2] + vbroadcasti32x8 m12, [pw_2048_m2048+16] + psrlq m7, m4, 8 + vpermi2q m4, m0, m3 ; 0 1 2 3 + psrlq m5, m7, 24 + vpermi2q m7, m0, m3 ; 12 13 14 15 + psrlq m6, m5, 8 + vpermq m5, m5, m1 ; 4 5 6 7 + vpermq m6, m6, m2 ; 8 9 10 11 +.pass2_end: + vpbroadcastd m11, [pixel_10bpc_max] + pxor m10, m10 + lea r6, [strideq*3] + pmulhrsw m8, m12, m4 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m5 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m6 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m7 + jmp m(idct_8x8_internal_10bpc).write_8x4 +ALIGN function_align +.main: + ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a + psubd m8, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m5, m1 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7 + REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7 + vpbroadcastd m10, [o(pd_1567)] + vpbroadcastd m11, [o(pd_3784)] + ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a + ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a + vpbroadcastd m12, [o(pd_1448)] + psubd m9, m6, m8 ; t7 + paddd m6, m8 ; out6 + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m2 ; t2 + paddd m0, m2 ; out0 + psubd m2, m1, m4 ; t6 + paddd m1, m4 ; -out1 + REPX {pmaxsd x, m14}, m5, m3, m2, m9 + REPX {pminsd x, m15}, m5, m3, m2, m9 + REPX {pmulld x, m12}, m5, m3, m2, m9 + vpbroadcastd m4, [o(pd_1)] + psubd m8, m5, m3 ; (t2 - t3) * 1448 + paddd m3, m5 ; (t2 + t3) * 1448 + psubd m5, m2, m9 ; (t6 - t7) * 1448 + paddd m2, m9 ; (t6 + t7) * 1448 + vpbroadcastd m9, [o(pd_3072)] + paddd m0, m4 + psubd m1, m4, m1 + paddd m10, m6, m4 + psubd m11, m4, m7 + paddd m2, m9 + paddd m8, m9 + vpbroadcastd m9, [o(pd_3071)] + psubd m3, m9, m3 + psubd m9, m5 + ret +ALIGN function_align +.fast_main: + mova ym0, [cq+64*0] + mova ym4, [cq+64*2] + mova ym1, [cq+64*7] + mova ym5, [cq+64*5] + mova ym2, [cq+64*4] + mova ym6, [cq+64*6] + mova ym3, [cq+64*3] + mova ym7, [cq+64*1] + call m(idct_8x16_internal_10bpc).round_input_fast + jmp m(iadst_8x8_internal_10bpc).main +ALIGN function_align +.pass2_main: + mova m8, [o(iadst8x16p)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3 + vpbroadcastd m10, [o(pw_2896x8)] + punpckhdq m5, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m2, m3 + punpckldq m2, m3 + lea r5, [o_base_8bpc] + punpckhqdq m4, m0, m2 ; 12 3 14 1 + punpcklqdq m0, m2 ; 0 15 2 13 + punpckhqdq m6, m5, m1 ; 8 7 10 5 + punpcklqdq m5, m1 ; 4 11 6 9 + call m(iadst_8x16_internal_8bpc).main2 + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m10 ; -out7 out4 out6 -out5 + pmulhrsw m2, m10 ; out8 -out11 -out9 out10 + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, identity, 35 +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst + +cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call m(idct_8x16_internal_10bpc).load + call m(iadst_8x16_internal_10bpc).main + psrad m7, m0, 1 + psrad m0, m11, 1 + psrad m6, m1, 1 + psrad m1, m10, 1 + psrad m5, m2, 12 + psrad m2, m9, 12 + psrad m4, m3, 12 + psrad m3, m8, 12 + jmp m(idct_8x16_internal_10bpc).pass1_end +.fast: + call m(iadst_8x16_internal_10bpc).fast_main + punpckhqdq m1, m3, m4 ; -out3 -out1 + punpcklqdq m3, m0 ; out2 out0 + punpckhqdq m0, m2 ; -out7 -out5 + punpcklqdq m4, m2 ; out6 out4 + psubd m1, m11, m1 + paddd m3, m11 + psubd m0, m11, m0 + paddd m4, m11 + jmp m(iadst_8x16_internal_10bpc).fast_end +.pass2: + call m(iadst_8x16_internal_10bpc).pass2_main + movu m7, [permB+2] + vbroadcasti32x8 m12, [pw_m2048_2048+16] + psrlq m4, m7, 8 + vpermi2q m7, m3, m0 ; 3 2 1 0 + psrlq m5, m4, 24 + vpermi2q m4, m3, m0 ; 15 14 13 12 + psrlq m6, m5, 8 + vpermq m5, m5, m2 ; 11 10 9 8 + vpermq m6, m6, m1 ; 7 6 5 4 + jmp m(iadst_8x16_internal_10bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x16_internal_10bpc).load2 + jmp m(idct_8x16_internal_10bpc).pass1_end +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + pmulhrsw m4, m8, m0 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m7, [o(pw_2048)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + vpbroadcastd m6, [o(pixel_10bpc_max)] + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + pxor m5, m5 + punpckhqdq m1, m0, m2 ; 1 5 9 13 + punpcklqdq m0, m2 ; 0 4 8 12 + punpcklqdq m2, m3, m4 ; 2 6 10 14 + punpckhqdq m3, m4 ; 3 7 11 15 + lea r6, [strideq*3] + pmulhrsw m0, m7 + call .write_8x4_start + pmulhrsw m0, m7, m1 + call .write_8x4 + pmulhrsw m0, m7, m2 + call .write_8x4 + pmulhrsw m0, m7, m3 +.write_8x4: + add dstq, strideq + add cq, 64*2 +.write_8x4_start: + mova xm4, [dstq+strideq*0] + vinserti32x4 ym4, [dstq+strideq*4], 1 + vinserti32x4 m4, [dstq+strideq*8], 2 + vinserti32x4 m4, [dstq+r6*4 ], 3 + mova [cq+64*0], m5 + mova [cq+64*1], m5 + paddw m4, m0 + pmaxsw m4, m5 + pminsw m4, m6 + mova [dstq+strideq*0], xm4 + vextracti32x4 [dstq+strideq*4], ym4, 1 + vextracti32x4 [dstq+strideq*8], m4, 2 + vextracti32x4 [dstq+r6*4 ], m4, 3 + ret + +%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 16x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 +.dconly: + vpbroadcastd m2, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw m1, r6d + paddsw m1, m2 +.dconly_loop: + mova ym0, [dstq+strideq*0] + vinserti32x8 m0, [dstq+strideq*1], 1 + paddsw m0, m1 + psubusw m0, m2 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity, -21 +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, adst + +cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + pmulld m4, m12, [cq+64*0] ; 0 1 + pmulld m9, m12, [cq+64*1] ; 2 3 + pmulld m8, m12, [cq+64*2] ; 4 5 + pmulld m7, m12, [cq+64*3] ; 6 7 + vpbroadcastd m13, [o(pd_2048)] + pxor m2, m2 + mova m15, [o(permB)] + REPX {mova [cq+64*x], m2}, 0, 1, 2, 3 + psrlq m0, m15, 32 + REPX {paddd x, m13}, m4, m9, m8, m7 + vpbroadcastd m14, [o(clip_18b_min)] + REPX {psrad x, 12 }, m4, m8, m9, m7 + mova m1, m0 + vpermi2q m0, m4, m8 ; 0 4 + cmp eobd, 43 + jl .fast + pmulld m5, m12, [cq+64*4] ; 8 9 + pmulld m10, m12, [cq+64*5] ; 10 11 + pmulld m11, m12, [cq+64*6] ; 12 13 + pmulld m6, m12, [cq+64*7] ; 14 15 + REPX {mova [cq+64*x], m2}, 4, 5, 6, 7 + REPX {paddd x, m13}, m5, m10, m11, m6 + REPX {psrad x, 12 }, m10, m5, m11, m6 + mova m2, m1 + vpermi2q m1, m9, m10 ; 2 10 + mova m3, m2 + vpermi2q m2, m5, m11 ; 8 12 + vpermi2q m3, m6, m7 ; 14 6 + vpermt2q m4, m15, m11 ; 1 13 + vpermt2q m6, m15, m9 ; 15 3 + vpermt2q m5, m15, m8 ; 9 5 + vpermt2q m7, m15, m10 ; 7 11 + vpbroadcastd m15, [o(clip_18b_max)] + call m(idct_8x8_internal_10bpc).main + call .main + jmp .pass1_end +.fast: + vpermi2q m1, m9, m7 ; 2 6 + vpermt2q m4, m15, m9 ; 1 3 + vpermt2q m7, m15, m8 ; 7 5 + vpbroadcastd m15, [o(clip_18b_max)] + call m(idct_8x8_internal_10bpc).main_fast + call .main_fast +.pass1_end: + call m(idct_8x16_internal_10bpc).main_end + mova m8, [o(permA)] + psrlq m9, m8, 8 +.pass1_end2: + mova m10, m9 + mova m11, m8 + call .transpose_16x8 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(idct_16x8_internal_8bpc).main + movshdup m4, [permC] + vpbroadcastd m13, [pw_2048] + psrlq m5, m4, 8 + vpermq m0, m4, m0 + vpermq m1, m5, m1 + vpermq m2, m4, m2 + vpermq m3, m5, m3 +.end: + vpbroadcastd m15, [pixel_10bpc_max] + pxor m14, m14 + pmulhrsw m8, m13, m0 + pmulhrsw m9, m13, m1 + lea r6, [strideq*3] + call .write_16x4 + pmulhrsw m8, m13, m2 + pmulhrsw m9, m13, m3 +.write_16x4: + mova ym10, [dstq+strideq*0] + vinserti32x8 m10, [dstq+strideq*1], 1 + paddw m8, m10 + mova ym10, [dstq+strideq*2] + vinserti32x8 m10, [dstq+r6 ], 1 + paddw m9, m10 + pmaxsw m8, m14 + pmaxsw m9, m14 + pminsw m8, m15 + pminsw m9, m15 + mova [dstq+strideq*0], ym8 + vextracti32x8 [dstq+strideq*1], m8, 1 + mova [dstq+strideq*2], ym9 + vextracti32x8 [dstq+r6 ], m9, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vbroadcasti32x4 m6, [o(pd_4076_3920)] + vbroadcasti32x4 m3, [o(pd_401_m1189)] + pmulld m6, m4 ; t15 t12 + pmulld m4, m3 ; t9 t10 + REPX {paddd x, m13}, m6, m4 + REPX {psrad x, 12 }, m6, m4 + mova m5, m6 ; t14 t13 + mova m9, m4 ; t8 t11 + jmp .main3 +.main_fast: ; bottom half is zero + vbroadcasti32x4 m6, [o(pd_4076_3920)] + vbroadcasti32x4 m3, [o(pd_401_m1189)] + vbroadcasti32x4 m5, [o(pd_m2598_1931)] + vbroadcasti32x4 m9, [o(pd_3166_3612)] + pmulld m6, m4 ; t15a t12a + pmulld m4, m3 ; t8a t11a + pmulld m5, m7 ; t9a t10a + pmulld m7, m9 ; t14a t13a + jmp .main2 +.main: + ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189 + ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612 +.main2: + REPX {paddd x, m13}, m4, m6, m5, m7 + REPX {psrad x, 12 }, m4, m5, m6, m7 + paddd m9, m4, m5 ; t8 t11 + psubd m4, m5 ; t9 t10 + psubd m5, m6, m7 ; t14 t13 + paddd m6, m7 ; t15 t12 + REPX {pmaxsd x, m14}, m5, m4, m9, m6 + REPX {pminsd x, m15}, m5, m4, m9, m6 +.main3: + psubd m3, m0, m1 ; dct8 out7 out6 + paddd m0, m1 ; dct8 out0 out1 + vbroadcasti32x4 m7, [o(pd_3784_m3784)] + pmulld m7, m5 + vpmulld m5, [o(pd_1567)] {1to16} + paddd m1, m2, m8 ; dct8 out3 out2 + psubd m2, m8 ; dct8 out4 out5 + vbroadcasti32x4 m8, [o(pd_1567_m1567)] + pmulld m8, m4 + vpmulld m4, [o(pd_3784)] {1to16} + REPX {pmaxsd x, m14}, m0, m1 + REPX {pminsd x, m15}, m0, m1 + paddd m7, m13 + paddd m5, m13 + paddd m7, m8 + psubd m5, m4 + psrad m7, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a + punpckhqdq m4, m9, m7 + punpcklqdq m8, m9, m5 + punpckhqdq m5, m6, m5 + punpcklqdq m6, m7 + psubd m7, m8, m4 ; t11a t10 + paddd m8, m4 ; t8a t9 + psubd m4, m6, m5 ; t12a t13 + paddd m6, m5 ; t15a t14 + REPX {pmaxsd x, m14}, m4, m7 + REPX {pminsd x, m15}, m4, m7 + pmulld m4, m12 + pmulld m7, m12 + REPX {pmaxsd x, m14}, m2, m3, m6, m8 + REPX {pminsd x, m15}, m2, m3, m6, m8 + paddd m4, m13 + paddd m5, m4, m7 + psubd m4, m7 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + ret +ALIGN function_align +.transpose_16x8: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpermi2d m8, m0, m2 + vpermt2d m0, m9, m2 + vpermi2d m10, m1, m3 + vpermi2d m11, m1, m3 + punpckhwd m3, m8, m0 + punpcklwd m1, m8, m0 + punpckhwd m4, m10, m11 + punpcklwd m2, m10, m11 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, identity, -21 +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, adst + +cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + call .main_pass1 + vpbroadcastd m9, [o(pd_1)] + paddd m0, m9 + psubd m1, m9, m1 + paddd m2, m9 + psubd m3, m9, m3 + paddd m4, m9, m5 + psubd m5, m9, m6 + paddd m6, m9, m7 + psubd m7, m9, m8 +.pass1_end: + mova m9, [o(permA)] + psrlq m8, m9, 8 + REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp m(idct_16x8_internal_10bpc).pass1_end2 +.pass2: + call .main_pass2 + vpermq m8, m13, m0 + vpermq m9, m13, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m13, m2 + vpermq m9, m13, m3 + jmp m(idct_16x8_internal_10bpc).write_16x4 +ALIGN function_align +.main_pass1: + vpbroadcastd m12, [o(pd_2896)] + pmulld m2, m12, [cq+64*0] + pmulld m7, m12, [cq+64*1] + pmulld m1, m12, [cq+64*2] + pmulld m5, m12, [cq+64*3] + vpbroadcastd m13, [o(pd_2048)] + pxor m4, m4 + mova m10, [o(permB)] + REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 + REPX {paddd x, m13}, m2, m7, m1, m5 + psrlq m6, m10, 32 + REPX {psrad x, 12 }, m2, m7, m1, m5 + mova m0, m6 + vpermi2q m0, m2, m7 ; 0 2 + vpermt2q m7, m10, m2 ; 3 1 + mova m2, m6 + vpermi2q m2, m1, m5 ; 4 6 + vpermt2q m5, m10, m1 ; 7 5 + cmp eobd, 43 + jl .main_fast + pmulld m8, m12, [cq+64*4] + pmulld m3, m12, [cq+64*5] + pmulld m9, m12, [cq+64*6] + pmulld m1, m12, [cq+64*7] + REPX {mova [cq+64*x], m4}, 4, 5, 6, 7 + REPX {paddd x, m13}, m8, m3, m9, m1 + REPX {psrad x, 12 }, m8, m3, m9, m1 + mova m4, m6 + vpermi2q m4, m8, m3 ; 8 10 + vpermt2q m3, m10, m8 ; 11 9 + vpermi2q m6, m9, m1 ; 12 14 + vpermt2q m1, m10, m9 ; 15 13 +.main: + ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601 + jmp .main2 +.main_fast: + vbroadcasti32x4 m1, [o(pd_4091_3973)] + vbroadcasti32x4 m8, [o(pd_201_995)] + vbroadcasti32x4 m3, [o(pd_3703_3290)] + vbroadcasti32x4 m9, [o(pd_1751_2440)] + vbroadcasti32x4 m4, [o(pd_2751_2106)] + vbroadcasti32x4 m10, [o(pd_3035_3513)] + vbroadcasti32x4 m6, [o(pd_1380_601)] + vbroadcasti32x4 m11, [o(pd_3857_4052)] + pmulld m1, m0 + pmulld m0, m8 + pmulld m3, m2 + pmulld m2, m9 + pmulld m4, m5 + pmulld m5, m10 + pmulld m6, m7 + pmulld m7, m11 +.main2: + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + REPX {psubd x, m13, x}, m1, m3 + REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + REPX {pmaxsd x, m14}, m8, m4, m5, m6 + REPX {pminsd x, m15}, m8, m4, m5, m6 + vbroadcasti32x4 m11, [o(pd_4017_2276)] + vbroadcasti32x4 m10, [o(pd_799_3406)] + ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11 + ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10 + REPX {pmaxsd x, m14}, m0, m2, m1, m3 + REPX {pminsd x, m15}, m0, m2, m1, m3 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + REPX {pmaxsd x, m14}, m7, m3, m2, m6 + REPX {pminsd x, m15}, m7, m3, m2, m6 + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + vpbroadcastd m11, [o(pd_1567)] + vpbroadcastd m10, [o(pd_3784)] + ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10 + REPX {pmaxsd x, m14}, m0, m4, m1, m8 + REPX {pminsd x, m15}, m0, m4, m1, m8 + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmaxsd x, m14}, m6, m5, m3, m4 + mov r6d, 0x3333 + REPX {pminsd x, m15}, m6, m5, m3, m4 + kmovw k1, r6d + REPX {pmulld x, m12}, m6, m5, m3, m4 + pxor m9, m9 + REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8 + paddd m6, m13 + paddd m4, m13 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6 + ret +ALIGN function_align +.main_pass2: + lea r5, [o_base_8bpc] + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_16x8_internal_8bpc).main_pass2 + movshdup m13, [permC] + pmulhrsw m0, m6 + pmulhrsw m1, m6 + vpbroadcastd m15, [pixel_10bpc_max] + pxor m14, m14 + lea r6, [strideq*3] + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, identity, -21 +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst + +cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_10bpc).main_pass1 + vpbroadcastd m9, [o(pd_1)] + psubd m4, m9, m3 + paddd m3, m9, m5 + paddd m5, m9, m2 + psubd m2, m9, m6 + psubd m6, m9, m1 + paddd m1, m9, m7 + paddd m7, m9, m0 + psubd m0, m9, m8 + jmp m(iadst_16x8_internal_10bpc).pass1_end +.pass2: + call m(iadst_16x8_internal_10bpc).main_pass2 + psrlq m13, 8 + vpermq m8, m13, m3 + vpermq m9, m13, m2 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m13, m1 + vpermq m9, m13, m0 + jmp m(idct_16x8_internal_10bpc).write_16x4 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x16_internal_10bpc).load2 + vpbroadcastd m8, [o(pd_5793)] + vpbroadcastd m9, [o(pd_3072)] + pxor m10, m10 + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m10}, 0, 1, 2, 3 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m10}, 4, 5, 6, 7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + psrlq m8, [o(permA)], 16 + psrlq m9, m8, 8 + mova m10, m8 + mova m11, m9 + call m(idct_16x8_internal_10bpc).transpose_16x8 + jmp tx2q +.pass2: + movshdup m4, [o(permC)] + vpbroadcastd m13, [o(pw_4096)] + REPX {vpermq x, m4, x}, m0, m1, m2, m3 + jmp m(idct_16x8_internal_10bpc).end + +%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 16x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, 28 +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, adst + +cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 36 + jl .fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + call m(idct_8x16_internal_10bpc).main + mova m16, [cq+64* 1] + mova m17, [cq+64* 3] + mova m18, [cq+64* 5] + mova m19, [cq+64* 7] + mova m20, [cq+64* 9] + mova m21, [cq+64*11] + mova m22, [cq+64*13] + mova m23, [cq+64*15] + call .main + call .main_end +.pass1_end: +%if WIN64 + movaps xmm6, [cq+16*0] + movaps xmm7, [cq+16*1] +%endif + vzeroupper +.pass1_end2: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + punpckhdq m7, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m3, m6 + punpckldq m3, m6 + vshufi32x4 m6, m0, m4, q3232 + vinserti32x8 m0, ym4, 1 + vinserti32x8 m4, m8, ym3, 1 + vshufi32x4 m8, m3, q3232 + vinserti32x8 m3, m7, ym1, 1 + vshufi32x4 m7, m1, q3232 + vshufi32x4 m1, m2, m5, q3232 + vinserti32x8 m2, ym5, 1 + vshufi32x4 m5, m7, m1, q2020 ; 10 11 + vshufi32x4 m7, m1, q3131 ; 14 15 + vshufi32x4 m1, m3, m2, q2020 ; 2 3 + vshufi32x4 m3, m2, q3131 ; 6 7 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 +.pass1_end3: + mov r6d, 64*12 + pxor m8, m8 +.zero_loop: + mova [cq+r6+64*3], m8 + mova [cq+r6+64*2], m8 + mova [cq+r6+64*1], m8 + mova [cq+r6+64*0], m8 + sub r6d, 64*4 + jge .zero_loop + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(idct_16x16_internal_8bpc).main + movshdup m10, [permC] + vpbroadcastd m13, [pw_2048] + psrlq m11, m10, 8 + vpermq m8, m10, m0 + vpermq m0, m11, m7 + vpermq m7, m11, m1 + vpermq m1, m10, m6 + vpermq m6, m10, m2 + vpermq m2, m11, m5 + vpermq m5, m11, m3 + vpermq m3, m10, m4 +.pass2_end: + lea r6, [strideq*3] + vpbroadcastd m15, [pixel_10bpc_max] + pxor m14, m14 + pmulhrsw m8, m13, m8 + pmulhrsw m9, m13, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m8, m13, m6 + pmulhrsw m9, m13, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m8, m13, m3 + pmulhrsw m9, m13, m2 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m8, m13, m1 + pmulhrsw m9, m13, m0 + jmp m(idct_16x8_internal_10bpc).write_16x4 +.fast: + mova ym0, [cq+64*0] + mova ym2, [cq+64*4] + movshdup m8, [o(permB)] + mova ym1, [cq+64*2] + mova ym3, [cq+64*6] + mova ym4, [cq+64*1] + mova ym5, [cq+64*3] + mova ym6, [cq+64*5] + mova ym7, [cq+64*7] + vpermt2q m0, m8, m2 ; 0 4 + vpermt2q m1, m8, m3 ; 2 6 + vpermt2q m4, m8, m5 ; 1 3 + vpermt2q m7, m8, m6 ; 7 5 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + vpbroadcastd m11, [o(pd_2)] + call m(idct_8x16_internal_10bpc).main_end2 + mova m8, [o(permA)] + psrlq m9, m8, 8 + jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2 +ALIGN function_align +.main: + ITX_MULSUB_2D 16, 23, 7, 9, 10, 13, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 20, 19, 7, 9, 10, 13, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2D 22, 17, 7, 9, 10, 13, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 18, 21, 7, 9, 10, 13, 1931, 3612 ; t10a, t13a + paddd m9, m20, m16 ; t8 + psubd m20, m16, m20 ; t9 + psubd m16, m22, m18 ; t10 + paddd m18, m22 ; t11 + paddd m22, m23, m19 ; t15 + psubd m23, m19 ; t14 + psubd m19, m17, m21 ; t13 + paddd m17, m21 ; t12 + vpbroadcastd m11, [o(pd_3784)] + REPX {pmaxsd x, m14}, m20, m23, m16, m19 + vpbroadcastd m10, [o(pd_1567)] + REPX {pminsd x, m15}, m20, m23, m16, m19 + ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11 + ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m9, m18, m22, m17 + REPX {pminsd x, m15}, m9, m18, m22, m17 + paddd m21, m20, m19 ; t14 + psubd m20, m19 ; t13 + psubd m19, m9, m18 ; t11a + paddd m9, m18 ; t8a + psubd m18, m23, m16 ; t10 + paddd m16, m23 ; t9 + psubd m23, m22, m17 ; t12a + paddd m22, m17 ; t15a + REPX {pmaxsd x, m14}, m20, m23, m18, m19 + REPX {pminsd x, m15}, m20, m23, m18, m19 + REPX {pmulld x, m12}, m20, m23, m18, m19 + psubd m7, m0, m6 ; dct8 out7 + paddd m0, m6 ; dct8 out0 + psubd m6, m1, m5 ; dct8 out6 + paddd m1, m5 ; dct8 out1 + REPX {pmaxsd x, m14}, m7, m0, m6, m1 + psubd m5, m2, m4 ; dct8 out5 + paddd m2, m4 ; dct8 out2 + REPX {pminsd x, m15}, m7, m0, m6, m1 + psubd m4, m3, m8 ; dct8 out4 + paddd m3, m8 ; dct8 out3 + REPX {pmaxsd x, m14}, m5, m2, m4, m3 + paddd m20, m13 + paddd m23, m13 + REPX {pminsd x, m15}, m5, m2, m4, m3 + psubd m17, m20, m18 ; t10a + paddd m20, m18 ; t13a + REPX {pmaxsd x, m14}, m22, m21, m16, m9 + psubd m18, m23, m19 ; t11 + paddd m19, m23 ; t12 + REPX {pminsd x, m15}, m22, m21, m16, m9 + REPX {psrad x, 12 }, m20, m19, m18, m17 + ret +.main_end: + vpbroadcastd m11, [o(pd_2)] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m23, m0, m22 ; out15 + paddd m0, m22 ; out0 + psubd m22, m1, m21 ; out14 + paddd m1, m21 ; out1 + psubd m21, m2, m20 ; out13 + paddd m2, m20 ; out2 + psubd m20, m3, m19 ; out12 + paddd m3, m19 ; out3 + psubd m19, m4, m18 ; out11 + paddd m4, m18 ; out4 + psubd m18, m5, m17 ; out10 + paddd m5, m17 ; out5 + psubd m17, m6, m16 ; out9 + paddd m6, m16 ; out6 + psubd m16, m7, m9 ; out8 + paddd m7, m9 ; out7 + REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \ + m4, m20, m5, m21, m6, m22, m7, m23 + packssdw m0, m16 + packssdw m1, m17 + packssdw m2, m18 + packssdw m3, m19 + packssdw m4, m20 + packssdw m5, m21 + packssdw m6, m22 + packssdw m7, m23 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, flipadst +INV_TXFM_16X16_FN adst, adst + +cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 36 + jl .fast + call .main_pass1 + packssdw m0, m16 + packssdw m1, m17 + packssdw m2, m18 + packssdw m3, m19 + packssdw m4, m5, m20 + packssdw m5, m6, m21 + packssdw m6, m7, m22 + packssdw m7, m8, m23 + jmp m(idct_16x16_internal_10bpc).pass1_end +.fast: + call .main_pass1_fast + vpbroadcastd m9, [o(pd_2)] + paddd m0, m9 + psubd m1, m9, m1 + paddd m2, m9 + psubd m3, m9, m3 + paddd m4, m9, m5 + psubd m5, m9, m6 + paddd m6, m9, m7 + psubd m7, m9, m8 +.pass1_fast_end: + mova m9, [o(permA)] + psrlq m8, m9, 8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 +.pass1_fast_end2: + mova m10, m9 + mova m11, m8 + call m(idct_16x8_internal_10bpc).transpose_16x8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(iadst_16x16_internal_8bpc).main_pass2b + movshdup m10, [permC] + mova m13, [pw_2048_m2048] + psrlq m11, m10, 8 + vpermq m8, m11, m0 + vpermq m0, m10, m7 + vpermq m7, m11, m1 + vpermq m1, m10, m6 + vpermq m6, m11, m2 + vpermq m2, m10, m5 + vpermq m5, m11, m3 + vpermq m3, m10, m4 + jmp m(idct_16x16_internal_10bpc).pass2_end +ALIGN function_align +.main_pass1: + mova m0, [cq+64* 0] +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + mova m23, [cq+64*15] + vpbroadcastd m13, [o(pd_2048)] + ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0 + mova m7, [cq+64* 7] + mova m16, [cq+64* 8] + ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8 + mova m2, [cq+64* 2] + mova m21, [cq+64*13] + ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2 + mova m5, [cq+64* 5] + mova m18, [cq+64*10] + ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10 + mova m4, [cq+64* 4] + mova m19, [cq+64*11] + ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4 + mova m3, [cq+64* 3] + mova m20, [cq+64*12] + ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12 + mova m6, [cq+64* 6] + mova m17, [cq+64* 9] + ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6 + mova m1, [cq+64* 1] + mova m22, [cq+64*14] + ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14 + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psubd m9, m23, m7 ; t9a + paddd m23, m7 ; t1a + psubd m7, m2, m18 ; t10a + paddd m18, m2 ; t2a + REPX {pmaxsd x, m14}, m9, m23, m7, m18 + psubd m2, m17, m1 ; t15a + paddd m17, m1 ; t7a + REPX {pminsd x, m15}, m9, m23, m7, m18 + psubd m1, m21, m5 ; t11a + paddd m21, m5 ; t3a + REPX {pmaxsd x, m14}, m2, m17, m1, m21 + psubd m5, m4, m20 ; t12a + paddd m4, m20 ; t4a + REPX {pminsd x, m15}, m2, m17, m1, m21 + psubd m20, m19, m3 ; t13a + paddd m19, m3 ; t5a + REPX {pmaxsd x, m14}, m5, m4, m20, m19 + psubd m8, m6, m22 ; t14a + paddd m6, m22 ; t6a + REPX {pminsd x, m15}, m5, m4, m20, m19 + psubd m22, m0, m16 ; t8a + paddd m16, m0 ; t0a + REPX {pmaxsd x, m14}, m8, m6, m22, m16 + vpbroadcastd m11, [o(pd_4017)] + vpbroadcastd m10, [o(pd_799)] + REPX {pminsd x, m15}, m8, m6, m22, m16 + ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8 + ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13 + vpbroadcastd m11, [o(pd_2276)] + vpbroadcastd m10, [o(pd_3406)] + ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10 + ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15 + paddd m0, m16, m4 ; t0 + psubd m16, m4 ; t4 + psubd m3, m23, m19 ; t5 + paddd m23, m19 ; t1 + REPX {pmaxsd x, m14}, m0, m16, m3, m23 + psubd m19, m18, m6 ; t6 + paddd m18, m6 ; t2 + REPX {pminsd x, m15}, m0, m16, m3, m23 + psubd m6, m21, m17 ; t7 + paddd m21, m17 ; t3 + REPX {pmaxsd x, m14}, m19, m18, m6, m21 + paddd m17, m9, m20 ; t8a + psubd m9, m20 ; t12a + REPX {pminsd x, m15}, m19, m18, m6, m21 + psubd m20, m22, m5 ; t13a + paddd m22, m5 ; t9a + REPX {pmaxsd x, m14}, m17, m9, m20, m22 + psubd m5, m1, m2 ; t14a + paddd m1, m2 ; t10a + REPX {pminsd x, m15}, m17, m9, m20, m22 + psubd m2, m7, m8 ; t15a + paddd m7, m8 ; t11a + REPX {pmaxsd x, m14}, m5, m1, m2, m7 + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + REPX {pminsd x, m15}, m5, m1, m2, m7 + ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a + ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a + ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12 + ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15 + psubd m8, m0, m18 ; t2a + paddd m0, m18 ; out0 + psubd m18, m23, m21 ; t3a + paddd m23, m21 ; -out15 + paddd m21, m9, m5 ; -out13 + psubd m9, m5 ; t15a + psubd m5, m3, m6 ; t6 + paddd m3, m6 ; -out3 + REPX {pmaxsd x, m14}, m8, m18, m9, m5 + psubd m6, m20, m2 ; t14a + paddd m2, m20 ; out2 + paddd m20, m16, m19 ; out12 + psubd m16, m19 ; t7 + REPX {pminsd x, m15}, m8, m18, m9, m5 + psubd m19, m22, m7 ; t11 + paddd m22, m7 ; out14 + psubd m7, m17, m1 ; t10 + paddd m1, m17 ; -out1 + REPX {pmaxsd x, m14}, m6, m16, m19, m7 + vpbroadcastd m12, [o(pd_1448)] + vpbroadcastd m4, [o(pd_2)] + vpbroadcastd m10, [o(pd_5120)] + vpbroadcastd m11, [o(pd_5119)] + REPX {pminsd x, m15}, m6, m16, m19, m7 + psubd m17, m7, m19 ; -out9 + paddd m7, m19 ; out6 + psubd m19, m5, m16 ; -out11 + paddd m5, m16 ; out4 + REPX {pmulld x, m12}, m17, m7, m19, m5 + psubd m16, m8, m18 ; out8 + paddd m8, m18 ; -out7 + psubd m18, m6, m9 ; out10 + paddd m6, m9 ; -out5 + REPX {pmulld x, m12}, m16, m8, m18, m6 + REPX {paddd x, m4 }, m0, m2, m20, m22 + REPX {psubd x, m4, x}, m1, m3, m21, m23 + REPX {paddd x, m10 }, m7, m5, m16, m18 + REPX {psubd x, m11, x}, m17, m19, m8, m6 + REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3 + REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8 + ret +ALIGN function_align +.main_pass1_fast: + mova ym0, [cq+64*0] + mova ym1, [cq+64*2] + movshdup m8, [o(permB)] + mova ym6, [cq+64*1] + mova ym7, [cq+64*3] + mova ym2, [cq+64*4] + mova ym3, [cq+64*6] + mova ym4, [cq+64*5] + mova ym5, [cq+64*7] + vpermt2q m0, m8, m1 ; 0 2 + vpermt2q m7, m8, m6 ; 3 1 + vpermt2q m2, m8, m3 ; 4 6 + vpermt2q m5, m8, m4 ; 7 5 + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m12, [o(pd_2896)] + jmp m(iadst_16x8_internal_10bpc).main_fast + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 36 + jl .fast + call m(iadst_16x16_internal_10bpc).main_pass1 + packssdw m4, m19, m3 + packssdw m3, m20, m5 + packssdw m5, m18, m2 + packssdw m2, m21, m6 + packssdw m6, m17, m1 + packssdw m1, m22, m7 + packssdw m7, m16, m0 + packssdw m0, m23, m8 + jmp m(idct_16x16_internal_10bpc).pass1_end +.fast: + call m(iadst_16x16_internal_10bpc).main_pass1_fast + vpbroadcastd m9, [o(pd_2)] + psubd m4, m9, m3 + paddd m3, m9, m5 + paddd m5, m9, m2 + psubd m2, m9, m6 + psubd m6, m9, m1 + paddd m1, m9, m7 + paddd m7, m9, m0 + psubd m0, m9, m8 + jmp m(iadst_16x16_internal_10bpc).pass1_fast_end +.pass2: + lea r5, [o_base_8bpc] + call m(iadst_16x16_internal_8bpc).main_pass2b + movshdup m10, [permC] + movu m13, [pw_m2048_2048] + psrlq m11, m10, 8 + vpermq m8, m11, m7 + vpermq m7, m11, m6 + vpermq m6, m11, m5 + vpermq m5, m11, m4 + vpermq m3, m10, m3 + vpermq m2, m10, m2 + vpermq m1, m10, m1 + vpermq m0, m10, m0 + jmp m(idct_16x16_internal_10bpc).pass2_end + +INV_TXFM_16X16_FN identity, dct, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m10, [o(pd_5793)] + vpbroadcastd m11, [o(pd_5120)] + mov r6, cq + cmp eobd, 36 + jl .fast + call .pass1_main + packssdw m0, m6, m8 + packssdw m1, m7, m9 + call .pass1_main + packssdw m2, m6, m8 + packssdw m3, m7, m9 + call .pass1_main + packssdw m4, m6, m8 + packssdw m5, m7, m9 + call .pass1_main + packssdw m6, m8 + packssdw m7, m9 + jmp m(idct_16x16_internal_10bpc).pass1_end2 +.fast: + call .pass1_main_fast + packssdw m0, m6, m7 + call .pass1_main_fast + packssdw m1, m6, m7 + call .pass1_main_fast + packssdw m2, m6, m7 + call .pass1_main_fast + packssdw m3, m6, m7 + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckldq m3, m4, m1 + punpckhdq m4, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + pxor m7, m7 + vshufi32x4 m2, m0, m3, q3131 + vshufi32x4 m0, m3, q2020 + vshufi32x4 m3, m1, m4, q3131 + vshufi32x4 m1, m4, q2020 + REPX {mova x, m7}, m4, m5, m6 + jmp m(idct_16x16_internal_10bpc).pass1_end3 +.pass2: + movshdup m11, [o(permC)] + vpbroadcastd m12, [o(pw_1697x16)] + lea r6, [strideq*3] + vpbroadcastd m13, [o(pw_2048)] + pxor m14, m14 + vpbroadcastd m15, [pixel_10bpc_max] + vpermq m8, m11, m0 + vpermq m9, m11, m1 + call .pass2_main + vpermq m8, m11, m2 + vpermq m9, m11, m3 + call .pass2_main + vpermq m8, m11, m4 + vpermq m9, m11, m5 + call .pass2_main + vpermq m8, m11, m6 + vpermq m9, m11, m7 +.pass2_main: + pmulhrsw m0, m12, m8 + pmulhrsw m1, m12, m9 + paddsw m8, m8 + paddsw m9, m9 + paddsw m8, m0 + paddsw m9, m1 + pmulhrsw m8, m13 + pmulhrsw m9, m13 + jmp m(idct_16x8_internal_10bpc).write_16x4 +ALIGN function_align +.pass1_main: + pmulld m6, m10, [r6+64*0] + pmulld m7, m10, [r6+64*1] + pmulld m8, m10, [r6+64*8] + pmulld m9, m10, [r6+64*9] + add r6, 64*2 + REPX {paddd x, m11}, m6, m7, m8, m9 + REPX {psrad x, 13 }, m6, m8, m7, m9 + ret +ALIGN function_align +.pass1_main_fast: + mova ym6, [r6+64* 0] + vinserti32x8 m6, [r6+64* 4], 1 + mova ym7, [r6+64* 8] + vinserti32x8 m7, [r6+64*12], 1 + add r6, 64 + REPX {pmulld x, m10}, m6, m7 + REPX {paddd x, m11}, m6, m7 + REPX {psrad x, 13 }, m6, m7 + ret + +cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + vpbroadcastd m11, [o(pd_2)] + mova m20, [o(idct8x32p)] + pxor m21, m21 + cmp eobd, 43 + jl .fast + call .pass1_main + punpcklwd m16, m0, m1 + punpcklwd m17, m2, m3 + punpckhwd m18, m0, m1 + punpckhwd m19, m2, m3 + cmp eobd, 107 + jge .full + punpckldq m0, m16, m17 ; 0 2 + punpckhdq m1, m16, m17 ; 4 6 + punpckldq m2, m18, m19 ; 8 10 + punpckhdq m3, m18, m19 ; 12 14 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + vextracti32x8 ym16, m2, 1 + vextracti32x8 ym17, m3, 1 + call m(idct_8x16_internal_8bpc).main_fast + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .end +.full: + add cq, 64 + call .pass1_main + punpcklwd m5, m0, m1 + punpcklwd m6, m2, m3 + punpckhwd m7, m0, m1 + punpckhwd m8, m2, m3 + punpckldq m0, m16, m17 ; 0 2 + punpckhdq m1, m16, m17 ; 4 6 + punpckldq m2, m18, m19 ; 8 10 + punpckhdq m3, m18, m19 ; 12 14 + punpckldq m4, m5, m6 ; 16 18 + punpckhdq m5, m6 ; 20 22 + punpckldq m6, m7, m8 ; 24 26 + punpckhdq m7, m8 ; 28 30 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + vextracti32x8 ym16, m2, 1 + vextracti32x8 ym17, m3, 1 + vextracti32x8 ym18, m4, 1 + vextracti32x8 ym19, m5, 1 + vextracti32x8 ym20, m6, 1 + vextracti32x8 ym21, m7, 1 + call m(idct_8x16_internal_8bpc).main + REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main + jmp .end +.fast: + movshdup m8, [o(permB)] + mova ym1, [cq+128*1] + mova ym5, [cq+128*5] + mova ym7, [cq+128*3] + mova ym3, [cq+128*7] + mova ym0, [cq+128*0] + mova ym4, [cq+128*2] + mova ym2, [cq+128*4] + mova ym6, [cq+128*6] + vpermt2q m1, m8, m5 ; 1 5 + vpermt2q m3, m8, m7 ; 7 3 + vpermt2q m0, m8, m4 ; 0 2 + vpermt2q m2, m8, m6 ; 4 6 + mova [cq+128*0], ym21 + REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_10bpc).main_end + packssdw m0, m2 + packssdw m1, m3 + vpermb m0, m20, m0 + vprold m20, 16 + vpermb m2, m20, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + call m(idct_8x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2 +.end: + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper + lea r3, [strideq*2] + vpbroadcastd m12, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m11, m11 + lea r3, [dstq+r3*8] + pmulhrsw m0, m10 + pmulhrsw m1, m10 + call .write_8x4x2 + pmulhrsw m0, m10, m2 + pmulhrsw m1, m10, m3 + call .write_8x4x2 + pmulhrsw m0, m10, m4 + pmulhrsw m1, m10, m5 + call .write_8x4x2 + pmulhrsw m0, m10, m6 + pmulhrsw m1, m10, m7 +.write_8x4x2: + mova xm8, [dstq+strideq*0] + vinserti32x4 ym8, [dstq+strideq*1], 1 + vinserti32x4 m8, [dstq+strideq*2], 2 + vinserti32x4 m8, [dstq+r6 ], 3 + mova xm9, [r3 +r6 ] + vinserti32x4 ym9, [r3 +strideq*2], 1 + vinserti32x4 m9, [r3 +strideq*1], 2 + vinserti32x4 m9, [r3 +strideq*0], 3 + paddw m8, m0 + paddw m9, m1 + pmaxsw m8, m11 + pmaxsw m9, m11 + pminsw m8, m12 + pminsw m9, m12 + mova [dstq+strideq*0], xm8 + vextracti32x4 [dstq+strideq*1], ym8, 1 + vextracti32x4 [dstq+strideq*2], m8, 2 + vextracti32x4 [dstq+r6 ], m8, 3 + lea dstq, [dstq+strideq*4] + vextracti32x4 [r3 +strideq*0], m9, 3 + vextracti32x4 [r3 +strideq*1], m9, 2 + vextracti32x4 [r3 +strideq*2], ym9, 1 + mova [r3 +r6 ], xm9 + lea r3, [r3+strideq*4] + ret +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 +ALIGN function_align +.pass1_main: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x16_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_end2 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + REPX {vpermb x, m20, x}, m0, m1, m2, m3 + ret + +cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob + vpbroadcastd m9, [pw_5] + lea r4, [strideq*3] + pxor m10, m10 + lea r5, [strideq*5] + vpbroadcastd m11, [pixel_10bpc_max] + sub eobd, 107 + lea r6, [strideq+r4*2] +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + lea r7, [dstq+strideq*8] + REPX {mova [cq+128*x], m10}, 0, 1, 2, 3 + REPX {paddsw x, m9}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m10}, 4, 5, 6, 7 + REPX {psraw x, 3 }, m0, m1, m2, m3 + add cq, 64 + mova xm4, [dstq+strideq*0] + mova xm5, [dstq+strideq*1] + mova xm6, [dstq+strideq*2] + mova xm7, [dstq+r4 *1] + punpckhwd m8, m0, m1 + vinserti32x4 ym4, [dstq+strideq*4], 1 + punpcklwd m0, m1 + vinserti32x4 ym5, [dstq+r5 *1], 1 + punpckhwd m1, m2, m3 + vinserti32x4 ym6, [dstq+r4 *2], 1 + punpcklwd m2, m3 + vinserti32x4 ym7, [dstq+r6 *1], 1 + punpckhwd m3, m0, m8 + vinserti32x4 m4, [r7 +strideq*0], 2 + punpcklwd m0, m8 + vinserti32x4 m5, [r7 +strideq*1], 2 + punpckhwd m8, m2, m1 + vinserti32x4 m6, [r7 +strideq*2], 2 + punpcklwd m2, m1 + vinserti32x4 m7, [r7 +r4 *1], 2 + punpckhqdq m1, m0, m2 + vinserti32x4 m4, [r7 +strideq*4], 3 + punpcklqdq m0, m2 + vinserti32x4 m5, [r7 +r5 *1], 3 + punpcklqdq m2, m3, m8 + vinserti32x4 m6, [r7 +r4 *2], 3 + punpckhqdq m3, m8 + vinserti32x4 m7, [r7 +r6 *1], 3 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + REPX {pmaxsw x, m10}, m0, m1, m2, m3 + REPX {pminsw x, m11}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + mova [dstq+strideq*2], xm2 + mova [dstq+r4 *1], xm3 + vextracti32x4 [dstq+strideq*4], ym0, 1 + vextracti32x4 [dstq+r5 *1], ym1, 1 + vextracti32x4 [dstq+r4 *2], ym2, 1 + vextracti32x4 [dstq+r6 *1], ym3, 1 + lea dstq, [r7+strideq*8] + vextracti32x4 [r7 +strideq*0], m0, 2 + vextracti32x4 [r7 +strideq*1], m1, 2 + vextracti32x4 [r7 +strideq*2], m2, 2 + vextracti32x4 [r7 +r4 *1], m3, 2 + vextracti32x4 [r7 +strideq*4], m0, 3 + vextracti32x4 [r7 +r5 *1], m1, 3 + vextracti32x4 [r7 +r4 *2], m2, 3 + vextracti32x4 [r7 +r6 *1], m3, 3 + add eobd, 0x80000000 + jnc .loop + RET + +cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + mova m11, [o(permB)] + mova m0, [cq+64* 0] ; 0 1 + mova m4, [cq+64* 1] ; 2 3 + mova m1, [cq+64* 2] ; 4 5 + mova m8, [cq+64* 3] ; 6 7 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psrlq m10, m11, 32 +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + mova m16, m11 + vpermi2q m16, m0, m1 ; 1 5 + mova m17, m11 + vpermi2q m17, m8, m4 ; 7 3 + cmp eobd, 43 + jl .fast + mova m18, [cq+64* 4] ; 8 9 + mova m20, [cq+64* 5] ; 10 11 + mova m6, [cq+64* 6] ; 12 13 + mova m7, [cq+64* 7] ; 14 15 + vpermt2q m0, m10, m18 ; 0 8 + vpermt2q m18, m11, m6 ; 9 13 + mova m19, m11 + vpermi2q m19, m7, m20 ; 15 11 + cmp eobd, 107 + jge .full + vpermt2q m1, m10, m6 ; 4 12 + vpermt2q m4, m10, m8 ; 2 6 + vpermt2q m7, m10, m20 ; 14 10 + mov r6d, 64*1 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + call .main_fast + call m(idct_16x16_internal_10bpc).main_end + jmp .end +.full: + mova m2, [cq+64* 8] ; 16 17 + mova m5, [cq+64* 9] ; 18 19 + mova m9, [cq+64*10] ; 20 21 + mova m21, [cq+64*11] ; 22 23 + vpermt2q m1, m10, m9 ; 4 20 + vpermt2q m7, m10, m21 ; 14 22 + vpermt2q m21, m11, m5 ; 23 19 + vpermt2q m5, m10, m20 ; 18 10 + mova m20, m11 + vpermi2q m20, m2, m9 ; 17 21 + mova m22, [cq+64*12] ; 24 25 + mova m9, [cq+64*13] ; 26 27 + mova m3, [cq+64*14] ; 28 29 + mova m23, [cq+64*15] ; 30 31 + vpermt2q m2, m10, m22 ; 16 24 + vpermt2q m22, m11, m3 ; 25 29 + vpermt2q m3, m10, m6 ; 28 12 + vpermt2q m4, m10, m9 ; 2 26 + mova m6, m10 + vpermi2q m6, m23, m8 ; 30 6 + vpermt2q m23, m11, m9 ; 31 27 + mov r6d, 64*3 + call m(idct_8x8_internal_10bpc).main + call m(idct_16x8_internal_10bpc).main + call .main + call m(idct_16x16_internal_10bpc).main_end + jmp .end +.fast: + vpermq m0, m10, m0 ; 0 0 + vpermq m1, m10, m1 ; 4 4 + vpermt2q m4, m10, m8 ; 2 6 + xor r6d, r6d + call m(idct_8x8_internal_10bpc).main_fast2 + call m(idct_16x8_internal_10bpc).main_fast2 + call .main_fast2 + call m(idct_16x16_internal_10bpc).main_end +.end: + mova m10, [o(idct32x8p)] +%if WIN64 + movaps xmm6, [cq+16*0] + movaps xmm7, [cq+16*1] +%endif + vzeroupper + psrlw m8, m10, 8 + mova m9, m8 + vpermi2w m8, m1, m5 + vpermt2w m1, m10, m5 + vprold m5, m9, 16 + vpermi2w m9, m3, m7 + vpermt2w m3, m10, m7 + vprold m10, 16 + mova m7, m5 + vpermi2w m5, m0, m4 + vpermt2w m0, m10, m4 + pxor m14, m14 + vpermi2w m7, m2, m6 + vpermt2w m2, m10, m6 +.zero_loop: + mova [cq+r6*4+64*3], m14 + mova [cq+r6*4+64*2], m14 + mova [cq+r6*4+64*1], m14 + mova [cq+r6*4+64*0], m14 + sub r6d, 64 + jge .zero_loop + punpckhdq m6, m5, m8 + punpckldq m5, m8 + punpckhdq m8, m7, m9 + punpckldq m7, m9 + punpckhdq m4, m2, m3 + punpckldq m2, m3 + punpckhdq m3, m0, m1 + punpckldq m0, m1 + vpbroadcastd m13, [o(pw_2048)] + vpbroadcastd m15, [o(pixel_10bpc_max)] + lea r5, [o_base_8bpc] + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + lea r6, [strideq*3] + pmulhrsw m0, m13 + pmulhrsw m1, m13 + pmulhrsw m2, m13 + pmulhrsw m3, m13 + call .write_32x4 + pmulhrsw m0, m13, m4 + pmulhrsw m1, m13, m5 + pmulhrsw m2, m13, m6 + pmulhrsw m3, m13, m7 +.write_32x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r6 ] + REPX {pmaxsw x, m14}, m0, m1, m2, m3 + REPX {pminsw x, m15}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r6 ], m3 + lea dstq, [dstq+strideq*4] + ret +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [o(dconly_10bpc)] + mov [cq], eobd + or r3d, 8 + add r6d, 640 + sar r6d, 10 + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw m2, r6d + paddsw m2, m3 +.dconly_loop: + paddsw m0, m2, [dstq+strideq*0] + paddsw m1, m2, [dstq+strideq*1] + psubusw m0, m3 + psubusw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vbroadcasti32x4 m23, [o(pd_4091_3973)] + vbroadcasti32x4 m7, [o(pd_201_995)] + vbroadcasti32x4 m22, [o(pd_1380_601)] + vbroadcasti32x4 m9, [o(pd_3857_4052)] + pmulld m23, m16 ; t16 t20 + pmulld m16, m7 ; t31 t27 + pmulld m22, m17 ; -t19 -t25 + pmulld m17, m9 ; t28 t24 + REPX {paddd x, m13}, m23, m16, m17 + psubd m22, m13, m22 + REPX {psrad x, 12 }, m23, m16, m22, m17 + mova m20, m23 ; t30 t26 + mova m9, m16 ; t17 t21 + mova m19, m22 ; t18 t22 + mova m18, m17 ; t29 t25 + jmp .main3 +.main_fast: ; bottom half is zero + vbroadcasti32x4 m23, [o(pd_4091_3973)] + vbroadcasti32x4 m7, [o(pd_201_995)] + vbroadcasti32x4 m20, [o(pd_2751_2106)] + vbroadcasti32x4 m9, [o(pd_3035_3513)] + vbroadcasti32x4 m21, [o(pd_3703_3290)] + vbroadcasti32x4 m10, [o(pd_1751_2440)] + vbroadcasti32x4 m22, [o(pd_1380_601)] + vbroadcasti32x4 m11, [o(pd_3857_4052)] + pmulld m23, m16 ; t16a t20a + pmulld m16, m7 ; t31a t27a + pmulld m20, m19 ; -t17a -t21a + pmulld m19, m9 ; t30a t26a + pmulld m21, m18 ; t18a t22a + pmulld m18, m10 ; t29a t25a + pmulld m22, m17 ; -t19a -t25a + pmulld m17, m11 ; t28a t24a + psubd m20, m13, m20 + psubd m22, m13, m22 + jmp .main2 +.main: + ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973 + ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106 + ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290 + ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601 + paddd m20, m13 + paddd m22, m13 +.main2: + REPX {paddd x, m13}, m16, m23, m19 + REPX {psrad x, 12 }, m16, m20, m23, m19 + psubd m9, m16, m20 ; t17 t21 + paddd m16, m20 ; t16 t20 + psubd m20, m23, m19 ; t30 t26 + paddd m23, m19 ; t31 t27 + REPX {pmaxsd x, m14}, m9, m16, m20, m23 + REPX {paddd x, m13}, m21, m18, m17 + REPX {psrad x, 12 }, m18, m22, m21, m17 + psubd m19, m22, m18 ; t18 t22 + paddd m22, m18 ; t19 t23 + psubd m18, m17, m21 ; t29 t25 + paddd m17, m21 ; t28 t24 + REPX {pmaxsd x, m14}, m19, m22, m18, m17 + REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17 +.main3: + vbroadcasti32x4 m11, [o(pd_4017_2276)] + vbroadcasti32x4 m10, [o(pd_799_3406)] + psubd m7, m0, m6 ; dct16 out15 out14 + paddd m0, m6 ; dct16 out0 out1 + psubd m6, m1, m5 ; dct16 out12 out13 + paddd m1, m5 ; dct16 out3 out2 + psubd m5, m2, m4 ; dct16 out11 out10 + paddd m2, m4 ; dct16 out4 out5 + psubd m4, m3, m8 ; dct16 out8 out9 + paddd m3, m8 ; dct16 out7 out6 + ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11 + ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 + punpckhqdq m21, m16, m20 ; t20 t21a + punpcklqdq m16, m20 ; t16 t17a + punpcklqdq m20, m22, m19 ; t19 t18a + punpckhqdq m22, m19 ; t23 t22a + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + punpcklqdq m19, m23, m9 ; t31 t30a + punpckhqdq m23, m9 ; t27 t26a + punpckhqdq m9, m17, m18 ; t24 t25a + punpcklqdq m17, m18 ; t28 t29a + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + psubd m18, m16, m20 ; t19a t18 + paddd m20, m16 ; t16a t17 + psubd m16, m19, m17 ; t28a t29 + paddd m19, m17 ; t31a t30 + psubd m17, m22, m21 ; t20a t21 + paddd m22, m21 ; t23a t22 + psubd m21, m9, m23 ; t27a t26 + paddd m23, m9 ; t24a t25 + REPX {pmaxsd x, m14}, m18, m16, m17, m21 + REPX {pminsd x, m15}, m16, m18, m21, m17 + ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11 + ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m20, m22, m19, m23 + REPX {pminsd x, m15}, m20, m22, m19, m23 + paddd m9, m20, m22 ; t16 t17a + psubd m20, m22 ; t23 t22a + paddd m22, m19, m23 ; t31 t30a + psubd m19, m23 ; t24 t25a + psubd m23, m16, m17 ; t20a t21 + paddd m16, m17 ; t19a t18 + psubd m17, m18, m21 ; t27a t26 + paddd m21, m18 ; t28a t29 + REPX {pmaxsd x, m14}, m20, m19, m23, m17 + REPX {pminsd x, m15}, m19, m20, m17, m23 + REPX {pmulld x, m12}, m19, m20, m17, m23 + REPX {pmaxsd x, m14}, m22, m21, m16, m9 + paddd m19, m13 + paddd m17, m13 + REPX {pminsd x, m15}, m22, m21, m16, m9 + psubd m18, m19, m20 ; t23a t22 + paddd m19, m20 ; t24a t25 + paddd m20, m17, m23 ; t27 t26a + psubd m17, m23 ; t20 t21a + REPX {psrad x, 12 }, m20, m19, m18, m17 + ret + +cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob + vpbroadcastd m5, [pw_4096] + lea r4, [strideq*3] + mova m6, [idtx32x8p] + lea r5, [strideq*5] + vpbroadcastd m9, [pixel_10bpc_max] + lea r6, [strideq+r4*2] + pxor m8, m8 + sub eobd, 107 + psrlw m7, m6, 8 +.loop: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] ; 02 13 + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] ; 46 57 + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] ; 8a 9b + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] ; ce df + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m8}, 0, 1, 2, 3 + mova m4, m6 + vpermi2w m4, m1, m3 + vpermt2w m1, m7, m3 + REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 + mova m3, m7 + vpermi2w m3, m0, m2 + vpermt2w m0, m6, m2 + add cq, 64*8 + punpcklqdq m2, m3, m1 ; 4 5 + punpckhqdq m3, m1 ; 6 7 + punpckhqdq m1, m0, m4 ; 2 3 + punpcklqdq m0, m4 ; 0 1 + mova ym4, [dstq+strideq*0] + vinserti32x8 m4, [dstq+strideq*1], 1 + paddw m0, m4 + mova ym4, [dstq+strideq*2] + vinserti32x8 m4, [dstq+r4 *1], 1 + paddw m1, m4 + mova ym4, [dstq+strideq*4] + vinserti32x8 m4, [dstq+r5 *1], 1 + paddw m2, m4 + mova ym4, [dstq+r4 *2] + vinserti32x8 m4, [dstq+r6 *1], 1 + paddw m3, m4 + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+r4 *1], m1, 1 + mova [dstq+strideq*4], ym2 + vextracti32x8 [dstq+r5 *1], m2, 1 + mova [dstq+r4 *2], ym3 + vextracti32x8 [dstq+r6 *1], m3, 1 + add dstq, 32 + add eobd, 0x80000000 + jnc .loop + RET + +%endif ; ARCH_X86_64 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm index 4fb30ef4e7a..3833e17c99f 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm @@ -361,18 +361,32 @@ ALIGN function_align %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 0, 4x4 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 - movd m1, [o(pw_2896x8)] + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 + mov r3d, 4 +.dconly: + add r5d, 128 + sar r5d, 8 +.dconly2: + imul r5d, 2896 + mova m2, [o(pixel_10bpc_max)] + add r5d, 34816 movd m0, r5d - packssdw m0, m0 - pmulhrsw m0, m1 - pshuflw m0, m0, q0000 + pshuflw m0, m0, q1111 + pxor m3, m3 punpcklqdq m0, m0 - mova m1, m0 - TAIL_CALL m(iadst_4x4_internal_16bpc).end +.dconly_loop: + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + paddw m1, m0 + pminsw m1, m2 + pmaxsw m1, m3 + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET %endif %endmacro @@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 4x8 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 2 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 2048 - sar r5d, 12 -.end: - imul r5d, 2896 - add r5d, 34816 - movd m0, r5d - pshuflw m0, m0, q1111 - punpcklqdq m0, m0 - pxor m4, m4 - mova m3, [o(pixel_10bpc_max)] - lea r2, [strideq*3] -.loop: - movq m1, [dstq+strideq*0] - movq m2, [dstq+strideq*2] - movhps m1, [dstq+strideq*1] - movhps m2, [dstq+r2] - paddw m1, m0 - paddw m2, m0 - REPX {pminsw x, m3}, m1, m2 - REPX {pmaxsw x, m4}, m1, m2 - movq [dstq+strideq*0], m1 - movhps [dstq+strideq*1], m1 - movq [dstq+strideq*2], m2 - movhps [dstq+r2 ], m2 - lea dstq, [dstq+strideq*4] - dec r3d - jg .loop - RET + mov r3d, 8 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly %endif %endmacro @@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 4 - add r5d, 6144 - sar r5d, 13 - jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end + mov r3d, 16 + add r5d, 384 + sar r5d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 %endif %endmacro @@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 2048 - sar r5d, 12 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 128 + sar r5d, 8 imul r5d, 2896 add r5d, 34816 movd m0, r5d @@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 2 .end: - add r5d, 6144 - sar r5d, 13 + add r5d, 384 + sar r5d, 9 .end2: imul r5d, 2896 add r5d, 34816 @@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 mov r3d, 4 %if stack_size_padded > 0 ; adjust to caller's stack allocation @@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 4 .dconly: - add r5d, 6144 - sar r5d, 13 + add r5d, 384 + sar r5d, 9 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -2755,6 +2742,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ret .round: %if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 pcmpeqd m8, m8 REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] @@ -2785,6 +2774,14 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 pcmpeqd m0, m0 REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 mova [r3+ 1*16], m1 @@ -3472,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 %if ARCH_X86_32 add rsp, 1*16 %endif @@ -3939,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (5+ARCH_X86_64*3+WIN64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 %endif @@ -4057,6 +4054,8 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ret .round: %if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 psrld m8, m11, 10 ; 2 REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] @@ -4087,6 +4086,14 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 mova m0, [o(pd_2)] REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 paddd m0, [r3+ 0*16] @@ -5162,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ call m(idct_8x8_internal_16bpc).round1_and_write_8x8 ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (31+2*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 @@ -5339,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ %endif RET .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 add rsp, (65+4*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly @@ -5944,6 +5951,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ ; final sumsub for idct16 as well as idct32, plus final downshift %macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx mova m%4, [r3+16*(23-%1)] + pmaxsd m%1, m12 + pminsd m%1, m13 psubd m%3, m%1, m%4 ; idct16 out15 - n paddd m%1, m%4 ; idct16 out0 + n pmaxsd m%1, m12 @@ -6019,6 +6028,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ .loop_dct32_end: mova m0, [r3+16*16] mova m6, [r3+16*24] + pmaxsd m0, m2 + pminsd m0, m3 psubd m5, m0, m6 ; idct16 out15 - n paddd m0, m6 ; idct16 out0 + n pmaxsd m0, m2 @@ -6045,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ %endif .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 .dconly1: - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -6344,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ %endif .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ @@ -6565,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 @@ -6838,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 @@ -7098,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 @@ -7537,6 +7548,8 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ mova m5, [r3-16* 4] ; idct64 48 + n mova m6, [r4-16*20] ; idct64 47 - n mova m7, [r3-16*20] ; idct64 32 + n + pmaxsd m0, m12 + pminsd m0, m13 paddd m8, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m12}, m8, m0 @@ -7565,11 +7578,13 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ mova [r4-16* 4], m6 mova [r3+16*12], m8 %else + mova m5, [o(clip_18b_min)] + mova m6, [o(clip_18b_max)] mova m1, [r3+16*44] ; idct16 15 - n + pmaxsd m0, m5 + pminsd m0, m6 paddd m4, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n - mova m5, [o(clip_18b_min)] - mova m6, [o(clip_18b_max)] REPX {pmaxsd x, m5}, m4, m0 REPX {pminsd x, m6}, m4, m0 paddd m1, m4, m3 ; idct32 out0 + n @@ -7632,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 .dconly1: - add r5d, 10240 - sar r5d, 14 + add r5d, 640 + sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 @@ -7876,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 - add r5d, 2048 - sar r5d, 12 - imul r5d, 2896 - add r5d, 6144 - sar r5d, 13 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 add rsp, (1+8*32+1*WIN64)*16 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 @@ -8112,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ ret .dconly: - imul r5d, [cq], 2896 + imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm index 092c842786d..a67f053a61b 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm @@ -126,7 +126,7 @@ pw_m2751_3035x8: dw -2751*8, 3035*8 SECTION .text -; Code size reduction trickery: Intead of using rip-relative loads with +; Code size reduction trickery: Instead of using rip-relative loads with ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a ; single rip-relative lea and then address things relative from that with ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. @@ -1194,13 +1194,9 @@ cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] + mov [cq], eobd pmulhrsw xm0, xm1 - movd xm2, [o(pw_2048)] - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mova m1, m0 - jmp m(iadst_8x4_internal_8bpc).end3 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 %endif %endmacro @@ -1340,20 +1336,20 @@ cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd + or r3d, 8 +.dconly: pmulhrsw xm0, xm2 - psrlw xm2, 3 ; pw_2048 +.dconly2: + movd xm2, [pw_2048] pmulhrsw xm0, xm1 + lea r2, [strideq*3] pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 -.end: - mov r2d, 2 -.end2: - lea r3, [strideq*3] -.loop: - WRITE_8X4 0, 0, 1, 2 +.dconly_loop: + WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 lea dstq, [dstq+strideq*4] - dec r2d - jg .loop + sub r3d, 4 + jg .dconly_loop RET %endif %endmacro @@ -1543,13 +1539,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - psrlw xm2, 3 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly %endif %endmacro @@ -1902,7 +1893,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd - mov r2d, 2 + or r3d, 4 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative @@ -1911,17 +1902,17 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 vpbroadcastw m0, xm0 pxor m3, m3 .dconly_loop: - mova xm1, [dstq] - vinserti128 m1, [dstq+strideq], 1 + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 punpckhbw m2, m1, m3 punpcklbw m1, m3 paddw m2, m0 paddw m1, m0 packuswb m1, m2 - mova [dstq], xm1 - vextracti128 [dstq+strideq], m1, 1 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] - dec r2d + sub r3d, 2 jg .dconly_loop RET %endif @@ -2162,7 +2153,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 4 + or r3d, 8 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -2473,7 +2464,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 8 + or r3d, 16 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -3120,13 +3111,8 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - pmulhrsw xm0, xm2 - psrlw xm2, 2 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly .full: REPX {pmulhrsw x, m9}, m12, m13, m14, m15 pmulhrsw m6, m9, [rsp+32*2] @@ -3290,7 +3276,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 8 + or r3d, 8 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative @@ -3307,7 +3293,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob packuswb m1, m2 mova [dstq], m1 add dstq, strideq - dec r2d + dec r3d jg .dconly_loop RET .normal: @@ -3672,7 +3658,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 16 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .full: mova [tmp1q-32*4], m1 @@ -3991,7 +3977,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 16 + or r3d, 16 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 @@ -4222,7 +4208,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ @@ -4486,7 +4472,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 32 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .normal: PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 @@ -4832,7 +4818,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 16 + or r3d, 16 .dconly: pmulhrsw xm0, xm2 movd xm2, [o(pw_2048)] @@ -4856,7 +4842,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob mova [dstq+32*0], m2 mova [dstq+32*1], m3 add dstq, strideq - dec r2d + dec r3d jg .dconly_loop RET .normal: @@ -4997,7 +4983,7 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 @@ -5200,7 +5186,7 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ @@ -5381,7 +5367,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm index 7d01bccb4f5..f30f4909287 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm @@ -29,7 +29,8 @@ %if ARCH_X86_64 SECTION_RODATA 64 -int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 +const \ +int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 @@ -84,7 +85,7 @@ pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 -gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10 +gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 @@ -845,7 +846,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 punpcklwd m3, m5 ; dct8 in3 in5 punpckhwd m5, m2 ; dct16 in11 in5 punpcklwd m6, m2 ; dct4 in3 in1 -.main2: +cglobal_label .main2 vpbroadcastd m10, [o(pd_2048)] .main3: vpbroadcastq m13, [o(int_mshift)] @@ -1355,7 +1356,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m3, m3, q2031 jmp m(iadst_8x8_internal_8bpc).end2 ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -1422,7 +1423,7 @@ ALIGN function_align punpckhqdq m0, m4 ; out0 -out1 ret ALIGN function_align -.main_pass2: +cglobal_label .main_pass2 IADST8_1D_PACKED 2 ret @@ -1499,8 +1500,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128 sar r6d, 8 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly @@ -1608,7 +1609,54 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpscatterdq [r3+ym8]{k2}, m2 RET ALIGN function_align -.main: +cglobal_label .main_fast2 ; bottom three-quarters are zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + vpbroadcastd ym3, [o(pw_401_4076x8)] + vpbroadcastd ym5, [o(pw_799_4017x8)] + vpbroadcastd ym4, [o(pw_m1189_3920x8)] + pxor ym6, ym6 + punpckhwd ym2, ym0, ym0 + pmulhrsw ym2, ym3 ; t8a t15a + punpcklwd ym7, ym1, ym1 + pmulhrsw ym7, ym5 ; t4a t7a + punpckhwd ym1, ym1 + pmulhrsw ym4, ym1 ; t11a t12a + vpcmpub k7, ym13, ym10, 6 + punpcklwd ym9, ym6, ym0 + psubsw ym0, ym2, ym4 ; t11a t12a + paddsw ym8, ym2, ym4 ; t8a t15a + mova ym1, ym7 + jmp .main5 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + pxor ym6, ym6 + punpckhwd ym8, ym0, ym0 + punpckhwd ym4, ym3, ym3 + punpckhwd ym5, ym2, ym2 + punpcklwd ym7, ym1, ym1 + punpckhwd ym1, ym1 + punpcklwd ym3, ym3 + punpcklwd ym9, ym6, ym0 + punpcklwd ym6, ym2 + vpbroadcastd ym2, [o(pw_401_4076x8)] + vpbroadcastd ym0, [o(pw_m2598_3166x8)] + vpbroadcastd ym11, [o(pw_1931_3612x8)] + vpbroadcastd ym12, [o(pw_m1189_3920x8)] + pmulhrsw ym8, ym2 ; t8a t15a + vpbroadcastd ym2, [o(pw_799_4017x8)] + pmulhrsw ym0, ym4 ; t9a t14a + vpbroadcastd ym4, [o(pw_m2276_3406x8)] + pmulhrsw ym5, ym11 ; t10a t13a + pmulhrsw ym1, ym12 ; t11a t12a + pmulhrsw ym7, ym2 ; t4a t7a + pmulhrsw ym3, ym4 ; t5a t6a + vpcmpub k7, ym13, ym10, 6 + jmp .main4 +ALIGN function_align +cglobal_label .main WRAP_YMM IDCT16_1D_PACKED ret @@ -1685,13 +1733,14 @@ ALIGN function_align vpermi2q m6, m0, m2 ; in4 in8 in6 in10 vpermt2q m1, m10, m3 ; in11 in7 in9 in5 .main: - vpbroadcastd m9, [o(pd_2048)] - vpbroadcastq m13, [o(int_mshift)] - kxnorb k1, k1, k1 punpcklwd m0, m4, m5 ; in0 in15 in2 in13 punpckhwd m4, m5 ; in12 in3 in14 in1 punpcklwd m5, m6, m1 ; in4 in11 in6 in9 punpckhwd m6, m1 ; in8 in7 in10 in5 +cglobal_label .main2 + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + kxnorb k1, k1, k1 vpcmpub k7, m13, m9, 6 ; 0x33... pxor m8, m8 ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 @@ -1976,7 +2025,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd - mov r3d, 8 + or r3d, 8 .dconly: imul r6d, 181 add r6d, 128 @@ -2114,7 +2163,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vextracti32x4 [r3 +r4 ], m1, 3 RET ALIGN function_align -.main: +cglobal_label .main IDCT8_1D_PACKED ret @@ -2168,6 +2217,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call .main_pass2 + movshdup m4, [o(permC)] pmulhrsw m0, m6 pmulhrsw m1, m6 psrlq m6, m4, 4 @@ -2194,9 +2244,8 @@ ALIGN function_align IADST8_1D_PACKED 1 ret ALIGN function_align -.main_pass2: +cglobal_label .main_pass2 IADST8_1D_PACKED 2 - movshdup m4, [o(permC)] pxor m5, m5 psubd m5, m6 packssdw m6, m5 @@ -2222,6 +2271,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call m(iadst_16x8_internal_8bpc).main_pass2 + movshdup m4, [o(permC)] pmulhrsw m5, m6, m0 pmulhrsw m0, m6, m1 psrlq m1, m4, 12 @@ -2276,8 +2326,8 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 @@ -2456,7 +2506,7 @@ ALIGN function_align pmulhrsw m3, m4 ; t5a t6a jmp .main4 ALIGN function_align -.main: +cglobal_label .main IDCT16_1D_PACKED ret @@ -2562,6 +2612,7 @@ ALIGN function_align vshufi32x4 m1, m5, q2020 ; 2 3 vshufi32x4 m5, m7, m9, q2020 ; 10 11 vshufi32x4 m7, m9, q3131 ; 14 15 +cglobal_label .main_pass2b REPX {pshufd x, x, q1032}, m1, m3, m5, m7 call .main vpbroadcastd m8, [o(pw_2896x8)] @@ -2770,13 +2821,13 @@ ALIGN function_align vpermt2q m9, m12, m7 jmp m(idct_16x16_internal_8bpc).end -%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] - vpbroadcastd m%3, [o(pw_%4_%5x8)] - punpcklwd m%1, m%2, m%2 - pmulhrsw m%1, m%3 - vpbroadcastd m%3, [o(pw_%6_%7x8)] - punpckhwd m%2, m%2 - pmulhrsw m%2, m%3 +%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] + vpbroadcastd m%4, [o(pw_%5_%6x8)] + punpcklwd m%1, m%3, m%3 + pmulhrsw m%1, m%4 + vpbroadcastd m%4, [o(pw_%7_%8x8)] + punpckhwd m%2, m%3, m%3 + pmulhrsw m%2, m%4 %endmacro cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob @@ -2864,82 +2915,86 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 - pxor ym4, ym4 vpermt2q m2, m5, m6 ; 8 10 vpermt2q m16, m5, m17 ; 9 11 - mova ym5, ym4 - mova ym6, ym4 - mova ym7, ym4 vextracti32x8 ym3, m2, 1 ; 12 14 vextracti32x8 ym17, m16, 1 ; 13 15 - call m(idct_8x16_internal_8bpc).main + call m(idct_8x16_internal_8bpc).main_fast call .main_fast .end: - vpbroadcastd ym12, strided - vpbroadcastd m13, [o(pw_2048)] - pmulld ym7, ym12, [o(gather8d)] - REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11 + vpbroadcastd ym8, strided + pmulld ym8, [o(gather8d)] + call .main_end lea r3, [dstq+strideq*4] - shl strideq, 4 - lea r4, [dstq+strideq] - add r1, r3 kxnorb k1, k1, k1 - pxor m6, m6 + lea r4, [dstq+strideq*8] + pxor m9, m9 + lea r1, [r3+strideq*8] kmovb k2, k1 - vpgatherdq m12{k1}, [r0+ym7] + vpgatherdq m12{k1}, [r0+ym8] kmovb k1, k2 - vpgatherdq m13{k2}, [r3+ym7] + vpgatherdq m13{k2}, [r3+ym8] kmovb k2, k1 - vpgatherdq m14{k1}, [r4+ym7] + vpgatherdq m14{k1}, [r4+ym8] kmovb k1, k2 - vpgatherdq m15{k2}, [r1+ym7] - REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 - punpcklbw m4, m12, m6 - punpckhbw m12, m6 - paddw m0, m4 + vpgatherdq m15{k2}, [r1+ym8] + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m11, m12, m9 + punpckhbw m12, m9 + paddw m0, m11 paddw m1, m12 packuswb m0, m1 kmovb k2, k1 - vpscatterdq [r0+ym7]{k1}, m0 - punpcklbw m4, m13, m6 - punpckhbw m13, m6 - paddw m2, m4 + vpscatterdq [r0+ym8]{k1}, m0 + punpcklbw m12, m13, m9 + punpckhbw m13, m9 + paddw m2, m12 paddw m3, m13 packuswb m2, m3 kmovb k1, k2 - vpscatterdq [r3+ym7]{k2}, m2 - punpcklbw m4, m14, m6 - punpckhbw m14, m6 - paddw m8, m4 - paddw m9, m14 - packuswb m8, m9 + vpscatterdq [r3+ym8]{k2}, m2 + punpcklbw m13, m14, m9 + punpckhbw m14, m9 + paddw m4, m13 + paddw m5, m14 + packuswb m4, m5 kmovb k2, k1 - vpscatterdq [r4+ym7]{k1}, m8 - punpcklbw m4, m15, m6 - punpckhbw m15, m6 - paddw m10, m4 - paddw m11, m15 - packuswb m10, m11 - vpscatterdq [r1+ym7]{k2}, m10 + vpscatterdq [r4+ym8]{k1}, m4 + punpcklbw m14, m15, m9 + punpckhbw m15, m9 + paddw m6, m14 + paddw m7, m15 + packuswb m6, m7 + vpscatterdq [r1+ym8]{k2}, m6 RET .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 INIT_YMM avx512icl ALIGN function_align -.main_fast: ; bottom half is zero - ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a - ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a - ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a - ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a +cglobal_label .main_fast2 ; bottom three-quarters are zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + mova m11, m12 + mova m17, m20 + mova m15, m21 + mova m16, m14 + jmp .main4 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main3 ALIGN function_align -.main: +cglobal_label .main punpcklwd m12, m21, m14 ; in31 in1 punpckhwd m14, m21 ; in3 in29 punpcklwd m21, m20, m15 ; in27 in5 @@ -2966,6 +3021,7 @@ ALIGN function_align paddsw m21, m16 ; t20 t27 psubsw m16, m14, m19 ; t22 t25 paddsw m14, m19 ; t23 t24 +.main4: ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a @@ -2997,8 +3053,8 @@ ALIGN function_align REPX {pshufb x, m18}, m20, m11, m21, m19 ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 - packssdw m18, m13 ; t23a t22 - packssdw m12, m15 ; t24a t25 + packssdw m18, m13 ; t23a t22 + packssdw m12, m15 ; t24a t25 ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 packssdw m16, m13 ; t20 t21a @@ -3007,32 +3063,27 @@ ALIGN function_align punpckhqdq m19, m21 ; t28a t29 punpcklqdq m21, m20, m11 ; t16 t17a punpckhqdq m20, m11 ; t31 t30a - psubsw m15, m1, m19 ; out28 out29 - paddsw m1, m19 ; out3 out2 - psubsw m9, m6, m13 ; out19 out18 - paddsw m6, m13 ; out12 out13 - psubsw m10, m5, m16 ; out20 out21 - paddsw m5, m16 ; out11 out10 - psubsw m19, m3, m12 ; out24 out25 - paddsw m3, m12 ; out7 out6 - psubsw m8, m7, m21 ; out16 out17 - paddsw m7, m21 ; out15 out14 - psubsw m21, m0, m20 ; out31 out30 - paddsw m0, m20 ; out0 out1 - psubsw m11, m4, m18 ; out23 out22 - paddsw m4, m18 ; out8 out9 - psubsw m18, m2, m14 ; out27 out26 - paddsw m2, m14 ; out4 out5 INIT_ZMM avx512icl - movu m16, [o(permD+3)] - vpermt2q m0, m16, m4 ; 0 1 8 9 - vpermt2q m8, m16, m19 ; 16 17 24 25 - vpermt2q m1, m16, m5 ; 3 2 11 10 - vpermt2q m9, m16, m18 ; 19 18 27 26 - vpermt2q m2, m16, m6 ; 4 5 12 13 - vpermt2q m10, m16, m15 ; 20 21 28 29 - vpermt2q m3, m16, m7 ; 7 6 15 14 - vpermt2q m11, m16, m21 ; 23 22 31 30 + mova m15, [o(permA)] + ret +cglobal_label .main_end + vpbroadcastd m10, [o(pw_2048)] + vpermt2q m0, m15, m1 ; t0 t1 t2 t3 + vpermt2q m20, m15, m19 ; t31 t30a t29 t28a + vpermt2q m2, m15, m3 ; t4 t5 t6 t7 + vpermt2q m14, m15, m12 ; t27 t26a t25 t24a + vpermt2q m4, m15, m5 ; t8 t9 t10 t11 + vpermt2q m18, m15, m16 ; t23a t22 t21a t20 + vpermt2q m6, m15, m7 ; t12 t13 t14 t15 + vpermt2q m13, m15, m21 ; t19a t18 t17a t16 + psubsw m7, m0, m20 ; out31 out30 out29 out28 + paddsw m0, m20 ; out0 out1 out2 out3 + psubsw m5, m2, m14 ; out27 out26 out25 out24 + paddsw m2, m14 ; out4 out5 out6 out7 + psubsw m3, m4, m18 ; out23 out22 out21 out20 + paddsw m4, m18 ; out8 out9 out10 out11 + psubsw m1, m6, m13 ; out19 out18 out17 out16 + paddsw m6, m13 ; out12 out13 out14 out15 vzeroupper ret @@ -3079,16 +3130,33 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob call m(idct_8x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast .pass2: - vpbroadcastd m12, [o(pw_8192)] - vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31 - vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30 - vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29 - vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28 - vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27 - vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26 - vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15 - vshufi32x4 m0, m8, q2020 ; 0 8 16 24 - REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m10, [o(pw_8192)] + vpermt2q m0, m15, m4 ; t0 t1 t9 t8 + vpermt2q m20, m15, m18 ; t31 t30a t23a t22 + vpermt2q m3, m15, m7 ; t7 t6 t14 t15 + vpermt2q m12, m15, m21 ; t25 t24a t17a t16 + vpermt2q m2, m15, m6 ; t4 t5 t13 t12 + vpermt2q m14, m15, m13 ; t23a t22 t21a t20 + vpermt2q m1, m15, m5 ; t3 t2 t10 t11 + vpermt2q m19, m15, m16 ; t27 t26a t19a t18 + psubsw m8, m0, m20 ; out31 out30 out22 out23 + paddsw m0, m20 ; out0 out1 out9 out8 + paddsw m6, m3, m12 ; out7 out6 out14 out15 + psubsw m3, m12 ; out24 out25 out17 out16 + psubsw m5, m2, m14 ; out27 out26 out18 out19 + paddsw m4, m2, m14 ; out4 out5 out13 out12 + psubsw m7, m1, m19 ; out28 out29 out21 out20 + paddsw m2, m1, m19 ; out3 out2 out10 out11 + vzeroupper + vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 + vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 + vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 + vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 + vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 + vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 + vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 + vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 call .main vpbroadcastd m8, [o(pw_2048)] @@ -3132,7 +3200,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 8 + or r3d, 8 .dconly2: imul r6d, 181 add r6d, 128+512 @@ -3158,7 +3226,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob jg .dconly_loop RET ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m10, [o(pd_2048)] .main2: ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a @@ -3535,7 +3603,7 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly ALIGN function_align .main_oddhalf_fast2: ; bottom three-quarters are zero @@ -3821,8 +3889,8 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -4603,7 +4671,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 ALIGN function_align .main_oddhalf_fast2: ; bottom three-quarters are zero @@ -5068,8 +5136,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 64 imul r6d, 181 - mov r3d, 64 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 @@ -5282,7 +5350,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob jnz .normal movsx r6d, word [cq] mov [cq], eobd - mov r3d, 16 + or r3d, 16 .dconly: imul r6d, 181 add r6d, 128+512 @@ -6012,8 +6080,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 64 imul r6d, 181 - mov r3d, 64 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -6674,8 +6742,8 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 32 imul r6d, 181 - mov r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -7117,7 +7185,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly ALIGN function_align .pass2_end: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h new file mode 100644 index 00000000000..33c842a9ce4 --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h @@ -0,0 +1,66 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/loopfilter.h" + +#define decl_loopfilter_sb_fns(ext) \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext)) + +decl_loopfilter_sb_fns(ssse3); +decl_loopfilter_sb_fns(avx2); +decl_loopfilter_sb_fns(avx512icl); + +static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm index 361ccc3b883..ed83000ac24 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm @@ -30,22 +30,24 @@ SECTION_RODATA 32 +pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 times 4 db 8, 9 times 4 db 0, 1 times 4 db 8, 9 -pw_1: times 16 dw 1 -pw_2: times 16 dw 2 -pw_3: times 16 dw 3 -; 4 and 16 need to be next to each other since they are used as alternates -; depending on whether bitdepth is 10 or 12 -pw_4: times 16 dw 4 -pw_16: times 16 dw 16 -pw_8: times 16 dw 8 -pw_4096: times 16 dw 4096 +pw_1: times 16 dw 1 +pw_2: times 16 dw 2 +pw_3: times 16 dw 3 +pw_4096: times 2 dw 4096 -pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 +; 10bpc/12bpc: +pw_4: times 2 dw 4 + times 2 dw 16 +clip_max: times 2 dw 511 + times 2 dw 2047 +clip_min: times 2 dw -512 + times 2 dw -2048 SECTION .text @@ -398,9 +400,10 @@ SECTION .text pmaxuw m2, [pw_1] ; I psrlw m1, m0, 4 ; H paddw m0, [pw_2] + vpbroadcastd m8, [r11] paddw m0, m0 paddw m0, m2 ; E - REPX {pmullw x, [r11]}, m0, m1, m2 + REPX {pmullw x, m8}, m0, m1, m2 psubw m8, m3, m4 ; p1-p0 psubw m9, m5, m6 ; q1-q0 @@ -430,7 +433,8 @@ SECTION .text pabsw m10, m10 pmaxuw m9, m10 %endif - pcmpgtw m9, [r11] ; !flat8in + vpbroadcastd m10, [r11] + pcmpgtw m9, m10 ; !flat8in psubw m10, m13, m3 ; p2-p1 pabsw m10, m10 @@ -503,7 +507,8 @@ SECTION .text pmaxuw m0, m2 pmaxuw m1, m10 pmaxuw m1, m0 - pcmpgtw m1, [r11] ; !flat8out + vpbroadcastd m0, [r11] + pcmpgtw m1, m0 ; !flat8out por m1, m9 ; !flat8in | !flat8out vpbroadcastd m2, [maskq+8] pand m10, m2, m12 @@ -544,12 +549,8 @@ SECTION .text %endif ; short filter - - vpbroadcastw m0, r7m - pcmpeqw m2, m2 - psrlw m0, 1 ; 511 or 2047 - pxor m2, m0 ; -512 or -2048 - + vpbroadcastd m0, [r11+8*1] ; 511 or 2047 + vpbroadcastd m2, [r11+8*2] ; -512 or -2048 psubw m10, m5, m4 paddw m11, m10, m10 paddw m11, m10 @@ -561,17 +562,18 @@ SECTION .text pminsw m10, m0 pmaxsw m10, m2 pand m8, m10 ; f&=fm - paddw m10, m8, [pw_3] - paddw m8, [pw_4] + vpbroadcastd m10, [pw_4] + paddw m10, m8 + paddw m8, [pw_3] REPX {pminsw x, m0}, m10, m8 psraw m10, 3 ; f2 psraw m8, 3 ; f1 - paddw m4, m10 - psubw m5, m8 + psubw m5, m10 + paddw m4, m8 - paddw m8, [pw_1] - psraw m8, 1 ; f=(f1+1)>>1 - pandn m8, m7, m8 ; f&=!hev + paddw m10, [pw_1] + psraw m10, 1 ; f=(f1+1)>>1 + pandn m8, m7, m10 ; f&=!hev paddw m3, m8 psubw m6, m8 pxor m8, m8 @@ -603,8 +605,8 @@ SECTION .text mova [rsp+ 0*32], m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 - psllw m8, m0, 3 ; p6*8 - paddw m8, [pw_8] + paddw m8, m0, [pw_1] + psllw m8, 3 ; p6*8+8 paddw m10, m2, m7 ; p5+p4 psubw m8, m0 paddw m10, m10 ; (p5+p4)*2 @@ -759,7 +761,6 @@ SECTION .text psubw m8, m15 paddw m8, m0 psrlw m10, m8, 4 - pand m10, m1 %ifidn %2, v mova m9, [tmpq+strideq*1] %else @@ -788,6 +789,7 @@ SECTION .text %if %1 >= 8 ; flat8 filter + vpbroadcastd m7, [pw_4096] %ifidn %2, v mova m0, [tmpq+strideq*0] ; p3 %else @@ -799,43 +801,43 @@ SECTION .text paddw m2, m0 ; p1+p0+p3 paddw m8, m5 ; 2*(p3+p2)+q0 paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 - pmulhrsw m7, m2, [pw_4096] + pmulhrsw m10, m2, m7 paddw m8, m3, m6 psubw m2, m1 paddw m2, m8 - pmulhrsw m8, m2, [pw_4096] + pmulhrsw m8, m2, m7 - paddw m10, m0, m3 - paddw m11, m4, m14 - psubw m2, m10 - paddw m2, m11 - pmulhrsw m10, m2, [pw_4096] + paddw m11, m0, m3 + paddw m1, m4, m14 + psubw m2, m11 + paddw m2, m1 + pmulhrsw m1, m2, m7 paddw m11, m0, m4 + pblendvb m4, m1, m9 paddw m1, m5, m15 psubw m2, m11 paddw m2, m1 - pmulhrsw m11, m2, [pw_4096] + pmulhrsw m11, m2, m7 paddw m2, m6 paddw m2, m15 paddw m1, m13, m5 + pblendvb m5, m11, m9 + pblendvb m13, m10, m9 psubw m2, m1 - pmulhrsw m1, m2, [pw_4096] + pmulhrsw m1, m2, m7 psubw m2, m3 + pblendvb m3, m8, m9 psubw m2, m6 - paddw m0, m15, m14 - paddw m2, m0 - pmulhrsw m2, [pw_4096] + pblendvb m6, m1, m9 + paddw m1, m15, m14 + paddw m2, m1 + pmulhrsw m2, m7 - vpblendvb m13, m13, m7, m9 - vpblendvb m3, m3, m8, m9 - vpblendvb m4, m4, m10, m9 - vpblendvb m5, m5, m11, m9 - vpblendvb m6, m6, m1, m9 - vpblendvb m14, m14, m2, m9 + pblendvb m14, m2, m9 %ifidn %2, v mova [tmpq+strideq*1], m13 ; p2 @@ -844,9 +846,7 @@ SECTION .text mova [dstq+strideq*0], m5 ; q0 mova [dstq+strideq*1], m6 ; q1 mova [dstq+strideq*2], m14 ; q2 -%else - mova m0, [rsp+5*32] -%if %1 == 8 +%elif %1 == 8 TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 ; write 8x16 @@ -871,29 +871,28 @@ SECTION .text vextracti128 [dstq+stride3q -8], m15, 1 lea dstq, [dstq+strideq*4] %else - mova m0, [rsp+6*32] + mova m8, [rsp+6*32] mova m1, [rsp+7*32] mova m2, [rsp+8*32] mova m7, [rsp+9*32] - mova m8, [rsp+5*32] - TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9 + TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9 - mova [dstq+strideq*0-16], xm0 + mova [dstq+strideq*0-16], xm8 mova [dstq+strideq*1-16], xm1 mova [dstq+strideq*2-16], xm2 mova [dstq+stride3q -16], xm7 lea tmpq, [dstq+strideq*4] - mova [tmpq+strideq*0-16], xm8 + mova [tmpq+strideq*0-16], xm0 mova [tmpq+strideq*1-16], xm13 mova [tmpq+strideq*2-16], xm3 mova [tmpq+stride3q -16], xm4 lea tmpq, [tmpq+strideq*4] - vextracti128 [tmpq+strideq*0-16], m0, 1 + vextracti128 [tmpq+strideq*0-16], m8, 1 vextracti128 [tmpq+strideq*1-16], m1, 1 vextracti128 [tmpq+strideq*2-16], m2, 1 vextracti128 [tmpq+stride3q -16], m7, 1 lea tmpq, [tmpq+strideq*4] - vextracti128 [tmpq+strideq*0-16], m8, 1 + vextracti128 [tmpq+strideq*0-16], m0, 1 vextracti128 [tmpq+strideq*1-16], m13, 1 vextracti128 [tmpq+strideq*2-16], m3, 1 vextracti128 [tmpq+stride3q -16], m4, 1 @@ -924,39 +923,38 @@ SECTION .text vextracti128 [dstq+stride3q ], m3, 1 lea dstq, [dstq+strideq*4] %endif -%endif %elif %1 == 6 ; flat6 filter - + vpbroadcastd m7, [pw_4096] paddw m8, m3, m4 paddw m8, m13 ; p2+p1+p0 paddw m11, m13, m5 paddw m8, m8 paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 - pmulhrsw m2, m8, [pw_4096] + pmulhrsw m2, m8, m7 paddw m8, m5 paddw m11, m13, m13 paddw m8, m6 psubw m8, m11 - pmulhrsw m10, m8, [pw_4096] + pmulhrsw m10, m8, m7 paddw m8, m6 paddw m11, m13, m3 paddw m8, m14 psubw m8, m11 - pmulhrsw m11, m8, [pw_4096] + pmulhrsw m11, m8, m7 psubw m8, m3 paddw m14, m14 psubw m8, m4 paddw m8, m14 - pmulhrsw m8, [pw_4096] + pmulhrsw m8, m7 - vpblendvb m3, m3, m2, m9 - vpblendvb m4, m4, m10, m9 - vpblendvb m5, m5, m11, m9 - vpblendvb m6, m6, m8, m9 + pblendvb m3, m2, m9 + pblendvb m4, m10, m9 + pblendvb m5, m11, m9 + pblendvb m6, m8, m9 %ifidn %2, v mova [tmpq+strideq*2], m3 ; p1 @@ -982,10 +980,10 @@ INIT_YMM avx2 cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq @@ -1013,7 +1011,7 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ test [maskq+0], mask_bitsd ; vmask[0] jz .end - FILTER 4, v + call .v4 .end: pslld m12, 4 @@ -1023,15 +1021,19 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ sub wd, 4 jg .loop RET +ALIGN function_align +.v4: + FILTER 4, v + ret INIT_YMM avx2 cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 @@ -1058,7 +1060,7 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter - FILTER 4, h + call .h4 jmp .end .no_filter: @@ -1071,15 +1073,19 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ sub hd, 4 jg .loop RET +ALIGN function_align +.h4: + FILTER 4, h + ret INIT_YMM avx2 cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq @@ -1100,7 +1106,7 @@ cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ test [maskq+0], mask_bitsd ; vmask[0] jz .end - FILTER 4, v + call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4 .end: pslld m12, 4 @@ -1115,10 +1121,10 @@ INIT_YMM avx2 cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits - rorx r6d, r7m, 6 - and r6d, 32 ; 0 for 10bpc, 32 for 12bpc + mov r6d, r7m lea r11, [pw_4] - add r11, r6 + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 @@ -1138,7 +1144,7 @@ cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter - FILTER 4, h + call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4 jmp .end .no_filter: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm new file mode 100644 index 00000000000..b7bc3aa106f --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm @@ -0,0 +1,912 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +l_shuf_v: times 2 db 0, 32 +pw_1: times 2 dw 1 + times 2 db 4, 36 +pw_3: times 2 dw 3 + times 2 db 8, 40 +pw_4: times 2 dw 4 + times 2 db 12, 44 +pw_16: times 2 dw 16 + times 2 db 16, 48 +pw_4096: times 2 dw 4096 + times 2 db 20, 52 +pw_16384: times 2 dw 16384 + times 2 db 24, 56 +pw_32767: times 2 dw 32767 + times 2 db 28, 60 + times 2 dw 0 +filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128 +stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25 +l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1 +clip_max: dw 511, 511, 2047, 2047 +clip_min: dw -512, -512, -2048, -2048 + +SECTION .text + +%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp + punpckhwd m%9, m%5, m%6 + punpcklwd m%5, m%6 + punpckhwd m%6, m%1, m%2 + punpcklwd m%1, m%2 + punpckhwd m%2, m%7, m%8 + punpcklwd m%7, m%8 + punpckhwd m%8, m%3, m%4 + punpcklwd m%3, m%4 + punpckhdq m%4, m%1, m%3 + punpckldq m%1, m%3 + punpckldq m%3, m%5, m%7 + punpckhdq m%5, m%7 + punpckhdq m%7, m%6, m%8 + punpckldq m%6, m%8 + punpckldq m%8, m%9, m%2 + punpckhdq m%9, m%2 + punpckhqdq m%2, m%1, m%3 + punpcklqdq m%1, m%3 + punpcklqdq m%3, m%4, m%5 + punpckhqdq m%4, m%5 + punpcklqdq m%5, m%6, m%8 + punpckhqdq m%6, m%8 + punpckhqdq m%8, m%7, m%9 + punpcklqdq m%7, m%9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] +%ifidn %2, v +%if %1 == 16 + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1 ] + mova m1, [tmpq+strideq*2 ] ; p5 + mova m2, [tmpq+stride3q ] ; p4 + mova m3, [tmpq+strideq*4 ] ; p3 + mova m4, [tmpq+stride5q ] ; p2 +%elif %1 == 6 || %1 == 8 + lea tmpq, [dstq+mstrideq*4] +%if %1 == 8 + mova m3, [tmpq+strideq*0 ] +%endif + mova m4, [tmpq+strideq*1 ] +%endif + mova m5, [dstq+mstrideq*2] ; p1 + mova m6, [dstq+mstrideq*1] ; p0 + mova m7, [dstq+strideq*0 ] ; q0 + mova m8, [dstq+strideq*1 ] ; q1 +%if %1 != 4 + mova m9, [dstq+strideq*2 ] ; q2 +%endif +%if %1 == 8 || %1 == 16 + mova m10, [dstq+stride3q ] ; q3 +%endif +%if %1 == 16 + mova m11, [dstq+strideq*4 ] ; q4 + mova m22, [dstq+stride5q ] ; q5 + mova m23, [dstq+stride3q*2] +%endif +%else ; h +%if %1 == 16 + movu ym16, [dstq+strideq*0 -16] + movu ym17, [dstq+strideq*1 -16] + movu ym18, [dstq+strideq*2 -16] + movu ym19, [dstq+stride3q -16] + movu ym20, [dstq+strideq*4 -16] + movu ym22, [dstq+stride5q -16] + movu ym23, [dstq+stride3q*2-16] + movu ym28, [dstq+stride7q -16] + lea tmpq, [dstq+strideq*8 -16] + vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1 + vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1 + vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1 + vinserti32x8 m10, m19, [tmpq+stride3q ], 1 + vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1 + vinserti32x8 m22, m22, [tmpq+stride5q ], 1 + vinserti32x8 m23, m23, [tmpq+stride3q*2], 1 + vinserti32x8 m28, m28, [tmpq+stride7q ], 1 + lea tmpq, [tmpq+strideq*8] + TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27 + movu ym16, [tmpq+strideq*0 ] + movu ym17, [tmpq+strideq*1 ] + movu ym18, [tmpq+strideq*2 ] + movu ym19, [tmpq+stride3q ] + movu ym24, [tmpq+strideq*4 ] + movu ym25, [tmpq+stride5q ] + movu ym26, [tmpq+stride3q*2] + movu ym20, [tmpq+stride7q ] + lea tmpq, [tmpq+strideq*8] + vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1 + vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1 + vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1 + vinserti32x8 m3, m19, [tmpq+stride3q ], 1 + vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1 + vinserti32x8 m5, m25, [tmpq+stride5q ], 1 + vinserti32x8 m6, m26, [tmpq+stride3q*2], 1 + vinserti32x8 m20, m20, [tmpq+stride7q ], 1 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27 + vshufi32x4 m27, m7, m0, q2020 + vshufi32x4 m7, m0, q3131 + vshufi32x4 m0, m8, m1, q2020 + vshufi32x4 m8, m1, q3131 + vshufi32x4 m1, m9, m2, q2020 + vshufi32x4 m9, m2, q3131 + vshufi32x4 m2, m10, m3, q2020 + vshufi32x4 m10, m3, q3131 + vshufi32x4 m3, m11, m4, q2020 + vshufi32x4 m11, m4, q3131 + vshufi32x4 m4, m22, m5, q2020 + vshufi32x4 m22, m5, q3131 + vshufi32x4 m5, m23, m6, q2020 + vshufi32x4 m23, m6, q3131 + vshufi32x4 m6, m28, m20, q2020 + vshufi32x4 m28, m20, q3131 +%elif %1 == 6 || %1 == 8 +%if %1 == 8 + sub dstq, 8 + movu xm16, [dstq+strideq*0 ] + movu xm17, [dstq+strideq*1 ] + movu xm18, [dstq+strideq*2 ] + movu xm19, [dstq+stride3q ] + movu xm24, [dstq+strideq*4 ] + movu xm25, [dstq+stride5q ] + movu xm26, [dstq+stride3q*2] + movu xm27, [dstq+stride7q ] + lea tmpq, [dstq+strideq*8 ] + vinserti128 ym16, [tmpq+strideq*0 ], 1 + vinserti128 ym17, [tmpq+strideq*1 ], 1 + vinserti128 ym18, [tmpq+strideq*2 ], 1 + vinserti128 ym19, [tmpq+stride3q ], 1 + vinserti128 ym24, [tmpq+strideq*4 ], 1 + vinserti128 ym25, [tmpq+stride5q ], 1 + vinserti128 ym26, [tmpq+stride3q*2], 1 + vinserti128 ym27, [tmpq+stride7q ], 1 + lea tmpq, [tmpq+strideq*8 ] + vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2 + vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2 + vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2 + vinserti32x4 m7, m19, [tmpq+stride3q ], 2 + vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2 + vinserti32x4 m9, m25, [tmpq+stride5q ], 2 + vinserti32x4 m3, m26, [tmpq+stride3q*2], 2 + vinserti32x4 m4, m27, [tmpq+stride7q ], 2 + lea tmpq, [tmpq+strideq*8 ] + vinserti32x4 m10, [tmpq+strideq*0 ], 3 + vinserti32x4 m8, [tmpq+strideq*1 ], 3 + vinserti32x4 m5, [tmpq+strideq*2 ], 3 + vinserti32x4 m7, [tmpq+stride3q ], 3 + vinserti32x4 m2, [tmpq+strideq*4 ], 3 + vinserti32x4 m9, [tmpq+stride5q ], 3 + vinserti32x4 m3, [tmpq+stride3q*2], 3 + vinserti32x4 m4, [tmpq+stride7q ], 3 +%else ; %1 == 6 + movu xm16, [dstq+strideq*0-8] + movu xm17, [dstq+strideq*1-8] + movu xm18, [dstq+strideq*2-8] + movu xm19, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4-8] + movu xm2, [tmpq+strideq*0] + movu xm9, [tmpq+strideq*1] + movu xm3, [tmpq+strideq*2] + movu xm4, [tmpq+stride3q ] + lea tmpq, [tmpq+strideq*4] + vinserti128 ym16, [tmpq+strideq*0], 1 + vinserti128 ym17, [tmpq+strideq*1], 1 + vinserti128 ym18, [tmpq+strideq*2], 1 + vinserti128 ym19, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 ym2, [tmpq+strideq*0], 1 + vinserti128 ym9, [tmpq+strideq*1], 1 + vinserti128 ym3, [tmpq+strideq*2], 1 + vinserti128 ym4, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m10, m16, [tmpq+strideq*0], 2 + vinserti32x4 m8, m17, [tmpq+strideq*1], 2 + vinserti32x4 m5, m18, [tmpq+strideq*2], 2 + vinserti32x4 m7, m19, [tmpq+stride3q ], 2 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m2, [tmpq+strideq*0], 2 + vinserti32x4 m9, [tmpq+strideq*1], 2 + vinserti32x4 m3, [tmpq+strideq*2], 2 + vinserti32x4 m4, [tmpq+stride3q ], 2 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m10, [tmpq+strideq*0], 3 + vinserti32x4 m8, [tmpq+strideq*1], 3 + vinserti32x4 m5, [tmpq+strideq*2], 3 + vinserti32x4 m7, [tmpq+stride3q ], 3 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m2, [tmpq+strideq*0], 3 + vinserti32x4 m9, [tmpq+strideq*1], 3 + vinserti32x4 m3, [tmpq+strideq*2], 3 + vinserti32x4 m4, [tmpq+stride3q ], 3 +%endif + punpcklwd m6, m10, m8 + punpckhwd m10, m8 + punpcklwd m8, m5, m7 + punpckhwd m5, m7 + punpcklwd m7, m2, m9 + punpckhwd m2, m9 + punpcklwd m9, m3, m4 + punpckhwd m3, m4 + punpckldq m4, m6, m8 + punpckhdq m6, m8 + punpckldq m8, m10, m5 + punpckhdq m10, m5 + punpckldq m5, m7, m9 + punpckhdq m7, m9 + punpckldq m9, m2, m3 + punpckhdq m2, m3 +%if %1 == 8 + punpcklqdq m3, m4, m5 +%endif + punpckhqdq m4, m5 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + punpcklqdq m7, m8, m9 + punpckhqdq m8, m9 + punpcklqdq m9, m10, m2 +%if %1 == 8 + punpckhqdq m10, m2 +%endif +%else ; %1 == 4 + kxnorb k1, k1, k1 + kmovb k2, k1 + vpgatherdq m7{k1}, [dstq+ym12-4] + lea tmpq, [dstq+strideq*2-4] + kmovb k1, k2 + vpgatherdq m4{k2}, [tmpq+ym12] + lea tmpq, [tmpq+strideq*2] + kmovb k2, k1 + vpgatherdq m5{k1}, [tmpq+ym12] + lea tmpq, [tmpq+strideq*2] + vpgatherdq m6{k2}, [tmpq+ym12] + punpcklwd m8, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m6, m8, m7 + punpckhwd m8, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + punpcklqdq m7, m8, m4 + punpckhqdq m8, m4 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + movu ym16, [lq+l_strideq*1] + movsldup m17, [l_shuf_v] + vptestnmb k1, ym16, ym16 + vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][] + vpermb m16, m17, m16 ; l[x][1] +%else + movq xm16, [lq+l_strideq*0] + movq xm17, [lq+l_strideq*1] + vinserti128 ym16, [lq+l_strideq*2], 1 + vinserti128 ym17, [lq+l_stride3q ], 1 + lea tmpq, [lq+l_strideq*4] + vinserti32x4 m16, [tmpq+l_strideq*0], 2 + vinserti32x4 m17, [tmpq+l_strideq*1], 2 + vinserti32x4 m16, [tmpq+l_strideq*2], 3 + vinserti32x4 m17, [tmpq+l_stride3q ], 3 + punpcklqdq m16, m17 + vbroadcasti32x4 m17, [l_shuf_h] + vptestnmb k1, m16, m16 + vpalignr m16{k1}, m16, 12 + pshufb m16, m17 ; l[x][1] +%endif + vpbroadcastd m20, [pw_32767] + psubw m17, m5, m6 ; p1-p0 + psubw m18, m7, m8 ; q1-q0 + vptestmw k1, m16, m16 ; L + pabsw m17, m17 + pabsw m18, m18 + vpmaxuw m20{k1}, m17, m18 + vpbroadcastw m17, [lutq+136] + psrlw m18, m16, [lutq+128] + vpbroadcastd m19, [pw_1] + pminuw m18, m17 + psrlw m17, m16, 4 ; H + paddw m16, m16 + pmaxuw m18, m19 ; I + vpaddd m16, [pw_4] {1to16} + paddw m16, m18 ; E + REPX {pmullw x, m13}, m17, m18, m16 + vpcmpw k4, m20, m17, 6 ; hev +%if %1 != 4 + psubw m19, m4, m5 ; p2-p1 + pabsw m19, m19 +%if %1 == 8 || %1 == 16 + psubw m17, m3, m4 ; p3-p2 + pabsw m17, m17 + pmaxuw m19, m17 + psubw m17, m9, m10 ; q3-q2 + pabsw m17, m17 + pmaxuw m19, m17 +%endif + psubw m17, m9, m8 ; q2-q1 + pabsw m17, m17 + pmaxuw m19, m17 +%if %1 == 16 + vpbroadcastd ym17, [maskq+4] + vpord ym17, [maskq+8] {1to8} + vptestmd k1, ym17, ym21 +%else + vptestmd k1, ym21, [maskq+4] {1to8} +%endif + pmaxuw m19, m20 + psubw m17, m4, m6 ; p2-p0 + pabsw m17, m17 + pmaxuw m17, m20 + vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks +%if %1 == 8 || %1 == 16 + psubw m19, m3, m6 ; p3-p0 + pabsw m19, m19 + pmaxuw m17, m19 + psubw m19, m7, m10 ; q3-q0 + pabsw m19, m19 + pmaxuw m17, m19 +%endif + psubw m19, m7, m9 ; q2-q0 + pabsw m19, m19 + pmaxuw m17, m19 +%endif + vpcmpw k1, m20, m18, 2 + psubw m18, m5, m8 ; p1-q1 + psubw m19, m6, m7 ; p0-q0 + pabsw m18, m18 + pabsw m19, m19 + psrlw m18, 1 + paddw m19, m19 + paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E +%if %1 != 4 + vpcmpw k2{k1}, m17, m13, 2 ; flat8in +%endif +%if %1 == 16 + psubw m20, m0, m6 + psubw m16, m1, m6 + pabsw m20, m20 + psubw m17, m2, m6 + pabsw m16, m16 + psubw m18, m11, m7 + pabsw m17, m17 + psubw m19, m22, m7 + pabsw m18, m18 + pmaxuw m20, m16 + psubw m16, m23, m7 + pabsw m19, m19 + pmaxuw m17, m18 + pabsw m16, m16 + vpandd ym18, ym21, [maskq+8] {1to8} + pmaxuw m20, m17 + pmaxuw m19, m16 + pcmpeqd ym16, ym21, ym18 + vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8 + pmaxuw m20, m19 + pcmpeqd ym17, ym21, ym18 + vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8 + vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out + pcmpeqd ym18, ym21 + vptestmb k3{k3}, ym16, ym16 ; flat8 & fm + vptestmb k2{k2}, ym17, ym17 ; flat8in + vptestmb k1{k1}, ym18, ym18 + kandnd k1, k2, k1 ; fm & !flat8 & !flat16 + kandnd k2, k3, k2 ; flat8 & !flat16 +%elif %1 == 6 || %1 == 8 + vpandd ym17, ym21, [maskq+4] {1to8} + pcmpeqd ym16, ym21, ym17 + vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8 + pcmpeqd ym17, ym21 + vptestmb k2{k2}, ym16, ym16 ; flat8 & fm + vptestmb k1{k1}, ym17, ym17 + kandnd k1, k2, k1 ; fm & !flat8 +%else ; %1 == 4 + vpandd ym16, ym21, [maskq+0] {1to8} + pcmpeqd ym16, ym21 + vptestmb k1{k1}, ym16, ym16 +%endif + + ; short filter + psubw m16, m7, m6 + vpbroadcastd m17, [pw_3] + paddw m18, m16, m16 + paddw m18, m16 + psubw m16, m5, m8 ; iclip_diff(p1-q1) + pminsw m16, m14 + vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev + knotd k4, k4 ; !hev + paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f) + vpbroadcastd m18, [pw_4] + pminsw m16, m14 + vpmaxsw m16{k1}{z}, m15 ; f&=fm + paddw m17, m16 + paddw m16, m18 + vpbroadcastd m18, [pw_16384] + pminsw m17, m14 + pminsw m16, m14 + psraw m17, 3 ; f2 + psraw m16, 3 ; f1 + paddw m6, m17 + psubw m7, m16 + vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev + psubw m17, m14, m15 ; 1023 or 4095 + pxor m18, m18 + paddw m5, m16 + psubw m8, m16 + REPX {pminsw x, m17}, m6, m7, m5, m8 + REPX {pmaxsw x, m18}, m6, m7, m5, m8 + +%if %1 == 16 ; flat16 filter + vpaddd m19, m0, [pw_1] {1to16} + paddw m16, m1, m2 ; p5+p4 + paddw m26, m1, m6 ; p5+p0 + paddw m24, m2, m7 ; p4+q0 + paddw m16, m4 ; p5+p4+p3 + paddw m17, m3, m5 ; p2+p1 + psllw m19, 3 + paddw m16, m26 ; p5*2+p4+p3+p0 + paddw m17, m24 ; p4+p2+p1+q0 + psubw m19, m0 ; p6*7+8 + paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0 + paddw m18, m3, m8 + paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0 + paddw m25, m1, m0 + paddw m16, m0, m0 + psrlw m1{k3}, m19, 4 + paddw m19, m18 + psubw m19, m16 ; +p3+q1-p6*2 + paddw m16, m2, m0 + psrlw m2{k3}, m19, 4 + psubw m19, m25 + paddw m25, m4, m9 + paddw m20, m10, m5 + paddw m19, m25 ; +p2+q2-p6-p5 + paddw m17, m0, m3 + psubw m16, m20, m16 + psrlw m3{k3}, m19, 4 + paddw m19, m16 ; +p1+q3-p6-p4 + paddw m16, m11, m6 + psubw m16, m17 + paddw m17, m0, m4 + psrlw m4{k3}, m19, 4 + paddw m19, m16 ; +p0+q4-p6-p3 + paddw m16, m22, m7 + psubw m16, m17 + paddw m17, m0, m5 + psrlw m5{k3}, m19, 4 + paddw m19, m16 ; +q0+q5-p6-p2 + paddw m16, m23, m8 + psrlw m6{k3}, m19, 4 + psubw m16, m17 + paddw m19, m16 ; +q1+q6-p6-p1 + paddw m16, m23, m9 + psrlw m7{k3}, m19, 4 + psubw m16, m26 + paddw m19, m16 ; +q2+q6-p5-p0 + paddw m16, m23, m10 + psrlw m8{k3}, m19, 4 + psubw m16, m24 + paddw m19, m16 ; +q3+q6-p4-p0 + paddw m16, m23, m11 + psrlw m9{k3}, m19, 4 + psubw m16, m18 + paddw m19, m16 ; +q4+q6-p3-q1 + paddw m16, m23, m22 + psrlw m10{k3}, m19, 4 + psubw m16, m25 + paddw m19, m16 ; +q5+q6-p2-q2 + paddw m16, m23, m23 + psrlw m11{k3}, m19, 4 + psubw m16, m20 + paddw m19, m16 ; +q6*2-p1-q3 + psrlw m22{k3}, m19, 4 +%endif +%if %1 == 8 || %1 == 16 ; flat8 filter + vpbroadcastd m20, [pw_4096] + paddw m16, m3, m4 ; p3+p2 + paddw m19, m5, m6 ; p1+p0 + paddw m17, m16, m16 ; 2*(p3+p2) + paddw m19, m3 ; p1+p0+p3 + paddw m17, m7 ; 2*(p3+p2)+q0 + paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0 + paddw m18, m4, m7 + pmulhrsw m4{k2}, m19, m20 + psubw m19, m16 + paddw m17, m5, m8 + paddw m16, m3, m5 + paddw m19, m17 + pmulhrsw m5{k2}, m19, m20 + psubw m19, m16 + paddw m16, m6, m9 + paddw m19, m16 + paddw m16, m3, m6 + pmulhrsw m6{k2}, m19, m20 + paddw m19, m10 + psubw m16, m7, m16 + paddw m19, m16 + psubw m16, m10, m18 + pmulhrsw m7{k2}, m19, m20 + paddw m16, m8 + paddw m19, m16 + psubw m16, m10, m17 + pmulhrsw m8{k2}, m19, m20 + paddw m16, m9 + paddw m19, m16 + pmulhrsw m9{k2}, m19, m20 +%elif %1 == 6 ; flat6 filter + vpbroadcastd m10, [pw_4096] + paddw m2, m5, m6 + paddw m0, m4, m7 + paddw m1, m2, m4 ; p2+p1+p0 + paddw m3, m4, m4 + paddw m1, m1 + paddw m4, m5 + paddw m1, m0 ; p2+2*(p2+p1+p0)+q0 + psubw m3, m7, m3 + pmulhrsw m5{k2}, m1, m10 + paddw m3, m8 + psubw m4, m8, m4 + paddw m1, m3 + pmulhrsw m6{k2}, m1, m10 + paddw m4, m9 + paddw m9, m9 + paddw m1, m4 + pmulhrsw m7{k2}, m1, m10 + psubw m9, m2 + paddw m1, m9 + pmulhrsw m8{k2}, m1, m10 +%endif + +%ifidn %2, v +%if %1 == 16 + mova [tmpq+strideq*2 ], m1 ; p5 + mova [tmpq+stride3q ], m2 ; p4 + mova [tmpq+strideq*4 ], m3 ; p3 + mova [tmpq+stride5q ], m4 ; p2 +%elif %1 == 8 + mova [tmpq+strideq*1 ], m4 ; p2 +%endif + mova [dstq+mstrideq*2], m5 ; p1 + mova [dstq+mstrideq ], m6 ; p0 + mova [dstq+strideq*0 ], m7 ; q0 + mova [dstq+strideq*1 ], m8 ; q1 +%if %1 == 8 || %1 == 16 + mova [dstq+strideq*2 ], m9 ; q2 +%endif +%if %1 == 16 + mova [dstq+stride3q ], m10 ; q3 + mova [dstq+strideq*4 ], m11 ; q4 + mova [dstq+stride5q ], m22 ; q5 +%endif +%else +%if %1 == 16 + TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20 + TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20 + mova [dstq+strideq*0 -16], xm27 + mova [dstq+strideq*0 ], xm7 + mova [dstq+strideq*1 -16], xm0 + mova [dstq+strideq*1 ], xm8 + mova [dstq+strideq*2 -16], xm1 + mova [dstq+strideq*2 ], xm9 + mova [dstq+stride3q -16], xm2 + mova [dstq+stride3q ], xm10 + mova [dstq+strideq*4 -16], xm3 + mova [dstq+strideq*4 ], xm11 + mova [dstq+stride5q -16], xm4 + mova [dstq+stride5q ], xm22 + mova [dstq+stride3q*2-16], xm5 + mova [dstq+stride3q*2 ], xm23 + mova [dstq+stride7q -16], xm6 + mova [dstq+stride7q ], xm28 + lea dstq, [dstq+strideq*8] + vextracti128 [dstq+strideq*0 -16], ym27, 1 + vextracti128 [dstq+strideq*0 ], ym7, 1 + vextracti128 [dstq+strideq*1 -16], ym0, 1 + vextracti128 [dstq+strideq*1 ], ym8, 1 + vextracti128 [dstq+strideq*2 -16], ym1, 1 + vextracti128 [dstq+strideq*2 ], ym9, 1 + vextracti128 [dstq+stride3q -16], ym2, 1 + vextracti128 [dstq+stride3q ], ym10, 1 + vextracti128 [dstq+strideq*4 -16], ym3, 1 + vextracti128 [dstq+strideq*4 ], ym11, 1 + vextracti128 [dstq+stride5q -16], ym4, 1 + vextracti128 [dstq+stride5q ], ym22, 1 + vextracti128 [dstq+stride3q*2-16], ym5, 1 + vextracti128 [dstq+stride3q*2 ], ym23, 1 + vextracti128 [dstq+stride7q -16], ym6, 1 + vextracti128 [dstq+stride7q ], ym28, 1 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 -16], m27, 2 + vextracti32x4 [dstq+strideq*0 ], m7, 2 + vextracti32x4 [dstq+strideq*1 -16], m0, 2 + vextracti32x4 [dstq+strideq*1 ], m8, 2 + vextracti32x4 [dstq+strideq*2 -16], m1, 2 + vextracti32x4 [dstq+strideq*2 ], m9, 2 + vextracti32x4 [dstq+stride3q -16], m2, 2 + vextracti32x4 [dstq+stride3q ], m10, 2 + vextracti32x4 [dstq+strideq*4 -16], m3, 2 + vextracti32x4 [dstq+strideq*4 ], m11, 2 + vextracti32x4 [dstq+stride5q -16], m4, 2 + vextracti32x4 [dstq+stride5q ], m22, 2 + vextracti32x4 [dstq+stride3q*2-16], m5, 2 + vextracti32x4 [dstq+stride3q*2 ], m23, 2 + vextracti32x4 [dstq+stride7q -16], m6, 2 + vextracti32x4 [dstq+stride7q ], m28, 2 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 -16], m27, 3 + vextracti32x4 [dstq+strideq*0 ], m7, 3 + vextracti32x4 [dstq+strideq*1 -16], m0, 3 + vextracti32x4 [dstq+strideq*1 ], m8, 3 + vextracti32x4 [dstq+strideq*2 -16], m1, 3 + vextracti32x4 [dstq+strideq*2 ], m9, 3 + vextracti32x4 [dstq+stride3q -16], m2, 3 + vextracti32x4 [dstq+stride3q ], m10, 3 + vextracti32x4 [dstq+strideq*4 -16], m3, 3 + vextracti32x4 [dstq+strideq*4 ], m11, 3 + vextracti32x4 [dstq+stride5q -16], m4, 3 + vextracti32x4 [dstq+stride5q ], m22, 3 + vextracti32x4 [dstq+stride3q*2-16], m5, 3 + vextracti32x4 [dstq+stride3q*2 ], m23, 3 + vextracti32x4 [dstq+stride7q -16], m6, 3 + vextracti32x4 [dstq+stride7q ], m28, 3 +%elif %1 == 8 + TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2 + movu [dstq+strideq*0 ], xm3 + movu [dstq+strideq*1 ], xm4 + movu [dstq+strideq*2 ], xm5 + movu [dstq+stride3q ], xm6 + movu [dstq+strideq*4 ], xm7 + movu [dstq+stride5q ], xm8 + movu [dstq+stride3q*2], xm9 + movu [dstq+stride7q ], xm10 + lea dstq, [dstq+strideq*8] + vextracti128 [dstq+strideq*0 ], ym3, 1 + vextracti128 [dstq+strideq*1 ], ym4, 1 + vextracti128 [dstq+strideq*2 ], ym5, 1 + vextracti128 [dstq+stride3q ], ym6, 1 + vextracti128 [dstq+strideq*4 ], ym7, 1 + vextracti128 [dstq+stride5q ], ym8, 1 + vextracti128 [dstq+stride3q*2], ym9, 1 + vextracti128 [dstq+stride7q ], ym10, 1 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 ], m3, 2 + vextracti32x4 [dstq+strideq*1 ], m4, 2 + vextracti32x4 [dstq+strideq*2 ], m5, 2 + vextracti32x4 [dstq+stride3q ], m6, 2 + vextracti32x4 [dstq+strideq*4 ], m7, 2 + vextracti32x4 [dstq+stride5q ], m8, 2 + vextracti32x4 [dstq+stride3q*2], m9, 2 + vextracti32x4 [dstq+stride7q ], m10, 2 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 ], m3, 3 + vextracti32x4 [dstq+strideq*1 ], m4, 3 + vextracti32x4 [dstq+strideq*2 ], m5, 3 + vextracti32x4 [dstq+stride3q ], m6, 3 + vextracti32x4 [dstq+strideq*4 ], m7, 3 + vextracti32x4 [dstq+stride5q ], m8, 3 + vextracti32x4 [dstq+stride3q*2], m9, 3 + vextracti32x4 [dstq+stride7q ], m10, 3 + lea dstq, [dstq+strideq*8+8] +%else ; %1 == 4 || %1 == 6 + punpcklwd m9, m5, m6 + punpckhwd m5, m6 + kxnorb k1, k1, k1 + punpcklwd m6, m7, m8 + punpckhwd m7, m8 + kmovb k2, k1 + punpckldq m8, m9, m6 + vpscatterdq [dstq+ym12-4]{k1}, m8 + punpckhdq m9, m6 + lea tmpq, [dstq+strideq*2-4] + kmovb k1, k2 + vpscatterdq [tmpq+ym12]{k2}, m9 + punpckldq m6, m5, m7 + lea tmpq, [tmpq+strideq*2] + kmovb k2, k1 + vpscatterdq [tmpq+ym12]{k1}, m6 + punpckhdq m5, m7 + lea tmpq, [tmpq+strideq*2] + vpscatterdq [tmpq+ym12]{k2}, m5 +%endif +%endif +%endmacro + +INIT_ZMM avx512icl +cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride, tmp, \ + mask_bits, stride5 +%define base tmpq-filter_mask + SWAP 12, 26 ; avoids clobbering xmm10 on WIN64 + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + lea stride3q, [strideq*3] + shl l_strideq, 2 + lea stride5q, [strideq*5] + shr r6d, 11 ; is_12bpc + mova ym21, [base+filter_mask] + mov mstrideq, strideq + vpbroadcastd m13, [base+pw_4+r6*8] + mov mask_bitsd, 0xff + vpbroadcastd m14, [base+clip_max+r6*4] + sub lq, l_strideq + vpbroadcastd m15, [base+clip_min+r6*4] + neg mstrideq + mov wd, wm +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + FILTER 16, v + jmp .end +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 8, v + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call .v4 +.end: + shl mask_bitsd, 8 + add dstq, 64 + pslld ym21, 8 + add lq, 32 + sub wd, 8 + jg .loop + RET +ALIGN function_align +.v4: ; called by both luma and chroma + FILTER 4, v + ret + +cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \ + lut, h, stride3, l_stride3, tmp, \ + mask_bits, stride5, stride7 + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + lea stride3q, [strideq*3] + vpbroadcastd ym12, strided + shl l_strideq, 2 + lea stride5q, [strideq*5] + shr r6d, 11 ; is_12bpc + pmulld ym12, [base+stride_mul] + lea stride7q, [strideq+stride3q*2] + mova ym21, [base+filter_mask] + mov mask_bitsd, 0xff + vpbroadcastd m13, [base+pw_4+r6*8] + sub lq, 4 + vpbroadcastd m14, [base+clip_max+r6*4] + lea l_stride3q, [l_strideq*3] + vpbroadcastd m15, [base+clip_min+r6*4] + mov hd, hm +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + FILTER 16, h + jmp .end +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 8, h + jmp .end2 +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + call .h4 +.no_filter: + lea dstq, [dstq+stride3q*8] +.end: + lea dstq, [dstq+strideq*8] +.end2: + shl mask_bitsd, 8 + pslld ym21, 8 + lea lq, [lq+l_strideq*8] + sub hd, 8 + jg .loop + RET +ALIGN function_align +.h4: ; called by both luma and chroma + FILTER 4, h + ret + +cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + shl l_strideq, 2 + lea stride3q, [strideq*3] + shr r6d, 11 ; is_12bpc + mova ym21, [base+filter_mask] + mov mstrideq, strideq + vpbroadcastd m13, [base+pw_4+r6*8] + mov mask_bitsd, 0xff + vpbroadcastd m14, [base+clip_max+r6*4] + sub lq, l_strideq + vpbroadcastd m15, [base+clip_min+r6*4] + neg mstrideq + mov wd, wm +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 6, v + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4 +.end: + shl mask_bitsd, 8 + add dstq, 64 + pslld ym21, 8 + add lq, 32 + sub wd, 8 + jg .loop + RET + +cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + vpbroadcastd ym12, strided + shl l_strideq, 2 + shr r6d, 11 ; is_12bpc + pmulld ym12, [base+stride_mul] + lea stride3q, [strideq*3] + mova ym21, [base+filter_mask] + mov mask_bitsd, 0xff + vpbroadcastd m13, [base+pw_4+r6*8] + sub lq, 4 + vpbroadcastd m14, [base+clip_max+r6*4] + lea l_stride3q, [l_strideq*3] + vpbroadcastd m15, [base+clip_min+r6*4] + mov hd, hm +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 6, h + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4 +.end: + lea tmpq, [strideq+stride3q] + shl mask_bitsd, 8 + pslld ym21, 8 + lea dstq, [dstq+tmpq*8] + lea lq, [lq+l_strideq*8] + sub hd, 8 + jg .loop + RET + +%endif ; ARCH_X86_64 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm index d6b296b19ef..84696c758ae 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm @@ -1444,7 +1444,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ cmp byte [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call .v4 .end: add lq, 32 @@ -1453,6 +1453,10 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ sub wd, 8 jg .loop RET +ALIGN function_align +.v4: + FILTER 4, v + ret INIT_YMM avx2 cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ @@ -1481,7 +1485,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ cmp byte [maskq+0], 0 ; vmask[0] je .no_filter - FILTER 4, h + call .h4 jmp .end .no_filter: @@ -1493,6 +1497,10 @@ cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ sub hd, 8 jg .loop RET +ALIGN function_align +.h4: + FILTER 4, h + ret INIT_YMM avx2 cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ @@ -1515,7 +1523,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ cmp byte [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4 .end: add lq, 32 @@ -1545,7 +1553,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \ cmp byte [maskq+0], 0 ; vmask[0] je .no_filter - FILTER 4, h + call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4 jmp .end .no_filter: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm index c09dced418b..0218b624d3c 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm @@ -80,25 +80,24 @@ SECTION .text punpckhwd m%1, m%3 kmovw k1, k6 lea t0, [dstq+strideq*4] - vpscatterdd [dstq+m29-2]{k1}, m%4 + vpscatterdd [dstq+m19-2]{k1}, m%4 kmovw k1, k6 lea t1, [dstq+strideq*8] - vpscatterdd [t0 +m29-2]{k1}, m%5 + vpscatterdd [t0 +m19-2]{k1}, m%5 kmovw k1, k6 lea t2, [t0 +strideq*8] - vpscatterdd [t1 +m29-2]{k1}, m%2 + vpscatterdd [t1 +m19-2]{k1}, m%2 kmovw k1, k6 - vpscatterdd [t2 +m29-2]{k1}, m%1 + vpscatterdd [t2 +m19-2]{k1}, m%1 %endmacro %macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem %if %1 == 0 - SWAP m16, m15 + SWAP m16, m22 %endif - ; input in m0-15 - punpcklbw m15, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m2, m3 + punpcklbw m22, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 @@ -108,21 +107,21 @@ SECTION .text punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 - punpcklbw m11, m12, m13 - punpckhbw m12, m13 + punpcklbw m11, m25, m13 + punpckhbw m25, m13 %if %1 == 0 SWAP m13, m16 %else mova m13, %3 %endif - SWAP m16, m12 - punpcklbw m12, m14, m13 + SWAP m16, m25 + punpcklbw m25, m14, m13 punpckhbw m13, m14, m13 - ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 - punpcklwd m14, m15, m1 - punpckhwd m15, m1 - punpcklwd m1, m0, m2 - punpckhwd m0, m2 + ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13 + punpcklwd m14, m22, m26 + punpckhwd m22, m26 + punpcklwd m26, m24, m2 + punpckhwd m24, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 @@ -131,58 +130,58 @@ SECTION .text punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 - punpcklwd m10, m11, m12 - punpckhwd m11, m12 - SWAP m12, m16, m11 - punpcklwd m11, m12, m13 - punpckhwd m12, m13 - ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 + punpcklwd m10, m11, m25 + punpckhwd m11, m25 + SWAP m25, m16, m11 + punpcklwd m11, m25, m13 + punpckhwd m25, m13 + ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25 punpckldq m13, m14, m2 punpckhdq m14, m2 - punpckldq m2, m15, m3 - punpckhdq m15, m3 - punpckldq m3, m1, m5 - punpckhdq m1, m5 - punpckldq m5, m0, m4 - punpckhdq m0, m4 + punpckldq m2, m22, m3 + punpckhdq m22, m3 + punpckldq m3, m26, m5 + punpckhdq m26, m5 + punpckldq m5, m24, m4 + punpckhdq m24, m4 punpckldq m4, m6, m10 punpckhdq m6, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 - punpckldq m11, m8, m12 - punpckhdq m8, m12 - SWAP m12, m16, m8 - punpckldq m8, m7, m12 - punpckhdq m7, m12 - ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 - punpcklqdq m12, m13, m4 + punpckldq m11, m8, m25 + punpckhdq m8, m25 + SWAP m25, m16, m8 + punpckldq m8, m7, m25 + punpckhdq m7, m25 + ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m25, m13, m4 punpckhqdq m13, m4 punpcklqdq m4, m14, m6 punpckhqdq m14, m6 punpcklqdq m6, m2, m8 punpckhqdq m2, m8 - punpcklqdq m8, m15, m7 - punpckhqdq m15, m7 + punpcklqdq m8, m22, m7 + punpckhqdq m22, m7 punpcklqdq m7, m3, m10 punpckhqdq m3, m10 - punpcklqdq m10, m1, m9 - punpckhqdq m1, m9 + punpcklqdq m10, m26, m9 + punpckhqdq m26, m9 punpcklqdq m9, m5, m11 punpckhqdq m5, m11 SWAP m11, m16 %if %2 == 0 - SWAP m16, m12 + SWAP m16, m25 %else - mova %3, m12 + mova %3, m25 %endif - punpcklqdq m12, m0, m11 - punpckhqdq m0, m11 + punpcklqdq m25, m24, m11 + punpckhqdq m24, m11 %if %2 == 0 SWAP m11, m16 %endif - ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 - SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 - SWAP 3, 14, 12, 9 + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24 + SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22 + SWAP 3, 14, 25, 9 %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] @@ -205,7 +204,7 @@ SECTION .text %endif lea t0, [dstq+mstrideq*4] %if %1 != 6 - mova m12, [t0 +strideq*0] + mova m25, [t0 +strideq*0] %endif mova m13, [t0 +strideq*1] mova m3, [t0 +strideq*2] @@ -214,13 +213,13 @@ SECTION .text mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 - mova m15, [dstq+stride3q ] + mova m22, [dstq+stride3q ] %endif %if %1 == 16 lea t0, [dstq+strideq*4] - mova m19, [t0 +strideq*0] - mova m20, [t0 +strideq*1] - mova m21, [t0 +strideq*2] + mova m29, [t0 +strideq*0] + mova m30, [t0 +strideq*1] + mova m31, [t0 +strideq*2] %endif %endif %else ; h @@ -230,15 +229,15 @@ SECTION .text vbroadcasti32x4 m0, [hshuf4] kmovw k1, k6 lea t0, [dstq+strideq*4] - vpgatherdd m3{k1}, [dstq+m29-2] + vpgatherdd m3{k1}, [dstq+m19-2] kmovw k1, k6 lea t1, [dstq+strideq*8] - vpgatherdd m4{k1}, [t0 +m29-2] + vpgatherdd m4{k1}, [t0 +m19-2] kmovw k1, k6 lea t2, [t0 +strideq*8] - vpgatherdd m5{k1}, [t1 +m29-2] + vpgatherdd m5{k1}, [t1 +m19-2] kmovw k1, k6 - vpgatherdd m6{k1}, [t2 +m29-2] + vpgatherdd m6{k1}, [t2 +m19-2] pshufb m3, m0 pshufb m4, m0 pshufb m5, m0 @@ -257,16 +256,16 @@ SECTION .text %elif %1 == 6 || %1 == 8 kmovb k1, k7 lea t0, [dstq+strideq*1] - vpgatherdq m3{k1}, [dstq+ym31-%1/2] + vpgatherdq m3{k1}, [dstq+ym21-%1/2] kmovb k1, k7 lea t1, [dstq+strideq*2] - vpgatherdq m4{k1}, [t0 +ym31-%1/2] + vpgatherdq m4{k1}, [t0 +ym21-%1/2] kmovb k1, k7 lea t2, [dstq+stride3q ] - vpgatherdq m5{k1}, [t1 +ym31-%1/2] + vpgatherdq m5{k1}, [t1 +ym21-%1/2] kmovb k1, k7 - vextracti32x8 ym0, m31, 1 - vpgatherdq m6{k1}, [t2 +ym31-%1/2] + vextracti32x8 ym0, m21, 1 + vpgatherdq m6{k1}, [t2 +ym21-%1/2] kmovb k1, k7 vpgatherdq m12{k1}, [dstq+ym0 -%1/2] kmovb k1, k7 @@ -344,7 +343,7 @@ SECTION .text punpckhqdq m13, m5, m13 %if %1 == 8 punpcklqdq m5, m7, m12 - punpckhqdq m12, m7, m12 + punpckhqdq m25, m7, m12 ; xm3: A0-15 ; xm14: B0-15 ; xm15: C0-15 @@ -352,10 +351,11 @@ SECTION .text ; xm4: E0-15 ; xm13: F0-15 ; xm5: G0-15 - ; xm12: H0-15 - SWAP 12, 3, 15 + ; xm25: H0-15 + SWAP 25, 3, 15 SWAP 13, 14, 5, 4, 6 - ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 + SWAP 15, 22 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22 %else SWAP 13, 3, 14 SWAP 6, 4, 15, 5 @@ -364,8 +364,8 @@ SECTION .text %else ; 16, h ; load and 16x16 transpose. We only use 14 pixels but we'll need the ; remainder at the end for the second transpose - movu xm0, [dstq+strideq*0-8] - movu xm1, [dstq+strideq*1-8] + movu xm24, [dstq+strideq*0-8] + movu xm26, [dstq+strideq*1-8] movu xm2, [dstq+strideq*2-8] movu xm3, [dstq+stride3q -8] lea t0, [dstq+strideq*4] @@ -379,13 +379,13 @@ SECTION .text movu xm10, [t0 +strideq*2-8] movu xm11, [t0 +stride3q -8] lea t0, [t0 +strideq*4] - movu xm12, [t0 +strideq*0-8] + movu xm25, [t0 +strideq*0-8] movu xm13, [t0 +strideq*1-8] movu xm14, [t0 +strideq*2-8] - movu xm15, [t0 +stride3q -8] + movu xm22, [t0 +stride3q -8] lea t0, [t0 +strideq*4] - vinserti32x4 ym0, [t0 +strideq*0-8], 1 - vinserti32x4 ym1, [t0 +strideq*1-8], 1 + vinserti32x4 ym24, [t0 +strideq*0-8], 1 + vinserti32x4 ym26, [t0 +strideq*1-8], 1 vinserti32x4 ym2, [t0 +strideq*2-8], 1 vinserti32x4 ym3, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] @@ -399,13 +399,13 @@ SECTION .text vinserti32x4 ym10, [t0 +strideq*2-8], 1 vinserti32x4 ym11, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] - vinserti32x4 ym12, [t0 +strideq*0-8], 1 + vinserti32x4 ym25, [t0 +strideq*0-8], 1 vinserti32x4 ym13, [t0 +strideq*1-8], 1 vinserti32x4 ym14, [t0 +strideq*2-8], 1 - vinserti32x4 ym15, [t0 +stride3q -8], 1 + vinserti32x4 ym22, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] - vinserti32x4 m0, [t0 +strideq*0-8], 2 - vinserti32x4 m1, [t0 +strideq*1-8], 2 + vinserti32x4 m24, [t0 +strideq*0-8], 2 + vinserti32x4 m26, [t0 +strideq*1-8], 2 vinserti32x4 m2, [t0 +strideq*2-8], 2 vinserti32x4 m3, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] @@ -419,13 +419,13 @@ SECTION .text vinserti32x4 m10, [t0 +strideq*2-8], 2 vinserti32x4 m11, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] - vinserti32x4 m12, [t0 +strideq*0-8], 2 + vinserti32x4 m25, [t0 +strideq*0-8], 2 vinserti32x4 m13, [t0 +strideq*1-8], 2 vinserti32x4 m14, [t0 +strideq*2-8], 2 - vinserti32x4 m15, [t0 +stride3q -8], 2 + vinserti32x4 m22, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] - vinserti32x4 m0, [t0 +strideq*0-8], 3 - vinserti32x4 m1, [t0 +strideq*1-8], 3 + vinserti32x4 m24, [t0 +strideq*0-8], 3 + vinserti32x4 m26, [t0 +strideq*1-8], 3 vinserti32x4 m2, [t0 +strideq*2-8], 3 vinserti32x4 m3, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] @@ -439,41 +439,38 @@ SECTION .text vinserti32x4 m10, [t0 +strideq*2-8], 3 vinserti32x4 m11, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] - vinserti32x4 m12, [t0 +strideq*0-8], 3 + vinserti32x4 m25, [t0 +strideq*0-8], 3 vinserti32x4 m13, [t0 +strideq*1-8], 3 vinserti32x4 m14, [t0 +strideq*2-8], 3 - vinserti32x4 m15, [t0 +stride3q -8], 3 + vinserti32x4 m22, [t0 +stride3q -8], 3 ; TRANSPOSE_16X16B 0, 1, [rsp+0*64] - SWAP m16, m1 + SWAP m16, m26 SWAP m17, m2 SWAP m18, m3 - SWAP m19, m12 - SWAP m20, m13 - SWAP m21, m14 - mova [rsp+4*64], m15 - ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 - SWAP 12, 4, 7 + SWAP m29, m25 + SWAP m30, m13 + SWAP m31, m14 + mova [rsp+4*64], m22 + ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22 + SWAP 25, 4, 7 SWAP 13, 5, 8 SWAP 3, 6, 9 SWAP 10, 14 - SWAP 11, 15 + SWAP 11, 22 %endif %endif ; load L/E/I/H -%if is_uv - SWAP m22, m15 -%endif - vpbroadcastd m22, [pb_1] + vpbroadcastd m15, [pb_1] %ifidn %2, v movu m1, [lq] movu m0, [lq+l_strideq] %else kmovw k1, k6 - vpgatherdd m0{k1}, [lq+m30+4] + vpgatherdd m0{k1}, [lq+m20+4] kmovw k1, k6 - vpgatherdd m1{k1}, [lq+m30+0] + vpgatherdd m1{k1}, [lq+m20+0] %endif pxor m2, m2 pcmpeqb k1, m0, m2 @@ -484,7 +481,7 @@ SECTION .text pand m2, [pb_63]{bcstd} vpbroadcastb m1, [lutq+136] pminub m2, m1 - pmaxub m2, m22 ; I + pmaxub m2, m15 ; I pand m1, m0, [pb_240]{bcstd} psrlq m1, 4 ; H paddd m0, [pb_2]{bcstd} @@ -500,7 +497,7 @@ SECTION .text ABSSUB m9, m13, m4, m10 ; abs(p2-p0) pmaxub m9, m8 %else - ABSSUB m9, m12, m4, m10 ; abs(p3-p0) + ABSSUB m9, m25, m4, m10 ; abs(p3-p0) pmaxub m9, m8 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) pmaxub m9, m10 @@ -508,17 +505,17 @@ SECTION .text ABSSUB m10, m5, m14, m11 ; abs(q2-q0) pmaxub m9, m10 %if %1 != 6 - ABSSUB m10, m5, m15, m11 ; abs(q3-q0) + ABSSUB m10, m5, m22, m11 ; abs(q3-q0) pmaxub m9, m10 %endif - vpcmpub k2{k3}, m9, m22, 2 ; le ; flat8in + vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in %if %1 == 6 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) %else - ABSSUB m10, m12, m13, m11 ; abs(p3-p2) + ABSSUB m10, m25, m13, m11 ; abs(p3-p2) ABSSUB m11, m13, m3, m1 ; abs(p2-p1) pmaxub m10, m11 - ABSSUB m11, m14, m15, m1 ; abs(q3-q2) + ABSSUB m11, m14, m22, m1 ; abs(q3-q2) pmaxub m10, m11 %endif ABSSUB m11, m14, m6, m1 ; abs(q2-q1) @@ -526,16 +523,10 @@ SECTION .text %if %1 == 16 vpbroadcastd m11, [maskq+8] por m11, [maskq+4]{bcstd} - pand m11, pbmask %else - %if !is_h || %1 == 6 - pand m11, pbmask, [maskq+4]{bcstd} - %else vpbroadcastd m11, [maskq+4] - pand m11, pbmask - %endif %endif - pcmpeqd k4, m11, pbmask + vptestmd k4, m11, pbmask vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks pmaxub m8, m10 %endif @@ -554,77 +545,58 @@ SECTION .text pmaxub m1, m2 ABSSUB m2, m18, m4, m10 pmaxub m1, m2 - ABSSUB m2, m19, m5, m10 + ABSSUB m2, m29, m5, m10 pmaxub m1, m2 - ABSSUB m2, m20, m5, m10 + ABSSUB m2, m30, m5, m10 pmaxub m1, m2 - ABSSUB m2, m21, m5, m10 + ABSSUB m2, m31, m5, m10 pmaxub m1, m2 - ; - vpcmpub k4, m1, m22, 2 ; flat8out - kandq k4, k4, k2 ; flat8in & flat8out - + kandq k2, k2, k3 + vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out vpbroadcastd m2, [maskq+8] - pand m10, m2, pbmask - pcmpeqd k5, m10, pbmask + vptestmd k5, m2, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k4, k4, k5 ; flat16 - kandq k4, k3, k4 ; flat16 & fm + vptestmb k4{k4}, m7, m7 ; flat16 & fm por m10, m2, [maskq+4]{bcstd} - pand m2, m10, pbmask - pcmpeqd k5, m2, pbmask + vptestmd k5, m10, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k2, k2, k5 ; flat8in - kandq k2, k3, k2 + vptestmb k2{k2}, m7, m7 ; flat8in por m2, m10, [maskq+0]{bcstd} - pand m2, pbmask - pcmpeqd k5, m2, pbmask + vptestmd k5, m2, pbmask vpmovm2d m7, k5 - vpmovb2m k5, m7 - kandq k3, k3, k5 + vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 & !flat16 kandnq k2, k4, k2 ; flat8 & !flat16 %elif %1 != 4 vpbroadcastd m0, [maskq+4] - pand m2, m0, pbmask - pcmpeqd k4, m2, pbmask + vptestmd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k2, k2, k4 + vptestmb k2{k2}, m7, m7 kandq k2, k2, k3 ; flat8 & fm por m0, [maskq+0]{bcstd} - pand m0, pbmask - pcmpeqd k4, m0, pbmask + vptestmd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k3, k3, k4 + vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 %else %ifidn %2, v - pand m0, pbmask, [maskq+0]{bcstd} + vptestmd k4, pbmask, [maskq+0]{bcstd} %else vpbroadcastd m0, [maskq+0] - pand m0, pbmask + vptestmd k4, m0, pbmask %endif - pcmpeqd k4, m0, pbmask vpmovm2d m7, k4 - vpmovb2m k4, m7 - kandq k3, k3, k4 ; fm + vptestmb k3{k3}, m7, m7 ; fm %endif ; short filter -%if is_uv - SWAP m23, m22 - SWAP m24, m0 - SWAP m25, m12 - SWAP m26, m1 +%if %1 >= 8 + SWAP m23, m15 %endif - vpbroadcastd m23, [pb_3] - vpbroadcastd m24, [pb_4] - vpbroadcastd m25, [pb_16] - vpbroadcastd m26, [pb_64] + vpbroadcastd m15, [pb_3] + vpbroadcastd m0, [pb_4] + vpbroadcastd m12, [pb_16] + vpbroadcastd m1, [pb_64] pxor m3, pb128 pxor m6, pb128 psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev @@ -634,16 +606,16 @@ SECTION .text paddsb m10, m11 paddsb m10, m11 paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm - paddsb m8, m10, m23 - paddsb m10, m24 + paddsb m8, m10, m15 + paddsb m10, m0 pand m8, [pb_248]{bcstd} pand m10, [pb_248]{bcstd} psrlq m8, 3 psrlq m10, 3 - pxor m8, m25 - pxor m10, m25 - psubb m8, m25 ; f2 - psubb m10, m25 ; f1 + pxor m8, m12 + pxor m10, m12 + psubb m8, m12 ; f2 + psubb m10, m12 ; f1 paddsb m4, m8 psubsb m5, m10 pxor m4, pb128 @@ -652,7 +624,7 @@ SECTION .text pxor m10, pb128 pxor m8, m8 pavgb m8, m10 ; f=(f1+1)>>1 - psubb m8, m26 + psubb m8, m1 knotq k1, k1 paddsb m3{k1}, m3, m8 psubsb m6{k1}, m6, m8 @@ -664,40 +636,40 @@ SECTION .text %ifidn %2, v lea t0, [dstq+mstrideq*8] %endif - SWAP m0, m16, m14 - SWAP m2, m17, m15 + SWAP m24, m16, m14 + SWAP m2, m17, m22 SWAP m7, m18 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A ; write -6 - vpbroadcastd m26, [pb_7_1] - vpbroadcastd m25, [pb_2] - punpcklbw m14, m0, m12 - punpckhbw m15, m0, m12 - pmaddubsw m10, m14, m26 - pmaddubsw m11, m15, m26 ; p6*7+p3 + vpbroadcastd m1, [pb_7_1] + vpbroadcastd m12, [pb_2] + punpcklbw m14, m24, m25 + punpckhbw m22, m24, m25 + pmaddubsw m10, m14, m1 + pmaddubsw m11, m22, m1 ; p6*7+p3 punpcklbw m8, m2, m7 punpckhbw m9, m2, m7 - pmaddubsw m8, m25 - pmaddubsw m9, m25 + pmaddubsw m8, m12 + pmaddubsw m9, m12 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 %ifidn %2, h vpbroadcastd m27, [pw_2048] - vpbroadcastd m26, [pb_m1_1] + vpbroadcastd m1, [pb_m1_1] %define pw2048 m27 - %define pbm1_1 m26 + %define pbm1_1 m1 %endif punpcklbw m8, m13, m3 punpckhbw m9, m13, m3 - pmaddubsw m8, m22 - pmaddubsw m9, m22 + pmaddubsw m8, m23 + pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 punpcklbw m8, m4, m5 punpckhbw m9, m4, m5 - pmaddubsw m8, m22 - pmaddubsw m9, m22 + pmaddubsw m8, m23 + pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 pmulhrsw m8, m10, pw2048 @@ -713,17 +685,17 @@ SECTION .text ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B ; write -5 pmaddubsw m14, pbm1_1 - pmaddubsw m15, pbm1_1 + pmaddubsw m22, pbm1_1 paddw m10, m14 - paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 - punpcklbw m8, m0, m6 - punpckhbw m9, m0, m6 + paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m24, m6 + punpckhbw m9, m24, m6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 SWAP m18, m8 - SWAP m22, m9 + SWAP m23, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 @@ -737,8 +709,8 @@ SECTION .text ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C ; write -4 SWAP m14, m16 - punpcklbw m8, m0, m13 - punpckhbw m9, m0, m13 + punpcklbw m8, m24, m13 + punpckhbw m9, m24, m13 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -756,21 +728,21 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 %else - vpblendmb m8{k4}, m12, m8 + vpblendmb m8{k4}, m25, m8 mova [rsp+3*64], m8 %endif ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D ; write -3 - SWAP m15, m17 - punpcklbw m8, m0, m3 - punpckhbw m9, m0, m3 + SWAP m22, m17 + punpcklbw m8, m24, m3 + punpckhbw m9, m24, m3 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 - punpcklbw m8, m7, m15 - punpckhbw m7, m15 + punpcklbw m8, m7, m22 + punpckhbw m7, m22 pmaddubsw m8, pbm1_1 pmaddubsw m7, pbm1_1 paddw m10, m8 @@ -779,69 +751,69 @@ SECTION .text pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 - vpblendmb m23{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F + vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E ; write -2 %ifidn %2, v lea t0, [dstq+strideq*4] %endif - punpcklbw m8, m0, m4 - punpckhbw m9, m0, m4 + punpcklbw m8, m24, m4 + punpckhbw m9, m24, m4 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 - punpcklbw m8, m12, m19 - punpckhbw m9, m12, m19 - SWAP m1, m19 + punpcklbw m8, m25, m29 + punpckhbw m9, m25, m29 + SWAP m26, m29 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 - SWAP m19, m8 - SWAP m24, m9 + SWAP m29, m8 + SWAP m0, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 - vpblendmb m25{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G + vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F ; write -1 %ifidn %2, h - SWAP m28, m0 + SWAP m28, m24 punpcklbw m8, m28, m5 - punpckhbw m0, m28, m5 + punpckhbw m24, m28, m5 %else - punpcklbw m8, m0, m5 - punpckhbw m0, m5 + punpcklbw m8, m24, m5 + punpckhbw m24, m5 %endif pmaddubsw m8, pbm1_1 - pmaddubsw m0, pbm1_1 + pmaddubsw m24, pbm1_1 paddw m10, m8 - paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 - punpcklbw m0, m13, m20 - punpckhbw m9, m13, m20 + paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m24, m13, m30 + punpckhbw m9, m13, m30 %ifidn %2, h - SWAP m27, m20 + SWAP m27, m30 %endif - SWAP m13, m23 - pmaddubsw m0, pbm1_1 + SWAP m13, m15 + pmaddubsw m24, pbm1_1 pmaddubsw m9, pbm1_1 - paddw m10, m0 + paddw m10, m24 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 - SWAP m20, m0 - SWAP m23, m9 + SWAP m30, m24 + SWAP m15, m9 %ifidn %2, h - SWAP m9, m0 + SWAP m9, m24 %define pw2048 m9 %endif - pmulhrsw m0, m10, pw2048 + pmulhrsw m24, m10, pw2048 pmulhrsw m8, m11, pw2048 paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 - paddw m11, m22 - packuswb m0, m8 - punpcklbw m8, m3, m21 + paddw m11, m23 + packuswb m24, m8 + punpcklbw m8, m3, m31 pmaddubsw m8, pbm1_1 paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 SWAP m18, m8 @@ -851,34 +823,34 @@ SECTION .text SWAP m16, m9 %define pw2048 m16 %endif - punpckhbw m9, m3, m21 - SWAP m3, m25 + punpckhbw m9, m3, m31 + SWAP m3, m12 pmaddubsw m9, pbm1_1 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 - SWAP m22, m9 + SWAP m23, m9 pmulhrsw m9, m11, pw2048 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 %ifidn %2, h - SWAP m2, m26 + SWAP m2, m1 %define pbm1_1 m2 %endif - vpblendmb m26{k4}, m4, m0 ; don't clobber p0/m4 since we need it in H + vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G ; write +0 - SWAP m0, m21 ; q6 + SWAP m24, m31 ; q6 packuswb m8, m9 %ifidn %2, h - SWAP m21, m2 - %define pbm1_1 m21 + SWAP m31, m2 + %define pbm1_1 m31 %endif - vpblendmb m25{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I + vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H ; write +1 - punpcklbw m8, m4, m0 - punpckhbw m2, m4, m0 - SWAP m4, m26 + punpcklbw m8, m4, m24 + punpckhbw m2, m4, m24 + SWAP m4, m1 pmaddubsw m8, pbm1_1 pmaddubsw m2, pbm1_1 paddw m10, m8 @@ -892,9 +864,9 @@ SECTION .text ; write +2 paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 paddw m11, m7 - punpcklbw m8, m5, m0 - punpckhbw m9, m5, m0 - SWAP m5, m25 + punpcklbw m8, m5, m24 + punpckhbw m9, m5, m24 + SWAP m5, m12 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -906,10 +878,10 @@ SECTION .text ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 - paddw m10, m19 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 - paddw m11, m24 - punpcklbw m8, m6, m0 - punpckhbw m9, m6, m0 + paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + paddw m11, m0 + punpcklbw m8, m6, m24 + punpckhbw m9, m6, m24 SWAP 2, 6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 @@ -921,20 +893,20 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+mstrideq]{k4}, m8 %else - SWAP m19, m16 - %define pw2048 m19 - vpblendmb m16{k4}, m15, m8 + SWAP m29, m16 + %define pw2048 m29 + vpblendmb m16{k4}, m22, m8 %endif ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K ; write +4 - paddw m10, m20 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - paddw m11, m23 + paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m15 %ifidn %2, h - SWAP m23, m8 + SWAP m15, m8 %endif - punpcklbw m8, m14, m0 - punpckhbw m9, m14, m0 + punpcklbw m8, m14, m24 + punpckhbw m9, m14, m24 SWAP 14, 7 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 @@ -946,16 +918,16 @@ SECTION .text %ifidn %2, v vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 %else - vpblendmb m17{k4}, m1, m8 + vpblendmb m17{k4}, m26, m8 %endif ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L ; write +5 paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 - paddw m11, m22 - punpcklbw m8, m15, m0 - punpckhbw m9, m15, m0 - SWAP m20, m0 + paddw m11, m23 + punpcklbw m8, m22, m24 + punpckhbw m9, m22, m24 + SWAP m30, m24 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 @@ -979,26 +951,26 @@ SECTION .text vpbroadcastd m9, [pb_3_1] vpbroadcastd m10, [pb_2_1] %if %1 == 16 - vpbroadcastd m22, [pb_1] - vpbroadcastd m24, [pb_4] + vpbroadcastd m23, [pb_1] + vpbroadcastd m0, [pb_4] %elifidn %2, h - vpbroadcastd m21, [pb_m1_1] - %define pbm1_1 m21 + vpbroadcastd m31, [pb_m1_1] + %define pbm1_1 m31 %endif - punpcklbw m0, m12, m3 - punpckhbw m1, m12, m3 - pmaddubsw m2, m0, m9 - pmaddubsw m7, m1, m9 ; 3 * p3 + p1 + punpcklbw m24, m25, m3 + punpckhbw m26, m25, m3 + pmaddubsw m2, m24, m9 + pmaddubsw m7, m26, m9 ; 3 * p3 + p1 punpcklbw m8, m13, m4 punpckhbw m11, m13, m4 pmaddubsw m8, m10 pmaddubsw m11, m10 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 - punpcklbw m8, m5, m24 - punpckhbw m11, m5, m24 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + punpcklbw m8, m5, m0 + punpckhbw m11, m5, m0 + pmaddubsw m8, m23 + pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 psrlw m8, m2, 3 @@ -1015,8 +987,8 @@ SECTION .text %endif %endif - pmaddubsw m8, m0, pbm1_1 - pmaddubsw m11, m1, pbm1_1 + pmaddubsw m8, m24, pbm1_1 + pmaddubsw m11, m26, pbm1_1 paddw m2, m8 paddw m7, m11 punpcklbw m8, m13, m6 @@ -1035,14 +1007,14 @@ SECTION .text SWAP m18, m8 %endif - pmaddubsw m0, m22 - pmaddubsw m1, m22 - psubw m2, m0 - psubw m7, m1 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 punpcklbw m8, m4, m14 punpckhbw m11, m4, m14 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + pmaddubsw m8, m23 + pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 psrlw m8, m2, 3 @@ -1052,19 +1024,19 @@ SECTION .text %ifidn %2, v mova [t0+stride3q], m8 %else - SWAP m19, m8 + SWAP m29, m8 %endif - punpcklbw m0, m5, m15 - punpckhbw m1, m5, m15 - pmaddubsw m8, m0, m22 - pmaddubsw m11, m1, m22 + punpcklbw m24, m5, m22 + punpckhbw m26, m5, m22 + pmaddubsw m8, m24, m23 + pmaddubsw m11, m26, m23 paddw m2, m8 paddw m7, m11 - punpcklbw m8, m4, m12 - punpckhbw m11, m4, m12 - pmaddubsw m8, m22 - pmaddubsw m11, m22 + punpcklbw m8, m4, m25 + punpckhbw m11, m4, m25 + pmaddubsw m8, m23 + pmaddubsw m11, m23 psubw m2, m8 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 psrlw m8, m2, 3 @@ -1075,10 +1047,10 @@ SECTION .text mova [dstq+strideq*0], m11 %endif - pmaddubsw m0, pbm1_1 - pmaddubsw m1, pbm1_1 - paddw m2, m0 - paddw m7, m1 + pmaddubsw m24, pbm1_1 + pmaddubsw m26, pbm1_1 + paddw m2, m24 + paddw m7, m26 punpcklbw m8, m13, m6 punpckhbw m13, m6 pmaddubsw m8, pbm1_1 @@ -1093,18 +1065,18 @@ SECTION .text mova [dstq+strideq*1], m13 %endif - punpcklbw m0, m3, m6 - punpckhbw m1, m3, m6 - pmaddubsw m0, m22 - pmaddubsw m1, m22 - psubw m2, m0 - psubw m7, m1 - punpcklbw m0, m14, m15 - punpckhbw m1, m14, m15 - pmaddubsw m0, m22 - pmaddubsw m1, m22 - paddw m2, m0 - paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + punpcklbw m24, m3, m6 + punpckhbw m26, m3, m6 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 + punpcklbw m24, m14, m22 + punpckhbw m26, m14, m22 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + paddw m2, m24 + paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 @@ -1120,36 +1092,36 @@ SECTION .text %endif %ifidn %2, h - SWAP m0, m18 - SWAP m1, m19 + SWAP m24, m18 + SWAP m26, m29 %if %1 == 8 ; 16x8 transpose - punpcklbw m3, m12, m10 - punpckhbw m12, m10 - punpcklbw m10, m0, m1 - punpckhbw m0, m1 - punpcklbw m1, m11, m13 + punpcklbw m3, m25, m10 + punpckhbw m25, m10 + punpcklbw m10, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m11, m13 punpckhbw m11, m13 - punpcklbw m13, m2, m15 - punpckhbw m2, m15 + punpcklbw m13, m2, m22 + punpckhbw m2, m22 ; - punpcklwd m15, m3, m10 + punpcklwd m22, m3, m10 punpckhwd m3, m10 - punpcklwd m10, m12, m0 - punpckhwd m12, m0 - punpcklwd m0, m1, m13 - punpckhwd m1, m13 + punpcklwd m10, m25, m24 + punpckhwd m25, m24 + punpcklwd m24, m26, m13 + punpckhwd m26, m13 punpcklwd m13, m11, m2 punpckhwd m11, m2 ; - punpckldq m2, m15, m0 - punpckhdq m15, m0 - punpckldq m0, m3, m1 - punpckhdq m3, m1 - punpckldq m1, m10, m13 + punpckldq m2, m22, m24 + punpckhdq m22, m24 + punpckldq m24, m3, m26 + punpckhdq m3, m26 + punpckldq m26, m10, m13 punpckhdq m10, m13 - punpckldq m13, m12, m11 - punpckhdq m12, m11 + punpckldq m13, m25, m11 + punpckhdq m25, m11 ; write 8x32 vpbroadcastd ym16, strided pmulld ym16, [hmulD] @@ -1162,8 +1134,8 @@ SECTION .text kmovb k3, k6 kmovb k4, k6 vpscatterdq [dstq+ym16-4]{k1}, m2 - vpscatterdq [t1 +ym16-4]{k2}, m15 - vpscatterdq [t2 +ym16-4]{k3}, m0 + vpscatterdq [t1 +ym16-4]{k2}, m22 + vpscatterdq [t2 +ym16-4]{k3}, m24 vpscatterdq [t3 +ym16-4]{k4}, m3 lea t1, [t0+strideq*2] lea t2, [t0+strideq*4] @@ -1172,29 +1144,29 @@ SECTION .text kmovb k2, k6 kmovb k3, k6 kmovb k4, k6 - vpscatterdq [t0+ym16-4]{k1}, m1 + vpscatterdq [t0+ym16-4]{k1}, m26 vpscatterdq [t1+ym16-4]{k2}, m10 vpscatterdq [t2+ym16-4]{k3}, m13 - vpscatterdq [t3+ym16-4]{k4}, m12 + vpscatterdq [t3+ym16-4]{k4}, m25 %else ; 16x16 transpose and store SWAP 5, 10, 2 - SWAP 6, 0 - SWAP 7, 1 + SWAP 6, 24 + SWAP 7, 26 SWAP 8, 11 SWAP 9, 13 - mova m0, [rsp+0*64] - SWAP m1, m28 + mova m24, [rsp+0*64] + SWAP m26, m28 mova m2, [rsp+1*64] mova m3, [rsp+2*64] mova m4, [rsp+3*64] SWAP m11, m16 - SWAP m12, m17 + SWAP m25, m17 SWAP m13, m27 - SWAP m14, m20 + SWAP m14, m30 TRANSPOSE_16X16B 1, 0, [rsp+4*64] - movu [dstq+strideq*0-8], xm0 - movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*0-8], xm24 + movu [dstq+strideq*1-8], xm26 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea t0, [dstq+strideq*4] @@ -1208,13 +1180,13 @@ SECTION .text movu [t0+strideq*2-8], xm10 movu [t0+stride3q -8], xm11 lea t0, [t0+strideq*4] - movu [t0+strideq*0-8], xm12 + movu [t0+strideq*0-8], xm25 movu [t0+strideq*1-8], xm13 movu [t0+strideq*2-8], xm14 - movu [t0+stride3q -8], xm15 + movu [t0+stride3q -8], xm22 lea t0, [t0+strideq*4] - vextracti128 [t0+strideq*0-8], ym0, 1 - vextracti128 [t0+strideq*1-8], ym1, 1 + vextracti128 [t0+strideq*0-8], ym24, 1 + vextracti128 [t0+strideq*1-8], ym26, 1 vextracti128 [t0+strideq*2-8], ym2, 1 vextracti128 [t0+stride3q -8], ym3, 1 lea t0, [t0+strideq*4] @@ -1228,13 +1200,13 @@ SECTION .text vextracti128 [t0+strideq*2-8], ym10, 1 vextracti128 [t0+stride3q -8], ym11, 1 lea t0, [t0+strideq*4] - vextracti128 [t0+strideq*0-8], ym12, 1 + vextracti128 [t0+strideq*0-8], ym25, 1 vextracti128 [t0+strideq*1-8], ym13, 1 vextracti128 [t0+strideq*2-8], ym14, 1 - vextracti128 [t0+stride3q -8], ym15, 1 + vextracti128 [t0+stride3q -8], ym22, 1 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m0, 2 - vextracti32x4 [t0+strideq*1-8], m1, 2 + vextracti32x4 [t0+strideq*0-8], m24, 2 + vextracti32x4 [t0+strideq*1-8], m26, 2 vextracti32x4 [t0+strideq*2-8], m2, 2 vextracti32x4 [t0+stride3q -8], m3, 2 lea t0, [t0+strideq*4] @@ -1248,13 +1220,13 @@ SECTION .text vextracti32x4 [t0+strideq*2-8], m10, 2 vextracti32x4 [t0+stride3q -8], m11, 2 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m12, 2 + vextracti32x4 [t0+strideq*0-8], m25, 2 vextracti32x4 [t0+strideq*1-8], m13, 2 vextracti32x4 [t0+strideq*2-8], m14, 2 - vextracti32x4 [t0+stride3q -8], m15, 2 + vextracti32x4 [t0+stride3q -8], m22, 2 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m0, 3 - vextracti32x4 [t0+strideq*1-8], m1, 3 + vextracti32x4 [t0+strideq*0-8], m24, 3 + vextracti32x4 [t0+strideq*1-8], m26, 3 vextracti32x4 [t0+strideq*2-8], m2, 3 vextracti32x4 [t0+stride3q -8], m3, 3 lea t0, [t0+strideq*4] @@ -1268,19 +1240,15 @@ SECTION .text vextracti32x4 [t0+strideq*2-8], m10, 3 vextracti32x4 [t0+stride3q -8], m11, 3 lea t0, [t0+strideq*4] - vextracti32x4 [t0+strideq*0-8], m12, 3 + vextracti32x4 [t0+strideq*0-8], m25, 3 vextracti32x4 [t0+strideq*1-8], m13, 3 vextracti32x4 [t0+strideq*2-8], m14, 3 - vextracti32x4 [t0+stride3q -8], m15, 3 + vextracti32x4 [t0+stride3q -8], m22, 3 %endif %endif %elif %1 == 6 ; flat6 filter - SWAP m15, m23 - SWAP m0, m24 - SWAP m12, m25 - SWAP m1, m26 vpbroadcastd m15, [pb_3_1] vpbroadcastd m12, [pb_2] punpcklbw m8, m13, m5 @@ -1381,17 +1349,16 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] - mova m31, [pb_4x0_4x4_4x8_4x12] - mova m30, [pb_mask] - vpbroadcastd m29, [pb_128] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] vpbroadcastd m28, [pb_m1_1] vpbroadcastd m27, [pw_2048] - %define pbshuf m31 - %define pbmask m30 - %define pb128 m29 + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 %define pbm1_1 m28 %define pw2048 m27 - %define is_uv 0 .loop: cmp word [maskq+8], 0 ; vmask[2] @@ -1411,7 +1378,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call .v4 .end: add lq, 64 @@ -1420,6 +1387,11 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ sub wd, 16 jg .loop RET +ALIGN function_align +RESET_MM_PERMUTATION +.v4: + FILTER 4, v + ret cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ lut, h, stride3, stride8 @@ -1429,11 +1401,11 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ lea stride3q, [strideq*3] lea stride8q, [strideq*8] kxnorw k6, k6, k6 - vpbroadcastd m29, strided - vpbroadcastd m30, l_strided - pmulld m31, m29, [hmulA] - pmulld m30, m30, [hmulB] - pmulld m29, m29, [hmulC] + vpbroadcastd m19, strided + vpbroadcastd m20, l_strided + pmulld m21, m19, [hmulA] + pmulld m20, [hmulB] + pmulld m19, [hmulC] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask [pb_mask] %define pb128 [pb_128]{bcstd} @@ -1457,7 +1429,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, h + call .h4 .end: lea lq, [lq+l_strideq*8] @@ -1466,9 +1438,13 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ sub hd, 16 jg .loop RET +ALIGN function_align RESET_MM_PERMUTATION +.h4: + FILTER 4, h + ret -cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ +cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride DECLARE_REG_TMP 9 shl l_strideq, 2 @@ -1476,16 +1452,15 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] - mova m20, [pb_4x0_4x4_4x8_4x12] - mova m19, [pb_mask] - vpbroadcastd m18, [pb_128] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] vpbroadcastd m17, [pb_m1_1] vpbroadcastd m16, [pw_4096] - %define pbshuf m20 - %define pbmask m19 - %define pb128 m18 + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 %define pbm1_1 m17 - %define is_uv 1 .loop: cmp word [maskq+4], 0 ; vmask[1] @@ -1498,7 +1473,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, v + call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4 .end: add lq, 64 @@ -1525,17 +1500,14 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ vpbroadcastd m19, strided vpbroadcastd m20, l_strided pmulld m21, m19, [hmulA] - pmulld m20, m20, [hmulB] - pmulld m19, m19, [hmulC] + pmulld m20, [hmulB] + pmulld m19, [hmulC] mova m18, [pb_mask] vpbroadcastd m17, [pb_128] vpbroadcastd m16, [pw_4096] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask m18 %define pb128 m17 - %xdefine m31 m21 - %xdefine m30 m20 - %xdefine m29 m19 add l_strideq, l_strideq .loop: @@ -1549,7 +1521,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ cmp word [maskq+0], 0 ; vmask[0] je .end - FILTER 4, h + call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4 .end: lea lq, [lq+l_strideq*8] diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h new file mode 100644 index 00000000000..de23be8866c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h @@ -0,0 +1,94 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/looprestoration.h" + +#include "common/intops.h" + +#define decl_wiener_filter_fns(ext) \ +decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ +decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) + +#define decl_sgr_filter_fns(ext) \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext)) + +decl_wiener_filter_fns(sse2); +decl_wiener_filter_fns(ssse3); +decl_wiener_filter_fns(avx2); +decl_wiener_filter_fns(avx512icl); +decl_sgr_filter_fns(ssse3); +decl_sgr_filter_fns(avx2); +decl_sgr_filter_fns(avx512icl); + +static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; +#if BITDEPTH == 8 + c->wiener[0] = BF(dav1d_wiener_filter7, sse2); + c->wiener[1] = BF(dav1d_wiener_filter5, sse2); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); + c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3); + c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3); + } + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->wiener[0] = BF(dav1d_wiener_filter7, avx2); + c->wiener[1] = BF(dav1d_wiener_filter5, avx2); + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); + } + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl); +#if BITDEPTH == 8 + /* With VNNI we don't need a 5-tap version. */ + c->wiener[1] = c->wiener[0]; +#else + c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl); +#endif + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl); + } +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm index 5669ce66d8f..1e571774caf 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm @@ -329,11 +329,11 @@ ALIGN function_align packuswb m2, m4 psrlw m2, 8 vpackuswb m2{k2}, m3, m5 - mova [dstq+r10], m2 - add r10, 64 - jl .hv_loop - mov t6, t5 - mov t5, t4 + movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap + add r10, 64 ; function is used for chroma as well, and in some + jl .hv_loop ; esoteric edge cases chroma dst pointers may only + mov t6, t5 ; have a 32-byte alignment despite having a width + mov t5, t4 ; larger than 32, so use an unaligned store here. mov t4, t3 mov t3, t2 mov t2, t1 @@ -379,7 +379,7 @@ ALIGN function_align packuswb m0, m2 psrlw m0, 8 vpackuswb m0{k2}, m1, m3 - mova [dstq+r10], m0 + movu [dstq+r10], m0 add r10, 64 jl .v_loop mov t6, t5 diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc.h b/chromium/third_party/dav1d/libdav1d/src/x86/mc.h new file mode 100644 index 00000000000..65c607e180c --- /dev/null +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc.h @@ -0,0 +1,299 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018-2021, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/mc.h" + +#define decl_fn(type, name) \ + decl_##type##_fn(BF(name, sse2)); \ + decl_##type##_fn(BF(name, ssse3)); \ + decl_##type##_fn(BF(name, avx2)); \ + decl_##type##_fn(BF(name, avx512icl)); +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) +#define init_mc_scaled_fn(type, name, suffix) \ + c->mc_scaled[type] = BF(dav1d_put_##name, suffix) +#define init_mct_scaled_fn(type, name, suffix) \ + c->mct_scaled[type] = BF(dav1d_prep_##name, suffix) + +decl_fn(mc, dav1d_put_8tap_regular); +decl_fn(mc, dav1d_put_8tap_regular_smooth); +decl_fn(mc, dav1d_put_8tap_regular_sharp); +decl_fn(mc, dav1d_put_8tap_smooth); +decl_fn(mc, dav1d_put_8tap_smooth_regular); +decl_fn(mc, dav1d_put_8tap_smooth_sharp); +decl_fn(mc, dav1d_put_8tap_sharp); +decl_fn(mc, dav1d_put_8tap_sharp_regular); +decl_fn(mc, dav1d_put_8tap_sharp_smooth); +decl_fn(mc, dav1d_put_bilin); + +decl_fn(mct, dav1d_prep_8tap_regular); +decl_fn(mct, dav1d_prep_8tap_regular_smooth); +decl_fn(mct, dav1d_prep_8tap_regular_sharp); +decl_fn(mct, dav1d_prep_8tap_smooth); +decl_fn(mct, dav1d_prep_8tap_smooth_regular); +decl_fn(mct, dav1d_prep_8tap_smooth_sharp); +decl_fn(mct, dav1d_prep_8tap_sharp); +decl_fn(mct, dav1d_prep_8tap_sharp_regular); +decl_fn(mct, dav1d_prep_8tap_sharp_smooth); +decl_fn(mct, dav1d_prep_bilin); + +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth); +decl_fn(mc_scaled, dav1d_put_bilin_scaled); + +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth); +decl_fn(mct_scaled, dav1d_prep_bilin_scaled); + +decl_fn(avg, dav1d_avg); +decl_fn(w_avg, dav1d_w_avg); +decl_fn(mask, dav1d_mask); +decl_fn(w_mask, dav1d_w_mask_420); +decl_fn(w_mask, dav1d_w_mask_422); +decl_fn(w_mask, dav1d_w_mask_444); +decl_fn(blend, dav1d_blend); +decl_fn(blend_dir, dav1d_blend_v); +decl_fn(blend_dir, dav1d_blend_h); + +decl_fn(warp8x8, dav1d_warp_affine_8x8); +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4)); +decl_fn(warp8x8t, dav1d_warp_affine_8x8t); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4)); + +decl_fn(emu_edge, dav1d_emu_edge); + +decl_fn(resize, dav1d_resize); + +static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) + return; + +#if BITDEPTH == 8 + init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2); + + c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2); +#endif + + if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) + return; + + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + + c->avg = BF(dav1d_avg, ssse3); + c->w_avg = BF(dav1d_w_avg, ssse3); + c->mask = BF(dav1d_mask, ssse3); + c->w_mask[0] = BF(dav1d_w_mask_444, ssse3); + c->w_mask[1] = BF(dav1d_w_mask_422, ssse3); + c->w_mask[2] = BF(dav1d_w_mask_420, ssse3); + c->blend = BF(dav1d_blend, ssse3); + c->blend_v = BF(dav1d_blend_v, ssse3); + c->blend_h = BF(dav1d_blend_h, ssse3); + c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3); + c->emu_edge = BF(dav1d_emu_edge, ssse3); + c->resize = BF(dav1d_resize, ssse3); + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE41)) + return; + +#if BITDEPTH == 8 + c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4); +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) + return; + + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + c->avg = BF(dav1d_avg, avx2); + c->w_avg = BF(dav1d_w_avg, avx2); + c->mask = BF(dav1d_mask, avx2); + c->w_mask[0] = BF(dav1d_w_mask_444, avx2); + c->w_mask[1] = BF(dav1d_w_mask_422, avx2); + c->w_mask[2] = BF(dav1d_w_mask_420, avx2); + c->blend = BF(dav1d_blend, avx2); + c->blend_v = BF(dav1d_blend_v, avx2); + c->blend_h = BF(dav1d_blend_h, avx2); + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2); + c->emu_edge = BF(dav1d_emu_edge, avx2); + c->resize = BF(dav1d_resize, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) + return; + + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl); + + c->avg = BF(dav1d_avg, avx512icl); + c->w_avg = BF(dav1d_w_avg, avx512icl); + c->mask = BF(dav1d_mask, avx512icl); + c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl); + c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl); + c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl); + c->blend = BF(dav1d_blend, avx512icl); + c->blend_v = BF(dav1d_blend_v, avx512icl); + c->blend_h = BF(dav1d_blend_h, avx512icl); + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl); + c->resize = BF(dav1d_resize, avx512icl); +#endif +} diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm index e83b18ad969..585ba53e080 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm @@ -1604,7 +1604,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vpbroadcastd m11, [buf+ 4] vpbroadcastd m12, [buf+ 8] vpbroadcastd m13, [buf+12] - cmp wd, 16 + sub wd, 16 je .h_w16 jg .h_w32 .h_w8: @@ -3615,32 +3615,32 @@ ALIGN function_align .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq ], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm0, ym1, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq ], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: @@ -3860,33 +3860,33 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: @@ -4090,32 +4090,32 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: @@ -4249,32 +4249,32 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm index eb3ca1c427d..7897f1decc1 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm @@ -449,9 +449,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy pshufb ym0, ym4 pmaddubsw ym0, ym5 pmulhrsw ym0, ym3 - vpmovuswb xmm0, ym0 - movq [dstq+dsq*0], xmm0 - movhps [dstq+dsq*1], xmm0 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 @@ -755,9 +755,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy pmulhw ym1, ym6 paddw ym1, ym2 pmulhrsw ym1, ym7 - vpmovuswb xmm1, ym1 - movq [dstq+dsq*0], xmm1 - movhps [dstq+dsq*1], xmm1 + vpmovuswb xm1, ym1 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop @@ -1588,13 +1588,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_w4_loop RET .h_w8: - movu xmm0, [srcq+ssq*0] - vinserti32x4 ym0, ymm0, [srcq+ssq*1], 1 + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 - vpmovuswb xmm0, ym0 - movq [dstq+dsq*0], xmm0 - movhps [dstq+dsq*1], xmm0 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 @@ -3308,17 +3308,17 @@ ALIGN function_align cmp hd, 8 jg .w4_h16 WRAP_YMM %1 0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_ret: RET .w4_h16: @@ -3332,29 +3332,29 @@ ALIGN function_align cmp hd, 4 jne .w8_h8 WRAP_YMM %1 0 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq ], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w8_h8: %1 0 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq ], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq ], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3415,8 +3415,8 @@ ALIGN function_align paddw m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] - pmulhrsw m0, m2 - pmulhrsw m1, m2 + pmulhrsw m0, m4 + pmulhrsw m1, m4 packuswb m0, m1 %endmacro @@ -3425,13 +3425,13 @@ ALIGN function_align add tmp2q, %1*mmsize %endmacro -cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] - vpbroadcastd m2, [base+pw_1024] + vpbroadcastd m4, [base+pw_1024] add wq, r6 BIDIR_FN AVG @@ -3573,17 +3573,17 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym5, [wm_420_perm4+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: vpermb ym8, ym10, ym8 movq [maskq], xm8 @@ -3609,11 +3609,11 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpdpbusd ym8, ym4, ym9 vpermb m8, m10, m8 mova [maskq], xm8 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3627,18 +3627,18 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpdpbusd m1, m4, m9 vpermb m1, m10, m1 mova [maskq], xm1 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3766,17 +3766,17 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: pand xm8, xm11 mova [maskq], xm8 @@ -3801,11 +3801,11 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb ym8, ym10, ym8 pand xm8, xm11 mova [maskq], xm8 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3819,18 +3819,18 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb m1, m10, m1 pand ym1, ym11 mova [maskq], ym1 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3936,17 +3936,17 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: RET .w4_h16: @@ -3965,11 +3965,11 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3980,18 +3980,18 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 mova [maskq], m4 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac.h b/chromium/third_party/dav1d/libdav1d/src/x86/msac.h index e11cd08c8a4..0bb632fb314 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/msac.h +++ b/chromium/third_party/dav1d/libdav1d/src/x86/msac.h @@ -28,21 +28,21 @@ #ifndef DAV1D_SRC_X86_MSAC_H #define DAV1D_SRC_X86_MSAC_H +#include "src/cpu.h" + unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s); unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f); unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf); -/* Needed for checkasm */ -unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, - size_t n_symbols); - #if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2 #define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2 @@ -55,10 +55,21 @@ unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, #if ARCH_X86_64 #define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb)) + +static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (flags & DAV1D_X86_CPU_FLAG_SSE2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; + } + + if (flags & DAV1D_X86_CPU_FLAG_AVX2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; + } +} + #elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2 #endif -void dav1d_msac_init_x86(MsacContext *const s); - #endif /* DAV1D_SRC_X86_MSAC_H */ diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c b/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c deleted file mode 100644 index a634da27c4e..00000000000 --- a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright © 2020, VideoLAN and dav1d authors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "src/cpu.h" -#include "src/msac.h" -#include "src/x86/msac.h" - -#if ARCH_X86_64 -void dav1d_msac_init_x86(MsacContext *const s) { - const unsigned flags = dav1d_get_cpu_flags(); - - if (flags & DAV1D_X86_CPU_FLAG_SSE2) { - s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; - } - - if (flags & DAV1D_X86_CPU_FLAG_AVX2) { - s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; - } -} -#endif diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c b/chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h index e3575ba4da7..de4124c436e 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c +++ b/chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h @@ -32,7 +32,7 @@ decl_splat_mv_fn(dav1d_splat_mv_sse2); decl_splat_mv_fn(dav1d_splat_mv_avx2); decl_splat_mv_fn(dav1d_splat_mv_avx512icl); -COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { +static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; |