summaryrefslogtreecommitdiff
path: root/chromium/third_party/dav1d/libdav1d/src/x86
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86')
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/cdef.h87
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm622
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h81
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/ipred.h146
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm66
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm92
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx.h356
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm1482
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm2599
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm231
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm84
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm298
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h66
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm172
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm912
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm16
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm718
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h94
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm12
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc.h299
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm146
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm166
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/msac.h23
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c43
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h (renamed from chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c)2
25 files changed, 7282 insertions, 1531 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h b/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h
new file mode 100644
index 00000000000..553d6507412
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+#define decl_cdef_fns(ext) \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \
+ decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext))
+
+decl_cdef_fns(avx512icl);
+decl_cdef_fns(avx2);
+decl_cdef_fns(sse4);
+decl_cdef_fns(ssse3);
+decl_cdef_fns(sse2);
+
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
+decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
+
+static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+#if BITDEPTH == 8
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->dir = BF(dav1d_cdef_dir, ssse3);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+ c->dir = BF(dav1d_cdef_dir, sse4);
+#if BITDEPTH == 8
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->dir = BF(dav1d_cdef_dir, avx2);
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl);
+ c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl);
+ c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm
new file mode 100644
index 00000000000..6d625a02a0c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/cdef16_avx512.asm
@@ -0,0 +1,622 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21
+ db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29
+ db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37
+ db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45
+end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
+ db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
+edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111
+ dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011
+ dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111
+pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4
+cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6
+ dw 1, 2, 1, 10, 9, 18, 8, 17
+ dw 8, 16, 8, 15, -7,-14, 1, -6
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28
+ db 2, 4, 2, 36, 34, 68, 32, 66
+ db 32, 64, 32, 62,-30,-60, 2,-28
+pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3
+sec_taps4: dw 32, 16
+pw_m16384: times 2 dw -16384
+pw_2048: times 2 dw 2048
+pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4)
+edge_mask8: dw 0x2121, 0x2020, 0x0101
+
+SECTION .text
+
+%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp
+ psubw %1, %2, %3
+ pabsw %1, %1
+ vpcmpgtw k1, %3, %2
+ vpsrlvw %7, %1, %6
+ psubusw %7, %5, %7
+ pminsw %1, %7
+ vpsubw %1{k1}, %4, %1
+%endmacro
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs4
+ lea r6, [cdef_dirs4]
+ movu xm3, [dstq+strideq*0]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ mova xm2, [leftq]
+ lea r2, [dstq+strideq*2]
+ vinserti32x4 m3, [r2+strideq*0], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m3, [r2+strideq*1], 3
+ vpermt2d m2, m5, m3
+ vinserti32x4 m1, m2, [topq+strideq*0-4], 0
+ vinserti32x4 m1, [topq+strideq*1-4], 1
+ mov r3d, edgem
+ movifnidn prid, prim
+ punpcklwd m3, m3 ; px
+ psrlw m5, 8
+ vpbroadcastd m0, [base+pd_268435568]
+ pxor m12, m12
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m6, m3, m8
+ pmaxsw m7, m3, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ call .constrain_sec
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m6, m8
+ pmaxsw m7, m8
+ pminuw m6, m9
+ pmaxsw m7, m9
+ psrldq m8, m6, 2
+ vpshldd m3, m0, 8
+ psrldq m9, m7, 2
+ paddd m0, m3
+ pminuw m6, m8
+ psrldq m0, 1
+ pmaxsw m7, m9
+ pmaxsw m0, m6
+ pminsw m0, m7
+ vpmovdw ym0, m0
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ mov r4d, dirm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym1, [base+end_perm4]
+ vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ vpermb m0, m1, m0
+.end:
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm0, ym0, 1
+ movq [r2+strideq*0], xm0
+ movhps [r2+strideq*1], xm0
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ or r3d, 0x04
+ vmovdqa32 m1{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m8, m5, m9
+ vpermi2w m8, m1, m2 ; k0p0 k1p0
+ psubw m9, m5, m9
+ vpermi2w m9, m1, m2 ; k0p1 k1p1
+ CONSTRAIN m10, m8, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ CONSTRAIN m10, m9, m3, m12, m13, m14, m11
+ vpdpwssd m0, m10, m15
+ ret
+
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7
+
+cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+ lea r6, [cdef_dirs4]
+ movu xm18, [dstq+strideq*0]
+ vinserti128 ym18, [dstq+strideq*1], 1
+ mova xm1, [leftq+16*0]
+ mova xm2, [leftq+16*1]
+ lea r2, [strideq*3]
+ vinserti32x4 m18, [dstq+strideq*2], 2
+ mova m5, [base+cdef_perm]
+ vinserti32x4 m18, [dstq+r2 ], 3
+ vpermt2d m1, m5, m18
+ vinserti32x4 m0, m1, [topq+strideq*0-4], 0
+ vinserti32x4 m0, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu xm19, [r3+strideq*0]
+ vinserti128 ym19, [r3+strideq*1], 1
+ vinserti32x4 m19, [r3+strideq*2], 2
+ vinserti32x4 m19, [r3+r2 ], 3
+ mov r3d, edgem
+ movifnidn prid, prim
+ vpermt2d m2, m5, m19
+ vpbroadcastd m16, [base+pd_268435568]
+ pxor m12, m12
+ punpcklwd m18, m18 ; px (top)
+ psrlw m5, 8
+ punpcklwd m19, m19 ; px (bottom)
+ mova m17, m16
+ vshufi32x4 m1, m2, q3210
+ cmp r3d, 0x0f
+ jne .mask_edges
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+.main:
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ vpbroadcastd m15, [base+pri_taps4+priq]
+ xor prid, prid
+ add r4d, r3d
+ cmovns prid, r4d ; pri_shift
+ mov r4d, dirm
+ vpbroadcastw m14, prid
+ mov r5d, secm
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4]
+ call .constrain
+ test r5d, r5d
+ jz .end_no_clip
+ lzcnt r5d, r5d
+ vpbroadcastw m13, secm
+ add r3d, r5d
+ pminuw m3, m18, m6
+ pmaxsw m4, m18, m6
+ pminuw m20, m19, m7
+ pmaxsw m21, m19, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ call .constrain_sec
+ pminuw m3, m6
+ pmaxsw m4, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+ pminuw m3, m6
+ pmaxsw m4, m6
+ mov r3, 0xcccccccccccccccc
+ pminuw m20, m7
+ pmaxsw m21, m7
+ kmovq k1, r3
+ pminuw m3, m8
+ pmaxsw m4, m8
+ pminuw m20, m9
+ pmaxsw m21, m9
+ vbroadcasti32x4 m0, [base+deint_shuf]
+ vpshldd m6, m20, m3, 16
+ vmovdqu8 m3{k1}, m20
+ vpshldd m18, m16, 8
+ vpshldd m7, m21, m4, 16
+ vmovdqu8 m4{k1}, m21
+ vpshldd m19, m17, 8
+ pminuw m3, m6
+ paddd m16, m18
+ pmaxsw m4, m7
+ paddd m17, m19
+ psrldq m16, 1
+ palignr m16{k1}, m17, m17, 15
+ lea r6, [dstq+strideq*4]
+ pmaxsw m16, m3
+ pminsw m16, m4
+ pshufb m16, m0
+ movq [dstq+strideq*0], xm16
+ movhps [r6 +strideq*0], xm16
+ vextracti128 xm17, ym16, 1
+ movq [dstq+strideq*1], xm17
+ movhps [r6 +strideq*1], xm17
+ vextracti32x4 xm17, m16, 2
+ movq [dstq+strideq*2], xm17
+ movhps [r6 +strideq*2], xm17
+ vextracti32x4 xm16, m16, 3
+ movq [dstq+r2 ], xm16
+ movhps [r6 +r2 ], xm16
+ RET
+.sec_only:
+ mov r4d, dirm
+ tzcnt r5d, secm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d ; sec_shift
+ call .constrain_sec
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4]
+ call .constrain
+.end_no_clip:
+ mova ym20, [base+end_perm4]
+ vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m19, m17, 8
+ paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddd m17, m19
+ vpermb m16, m20, m16
+ vpermb m17, m20, m17
+ movq [dstq+strideq*0], xm16
+ movhps [dstq+strideq*1], xm16
+ vextracti128 xm16, ym16, 1
+ movq [dstq+strideq*2], xm16
+ movhps [dstq+r2 ], xm16
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+strideq*0], xm17
+ movhps [dstq+strideq*1], xm17
+ vextracti128 xm17, ym17, 1
+ movq [dstq+strideq*2], xm17
+ movhps [dstq+r2 ], xm17
+ RET
+.mask_edges:
+ vpbroadcastd m6, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ vinserti32x4 m2, [botq+strideq*0-4], 2
+ vinserti32x4 m2, [botq+strideq*1-4], 3
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ jmp .mask_edges_main
+.mask_edges_no_bottom:
+ kmovw k1, [base+edge_mask4+8+r3*2]
+.mask_edges_main:
+ mov r4d, r3d
+ or r3d, 0x0c
+ vmovdqa32 m0{k1}, m6 ; edge pixels = -16384
+ kmovw k1, [base+edge_mask4-8+r3*2]
+ or r4d, 0x04
+ vmovdqa32 m1{k1}, m6
+ kmovw k1, [base+edge_mask4-8+r4*2]
+ vmovdqa32 m2{k1}, m6
+ jmp .main
+.constrain_sec:
+ vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4]
+ vpbroadcastw m14, r3d
+ vpbroadcastd m15, [base+sec_taps4]
+.constrain:
+ paddw m7, m5, m9
+ mova m6, m0
+ vpermt2w m6, m7, m1 ; k0p0 k1p0 (top)
+ psubw m9, m5, m9
+ mova m8, m0
+ vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom)
+ CONSTRAIN m10, m6, m18, m12, m13, m14, m11
+ vpermt2w m8, m9, m1 ; k0p1 k1p1 (top)
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m7, m19, m12, m13, m14, m11
+ vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom)
+ vpdpwssd m17, m10, m15
+ CONSTRAIN m10, m8, m18, m12, m13, m14, m11
+ vpdpwssd m16, m10, m15
+ CONSTRAIN m10, m9, m19, m12, m13, m14, m11
+ vpdpwssd m17, m10, m15
+ ret
+
+cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \
+ pri, sec, dir, damping, edge
+%define base r6-cdef_dirs8
+ lea r6, [cdef_dirs8]
+ movu ym17, [dstq+strideq*0]
+ vinserti32x8 m17, [dstq+strideq*1], 1
+ movq xm4, [leftq+8*0]
+ movq xm5, [leftq+8*1]
+ psrld m2, [base+cdef_perm], 16
+ movq xm6, [leftq+8*2]
+ movq xm7, [leftq+8*3]
+ lea r2, [strideq*3]
+ movu ym16, [topq+strideq*0-4]
+ vinserti32x8 m16, [topq+strideq*1-4], 1
+ lea r3, [dstq+strideq*4]
+ movu ym18, [dstq+strideq*2]
+ vinserti32x8 m18, [dstq+r2 ], 1
+ movu ym19, [r3+strideq*0]
+ vinserti32x8 m19, [r3+strideq*1], 1
+ movu ym20, [r3+strideq*2]
+ vinserti32x8 m20, [r3+r2 ], 1
+ vshufi32x4 m0, m17, m18, q2020 ; px (top)
+ mov r3d, edgem
+ vshufi32x4 m1, m19, m20, q2020 ; px (bottom)
+ movifnidn prid, prim
+ vpermt2d m17, m2, m4
+ vpermt2d m18, m2, m5
+ pxor m12, m12
+ vpermt2d m19, m2, m6
+ vpermt2d m20, m2, m7
+ cmp r3d, 0x0f
+ jne .mask_edges
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+.main:
+ mova [rsp+64*0], m16 ; top
+ mova [rsp+64*1], m17 ; 0 1
+ mova [rsp+64*2], m18 ; 2 3
+ mova [rsp+64*3], m19 ; 4 5
+ mova [rsp+64*4], m20 ; 6 7
+ mova [rsp+64*5], m21 ; bottom
+ test prid, prid
+ jz .sec_only
+ lzcnt r4d, prid
+ rorx r3d, prid, 2
+ vpbroadcastw m13, prim
+ cmp dword r10m, 0xfff ; if (bpc == 12)
+ cmove prid, r3d ; pri >>= 2
+ mov r3d, dampingm
+ and prid, 4
+ sub r3d, 31
+ add r4d, r3d ; pri_shift
+ vpbroadcastw m14, r4d
+ mov r4d, dirm
+ vpbroadcastd m2, [base+pri_taps8+priq*2+0]
+ vpbroadcastd m3, [base+pri_taps8+priq*2+4]
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1
+ pmaxsw m14, m12
+ call .constrain
+ mov r5d, secm
+ pmullw m16, m8, m2
+ pmullw m17, m9, m2
+ test r5d, r5d
+ jnz .pri_sec
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ call .constrain
+ pmullw m8, m3
+ pmullw m9, m3
+ jmp .end_no_clip
+.pri_sec:
+ lzcnt r5d, r5d
+ add r3d, r5d ; sec_shift
+ movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1
+ pminuw m18, m0, m4
+ pmaxsw m19, m0, m4
+ pminuw m20, m1, m5
+ pmaxsw m21, m1, m5
+ call .min_max_constrain2
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2
+ pmullw m8, m3
+ pmullw m9, m3
+ vpbroadcastw m13, secm
+ vpbroadcastw m14, r3d
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3
+ mova m2, m8
+ mova m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2
+ paddw m2, m8
+ paddw m3, m9
+ call .min_max_constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3
+ paddw m2, m2
+ paddw m3, m3
+ paddw m16, m8
+ paddw m17, m9
+ call .min_max_constrain
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m2
+ paddw m17, m3
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+ paddw m16, m0
+ paddw m17, m1
+ pmaxsw m16, m18
+ pmaxsw m17, m20
+ pminsw m16, m19
+ pminsw m17, m21
+ jmp .end
+.sec_only:
+ tzcnt r5d, secm
+ mov r4d, dirm
+ mov r3d, dampingm
+ vpbroadcastw m13, secm
+ sub r3d, r5d
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0]
+ vpbroadcastw m14, r3d
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0]
+ mova m16, m8
+ mova m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1]
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+ movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1]
+ paddw m16, m16
+ paddw m17, m17
+ paddw m16, m8
+ paddw m17, m9
+ call .constrain
+.end_no_clip:
+ vpbroadcastd m10, [base+pw_2048]
+ paddw m16, m8
+ paddw m17, m9
+ psraw m8, m16, 15
+ psraw m9, m17, 15
+ paddw m16, m8
+ paddw m17, m9
+ pmulhrsw m16, m10
+ pmulhrsw m17, m10
+ paddw m16, m0
+ paddw m17, m1
+.end:
+ mova [dstq+strideq*0], xm16
+ vextracti128 [dstq+strideq*1], ym16, 1
+ vextracti32x4 [dstq+strideq*2], m16, 2
+ vextracti32x4 [dstq+r2 ], m16, 3
+ lea dstq, [dstq+strideq*4]
+ mova [dstq+strideq*0], xm17
+ vextracti128 [dstq+strideq*1], ym17, 1
+ vextracti32x4 [dstq+strideq*2], m17, 2
+ vextracti32x4 [dstq+r2 ], m17, 3
+ RET
+.mask_edges:
+ vpbroadcastd m2, [base+pw_m16384]
+ test r3b, 0x08
+ jz .mask_edges_no_bottom ; avoid buffer overread
+ movu ym21, [botq+strideq*0-4]
+ vinserti32x8 m21, [botq+strideq*1-4], 1
+ jmp .mask_edges_top
+.mask_edges_no_bottom:
+ mova m21, m2
+.mask_edges_top:
+ test r3b, 0x04
+ jnz .mask_edges_main
+ mova m16, m2
+.mask_edges_main:
+ and r3d, 0x03
+ cmp r3d, 0x03
+ je .main
+ kmovw k1, [base+edge_mask8+r3*2]
+ vmovdqa32 m16{k1}, m2 ; edge pixels = -16384
+ vmovdqa32 m17{k1}, m2
+ vmovdqa32 m18{k1}, m2
+ vmovdqa32 m19{k1}, m2
+ vmovdqa32 m20{k1}, m2
+ vmovdqa32 m21{k1}, m2
+ jmp .main
+ALIGN function_align
+.min_max_constrain:
+ pminuw m18, m4
+ pmaxsw m19, m4
+ pminuw m20, m5
+ pmaxsw m21, m5
+.min_max_constrain2:
+ pminuw m18, m6
+ pmaxsw m19, m6
+ pminuw m20, m7
+ pmaxsw m21, m7
+.constrain:
+ %define tmp rsp+gprsize+68
+ movu m4, [tmp+r5+64*0]
+ vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top)
+ movu m5, [tmp+r5+64*2]
+ vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom)
+ neg r5
+ movu m6, [tmp+r5+64*0]
+ vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top)
+ movu m7, [tmp+r5+64*2]
+ vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom)
+ CONSTRAIN m8, m4, m0, m12, m13, m14, m15
+ CONSTRAIN m9, m5, m1, m12, m13, m14, m15
+ CONSTRAIN m10, m6, m0, m12, m13, m14, m15
+ CONSTRAIN m11, m7, m1, m12, m13, m14, m15
+ paddw m8, m10
+ paddw m9, m11
+ ret
+
+%endif ; ARCH_X86_64
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h b/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h
new file mode 100644
index 00000000000..eeaa328d1e1
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/filmgrain.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/filmgrain.h"
+
+#define decl_fg_fns(ext) \
+decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \
+decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \
+decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \
+decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext))
+
+decl_fg_fns(ssse3);
+decl_fg_fns(avx2);
+decl_fg_fns(avx512icl);
+
+static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3);
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->generate_grain_y = BF(dav1d_generate_grain_y, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2);
+ c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
+ c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h
new file mode 100644
index 00000000000..7df563fee1c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/ipred.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(dav1d_##name, ssse3)); \
+ decl_##type##_fn(BF(dav1d_##name, avx2)); \
+ decl_##type##_fn(BF(dav1d_##name, avx512icl))
+#define init_fn(type0, type1, name, suffix) \
+ c->type0[type1] = BF(dav1d_##name, suffix)
+
+#define init_angular_ipred_fn(type, name, suffix) \
+ init_fn(intra_pred, type, name, suffix)
+#define init_cfl_pred_fn(type, name, suffix) \
+ init_fn(cfl_pred, type, name, suffix)
+#define init_cfl_ac_fn(type, name, suffix) \
+ init_fn(cfl_ac, type, name, suffix)
+
+decl_fn(angular_ipred, ipred_dc);
+decl_fn(angular_ipred, ipred_dc_128);
+decl_fn(angular_ipred, ipred_dc_top);
+decl_fn(angular_ipred, ipred_dc_left);
+decl_fn(angular_ipred, ipred_h);
+decl_fn(angular_ipred, ipred_v);
+decl_fn(angular_ipred, ipred_paeth);
+decl_fn(angular_ipred, ipred_smooth);
+decl_fn(angular_ipred, ipred_smooth_h);
+decl_fn(angular_ipred, ipred_smooth_v);
+decl_fn(angular_ipred, ipred_z1);
+decl_fn(angular_ipred, ipred_z2);
+decl_fn(angular_ipred, ipred_z3);
+decl_fn(angular_ipred, ipred_filter);
+
+decl_fn(cfl_pred, ipred_cfl);
+decl_fn(cfl_pred, ipred_cfl_128);
+decl_fn(cfl_pred, ipred_cfl_top);
+decl_fn(cfl_pred, ipred_cfl_left);
+
+decl_fn(cfl_ac, ipred_cfl_ac_420);
+decl_fn(cfl_ac, ipred_cfl_ac_422);
+decl_fn(cfl_ac, ipred_cfl_ac_444);
+
+decl_fn(pal_pred, pal_pred);
+
+static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
+
+ c->pal_pred = BF(dav1d_pal_pred, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx2);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx2);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx2);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
+#endif
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl);
+
+ c->pal_pred = BF(dav1d_pal_pred, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm
index 4a1b060bd5f..1a307adc985 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred16_avx512.asm
@@ -114,20 +114,20 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
vbroadcasti32x4 m2, [tlq]
pshufb m2, m7 ; left
PAETH 4, 5, 6
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm8, ym0, 1
+ vextracti32x4 xm9, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+r6 ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm8
+ movq [dstq+r6 ], xm9
sub hd, 8
jl .w4_end
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+r6 ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm8
+ movhps [dstq+r6 ], xm9
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_end:
@@ -220,19 +220,19 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pshufb m3, m4
pmulhrsw m3, m5
paddw m3, m6
- vextracti32x4 xmm0, m3, 3
- vextracti32x4 xmm1, ym3, 1
- vextracti32x4 xmm2, m3, 2
- movhps [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
+ vextracti32x4 xm0, m3, 3
+ vextracti32x4 xm1, ym3, 1
+ vextracti32x4 xm2, m3, 2
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm3
add hq, 8
jg .end
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jl .w4_loop
@@ -337,20 +337,20 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
psubw m0, m6 ; left - right
pmulhrsw m0, m5
paddw m0, m6
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
sub hd, 8*2
jl .end
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.end:
@@ -472,11 +472,11 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
vpdpwssd m0, m1, m6
vpermb m0, m14, m0
pavgw ym0, ym15
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
add v_weightsq, 4*4
sub hd, 4*2
@@ -624,11 +624,11 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
pmovzxbw ym0, [idxq]
add idxq, 16
vpermw ym0, ym0, ym3
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- movq [dstq+strideq*2], xmm1
- movhps [dstq+stride3q ], xmm1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm
index 050ec9bb253..38c86b54f5c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/ipred_avx512.asm
@@ -242,9 +242,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
jmp wq
.w8:
movq xmm1, [tlq+1]
- vextracti32x4 xmm2, ym0, 1
+ vextracti32x4 xm2, ym0, 1
vpdpbusd xm0, xmm1, xm3
- paddd xmm2, xm0
+ paddd xmm2, xm2, xm0
punpckhqdq xmm0, xmm2, xmm2
paddd xmm0, xmm2
psrlq xmm1, xmm0, 32
@@ -275,9 +275,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
jmp wq
.w16:
movu xmm1, [tlq+1]
- vextracti32x4 xmm2, ym0, 1
+ vextracti32x4 xm2, ym0, 1
vpdpbusd xm0, xmm1, xm3
- paddd xmm2, xm0
+ paddd xmm2, xm2, xm0
punpckhqdq xmm0, xmm2, xmm2
paddd xmm0, xmm2
psrlq xmm1, xmm0, 32
@@ -309,8 +309,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
.w32:
movu ym1, [tlq+1]
vpdpbusd ym0, ym1, ym3
- vextracti32x4 xmm1, ym0, 1
- paddd xmm1, xm0
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
punpckhqdq xmm0, xmm1, xmm1
paddd xmm0, xmm1
psrlq xmm1, xmm0, 32
@@ -345,8 +345,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
movu ym2, [tlq+33]
vpdpbusd ym0, ym1, ym3
vpdpbusd ym0, ym2, ym3
- vextracti32x4 xmm1, ym0, 1
- paddd xmm1, xm0
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
punpckhqdq xmm0, xmm1, xmm1
paddd xmm0, xmm1
psrlq xmm1, xmm0, 32
@@ -524,12 +524,12 @@ INIT_YMM avx512icl
pextrd [dstq+stride3q ], xm0, 3
sub hd, 8
jl .w4_ret
- vextracti32x4 xmm0, m0, 1
+ vextracti32x4 xm0, m0, 1
lea dstq, [dstq+strideq*4]
- movd [dstq+strideq*0], xmm0
- pextrd [dstq+strideq*1], xmm0, 1
- pextrd [dstq+strideq*2], xmm0, 2
- pextrd [dstq+stride3q ], xmm0, 3
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_ret:
@@ -545,20 +545,20 @@ INIT_ZMM avx512icl
vpbroadcastq m4, [tlq+hq-8]
pshufb m4, m9
PAETH
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
sub hd, 8
jl .w8_ret
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jg .w8_loop
.w8_ret:
@@ -639,18 +639,18 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pmaddubsw m0, m2, m0
paddw m0, m3
vpermb m0, m6, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
add hq, 8
jg .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jl .w4_loop
.ret:
@@ -669,11 +669,11 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pmaddubsw m0, m2, m0
paddw m0, m3
vpermb m0, m6, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w8_loop
@@ -785,18 +785,18 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
paddw m0, m2
paddw m0, m1
vpermb m0, m8, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
sub hd, 8
jl .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.ret:
@@ -815,11 +815,11 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
paddw m0, m2
paddw m0, m1
vpermb m0, m8, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
@@ -937,18 +937,18 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
paddw m1, m2
pavgw m0, m1
vpermb m0, m11, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
sub hd, 8
jl .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.ret:
@@ -978,11 +978,11 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
paddw m1, m2
pavgw m0, m1
vpermb m0, m11, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx.h b/chromium/third_party/dav1d/libdav1d/src/x86/itx.h
new file mode 100644
index 00000000000..46cfdb75d1d
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx.h
@@ -0,0 +1,356 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/itx.h"
+
+#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix
+
+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
+#define decl_itx_fns(ext) \
+decl_itx17_fns( 4, 4, ext); \
+decl_itx16_fns( 4, 8, ext); \
+decl_itx16_fns( 4, 16, ext); \
+decl_itx16_fns( 8, 4, ext); \
+decl_itx16_fns( 8, 8, ext); \
+decl_itx16_fns( 8, 16, ext); \
+decl_itx2_fns ( 8, 32, ext); \
+decl_itx16_fns(16, 4, ext); \
+decl_itx16_fns(16, 8, ext); \
+decl_itx12_fns(16, 16, ext); \
+decl_itx2_fns (16, 32, ext); \
+decl_itx2_fns (32, 8, ext); \
+decl_itx2_fns (32, 16, ext); \
+decl_itx2_fns (32, 32, ext); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext))
+
+
+#define decl_itx2_bpc_fns(w, h, bpc, opt) \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt))
+
+#define decl_itx12_bpc_fns(w, h, bpc, opt) \
+decl_itx2_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt))
+
+#define decl_itx16_bpc_fns(w, h, bpc, opt) \
+decl_itx12_bpc_fns(w, h, bpc, opt); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt))
+
+#define decl_itx_bpc_fns(bpc, ext) \
+decl_itx16_bpc_fns( 4, 4, bpc, ext); \
+decl_itx16_bpc_fns( 4, 8, bpc, ext); \
+decl_itx16_bpc_fns( 4, 16, bpc, ext); \
+decl_itx16_bpc_fns( 8, 4, bpc, ext); \
+decl_itx16_bpc_fns( 8, 8, bpc, ext); \
+decl_itx16_bpc_fns( 8, 16, bpc, ext); \
+decl_itx2_bpc_fns ( 8, 32, bpc, ext); \
+decl_itx16_bpc_fns(16, 4, bpc, ext); \
+decl_itx16_bpc_fns(16, 8, bpc, ext); \
+decl_itx12_bpc_fns(16, 16, bpc, ext); \
+decl_itx2_bpc_fns (16, 32, bpc, ext); \
+decl_itx2_bpc_fns (32, 8, bpc, ext); \
+decl_itx2_bpc_fns (32, 16, bpc, ext); \
+decl_itx2_bpc_fns (32, 32, bpc, ext); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \
+decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext))
+
+decl_itx_fns(avx512icl);
+decl_itx_bpc_fns(10, avx512icl);
+decl_itx_fns(avx2);
+decl_itx_bpc_fns(10, avx2);
+decl_itx_bpc_fns(12, avx2);
+decl_itx_fns(sse4);
+decl_itx_fns(ssse3);
+decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
+
+static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+ assign_itx1_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+ assign_itx2_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
+ assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
+ assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+ assign_itx12_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
+ assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
+ assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+ assign_itx16_fn(pfx, w, h, ext); \
+ assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
+
+
+#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
+ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+ BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
+
+#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext)
+
+#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext)
+
+#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext)
+
+#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \
+ assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \
+ assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext)
+
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn(, 4, 4, ssse3);
+ assign_itx16_fn(R, 4, 8, ssse3);
+ assign_itx16_fn(R, 8, 4, ssse3);
+ assign_itx16_fn(, 8, 8, ssse3);
+ assign_itx16_fn(R, 4, 16, ssse3);
+ assign_itx16_fn(R, 16, 4, ssse3);
+ assign_itx16_fn(R, 8, 16, ssse3);
+ assign_itx16_fn(R, 16, 8, ssse3);
+ assign_itx12_fn(, 16, 16, ssse3);
+ assign_itx2_fn (R, 8, 32, ssse3);
+ assign_itx2_fn (R, 32, 8, ssse3);
+ assign_itx2_fn (R, 16, 32, ssse3);
+ assign_itx2_fn (R, 32, 16, ssse3);
+ assign_itx2_fn (, 32, 32, ssse3);
+ assign_itx1_fn (R, 16, 64, ssse3);
+ assign_itx1_fn (R, 32, 64, ssse3);
+ assign_itx1_fn (R, 64, 16, ssse3);
+ assign_itx1_fn (R, 64, 32, ssse3);
+ assign_itx1_fn ( , 64, 64, ssse3);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
+
+#if BITDEPTH == 16
+ if (bpc == 10) {
+ assign_itx16_fn(, 4, 4, sse4);
+ assign_itx16_fn(R, 4, 8, sse4);
+ assign_itx16_fn(R, 4, 16, sse4);
+ assign_itx16_fn(R, 8, 4, sse4);
+ assign_itx16_fn(, 8, 8, sse4);
+ assign_itx16_fn(R, 8, 16, sse4);
+ assign_itx16_fn(R, 16, 4, sse4);
+ assign_itx16_fn(R, 16, 8, sse4);
+ assign_itx12_fn(, 16, 16, sse4);
+ assign_itx2_fn (R, 8, 32, sse4);
+ assign_itx2_fn (R, 32, 8, sse4);
+ assign_itx2_fn (R, 16, 32, sse4);
+ assign_itx2_fn (R, 32, 16, sse4);
+ assign_itx2_fn (, 32, 32, sse4);
+ assign_itx1_fn (R, 16, 64, sse4);
+ assign_itx1_fn (R, 32, 64, sse4);
+ assign_itx1_fn (R, 64, 16, sse4);
+ assign_itx1_fn (R, 64, 32, sse4);
+ assign_itx1_fn (, 64, 64, sse4);
+ }
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx2);
+ assign_itx16_fn(R, 4, 8, avx2);
+ assign_itx16_fn(R, 4, 16, avx2);
+ assign_itx16_fn(R, 8, 4, avx2);
+ assign_itx16_fn( , 8, 8, avx2);
+ assign_itx16_fn(R, 8, 16, avx2);
+ assign_itx2_fn (R, 8, 32, avx2);
+ assign_itx16_fn(R, 16, 4, avx2);
+ assign_itx16_fn(R, 16, 8, avx2);
+ assign_itx12_fn( , 16, 16, avx2);
+ assign_itx2_fn (R, 16, 32, avx2);
+ assign_itx1_fn (R, 16, 64, avx2);
+ assign_itx2_fn (R, 32, 8, avx2);
+ assign_itx2_fn (R, 32, 16, avx2);
+ assign_itx2_fn ( , 32, 32, avx2);
+ assign_itx1_fn (R, 32, 64, avx2);
+ assign_itx1_fn (R, 64, 16, avx2);
+ assign_itx1_fn (R, 64, 32, avx2);
+ assign_itx1_fn ( , 64, 64, avx2);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 4, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 10, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
+ assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
+ } else {
+ assign_itx16_bpc_fn( , 4, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 12, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
+ }
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+#if BITDEPTH == 8
+ assign_itx16_fn( , 4, 4, avx512icl); // no wht
+ assign_itx16_fn(R, 4, 8, avx512icl);
+ assign_itx16_fn(R, 4, 16, avx512icl);
+ assign_itx16_fn(R, 8, 4, avx512icl);
+ assign_itx16_fn( , 8, 8, avx512icl);
+ assign_itx16_fn(R, 8, 16, avx512icl);
+ assign_itx2_fn (R, 8, 32, avx512icl);
+ assign_itx16_fn(R, 16, 4, avx512icl);
+ assign_itx16_fn(R, 16, 8, avx512icl);
+ assign_itx12_fn( , 16, 16, avx512icl);
+ assign_itx2_fn (R, 16, 32, avx512icl);
+ assign_itx1_fn (R, 16, 64, avx512icl);
+ assign_itx2_fn (R, 32, 8, avx512icl);
+ assign_itx2_fn (R, 32, 16, avx512icl);
+ assign_itx2_fn ( , 32, 32, avx512icl);
+ assign_itx1_fn (R, 32, 64, avx512icl);
+ assign_itx1_fn (R, 64, 16, avx512icl);
+ assign_itx1_fn (R, 64, 32, avx512icl);
+ assign_itx1_fn ( , 64, 64, avx512icl);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 8, 8, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl);
+ }
+#endif
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm
index c580944c7bb..811f711540f 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx2.asm
@@ -30,7 +30,6 @@
%if ARCH_X86_64
SECTION_RODATA 32
-pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482
itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7
@@ -39,14 +38,17 @@ iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6
iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048
-iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
-%macro COEF_PAIR 2
+%macro COEF_PAIR 2-3 0
pd_%1_%2: dd %1, %1, %2, %2
%define pd_%1 (pd_%1_%2 + 4*0)
%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3
+dd -%2, -%2
+%define pd_%2_m%2 pd_%2
+%endif
%endmacro
COEF_PAIR 201, 995
@@ -56,8 +58,8 @@ COEF_PAIR 1380, 601
COEF_PAIR 1751, 2440
COEF_PAIR 2598, 1189
COEF_PAIR 2751, 2106
-COEF_PAIR 2896, 1567
-COEF_PAIR 2896, 3784
+COEF_PAIR 2896, 1567, 1
+COEF_PAIR 2896, 3784, 1
COEF_PAIR 3035, 3513
COEF_PAIR 3166, 3920
COEF_PAIR 3703, 3290
@@ -66,9 +68,6 @@ COEF_PAIR 4017, 2276
COEF_PAIR 4076, 3612
COEF_PAIR 4091, 3973
-%define pd_1321 (pd_1321_2482 + 4*0)
-%define pd_2482 (pd_1321_2482 + 4*4)
-
pd_8: dd 8
pd_m601: dd -601
pd_m1189: dd -1189
@@ -77,17 +76,23 @@ pd_m2106: dd -2106
pd_m2598: dd -2598
pd_m2751: dd -2751
pd_m3344: dd -3344
+pd_1024: dd 1024
+pd_1321: dd 1321
+pd_1448: dd 1448
+pd_1697: dd 1697
+pd_2482: dd 2482
+pd_3072: dd 3072 ; 1024 + 2048
pd_3803: dd 3803
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
pd_5793: dd 5793
pd_6144: dd 6144 ; 2048 + 4096
-pd_10239: dd 10239 ; 2048 + 8192 - 1
-pd_10240: dd 10240 ; 2048 + 8192
-pd_11586: dd 11586 ; 5793 * 2
-pd_34816: dd 34816 ; 2048 + 32768
-pd_38912: dd 38912 ; 2048 + 4096 + 32768
+pd_17408: dd 17408 ; 1024 + 16384
pixel_10bpc_max: times 2 dw 0x03ff
pixel_12bpc_max: times 2 dw 0x0fff
+dconly_10bpc: times 2 dw 0x7c00
+dconly_12bpc: times 2 dw 0x7000
clip_18b_min: dd -0x20000
clip_18b_max: dd 0x1ffff
clip_20b_min: dd -0x80000
@@ -214,7 +219,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
-; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2
+; flags: 1 = packed, 2 = inv_dst2
; skip round/shift if rnd is not a number
%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
%if %8 < 32
@@ -241,7 +246,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
pmulld m%1, m%5
pmulld m%2, m%5
%endif
-%if %9 & 4
+%if %9 & 2
psubd m%4, m%6, m%4
psubd m%2, m%4, m%2
%else
@@ -250,17 +255,10 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
%endif
paddd m%2, m%4
%endif
-%if %9 & 2 ; invert the upper half of dst1 before rounding
- vbroadcasti128 m%4, [pw_2048_m2048]
- psubd m%1, m%3
- psignd m%1, m%4
- paddd m%1, m%6
-%else
%ifnum %6
paddd m%1, m%6
%endif
psubd m%1, m%3
-%endif
%ifnum %6
psrad m%2, 12
psrad m%1, 12
@@ -287,37 +285,39 @@ ALIGN function_align
%endif
%endmacro
-%macro INV_TXFM_4X4_FN 2 ; type1, type2
- INV_TXFM_FN %1, %2, 0, 4x4
-%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
- movd xm1, [pw_2896x8]
- mov [cq], eobd ; 0
- add r6d, 2048
- sar r6d, 12
- movd xm0, r6d
- packssdw xm0, xm0
- pmulhrsw xm0, xm1
- vpbroadcastw xm0, xm0
- mova xm1, xm0
- jmp m(iadst_4x4_internal_10bpc).end
-%endif
-%endmacro
-
-%macro INV_TXFM_4X4_12BPC_FN 2 ; type1, type2
- INV_TXFM_FN %1, %2, 0, 4x4, 12
+%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
+ INV_TXFM_FN %1, %2, 0, 4x4, %3
%ifidn %1_%2, dct_dct
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
imul r6d, [cq], 181
mov [cq], eobd ; 0
+ or r3d, 4
+.dconly2:
add r6d, 128
sar r6d, 8
+.dconly3:
imul r6d, 181
- add r6d, 128
- sar r6d, 8
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
- vpbroadcastd m0, xm0
- mova m1, m0
- jmp m(iadst_4x4_internal_12bpc).end
+ paddsw xm0, xm2
+ vpbroadcastw xm0, xm0
+.dconly_loop:
+ movq xm1, [dstq+strideq*0]
+ movhps xm1, [dstq+strideq*1]
+ paddsw xm1, xm0
+ psubusw xm1, xm2
+ movq [dstq+strideq*0], xm1
+ movhps [dstq+strideq*1], xm1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ WRAP_XMM RET
+%else
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
+%endif
%endif
%endmacro
@@ -399,12 +399,50 @@ INV_TXFM_4X4_FN adst, adst
INV_TXFM_4X4_FN adst, flipadst
INV_TXFM_4X4_FN adst, identity
+%macro IADST4_1D 0
+ vpbroadcastd m5, [pd_1321]
+ vpbroadcastd m7, [pd_2482]
+ pmulld m4, m0, m5 ; 1321*in0
+ pmulld m6, m3, m7 ; 2482*in3
+ paddd m4, m6 ; 1321*in0 + 2482*in3
+ pmulld m6, m0, m7 ; 2482*in0
+ paddd m0, m3 ; in0 + in3
+ paddd m7, m5 ; pd_3803
+ pmulld m5, m2 ; 1321*in2
+ pmulld m3, m7 ; 3803*in3
+ pmulld m7, m2 ; 3803*in2
+ psubd m2, m0 ; in2 - in0 - in3
+ vpbroadcastd m0, [pd_m3344]
+ pmulld m1, m0 ; -t3
+ pmulld m2, m0 ; out2 (unrounded)
+ psubd m6, m5 ; 2482*in0 - 1321*in2
+ paddd m4, m7 ; t0
+ psubd m6, m3 ; t1
+ paddd m3, m4, m6
+ psubd m4, m1 ; out0 (unrounded)
+ psubd m6, m1 ; out1 (unrounded)
+ paddd m3, m1 ; out3 (unrounded)
+%endmacro
+
cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
call .main
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_2048]
+ mova m2, [itx4_shuf]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
packssdw m0, m1
- vpermd m0, m4, m0
- psrld m4, 4
- pshufb m0, m4
+ vpermd m0, m2, m0
+ psrld m2, 4
+ pshufb m0, m2
+%if WIN64
+ movaps xmm6, [rsp+ 8]
+ movaps xmm7, [rsp+24]
+%endif
jmp tx2q
.pass2:
lea r6, [deint_shuf+128]
@@ -436,35 +474,16 @@ cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
RET
ALIGN function_align
.main:
- mova m2, [cq+16*2]
- vbroadcasti128 m5, [cq+16*0]
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+%if WIN64
+ movaps [rsp+16], xmm6
+ movaps [rsp+32], xmm7
+%endif
.main2:
- mova m0, [pd_1321_2482]
- vpbroadcastd m3, [pd_3803]
- vpbroadcastd m1, [pd_m3344]
- pmulld m4, m0, m2
- pmulld m3, m2
- pmulld m0, m5
- vpbroadcastd m5, [pd_2048]
- psubd xm2, [cq+16*3]
- psubd m2, [cq+16*0]
- pmulld m2, m1 ; t2 t3
- vpermq m4, m4, q1032
- paddd m4, m3
- psubd m0, m4
- paddd xm4, xm4
- paddd m4, m0 ; t0 t1
- vinserti128 m3, m2, xm4, 1 ; t2 t0
- paddd m0, m4, m5
- psubd xm4, xm2
- psubd m1, m0, m2
- vpermq m2, m2, q3232 ; t3 t3
- psubd m1, m4
- mova m4, [itx4_shuf]
- paddd m0, m2 ; out0 out1
- paddd m1, m3 ; out2 out3
- psrad m0, 12
- psrad m1, 12
+ WRAP_XMM IADST4_1D
ret
INV_TXFM_4X4_FN flipadst, dct
@@ -474,12 +493,9 @@ INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
- packssdw m0, m1
- psrld m1, m4, 8
- vpermd m0, m1, m0
- psrld m4, 4
- pshufb m0, m4
- jmp tx2q
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_10bpc).pass1_end
.pass2:
lea r6, [deint_shuf+128]
vextracti128 xm1, m0, 1
@@ -556,19 +572,20 @@ cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
movhps [r6 +strideq*1], xm1
RET
-INV_TXFM_4X4_12BPC_FN dct, dct
-INV_TXFM_4X4_12BPC_FN dct, identity
-INV_TXFM_4X4_12BPC_FN dct, adst
-INV_TXFM_4X4_12BPC_FN dct, flipadst
+INV_TXFM_4X4_FN dct, dct, 12
+INV_TXFM_4X4_FN dct, identity, 12
+INV_TXFM_4X4_FN dct, adst, 12
+INV_TXFM_4X4_FN dct, flipadst, 12
-cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
+cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(idct_4x4_internal_10bpc).main
mova m3, [idct4_12_shuf]
mova m4, [idct4_12_shuf2]
- vpermd m2, m3, m0
- vpermd m1, m4, m1
- jmp m(iadst_4x4_internal_12bpc).pass1_end
+ vpermd m2, m4, m1
+ vpermd m1, m3, m0
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
.pass2:
+ vpbroadcastd m5, [pd_2048]
vpermq m0, m0, q3120
vpermq m1, m1, q3120
call m(idct_4x4_internal_10bpc).main2
@@ -576,33 +593,52 @@ cglobal idct_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
vpermq m1, m1, q2031
jmp m(iadst_4x4_internal_12bpc).end
-INV_TXFM_4X4_12BPC_FN adst, dct
-INV_TXFM_4X4_12BPC_FN adst, adst
-INV_TXFM_4X4_12BPC_FN adst, flipadst
-INV_TXFM_4X4_12BPC_FN adst, identity
+INV_TXFM_4X4_FN adst, dct, 12
+INV_TXFM_4X4_FN adst, adst, 12
+INV_TXFM_4X4_FN adst, flipadst, 12
+INV_TXFM_4X4_FN adst, identity, 12
-cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
+cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
- vpermd m2, m4, m0
- vpermd m1, m4, m1
+ vinserti128 m1, m4, xm6, 1
+ vinserti128 m2, xm3, 1
.pass1_end:
- punpcklqdq m0, m2, m1
- punpckhqdq m1, m2, m1
+ mova m3, [itx4_shuf]
+ vpbroadcastd m5, [pd_1024]
+ psrad m1, 1
+ psrad m2, 1
+ vpermd m1, m3, m1
+ vpermd m2, m3, m2
+ paddd m1, m5
+ paddd m2, m5
+ psrad m1, 11
+ psrad m2, 11
.pass1_end2:
vpbroadcastd m3, [clip_18b_min]
vpbroadcastd m4, [clip_18b_max]
+ punpcklqdq m0, m1, m2
+ punpckhqdq m1, m2
pmaxsd m0, m3
pmaxsd m1, m3
pminsd m0, m4
pminsd m1, m4
jmp tx2q
.pass2:
- mova [cq+16*0], m0
- vextracti128 [cq+16*3], m1, 1
- mova m2, m1
- vpermq m5, m0, q1010
- call m(iadst_4x4_internal_10bpc).main2
+ call .main_pass2
+ vinserti128 m0, m4, xm6, 1
+ vinserti128 m1, m2, xm3, 1
+.pass2_end:
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5
+ paddd m1, m5
+ psrad m0, 12
+ psrad m1, 12
.end:
+%if WIN64
+ WIN64_RESTORE_XMM_INTERNAL
+ %assign xmm_regs_used 6
+%endif
+.end2:
vpbroadcastd m4, [pw_16384]
movq xm2, [dstq+strideq*0]
movq xm3, [dstq+strideq*1]
@@ -627,53 +663,53 @@ cglobal iadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
movhps [r6 +strideq*0], xm0
movhps [r6 +strideq*1], xm1
RET
+.main_pass2:
+ vextracti128 xm3, m1, 1
+ mova xm2, xm1
+ vextracti128 xm1, m0, 1
+ jmp m(iadst_4x4_internal_10bpc).main2
-INV_TXFM_4X4_12BPC_FN flipadst, dct
-INV_TXFM_4X4_12BPC_FN flipadst, adst
-INV_TXFM_4X4_12BPC_FN flipadst, flipadst
-INV_TXFM_4X4_12BPC_FN flipadst, identity
+INV_TXFM_4X4_FN flipadst, dct, 12
+INV_TXFM_4X4_FN flipadst, adst, 12
+INV_TXFM_4X4_FN flipadst, flipadst, 12
+INV_TXFM_4X4_FN flipadst, identity, 12
-cglobal iflipadst_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
+cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_10bpc).main
- psrld m4, 8
- vpermd m2, m4, m0
- vpermd m1, m4, m1
- punpckhqdq m0, m1, m2
- punpcklqdq m1, m2
- jmp m(iadst_4x4_internal_12bpc).pass1_end2
+ vinserti128 m1, m3, xm2, 1
+ vinserti128 m2, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass1_end
.pass2:
- mova [cq+16*0], m0
- vextracti128 [cq+16*3], m1, 1
- mova m2, m1
- vpermq m5, m0, q1010
- call m(iadst_4x4_internal_10bpc).main2
- vpermq m2, m0, q1032
- vpermq m0, m1, q1032
- mova m1, m2
- jmp m(iadst_4x4_internal_12bpc).end
-
-INV_TXFM_4X4_12BPC_FN identity, dct
-INV_TXFM_4X4_12BPC_FN identity, adst
-INV_TXFM_4X4_12BPC_FN identity, flipadst
-INV_TXFM_4X4_12BPC_FN identity, identity
-
-cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
- vpbroadcastd m1, [pd_5793]
- pmulld m0, m1, [cq+32*0]
- pmulld m1, [cq+32*1]
+ call m(iadst_4x4_internal_12bpc).main_pass2
+ vinserti128 m0, m3, xm2, 1
+ vinserti128 m1, m6, xm4, 1
+ jmp m(iadst_4x4_internal_12bpc).pass2_end
+
+INV_TXFM_4X4_FN identity, dct, 12
+INV_TXFM_4X4_FN identity, adst, 12
+INV_TXFM_4X4_FN identity, flipadst, 12
+INV_TXFM_4X4_FN identity, identity, 12
+
+cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
+ mova m2, [itx4_shuf]
+ vpbroadcastd m3, [pd_1697]
+ vpermd m0, m2, [cq+32*0]
+ vpermd m2, m2, [cq+32*1]
vpbroadcastd m5, [pd_2048]
- mova m3, [itx4_shuf]
- paddd m0, m5
+ pmulld m1, m3, m0
+ pmulld m3, m2
paddd m1, m5
- psrad m0, 12
+ paddd m3, m5
psrad m1, 12
- vpermd m2, m3, m0
- vpermd m1, m3, m1
- jmp m(iadst_4x4_internal_12bpc).pass1_end
+ psrad m3, 12
+ paddd m1, m0
+ paddd m2, m3
+ jmp m(iadst_4x4_internal_12bpc).pass1_end2
.pass2:
; m0 = in0 in1
; m1 = in2 in3
vpbroadcastd m3, [pd_5793]
+ vpbroadcastd m5, [pd_2048]
pmulld m0, m3
pmulld m1, m3
paddd m0, m5 ; 2048
@@ -685,34 +721,19 @@ cglobal iidentity_4x4_internal_12bpc, 0, 7, 6, dst, stride, c, eob, tx2
%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 4x8, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ vpbroadcastd xm2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 8
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 2048
- sar r6d, 12
-.end:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
- movd xm0, r6d
- vpbroadcastw xm0, xm0
- vpbroadcastd xm3, [pixel_%3bpc_max]
- pxor xm2, xm2
-.end_loop:
- movq xm1, [dstq+strideq*0]
- movhps xm1, [dstq+strideq*1]
- paddw xm1, xm0
- pmaxsw xm1, xm2
- pminsw xm1, xm3
- movq [dstq+strideq*0], xm1
- movhps [dstq+strideq*1], xm1
- lea dstq, [dstq+strideq*2]
- sub r3d, 2
- jg .end_loop
- WRAP_XMM RET
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
+%else
+ jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
+%endif
%endif
%endmacro
@@ -797,12 +818,14 @@ INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
-.pass1:
call m(iadst_8x4_internal_10bpc).main
- psrad m0, m4, 12
- psrad m1, m5, 12
- psrad m2, 12
- psrad m3, 12
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass1_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
jmp tx2q
.pass2:
call .pass2_main
@@ -918,13 +941,13 @@ INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
-.pass1:
call m(iadst_8x4_internal_10bpc).main
- psrad m0, m3, 12
- psrad m1, m2, 12
- psrad m2, m5, 12
- psrad m3, m4, 12
- jmp tx2q
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m2, m5, m6
+ paddd m3, m5, m4
+ jmp m(iadst_4x8_internal_10bpc).pass1_end
.pass2:
call m(iadst_4x8_internal_10bpc).pass2_main
mova xm4, [pw_2048_m2048]
@@ -1070,7 +1093,16 @@ INV_TXFM_4X8_FN adst, flipadst, 12
INV_TXFM_4X8_FN adst, identity, 12
cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
- jmp m(iadst_4x8_internal_10bpc).pass1
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m4, 1
+ psrad m1, m6, 1
+ psrad m2, 1
+ psrad m3, 1
+.pass1_end:
+ vpbroadcastd m5, [pd_1024]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 11}, m0, m1, m2, m3
+ jmp tx2q
.pass2:
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
@@ -1146,7 +1178,12 @@ INV_TXFM_4X8_FN flipadst, flipadst, 12
INV_TXFM_4X8_FN flipadst, identity, 12
cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
- jmp m(iflipadst_4x8_internal_10bpc).pass1
+ call m(iadst_8x4_internal_10bpc).main
+ psrad m0, m3, 1
+ psrad m1, m2, 1
+ psrad m2, m6, 1
+ psrad m3, m4, 1
+ jmp m(iadst_4x8_internal_12bpc).pass1_end
.pass2:
vpbroadcastd m8, [clip_18b_min]
vpbroadcastd m9, [clip_18b_max]
@@ -1180,12 +1217,13 @@ cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 4x16, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd xm2, [dconly_%3bpc]
mov [cq], eobd ; 0
- mov r3d, 16
- add r6d, 6144
- sar r6d, 13
- jmp m(inv_txfm_add_dct_dct_4x8_%3bpc).end
+ or r3d, 16
+ add r6d, 384
+ sar r6d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
%endif
%endmacro
@@ -1196,7 +1234,7 @@ INV_TXFM_4X16_FN dct, flipadst
cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
.pass1:
- vpbroadcastd m10, [pd_6144]
+ vpbroadcastd m10, [pd_3072]
mova m1, [cq+32*2]
mova m3, [cq+32*6]
mova m5, [cq+32*3]
@@ -1241,7 +1279,7 @@ ALIGN function_align
vpbroadcastd m4, [pd_3784]
vpbroadcastd m8, [pd_1567]
vpbroadcastd m9, [pd_2048]
- vpbroadcastd m6, [pd_2896]
+ vpbroadcastd m6, [pd_1448]
ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
ret
@@ -1253,7 +1291,7 @@ ALIGN function_align
psubd m0, m2
paddd m9, m4, m6
psubd m4, m6
- REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
+ REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
psubd m2, m0, m1
paddd m1, m0
psubd m6, m4, m5
@@ -1304,7 +1342,6 @@ INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
-.pass1:
call m(iadst_16x4_internal_10bpc).main
vpbroadcastd m6, [pd_6144]
call m(iadst_16x4_internal_10bpc).main_end
@@ -1545,7 +1582,6 @@ INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
-.pass1:
vpbroadcastd m7, [pd_5793]
pmulld m0, m7, [cq+32*0]
pmulld m4, m7, [cq+32*1]
@@ -1678,7 +1714,16 @@ INV_TXFM_4X16_FN adst, flipadst, 12
INV_TXFM_4X16_FN adst, identity, 12
cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
- jmp m(iadst_4x16_internal_10bpc).pass1
+ call .main_pass1
+ psrad m0, m4, 12
+ psrad m1, m5, 12
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ psrad m6, 12
+ psrad m7, 12
+ jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -1740,6 +1785,22 @@ ALIGN function_align
vperm2i128 m4, m8, m9, 0x20 ; 8 10
vperm2i128 m6, m8, m9, 0x31 ; 12 14
ret
+ALIGN function_align
+.main_pass1:
+ call m(iadst_16x4_internal_10bpc).main
+ vpbroadcastd m6, [pd_3072]
+ paddd m10, m4, m5
+ psubd m4, m3
+ psubd m5, m3
+ paddd m3, m10
+ psubd m8, m7, m1
+ paddd m7, m9
+ psubd m9, m1
+ paddd m7, m1
+ REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
+ REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
+ paddd m6, m0
+ ret
INV_TXFM_4X16_FN flipadst, dct, 12
INV_TXFM_4X16_FN flipadst, adst, 12
@@ -1747,7 +1808,16 @@ INV_TXFM_4X16_FN flipadst, flipadst, 12
INV_TXFM_4X16_FN flipadst, identity, 12
cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
- jmp m(iflipadst_4x16_internal_10bpc).pass1
+ call m(iadst_4x16_internal_12bpc).main_pass1
+ psrad m0, m3, 12
+ psrad m1, m2, 12
+ psrad m2, m5, 12
+ psrad m3, m4, 12
+ psrad m4, m7, 12
+ psrad m5, m6, 12
+ psrad m6, m9, 12
+ psrad m7, m8, 12
+ jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -1772,17 +1842,49 @@ INV_TXFM_4X16_FN identity, flipadst, 12
INV_TXFM_4X16_FN identity, identity, 12
cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
- jmp m(iidentity_4x16_internal_10bpc).pass1
+ vpbroadcastd m8, [pd_1697]
+ mova m0, [cq+32*0]
+ mova m4, [cq+32*1]
+ mova m1, [cq+32*2]
+ mova m5, [cq+32*3]
+ vpbroadcastd m9, [pd_6144]
+ pmulld m2, m8, m0
+ pmulld m6, m8, m4
+ pmulld m3, m8, m1
+ pmulld m7, m8, m5
+ mova m10, [cq+32*4]
+ mova m11, [cq+32*5]
+ mova m12, [cq+32*6]
+ mova m13, [cq+32*7]
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m0, m2
+ pmulld m2, m8, m10
+ paddd m4, m6
+ pmulld m6, m8, m11
+ paddd m1, m3
+ pmulld m3, m8, m12
+ paddd m5, m7
+ pmulld m7, m8, m13
+ REPX {psrad x, 1 }, m0, m4, m1, m5
+ REPX {paddd x, m9}, m2, m6, m3, m7
+ REPX {psrad x, 12}, m2, m6, m3, m7
+ paddd m2, m10
+ paddd m6, m11
+ paddd m3, m12
+ paddd m7, m13
+ REPX {psrad x, 1 }, m2, m6, m3, m7
+ jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
- vpbroadcastd m8, [pd_11586]
- vpbroadcastd m9, [pd_2048]
+ vpbroadcastd m8, [pd_5793]
+ vpbroadcastd m9, [pd_1024]
REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
- REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
@@ -1795,37 +1897,21 @@ cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 8x4, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
+.dconly:
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
- movd xm0, r6d
- vpbroadcastw m0, xm0
-.end:
- vpbroadcastd m4, [pixel_%3bpc_max]
- pxor m3, m3
- mova xm1, [dstq+strideq*0]
- vinserti128 m1, [dstq+strideq*1], 1
- lea r6, [dstq+strideq*2]
- mova xm2, [r6 +strideq*0]
- vinserti128 m2, [r6 +strideq*1], 1
- paddw m1, m0
- paddw m2, m0
- pmaxsw m1, m3
- pmaxsw m2, m3
- pminsw m1, m4
- pminsw m2, m4
- mova [dstq+strideq*0], xm1
- vextracti128 [dstq+strideq*1], m1, 1
- mova [r6 +strideq*0], xm2
- vextracti128 [r6 +strideq*1], m2, 1
- RET
+ or r3d, 4
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+%else
+ jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
+%endif
%endif
%endmacro
@@ -1960,32 +2046,7 @@ ALIGN function_align
REPX {paddd x, m4}, m0, m3, m2, m1
REPX {psrad x, 12}, m0, m3, m2, m1
.main2:
- vbroadcasti128 m6, [pd_1321]
- vbroadcasti128 m7, [pd_2482]
- pmulld m4, m0, m6 ; 1321*in0
- pmulld m5, m3, m7 ; 2482*in3
- paddd m4, m5 ; 1321*in0 + 2482*in3
- pmulld m5, m0, m7 ; 2482*in0
- paddd m0, m3 ; in0 + in3
- paddd m7, m6 ; pd_3803
- pmulld m6, m2 ; 1321*in2
- pmulld m3, m7 ; 3803*in3
- pmulld m7, m2 ; 3803*in2
- psubd m2, m0 ; in2 - in0 - in3
- vpbroadcastd m0, [pd_m3344]
- psubd m5, m6 ; 2482*in0 - 1321*in2
- vpbroadcastd m6, [pd_2048]
- psubd m5, m3 ; t1
- pmulld m2, m0 ; t2
- pmulld m1, m0 ; -t3
- paddd m4, m7 ; t0
- paddd m5, m6
- paddd m3, m4, m5
- paddd m4, m6
- psubd m4, m1 ; out0 (unshifted)
- psubd m5, m1 ; out1 (unshifted)
- paddd m2, m6 ; out2 (unshifted)
- paddd m3, m1 ; out3 (unshifted)
+ IADST4_1D
ret
INV_TXFM_8X4_FN flipadst, dct
@@ -2103,10 +2164,13 @@ cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
REPX {pmaxsd x, m8}, m0, m1, m2, m3
REPX {pminsd x, m9}, m0, m1, m2, m3
call .pass2_main
- psrad m0, m4, 12
- psrad m1, m5, 12
- psrad m2, 12
- psrad m3, 12
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m4
+ paddd m1, m5, m6
+ paddd m2, m5
+ paddd m3, m5
+.pass2_end:
+ REPX {psrad x, 12}, m0, m1, m2, m3
.end:
vpbroadcastd m4, [pw_16384]
REPX {psrad x, 3}, m0, m1, m2, m3
@@ -2162,11 +2226,12 @@ cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2
REPX {pmaxsd x, m8}, m0, m1, m2, m3
REPX {pminsd x, m9}, m0, m1, m2, m3
call m(iadst_8x4_internal_12bpc).pass2_main
- psrad m0, m3, 12
- psrad m3, m4, 12
- psrad m1, m2, 12
- psrad m2, m5, 12
- jmp m(iadst_8x4_internal_12bpc).end
+ vpbroadcastd m5, [pd_2048]
+ paddd m0, m5, m3
+ paddd m1, m5, m2
+ paddd m3, m5, m4
+ paddd m2, m5, m6
+ jmp m(iadst_8x4_internal_12bpc).pass2_end
INV_TXFM_8X4_FN identity, dct, 12
INV_TXFM_8X4_FN identity, adst, 12
@@ -2197,32 +2262,36 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 8x8, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
- mov [cq], eobd ; 0
- mov r3d, 8
+ vpbroadcastd m2, [dconly_%3bpc]
+%if %3 = 10
.dconly:
- add r6d, 6144
- sar r6d, 13
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
+ paddsw xm0, xm2
vpbroadcastw m0, xm0
- vpbroadcastd m3, [pixel_%3bpc_max]
- pxor m2, m2
.dconly_loop:
mova xm1, [dstq+strideq*0]
vinserti128 m1, [dstq+strideq*1], 1
- paddw m1, m0
- pmaxsw m1, m2
- pminsw m1, m3
+ paddsw m1, m0
+ psubusw m1, m2
mova [dstq+strideq*0], xm1
vextracti128 [dstq+strideq*1], m1, 1
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
RET
+%else
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
%endif
%endmacro
@@ -2245,7 +2314,7 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a
psubd m%10, m%7, m%9 ; t7
paddd m%7, m%9 ; out6
- vpbroadcastd m%9, [pd_2896]
+ vpbroadcastd m%9, [pd_1448]
psubd m%4, m%8, m%6 ; t3
paddd m%8, m%6 ; -out7
psubd m%6, m%1, m%3 ; t2
@@ -2255,10 +2324,10 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10
REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10
- psubd m%5, m%6, m%4 ; (t2 - t3) * 2896
- paddd m%4, m%6 ; (t2 + t3) * 2896
- psubd m%6, m%3, m%10 ; (t6 - t7) * 2896
- paddd m%3, m%10 ; (t6 + t7) * 2896
+ psubd m%5, m%6, m%4 ; (t2 - t3) * 1448
+ paddd m%4, m%6 ; (t2 + t3) * 1448
+ psubd m%6, m%3, m%10 ; (t6 - t7) * 1448
+ paddd m%3, m%10 ; (t6 + t7) * 1448
%endmacro
INV_TXFM_8X8_FN dct, dct
@@ -2430,8 +2499,8 @@ ALIGN function_align
vpbroadcastd m11, [pd_2048]
.main2:
IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
- psrld m8, 11 ; pd_1
- vpbroadcastd m9, [pd_6144]
+ psrld m8, 10 ; pd_1
+ vpbroadcastd m9, [pd_3072]
ret
ALIGN function_align
.main_end:
@@ -2440,14 +2509,14 @@ ALIGN function_align
paddd m6, m8
psubd m7, m8, m7
REPX {psrad x, 1 }, m0, m1, m6, m7
- ; (1 + ((x + 2048) >> 12)) >> 1 = (6144 + x) >> 13
- ; (1 - ((x + 2048) >> 12)) >> 1 = (6143 - x) >> 13
- psubd m8, m9, m8 ; pd_6143
+ ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12
+ ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12
+ psubd m8, m9, m8 ; pd_3071
paddd m2, m9
psubd m3, m8, m3
paddd m4, m9
psubd m5, m8, m5
- REPX {psrad x, 13}, m2, m3, m4, m5
+ REPX {psrad x, 12}, m2, m3, m4, m5
ret
INV_TXFM_8X8_FN flipadst, dct
@@ -2496,10 +2565,10 @@ ALIGN function_align
paddd m5, m9, m2
psubd m2, m8, m3
paddd m3, m9, m4
- psrad m4, m2, 13
- psrad m2, m10, 13
- psrad m3, 13
- psrad m5, 13
+ psrad m4, m2, 12
+ psrad m2, m10, 12
+ psrad m3, 12
+ psrad m5, 12
ret
INV_TXFM_8X8_FN identity, dct
@@ -2681,13 +2750,13 @@ ALIGN function_align
paddd m6, m9
psubd m7, m9, m7
REPX {psrad x, 4}, m0, m1, m6, m7
- vpbroadcastd m9, [pd_34816]
- psubd m8, m9, m8 ; 34815
+ vpbroadcastd m9, [pd_17408]
+ psubd m8, m9, m8 ; 17407
paddd m2, m9
psubd m3, m8, m3
paddd m4, m9
psubd m5, m8, m5
- REPX {psrad x, 16}, m2, m3, m4, m5
+ REPX {psrad x, 15}, m2, m3, m4, m5
ret
INV_TXFM_8X8_FN flipadst, dct, 12
@@ -2729,13 +2798,14 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
INV_TXFM_FN %1, %2, %3, 8x16, %4
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_%4bpc]
mov [cq], eobd ; 0
- mov r3d, 16
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- jmp m(inv_txfm_add_dct_dct_8x8_%4bpc).dconly
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
%endif
%endmacro
@@ -2904,7 +2974,7 @@ ALIGN function_align
vpbroadcastd m15, [pd_3784]
vpbroadcastd m10, [pd_1567]
ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
- ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2
psubd m3, m1, m4 ; t10
paddd m1, m4 ; t9
psubd m4, m0, m2 ; t11a
@@ -3269,7 +3339,7 @@ cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
call m(iadst_16x8_internal_10bpc).pass1_rotations
.pass2_end:
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
jmp m(idct_8x16_internal_12bpc).end
ALIGN function_align
.pass2_main:
@@ -3302,9 +3372,9 @@ ALIGN function_align
pmaxsd m7, m13, [cq+32* 3] ; 3
REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
call m(iadst_16x8_internal_10bpc).main_part2
- vpbroadcastd m14, [pd_34816]
+ vpbroadcastd m14, [pd_17408]
psrld m15, 11 ; pd_1
- psubd m13, m14, m15 ; pd_34815
+ psubd m13, m14, m15 ; pd_17407
pslld m15, 3 ; pd_8
ret
@@ -3357,49 +3427,52 @@ ALIGN function_align
m8, m9, m10, m11, m12, m13, m14
pminsd m15, [cq]
mova [cq], m7
- vpbroadcastd m7, [pd_11586]
+ vpbroadcastd m7, [pd_5793]
REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
pmulld m7, [cq]
mova [cq], m15
- vpbroadcastd m15, [pd_2048]
+ vpbroadcastd m15, [pd_1024]
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
paddd m15, [cq]
- REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
ret
%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 16x4, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
- mov [cq], eobd ; 0
- mov r3d, 4
+ vpbroadcastd m3, [dconly_%3bpc]
+%if %3 = 10
.dconly:
- add r6d, 6144
- sar r6d, 13
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 4
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ add r6d, 384
+ sar r6d, 9
+.dconly3:
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
+ paddsw xm0, xm3
vpbroadcastw m0, xm0
- vpbroadcastd m4, [pixel_%3bpc_max]
- pxor m3, m3
.dconly_loop:
- paddw m1, m0, [dstq+strideq*0]
- paddw m2, m0, [dstq+strideq*1]
- pmaxsw m1, m3
- pmaxsw m2, m3
- pminsw m1, m4
- pminsw m2, m4
+ paddsw m1, m0, [dstq+strideq*0]
+ paddsw m2, m0, [dstq+strideq*1]
+ psubusw m1, m3
+ psubusw m2, m3
mova [dstq+strideq*0], m1
mova [dstq+strideq*1], m2
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
RET
+%else
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
+%endif
%endif
%endmacro
@@ -3480,13 +3553,30 @@ ALIGN function_align
.pass1_main2:
ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1
ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
- psubd m4, m10, m5 ; t9 -t10
+ vbroadcasti128 m12, [pd_3784_m3784]
+ psubd m4, m10, m5
paddd m10, m5 ; t8 t11
- psubd m5, m11, m6 ; t14 -t13
+ psignd m4, m12 ; t9 t10
+ psubd m5, m11, m6
paddd m11, m6 ; t15 t12
- REPX {pmaxsd x, m8}, m4, m5, m10, m11
- REPX {pminsd x, m9}, m4, m5, m10, m11
- ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2
+ psignd m5, m12 ; t14 t13
+ vpbroadcastd m6, [pd_1567]
+ vpbroadcastd m13, [pd_3784]
+ REPX {pmaxsd x, m8}, m5, m4
+ REPX {pminsd x, m9}, m5, m4
+ pmulld m12, m5
+ pmulld m5, m6
+ vbroadcasti128 m6, [pd_1567_m1567]
+ pmulld m13, m4
+ pmulld m4, m6
+ REPX {pmaxsd x, m8}, m10, m11, m0, m1
+ REPX {pminsd x, m9}, m10, m11, m0, m1
+ paddd m12, m7
+ paddd m5, m7
+ paddd m4, m12
+ psubd m5, m13
+ psrad m4, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
vpbroadcastd m12, [pd_2896]
punpckhqdq m6, m11, m5
punpcklqdq m11, m4
@@ -3500,8 +3590,8 @@ ALIGN function_align
REPX {pminsd x, m9}, m5, m6
pmulld m5, m12
pmulld m6, m12
- REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10
- REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10
+ REPX {pmaxsd x, m8}, m2, m3, m11, m10
+ REPX {pminsd x, m9}, m2, m3, m11, m10
ret
ALIGN function_align
.pass1_main3:
@@ -3565,10 +3655,10 @@ cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
jmp m(idct_16x4_internal_10bpc).end
ALIGN function_align
.main:
- vbroadcasti128 m6, [pd_1321]
+ vpbroadcastd m6, [pd_1321]
mova m0, [cq+32*0]
mova m1, [cq+32*1]
- vbroadcasti128 m7, [pd_2482]
+ vpbroadcastd m7, [pd_2482]
mova m2, [cq+32*6]
mova m3, [cq+32*7]
pmulld m4, m0, m6
@@ -3663,8 +3753,7 @@ INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
-.pass1:
- vpbroadcastd m8, [pd_11586]
+ vpbroadcastd m8, [pd_5793]
vpermq m0, [cq+32*0], q3120 ; 0 1
vpermq m1, [cq+32*1], q3120 ; 2 3
vpermq m2, [cq+32*2], q3120 ; 4 5
@@ -3673,10 +3762,10 @@ cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
vpermq m5, [cq+32*5], q3120 ; a b
vpermq m6, [cq+32*6], q3120 ; c d
vpermq m7, [cq+32*7], q3120 ; e f
- vpbroadcastd m9, [pd_6144]
+ vpbroadcastd m9, [pd_3072]
REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
- REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
call m(idct_16x4_internal_10bpc).transpose_4x16_packed
@@ -3729,17 +3818,15 @@ cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
pmulld m2, m6, m11
pmulld m4, m6, m12
pmulld m6, m13
- vpbroadcastd m10, [pd_2048]
+ vpbroadcastd m10, [pd_17408]
call m(idct_4x16_internal_10bpc).pass1_main2
- REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
- vpbroadcastd m4, [pw_16384]
vpbroadcastd m5, [pixel_12bpc_max]
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
- REPX {pmulhrsw x, m4}, m0, m1, m2, m3
jmp m(idct_16x4_internal_10bpc).end2
INV_TXFM_16X4_FN adst, dct, 12
@@ -3824,7 +3911,37 @@ INV_TXFM_16X4_FN identity, flipadst, 12
INV_TXFM_16X4_FN identity, identity, 12
cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
- jmp m(iidentity_16x4_internal_10bpc).pass1
+ vpbroadcastd m8, [pd_1697]
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpbroadcastd m9, [pd_3072]
+ pmulld m4, m8, m0
+ pmulld m5, m8, m1
+ pmulld m6, m8, m2
+ pmulld m7, m8, m3
+ vpermq m10, [cq+32*4], q3120 ; 8 9
+ vpermq m11, [cq+32*5], q3120 ; a b
+ vpermq m12, [cq+32*6], q3120 ; c d
+ vpermq m13, [cq+32*7], q3120 ; e f
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m0, m4
+ pmulld m4, m8, m10
+ paddd m1, m5
+ pmulld m5, m8, m11
+ paddd m2, m6
+ pmulld m6, m8, m12
+ paddd m3, m7
+ pmulld m7, m8, m13
+ REPX {paddd x, m9}, m4, m5, m6, m7
+ REPX {psrad x, 12}, m4, m5, m6, m7
+ paddd m4, m10
+ paddd m5, m11
+ paddd m6, m12
+ paddd m7, m13
+ jmp tx2q
.pass2:
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -3844,13 +3961,14 @@ cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth
INV_TXFM_FN %1, %2, 0, 16x8, %3
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%3bpc]
mov [cq], eobd ; 0
- mov r3d, 8
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- jmp m(inv_txfm_add_dct_dct_16x4_%3bpc).dconly
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
%endif
%endmacro
@@ -4013,13 +4131,13 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
.pass1:
lea r6, [rsp+32*4]
call .main
- vpbroadcastd m14, [pd_6144]
+ vpbroadcastd m14, [pd_3072]
psrld m15, 11 ; pd_1
- psubd m13, m14, m15 ; pd_6143
+ psubd m13, m14, m15 ; pd_3071
call .pass1_rotations
.pass1_end:
REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11
jmp tx2q
.pass2:
call m(idct_16x8_internal_10bpc).transpose
@@ -4127,8 +4245,6 @@ ALIGN function_align
pmaxsd m10, m13
pminsd m9, m14
pminsd m10, m14
- pmulld m9, m15
- pmulld m10, m15
mova [r6-32*4], m1
mova m11, [r6-32*1] ; t7a
mova m1, [r6-32*2] ; t6a
@@ -4140,7 +4256,6 @@ ALIGN function_align
pmaxsd m2, m13
pminsd m8, m14
pminsd m2, m14
- pmulld m8, m15
mova [r6-32*1], m11
mova [r6-32*3], m2
mova m1, [r6+32*3] ; t15
@@ -4153,8 +4268,6 @@ ALIGN function_align
pmaxsd m11, m13
pminsd m7, m14
pminsd m11, m14
- pmulld m7, m15
- pmulld m11, m15
mova [r6-32*2], m12
pminsd m1, m14, [r6+32*0] ; t10a
pminsd m12, m14, [r6+32*1] ; t11a
@@ -4162,13 +4275,13 @@ ALIGN function_align
paddd m1, m4 ; -out1
psubd m4, m5, m12 ; t11
paddd m5, m12 ; out14
- pmulld m12, m15, [r6-32*3] ; t6
+ vpbroadcastd m12, [pd_1448]
pmaxsd m6, m13
pmaxsd m4, m13
pminsd m6, m14
pminsd m4, m14
- pmulld m6, m15
- pmulld m4, m15
+ REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4
+ pmulld m12, [r6-32*3] ; t6
mova [r6-32*3], m5
paddd m5, m11, m7 ; -out5 (unshifted)
psubd m11, m7 ; out10 (unshifted)
@@ -4233,7 +4346,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
.pass1:
lea r6, [rsp+32*4]
call m(iadst_16x8_internal_10bpc).main
- vpbroadcastd m14, [pd_6144]
+ vpbroadcastd m14, [pd_3072]
psrld m15, 11
psubd m13, m14, m15
call .pass1_rotations
@@ -4313,16 +4426,16 @@ cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
mova [rsp], m15
- vpbroadcastd m15, [pd_11586]
+ vpbroadcastd m15, [pd_5793]
REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
pmulld m15, [rsp]
mova [rsp], m7
- vpbroadcastd m7, [pd_6144]
+ vpbroadcastd m7, [pd_3072]
REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
paddd m7, [rsp]
- REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
@@ -4340,6 +4453,10 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_20b_max]
jmp m(idct_16x8_internal_10bpc).pass1
.pass2:
+ call .pass2_main
+ RET
+ALIGN function_align
+.pass2_main:
call m(idct_8x16_internal_12bpc).transpose
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@@ -4383,8 +4500,7 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpermq m1, m5, q3120
vpermq m2, m6, q3120
vpermq m3, m7, q3120
- call m(idct_16x8_internal_10bpc).write_16x4_zero
- RET
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
ALIGN function_align
.write_16x4_start:
vpbroadcastd m9, [pixel_12bpc_max]
@@ -4403,7 +4519,8 @@ cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
jmp m(iadst_16x8_internal_10bpc).pass1
.pass2:
call .pass2_main
- jmp m(idct_16x8_internal_12bpc).end
+ call m(idct_16x8_internal_12bpc).end
+ RET
ALIGN function_align
.pass2_main:
call m(idct_8x16_internal_12bpc).transpose
@@ -4483,12 +4600,13 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
INV_TXFM_FN %1, %2, %3, 16x16, %4
%ifidn %1_%2, dct_dct
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_%4bpc]
mov [cq], eobd ; 0
- mov r3d, 16
- add r6d, 10240
- sar r6d, 14
- jmp m(inv_txfm_add_dct_dct_16x4_%4bpc).dconly2
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
%endif
%endmacro
@@ -4756,17 +4874,17 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
add cq, 32
call .main
sub cq, 32
- vpbroadcastd m8, [pd_10240]
+ vpbroadcastd m8, [pd_5120]
paddd m4, m8
paddd m6, m8
paddd m9, m8
paddd m11, m8
- vpbroadcastd m8, [pd_10239]
+ vpbroadcastd m8, [pd_5119]
psubd m5, m8, m5
psubd m7, m8, m7
psubd m10, m8, m10
psubd m12, m8, m12
- REPX {psrad x, 14}, m4, m5, m6, m7, m9, m10, m11, m12
+ REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12
mova [r6+32*0], m4
mova [r6+32*1], m5
mova [r6+32*2], m6
@@ -4797,8 +4915,8 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
.fast:
add r6, 32*8
call .main
- vpbroadcastd m14, [pd_10240]
- vpbroadcastd m13, [pd_10239]
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
psrld m15, 10 ; pd_2
paddd m0, m15
psubd m1, m15, m1
@@ -4818,7 +4936,7 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
psubd m15, [r6-32*4]
.pass1_end:
REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
sub r6, 32*8
jmp tx2q
.pass2:
@@ -4892,17 +5010,17 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
add cq, 32
call m(iadst_16x16_internal_10bpc).main
sub cq, 32
- vpbroadcastd m8, [pd_10240]
+ vpbroadcastd m8, [pd_5120]
paddd m11, m8
paddd m9, m8
paddd m6, m8
paddd m4, m8
- vpbroadcastd m8, [pd_10239]
+ vpbroadcastd m8, [pd_5119]
psubd m12, m8, m12
psubd m10, m8, m10
psubd m7, m8, m7
psubd m5, m8, m5
- REPX {psrad x, 14}, m12, m11, m10, m9, m7, m6, m5, m4
+ REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4
mova [r6+32*0], m12
mova [r6+32*1], m11
mova [r6+32*2], m10
@@ -4933,8 +5051,8 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
.fast:
add r6, 32*8
call m(iadst_16x16_internal_10bpc).main
- vpbroadcastd m14, [pd_10240]
- vpbroadcastd m13, [pd_10239]
+ vpbroadcastd m14, [pd_5120]
+ vpbroadcastd m13, [pd_5119]
psrld m15, 10 ; pd_2
psubd m8, m13, m7
paddd m7, m14, m9
@@ -4996,9 +5114,8 @@ INV_TXFM_16X16_FN identity, dct, -92
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
-.pass1:
- vpbroadcastd m15, [pd_11586]
- vpbroadcastd m7, [pd_10240]
+ vpbroadcastd m15, [pd_5793]
+ vpbroadcastd m7, [pd_5120]
lea r6, [rsp+32*4]
sub eobd, 36
jl .fast
@@ -5010,7 +5127,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
pmulld m3, m15, [cq+r3+32*39]
add r6, 32*4
REPX {paddd x, m7}, m0, m1, m2, m3
- REPX {psrad x, 14}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
mova [r6+32*0], m0
mova [r6+32*1], m1
mova [r6+32*2], m2
@@ -5038,7 +5155,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
paddd m7, [cq]
- REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
@@ -5203,7 +5320,7 @@ cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
call m(iadst_16x8_internal_10bpc).pass1_rotations
.pass2_part3:
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
.end:
packssdw m15, m14
packssdw m14, m13, m12
@@ -5320,15 +5437,15 @@ ALIGN function_align
REPX {pminsd x, m14}, m1, m3, m4, m6
.pass2_fast2:
call m(iadst_16x8_internal_10bpc).main_part2
- vpbroadcastd m14, [pd_34816]
+ vpbroadcastd m14, [pd_17408]
psrld m15, 11 ; pd_1
- psubd m13, m14, m15 ; pd_34815
+ psubd m13, m14, m15 ; pd_17407
pslld m15, 3 ; pd_8
ret
ALIGN function_align
.pass2_part2:
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
- REPX {psrad x, 16}, m4, m5, m6, m7, m8, m9, m10, m11
+ REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
packssdw m0, m1
packssdw m1, m2, m3
packssdw m2, m4, m5
@@ -5375,8 +5492,73 @@ cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
INV_TXFM_16X16_FN identity, dct, -92, 12
INV_TXFM_16X16_FN identity, identity, 0, 12
+%macro IDTX16_12BPC 1 ; src
+ pmulld m6, m7, m%1
+ paddd m6, m15
+ psrad m6, 12
+ paddd m6, m%1
+ psrad m%1, m6, 1
+%endmacro
+
cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
- jmp m(iidentity_16x16_internal_10bpc).pass1
+ vpbroadcastd m7, [pd_1697]
+ vpbroadcastd m15, [pd_5120]
+ lea r6, [rsp+32*4]
+ sub eobd, 36
+ jl .fast
+ mov r3, -32*8*4
+.righthalf:
+ mova m10, [cq+r3+32*33]
+ mova m11, [cq+r3+32*35]
+ mova m12, [cq+r3+32*37]
+ mova m13, [cq+r3+32*39]
+ add r6, 32*4
+ pmulld m0, m7, m10
+ pmulld m1, m7, m11
+ pmulld m2, m7, m12
+ pmulld m3, m7, m13
+ REPX {paddd x, m15}, m0, m1, m2, m3
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ paddd m0, m10
+ paddd m1, m11
+ paddd m2, m12
+ paddd m3, m13
+ REPX {psrad x, 1 }, m0, m1, m2, m3
+ mova [r6+32*0], m0
+ mova [r6+32*1], m1
+ mova [r6+32*2], m2
+ mova [r6+32*3], m3
+ add r3, 32*8
+ jl .righthalf
+.fast:
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 1]
+ mova m2, [cq+64* 2]
+ mova m3, [cq+64* 3]
+ mova m4, [cq+64* 4]
+ mova m5, [cq+64* 5]
+ mova m8, [cq+64* 6]
+ mova m9, [cq+64* 7]
+ REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9
+ mova [cq+64*0], m8
+ mova [cq+64*1], m9
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64* 9]
+ mova m10, [cq+64*10]
+ mova m11, [cq+64*11]
+ mova m12, [cq+64*12]
+ mova m13, [cq+64*13]
+ mova m14, [cq+64*14]
+ REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14
+ mova m6, [cq+64*15]
+ pmulld m7, m6
+ paddd m7, m15
+ psrad m7, 12
+ paddd m7, m6
+ mova m6, [cq+64*0]
+ psrad m15, m7, 1
+ mova m7, [cq+64*1]
+ jmp tx2q
.pass2:
call m(iidentity_8x16_internal_12bpc).pass2_main
call m(idct_16x16_internal_10bpc).transpose_fast
@@ -5429,7 +5611,7 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
call m(idct_16x16_internal_12bpc).write_16x16
RET
-%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift
+%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
mova m%4, [r6+32*(%1-4)]
mova m%2, [r5+32*(3-%1)]
mova m%5, [r4+32*(%1-4)]
@@ -5446,8 +5628,10 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx
paddd m%2, m%3, m%5 ; out15 - n
psubd m%3, m%5 ; out16 + n
REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%if %7 & 1
packssdw m%1, m%3 ; out0 + n, out16 + n
packssdw m%2, m%4 ; out15 - n, out31 - n
+%endif
%endmacro
cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
@@ -5574,14 +5758,15 @@ cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
call m(idct_8x8_internal_10bpc).write_8x4
RET
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 32
- add r6d, 10240
- sar r6d, 14
- jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
ALIGN function_align
-.pass1_main:
+.pass1_main_part1:
mova m0, [cq+128*0]
mova m1, [cq+128*1]
mova m2, [cq+128*2]
@@ -5590,7 +5775,6 @@ ALIGN function_align
mova m5, [cq+128*5]
mova m6, [cq+128*6]
mova m7, [cq+128*7]
- add cq, 32
call m(idct_8x8_internal_10bpc).main
psrld m1, m11, 10 ; pd_2
REPX {paddd x, m1}, m0, m6, m5, m3
@@ -5603,6 +5787,11 @@ ALIGN function_align
psubd m4, m3, m8 ; out4
paddd m3, m8 ; out3
REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.pass1_main:
+ call .pass1_main_part1
+ add cq, 32
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
@@ -5665,7 +5854,7 @@ ALIGN function_align
vpbroadcastd m15, [pd_4017]
vpbroadcastd m10, [pd_799]
ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
- ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
psubd m3, m0, m6 ; t19a
paddd m0, m6 ; t16a
psubd m6, m7, m1 ; t28a
@@ -5734,7 +5923,7 @@ ALIGN function_align
vpbroadcastd m15, [pd_2276]
vpbroadcastd m10, [pd_3406]
ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
- ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
psubd m3, m0, m6 ; t27a
paddd m0, m6 ; t24a
psubd m6, m7, m1 ; t20a
@@ -5747,8 +5936,8 @@ ALIGN function_align
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
vpbroadcastd m15, [pd_3784]
vpbroadcastd m10, [pd_1567]
- ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
- ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20
mova m9, [r6-32*4] ; t16a
mova m10, [r6-32*3] ; t17
psubd m2, m9, m7 ; t23
@@ -5881,8 +6070,9 @@ ALIGN function_align
ret
cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob
- vpbroadcastd m5, [pw_5]
vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_5]
pxor m6, m6
mov r6d, eobd
add eobb, 21
@@ -5947,30 +6137,262 @@ ALIGN function_align
vextracti128 [dstq+r4 ], m3, 1
ret
+cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+%undef cmp
+ vpbroadcastd m11, [pd_2048]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ mov r4, cq
+ lea r6, [rsp+32*4]
+ call .pass1_main
+ cmp eobd, 43
+ jge .eob43
+ jmp .pass2_fast
+.eob43:
+ call .pass1_main
+ cmp eobd, 107
+ jge .eob107
+.pass2_fast:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ call m(idct_8x16_internal_10bpc).main_oddhalf_fast
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ REPX {pminsd x, m13}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+ jmp .pass2_end
+.eob107:
+ call .pass1_main
+ cmp eobd, 171
+ jge .eob171
+ jmp .pass2
+.eob171:
+ call .pass1_main
+.pass2:
+ mov cq, r4
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ pmaxsd m0, m12, [cq+128*1+ 0]
+ pmaxsd m1, m12, [cq+128*7+ 0]
+ pmaxsd m2, m12, [cq+128*1+32]
+ pmaxsd m3, m12, [cq+128*7+32]
+ pmaxsd m4, m12, [cq+128*1+64]
+ pmaxsd m5, m12, [cq+128*7+64]
+ pmaxsd m6, m12, [cq+128*1+96]
+ pmaxsd m7, m12, [cq+128*7+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m14, [pd_2896]
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
+ pmaxsd m0, m12, [cq+128*3+ 0]
+ pmaxsd m1, m12, [cq+128*5+ 0]
+ pmaxsd m2, m12, [cq+128*3+32]
+ pmaxsd m3, m12, [cq+128*5+32]
+ pmaxsd m4, m12, [cq+128*3+64]
+ pmaxsd m5, m12, [cq+128*5+64]
+ pmaxsd m6, m12, [cq+128*3+96]
+ pmaxsd m7, m12, [cq+128*5+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
+ pmaxsd m0, m12, [cq+128*2+ 0]
+ pmaxsd m1, m12, [cq+128*6+ 0]
+ pmaxsd m2, m12, [cq+128*2+32]
+ pmaxsd m3, m12, [cq+128*6+32]
+ pmaxsd m4, m12, [cq+128*2+64]
+ pmaxsd m5, m12, [cq+128*6+64]
+ pmaxsd m6, m12, [cq+128*2+96]
+ pmaxsd m7, m12, [cq+128*6+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x16_internal_10bpc).main_oddhalf
+ pmaxsd m0, m12, [cq+128*0+ 0]
+ pmaxsd m1, m12, [cq+128*4+ 0]
+ pmaxsd m2, m12, [cq+128*0+32]
+ pmaxsd m3, m12, [cq+128*4+32]
+ pmaxsd m4, m12, [cq+128*0+64]
+ pmaxsd m5, m12, [cq+128*4+64]
+ pmaxsd m6, m12, [cq+128*0+96]
+ pmaxsd m7, m12, [cq+128*4+96]
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_evenhalf
+.pass2_end:
+ psrld m11, 8 ; pd_8
+ IDCT32_END 0, 15, 8, 9, 10, 4
+ IDCT32_END 1, 14, 8, 9, 10, 4
+ punpckhqdq m8, m0, m1 ; 16 17 (interleaved)
+ punpcklqdq m0, m1 ; 0 1 (interleaved)
+ punpcklqdq m1, m14, m15 ; 14 15 (interleaved)
+ punpckhqdq m14, m15 ; 30 31 (interleaved)
+ mova [r5+32*3], m8
+ mova [r5+32*2], m14
+ IDCT32_END 2, 15, 8, 9, 10, 4
+ IDCT32_END 3, 14, 8, 9, 10, 4
+ punpckhqdq m8, m2, m3 ; 18 19 (interleaved)
+ punpcklqdq m2, m3 ; 2 3 (interleaved)
+ punpcklqdq m3, m14, m15 ; 12 13 (interleaved)
+ punpckhqdq m14, m15 ; 28 29 (interleaved)
+ mova [r5+32*1], m8
+ mova [r5+32*0], m14
+ IDCT32_END 4, 15, 8, 9, 10, 4
+ IDCT32_END 5, 14, 8, 9, 10, 4
+ punpckhqdq m8, m4, m5 ; 20 21 (interleaved)
+ punpcklqdq m4, m5 ; 4 5 (interleaved)
+ punpcklqdq m5, m14, m15 ; 10 11 (interleaved)
+ punpckhqdq m14, m15 ; 26 27 (interleaved)
+ mova [r5-32*1], m8
+ mova [r5-32*2], m14
+ IDCT32_END 6, 15, 8, 9, 10, 4
+ IDCT32_END 7, 14, 8, 9, 10, 4
+ punpckhqdq m8, m6, m7 ; 22 23 (interleaved)
+ punpcklqdq m6, m7 ; 6 7 (interleaved)
+ punpcklqdq m7, m14, m15 ; 8 9 (interleaved)
+ punpckhqdq m14, m15 ; 24 25 (interleaved)
+ mova [r5-32*3], m8
+ mova [r5-32*4], m14
+ mova m15, m1
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m2, q3120
+ call m(idct_8x8_internal_12bpc).write_8x4_start
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m4, q3120
+ vpermq m1, m6, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m7, q3120
+ vpermq m1, m5, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, m3, q3120
+ vpermq m1, m15, q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*3], q3120
+ vpermq m1, [r5+32*1], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*1], q3120
+ vpermq m1, [r5-32*3], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5-32*4], q3120
+ vpermq m1, [r5-32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ vpermq m0, [r5+32*0], q3120
+ vpermq m1, [r5+32*2], q3120
+ call m(idct_8x8_internal_10bpc).write_8x4
+ RET
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m2, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
+ALIGN function_align
+.pass1_main:
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1
+ TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15
+ mova [cq+128*0], m0
+ mova [cq+128*1], m1
+ mova [cq+128*2], m2
+ mova [cq+128*3], m3
+ mova [cq+128*4], m4
+ mova [cq+128*5], m5
+ mova [cq+128*6], m6
+ mova [cq+128*7], m7
+ add cq, 32
+ ret
+ALIGN function_align
+.main_end:
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2, 0
+ mova [cq+32*16], m8
+ mova [cq+32*31], m9
+ IDCT32_END 1, 14, 8, 9, 10, 2, 0
+ mova [cq+32*17], m8
+ mova [cq+32*30], m9
+ mova [cq+32*14], m14
+ IDCT32_END 2, 14, 8, 9, 10, 2, 0
+ mova [cq+32*18], m8
+ mova [cq+32*29], m9
+ mova [cq+32*13], m14
+ IDCT32_END 3, 14, 8, 9, 10, 2, 0
+ mova [cq+32*19], m8
+ mova [cq+32*28], m9
+ mova [cq+32*12], m14
+ IDCT32_END 4, 14, 8, 9, 10, 2, 0
+ mova [cq+32*20], m8
+ mova [cq+32*27], m9
+ mova [cq+32* 0], m0
+ mova [cq+32* 1], m1
+ mova [cq+32* 2], m2
+ IDCT32_END 5, 10, 0, 1, 2, 2, 0
+ mova [cq+32*21], m0
+ mova [cq+32*26], m1
+ IDCT32_END 6, 9, 0, 1, 2, 2, 0
+ mova [cq+32*22], m0
+ mova [cq+32*25], m1
+ IDCT32_END 7, 8, 0, 1, 2, 2, 0
+ mova [cq+32*23], m0
+ mova [cq+32*24], m1
+ mova m0, [cq+32* 0]
+ mova m1, [cq+32* 1]
+ mova m2, [cq+32* 2]
+ mova m11, m14
+ mova m12, [cq+32*12]
+ mova m13, [cq+32*13]
+ mova m14, [cq+32*14]
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1
+
cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .full
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 8
+ or r3d, 8
.dconly:
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
+ paddsw xm0, xm3
vpbroadcastw m0, xm0
- vpbroadcastd m4, [pixel_10bpc_max]
- pxor m3, m3
.dconly_loop:
- paddw m1, m0, [dstq+32*0]
- paddw m2, m0, [dstq+32*1]
- pmaxsw m1, m3
- pmaxsw m2, m3
- pminsw m1, m4
- pminsw m2, m4
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ psubusw m1, m3
+ psubusw m2, m3
mova [dstq+32*0], m1
mova [dstq+32*1], m2
add dstq, strideq
@@ -5979,6 +6401,39 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
RET
.full:
PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_18b_min]
+ vpbroadcastd m13, [clip_18b_max]
+ call .pass1
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
+ lea r6, [deint_shuf+128]
+ vpbroadcastd m11, [pw_2048]
+ mov r4, dstq
+ call .pass2
+ mova m0, [r5+32*3] ; 16 17
+ mova m1, [r5+32*2] ; 30 31
+ mova m2, [r5+32*1] ; 18 19
+ mova m3, [r5+32*0] ; 28 29
+ mova m4, [r5-32*1] ; 20 21
+ mova m5, [r5-32*2] ; 26 27
+ mova m6, [r5-32*3] ; 22 23
+ mova m7, [r5-32*4] ; 24 25
+ call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
+ lea dstq, [r4+32]
+ call .pass2
+ RET
+ALIGN function_align
+.pass2:
+ call m(idct_16x8_internal_8bpc).main
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ call m(idct_16x8_internal_10bpc).write_16x4_start
+ pmulhrsw m0, m11, m4
+ pmulhrsw m1, m11, m5
+ pmulhrsw m2, m11, m6
+ pmulhrsw m3, m11, m7
+ jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ALIGN function_align
+.pass1:
mova m0, [cq+32* 1]
mova m1, [cq+32* 7]
mova m2, [cq+32* 9]
@@ -5988,10 +6443,7 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
mova m6, [cq+32*25]
mova m7, [cq+32*31]
vpbroadcastd m11, [pd_2048]
- vpbroadcastd m12, [clip_18b_min]
- vpbroadcastd m13, [clip_18b_max]
vpbroadcastd m14, [pd_2896]
- lea r6, [rsp+32*4]
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
mova m0, [cq+32* 3]
mova m1, [cq+32* 5]
@@ -6021,37 +6473,12 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
mova m7, [cq+32*28]
call m(idct_8x8_internal_10bpc).main
call m(idct_8x16_internal_10bpc).main_evenhalf
- call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
- lea r6, [deint_shuf+128]
- vpbroadcastd m11, [pw_2048]
- mov r4, dstq
- call .pass2
- mova m0, [r5+32*3] ; 16 17
- mova m1, [r5+32*2] ; 30 31
- mova m2, [r5+32*1] ; 18 19
- mova m3, [r5+32*0] ; 28 29
- mova m4, [r5-32*1] ; 20 21
- mova m5, [r5-32*2] ; 26 27
- mova m6, [r5-32*3] ; 22 23
- mova m7, [r5-32*4] ; 24 25
- call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
- lea dstq, [r4+32]
- call .pass2
- RET
-ALIGN function_align
-.pass2:
- call m(idct_16x8_internal_8bpc).main
- REPX {pmulhrsw x, m11}, m0, m1, m2, m3
- call m(idct_16x8_internal_10bpc).write_16x4_start
- pmulhrsw m0, m11, m4
- pmulhrsw m1, m11, m5
- pmulhrsw m2, m11, m6
- pmulhrsw m3, m11, m7
- jmp m(idct_16x8_internal_10bpc).write_16x4_zero
+ ret
cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
- vpbroadcastd m5, [pw_4096]
vpbroadcastd m7, [pixel_10bpc_max]
+.pass1:
+ vpbroadcastd m5, [pw_4096]
pxor m6, m6
mov r6d, eobd
add eobb, 21
@@ -6078,6 +6505,47 @@ cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
jge .loop
RET
+cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
+ test eobd, eobd
+ jnz .full
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_12bpc]
+ mov [cq], eobd ; 0
+ or r3d, 8
+ jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
+.full:
+ PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
+ lea r6, [rsp+32*4]
+ vpbroadcastd m12, [clip_20b_min]
+ vpbroadcastd m13, [clip_20b_max]
+ call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
+ call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
+ mov r4, dstq
+ call m(idct_16x8_internal_12bpc).pass2_main
+ mova m0, [cq+32* 0] ; 16
+ mova m1, [cq+32* 1] ; 17
+ mova m2, [cq+32* 2] ; 18
+ mova m3, [cq+32* 3] ; 19
+ mova m4, [cq+32* 4] ; 20
+ mova m5, [cq+32* 5] ; 21
+ mova m6, [cq+32* 6] ; 22
+ mova m7, [cq+32* 7] ; 23
+ mova m8, [cq+32* 8] ; 24
+ mova m9, [cq+32* 9] ; 25
+ mova m10, [cq+32*10] ; 26
+ mova m11, [cq+32*11] ; 27
+ mova m12, [cq+32*12] ; 28
+ mova m13, [cq+32*13] ; 29
+ mova m14, [cq+32*14] ; 30
+ mova m15, [cq+32*15] ; 31
+ lea dstq, [r4+32]
+ call m(idct_16x8_internal_12bpc).pass2_main
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
+ vpbroadcastd m7, [pixel_12bpc_max]
+ jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
+
%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
mova m%4, [%2]
paddsw m%3, m%1, m%4
@@ -6121,13 +6589,14 @@ cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
jmp .fast
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 32
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
.eob44:
mova [r4+16*0], xm0
mova [r4+16*1], xm3
@@ -6472,14 +6941,15 @@ cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
jmp .end
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 16
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
.full:
add cq, 32
@@ -6742,9 +7212,10 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
.fast:
lea r4, [rsp+32*71]
@@ -7019,12 +7490,13 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 64
- add r6d, 10240
- sar r6d, 14
- jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
+ or r3d, 64
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
.fast:
lea r4, [rsp+32*38]
pxor m0, m0
@@ -7246,7 +7718,7 @@ ALIGN function_align
REPX {pmaxsd x, m12}, m8, m1, m6, m2
REPX {pminsd x, m13}, m8, m1, m6, m2
ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
- ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
REPX {pmaxsd x, m12}, m0, m3, m7, m4
REPX {pminsd x, m13}, m0, m3, m7, m4
vpbroadcastd m10, [r5+4*10]
@@ -7301,7 +7773,7 @@ ALIGN function_align
REPX {pmaxsd x, m12}, m8, m1, m3, m4
REPX {pminsd x, m13}, m8, m1, m3, m4
ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
- ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
REPX {pmaxsd x, m12}, m0, m2, m5, m7
REPX {pminsd x, m13}, m0, m5, m2, m7
psubd m6, m2, m7 ; t48a
@@ -7358,14 +7830,15 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [dconly_10bpc]
mov [cq], eobd ; 0
- mov r3d, 64
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ or r3d, 64
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
.fast:
lea r4, [rsp+32*70]
@@ -7540,30 +8013,26 @@ ALIGN function_align
cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .normal
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 16
+ or r3d, 16
.dconly:
- add r6d, 10240
- sar r6d, 14
+ add r6d, 640
+ sar r6d, 10
.dconly2:
- imul r6d, 2896
- add r6d, 34816
- sar r6d, 16
+ vpbroadcastd m5, [dconly_10bpc]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
movd xm0, r6d
-%if WIN64
- movaps [rsp+8], xmm6
-%endif
+ paddsw xm0, xm5
vpbroadcastw m0, xm0
- vpbroadcastd m6, [pixel_10bpc_max]
- pxor m5, m5
.dconly_loop:
- paddw m1, m0, [dstq+32*0]
- paddw m2, m0, [dstq+32*1]
- paddw m3, m0, [dstq+32*2]
- paddw m4, m0, [dstq+32*3]
- REPX {pmaxsw x, m5}, m1, m2, m3, m4
- REPX {pminsw x, m6}, m1, m2, m3, m4
+ paddsw m1, m0, [dstq+32*0]
+ paddsw m2, m0, [dstq+32*1]
+ paddsw m3, m0, [dstq+32*2]
+ paddsw m4, m0, [dstq+32*3]
+ REPX {psubusw x, m5}, m1, m2, m3, m4
mova [dstq+32*0], m1
mova [dstq+32*1], m2
mova [dstq+32*2], m3
@@ -7571,9 +8040,6 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
add dstq, strideq
dec r3d
jg .dconly_loop
-%if WIN64
- movaps xmm6, [rsp+8]
-%endif
RET
.normal:
PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
@@ -7814,14 +8280,14 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 32
- add r6d, 2048
- sar r6d, 12
- imul r6d, 2896
- add r6d, 6144
- sar r6d, 13
+ or r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
.fast:
pxor m0, m0
@@ -7963,9 +8429,9 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
call .main
jmp .pass2
.dconly:
- imul r6d, [cq], 2896
+ imul r6d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 64
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
.fast:
pxor m0, m0
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm
new file mode 100644
index 00000000000..b05fde54dc8
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_avx512.asm
@@ -0,0 +1,2599 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23
+ db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31
+ db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55
+ db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63
+idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51
+ db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59
+ db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17
+ db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25
+iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23
+ db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31
+ db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19
+ db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27
+permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13
+ db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29
+ db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15
+ db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31
+permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2
+ db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6
+ db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7
+ db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3
+permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6
+ db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14
+ db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7
+ db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15
+idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
+ db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
+ db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
+ db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
+idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25
+ db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57
+ db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29
+ db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61
+idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30
+ db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62
+ db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31
+ db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63
+
+pw_2048_m2048: times 16 dw 2048
+pw_m2048_2048: times 16 dw -2048
+pw_2048: times 16 dw 2048
+
+; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-
+%macro COEF_PAIR 2-3 0 ; a, b, flags
+%if %3 == 1
+pd_%1_m%2: dd %1, %1, -%2, -%2
+%define pd_%1 (pd_%1_m%2 + 4*0)
+%define pd_m%2 (pd_%1_m%2 + 4*2)
+%elif %3 == 2
+pd_m%1_%2: dd -%1, -%1, %2, %2
+%define pd_m%1 (pd_m%1_%2 + 4*0)
+%define pd_%2 (pd_m%1_%2 + 4*2)
+%else
+pd_%1_%2: dd %1, %1, %2, %2
+%define pd_%1 (pd_%1_%2 + 4*0)
+%define pd_%2 (pd_%1_%2 + 4*2)
+%if %3 == 3
+%define pd_%2_m%2 pd_%2
+dd -%2, -%2
+%endif
+%endif
+%endmacro
+
+COEF_PAIR 201, 995
+COEF_PAIR 401, 1189, 1
+COEF_PAIR 401, 1931
+COEF_PAIR 401, 3920
+COEF_PAIR 799, 2276, 1
+COEF_PAIR 799, 3406
+COEF_PAIR 799, 4017
+COEF_PAIR 1380, 601
+COEF_PAIR 1751, 2440
+COEF_PAIR 2598, 1189
+COEF_PAIR 2598, 1931, 2
+COEF_PAIR 2598, 3612
+COEF_PAIR 2751, 2106
+COEF_PAIR 2896, 1567, 3
+COEF_PAIR 2896, 3784, 3
+COEF_PAIR 3035, 3513
+COEF_PAIR 3166, 1931
+COEF_PAIR 3166, 3612
+COEF_PAIR 3166, 3920
+COEF_PAIR 3703, 3290
+COEF_PAIR 3857, 4052
+COEF_PAIR 4017, 2276
+COEF_PAIR 4017, 3406
+COEF_PAIR 4076, 1189
+COEF_PAIR 4076, 3612
+COEF_PAIR 4076, 3920
+COEF_PAIR 4091, 3973
+
+pw_5: times 2 dw 5
+pw_4096 times 2 dw 4096
+pw_1697x16: times 2 dw 1697*16
+pw_2896x8: times 2 dw 2896*8
+pixel_10bpc_max: times 2 dw 0x03ff
+dconly_10bpc: times 2 dw 0x7c00
+clip_18b_min: dd -0x20000
+clip_18b_max: dd 0x1ffff
+pd_1: dd 1
+pd_2: dd 2
+pd_1448: dd 1448
+pd_2048: dd 2048
+pd_3071: dd 3071 ; 1024 + 2048 - 1
+pd_3072: dd 3072 ; 1024 + 2048
+pd_5119: dd 5119 ; 1024 + 4096 - 1
+pd_5120: dd 5120 ; 1024 + 4096
+pd_5793: dd 5793
+
+cextern int8_permA
+cextern idct_8x8_internal_8bpc_avx512icl.main
+cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_8x16_internal_8bpc_avx512icl.main
+cextern idct_8x16_internal_8bpc_avx512icl.main2
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast
+cextern idct_8x16_internal_8bpc_avx512icl.main_fast2
+cextern iadst_8x16_internal_8bpc_avx512icl.main2
+cextern idct_16x8_internal_8bpc_avx512icl.main
+cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2
+cextern idct_16x16_internal_8bpc_avx512icl.main
+cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2
+cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end
+cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main
+
+SECTION .text
+
+%define o_base (pw_2048+4*128)
+%define o_base_8bpc (int8_permA+64*18)
+%define o(x) (r5 - o_base + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+INIT_ZMM avx512icl
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 1 = inv_dst1, 2 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+%if %8 < 4096
+ vpbroadcastd m%3, [o(pd_%8)]
+%else
+ vbroadcasti32x4 m%3, [o(pd_%8)]
+%endif
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+%if %7 < 4096
+ vpbroadcastd m%5, [o(pd_%7)]
+%else
+ vbroadcasti32x4 m%5, [o(pd_%7)]
+%endif
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 2
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+%if %9 & 1
+ psubd m%1, m%3, m%1
+%else
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
+cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_10bpc)
+ lea r5, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%4_internal_10bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+%if %3
+ add eobd, %3
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+.dconly:
+ add r6d, 384
+ sar r6d, 9
+.dconly2:
+ vpbroadcastd ym2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw ym1, r6d
+ paddsw ym1, ym2
+.dconly_loop:
+ mova xm0, [dstq+strideq*0]
+ vinserti32x4 ym0, [dstq+strideq*1], 1
+ paddsw ym0, ym1
+ psubusw ym0, ym2
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call .load
+ vpermi2q m1, m0, m2 ; 1 5
+ vpermi2q m3, m6, m4 ; 7 3
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ call .main_end
+ mova m4, [o(idct8x8p)]
+ packssdw m0, m2 ; 0 1 4 5
+ packssdw m1, m3 ; 3 2 7 6
+ vpermb m0, m4, m0
+ vprolq m1, 32
+ vpermb m2, m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ call m(idct_8x8_internal_8bpc).main
+ mova m10, [permC]
+ vpbroadcastd m12, [pw_2048]
+.end:
+ vpermt2q m0, m10, m1
+ vpermt2q m2, m10, m3
+.end2:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m10, m10
+ pmulhrsw m8, m12, m0
+ call .write_8x4_start
+ pmulhrsw m8, m12, m2
+.write_8x4:
+ lea dstq, [dstq+strideq*4]
+ add cq, 64*2
+.write_8x4_start:
+ mova xm9, [dstq+strideq*0]
+ vinserti32x4 ym9, [dstq+strideq*1], 1
+ vinserti32x4 m9, [dstq+strideq*2], 2
+ vinserti32x4 m9, [dstq+r6 ], 3
+ mova [cq+64*0], m10
+ mova [cq+64*1], m10
+ paddw m9, m8
+ pmaxsw m9, m10
+ pminsw m9, m11
+ mova [dstq+strideq*0], xm9
+ vextracti32x4 [dstq+strideq*1], ym9, 1
+ vextracti32x4 [dstq+strideq*2], m9, 2
+ vextracti32x4 [dstq+r6 ], m9, 3
+ ret
+ALIGN function_align
+.load:
+ mova m0, [cq+64*0] ; 0 1
+ mova m4, [cq+64*1] ; 2 3
+ mova m1, [o(permB)]
+ mova m2, [cq+64*2] ; 4 5
+ mova m6, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m5, m1, 32
+ vpbroadcastd m12, [o(pd_2896)]
+ mova m3, m1
+ vpbroadcastd m11, [o(pd_1)]
+ ret
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m8, [o(pd_799_4017)]
+ pmulld m8, m1 ; t4 t7
+ vpmulld m0, [o(pd_2896)] {1to16} ; dct4 out0 out1
+ REPX {paddd x, m13}, m8, m0
+ REPX {psrad x, 12 }, m8, m0
+ pmulld m3, m8, m12
+ mova m2, m0 ; dct4 out3 out2
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m3, [o(pd_4017_3406)]
+ vbroadcasti32x4 m8, [o(pd_799_m2276)]
+ vbroadcasti32x4 m2, [o(pd_2896_3784)]
+ vbroadcasti32x4 m9, [o(pd_2896_1567)]
+ pmulld m3, m1 ; t4a t5a
+ pmulld m1, m8 ; t7a t6a
+ pmulld m2, m0 ; t0 t3
+ pmulld m0, m9 ; t1 t2
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276
+ ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784
+.main2:
+ REPX {paddd x, m13}, m1, m3, m0, m2
+ REPX {psrad x, 12 }, m1, m3, m0, m2
+ punpcklqdq m8, m1, m3 ; t4a t7a
+ punpckhqdq m1, m3 ; t5a t6a
+ psubd m3, m8, m1 ; t5a t6a
+ paddd m8, m1 ; t4 t7
+ pmaxsd m3, m14
+ punpckhqdq m1, m2, m0 ; t3 t2
+ pminsd m3, m15
+ punpcklqdq m2, m0 ; t0 t1
+ pmulld m3, m12
+ paddd m0, m2, m1 ; dct4 out0 out1
+ psubd m2, m1 ; dct4 out3 out2
+ REPX {pmaxsd x, m14}, m8, m0, m2
+ REPX {pminsd x, m15}, m8, m0, m2
+.main3:
+ pshufd m1, m3, q1032
+ paddd m3, m13
+ psubd m9, m3, m1
+ paddd m3, m1
+ psrad m9, 12
+ psrad m3, 12
+ punpckhqdq m1, m8, m3 ; t7 t6
+ shufpd m8, m9, 0xaa ; t4 t5
+ ret
+.main_end:
+ paddd m0, m11
+ paddd m2, m11
+ psubd m3, m0, m1 ; out7 out6
+ paddd m0, m1 ; out0 out1
+ paddd m1, m2, m8 ; out3 out2
+ psubd m2, m8 ; out4 out5
+ REPX {vpsravd x, m11}, m0, m2, m3, m1
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+INV_TXFM_8X8_FN adst, adst
+
+cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call .main
+ punpckldq m1, m2, m4 ; out4 out6
+ punpckhdq m2, m0 ; -out5 -out7
+ punpckldq m0, m3 ; out0 out2
+ punpckhdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.pass1_end:
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m0, m1 ; 0 2 4 6
+ packssdw m4, m3 ; 1 3 5 7
+ psrlq m1, [o(permB)], 8
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ psrlq m2, m1, 32
+ vpermi2q m1, m0, m3
+ vpermt2q m0, m2, m3
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ movu m10, [permC+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ jmp m(idct_8x8_internal_10bpc).end
+.main_pass2:
+ vextracti32x8 ym2, m0, 1
+ vextracti32x8 ym3, m1, 1
+ lea r5, [o_base_8bpc]
+ pshufd ym4, ym0, q1032
+ pshufd ym5, ym1, q1032
+ jmp m(iadst_8x8_internal_8bpc).main_pass2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612
+ ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189
+ psubd m4, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ REPX {pmaxsd x, m14}, m4, m2, m0, m1
+ REPX {pminsd x, m15}, m4, m2, m0, m1
+ pxor m5, m5
+ psubd m5, m4
+ shufpd m4, m2, 0xaa ; t4 t7
+ shufpd m2, m5, 0xaa ; t5 -t6
+ ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784
+ punpckhqdq m3, m0, m1
+ punpcklqdq m0, m1
+ psubd m1, m0, m3 ; t2 t3
+ paddd m0, m3 ; out0 -out7
+ punpckhqdq m3, m4, m2 ; t7a t6a
+ punpcklqdq m4, m2 ; t5a t4a
+ psubd m2, m4, m3 ; t7 t6
+ paddd m4, m3 ; out6 -out1
+ REPX {pmaxsd x, m14}, m1, m2
+ REPX {pminsd x, m15}, m1, m2
+ shufpd m3, m1, m2, 0xaa
+ shufpd m1, m2, 0x55
+ pmulld m3, m12
+ pmulld m1, m12
+ paddd m3, m13
+ psubd m2, m3, m1
+ paddd m3, m1
+ psrad m2, 12 ; out4 -out5
+ pshufd m3, m3, q1032
+ psrad m3, 12 ; out2 -out3
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, identity
+INV_TXFM_8X8_FN flipadst, flipadst
+
+cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x8_internal_10bpc).load
+ vpermi2q m1, m6, m2 ; 7 5
+ vpermi2q m3, m4, m0 ; 3 1
+ vpermt2q m0, m5, m4 ; 0 2
+ vpermt2q m2, m5, m6 ; 4 6
+ call m(iadst_8x8_internal_10bpc).main
+ punpckhdq m1, m3, m4 ; -out3 -out1
+ punpckldq m3, m0 ; out2 out0
+ punpckhdq m0, m2 ; -out7 -out5
+ punpckldq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_8x8_internal_10bpc).main_pass2
+ movu m10, [permC+1]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ lea r6, [strideq*3]
+ vpermt2q m0, m10, m1 ; 7 6 5 4
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m2, m10, m3 ; 3 2 1 0
+ pxor m10, m10
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m0
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ packssdw m1, [cq+64*2] ; 0 4 1 5
+ mova m2, [cq+64*1] ; 2 6 3 7
+ packssdw m2, [cq+64*3]
+ mova m0, [o(idtx8x8p)]
+ vpermb m1, m0, m1
+ vpermb m2, m0, m2
+ punpckldq m0, m1, m2 ; 0 1 4 5
+ punpckhdq m1, m2 ; 2 3 6 7
+ jmp tx2q
+.pass2:
+ movu m3, [o(permC+2)]
+ vpbroadcastd m12, [o(pw_4096)]
+ psrlq m2, m3, 32
+ vpermi2q m2, m0, m1
+ vpermt2q m0, m3, m1
+ jmp m(idct_8x8_internal_10bpc).end2
+
+%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 8x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, 35
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, adst
+
+cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call .load
+ call .main
+ call .main_end
+.pass1_end:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ jmp tx2q
+.pass2:
+ mova m8, [o(idct8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpcklqdq m8, m0, m2 ; 15 1
+ punpckhqdq m0, m2 ; 7 9
+ punpckhqdq m1, m5, m4 ; 3 13
+ punpcklqdq m5, m4 ; 11 5
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym7, m8, 1 ; 14 2
+ vextracti32x8 ym3, m0, 1 ; 6 10
+ vextracti32x8 ym6, m1, 1 ; 12 4
+ vextracti32x8 ym9, m5, 1 ; 8 0
+ call m(idct_8x16_internal_8bpc).main2
+ mova m8, [permC]
+ vpbroadcastd m12, [pw_2048]
+ vpermt2q m0, m8, m1
+ lea r6, [strideq*3]
+ vpermt2q m2, m8, m3
+ vpbroadcastd m11, [pixel_10bpc_max]
+ vpermt2q m4, m8, m5
+ pxor m10, m10
+ vpermt2q m6, m8, m7
+ pmulhrsw m8, m12, m0
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m2
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*1]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*7]
+ mova ym7, [cq+64*3]
+ call .round_input_fast
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ movu m6, [o(permC+3)]
+ packssdw m3, m1, m3
+ packssdw m1, m0, m2
+ vprolq m3, 32
+ vpermd m1, m6, m1
+ vpermd m3, m6, m3
+ mova ym0, ym1 ; 0 4
+ vextracti32x8 ym1, m1, 1 ; 1 5
+ mova ym2, ym3 ; 2 6
+ vextracti32x8 ym3, m3, 1 ; 3 7
+ jmp tx2q
+ALIGN function_align
+.round_input_fast:
+ movshdup m8, [o(permB)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpermt2q m0, m8, m4
+ vpermt2q m1, m8, m5
+ vpermt2q m2, m8, m6
+ vpermt2q m3, m8, m7
+ vpbroadcastd m13, [o(pd_2048)]
+ REPX {pmulld x, m12}, m0, m1, m2, m3
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {paddd x, m13}, m0, m1, m2, m3
+ vpbroadcastd m11, [o(pd_1)]
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+ALIGN function_align
+.load:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+.load2:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m0, m12, [cq+64*0]
+ pmulld m1, m12, [cq+64*1]
+ pmulld m2, m12, [cq+64*2]
+ pmulld m3, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pmulld m4, m12, [cq+64*4]
+ pmulld m5, m12, [cq+64*5]
+ pmulld m6, m12, [cq+64*6]
+ pmulld m7, m12, [cq+64*7]
+ REPX {paddd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, 13, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, 13, 799, 4017 ; t4a t7a
+ pmulld m0, m12
+ pmulld m4, m12
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ psubd m5, m7, m3 ; t6a
+ paddd m7, m3 ; t7
+ pmaxsd m5, m14
+ pmaxsd m1, m14
+ pminsd m5, m15
+ pminsd m1, m15
+ pmulld m5, m12
+ pmulld m1, m12
+ ITX_MULSUB_2D 2, 6, 3, 9, 10, 13, 1567, 3784 ; t2 t3
+ pmaxsd m8, m14
+ pmaxsd m7, m14
+ paddd m0, m13
+ pminsd m8, m15
+ psubd m3, m0, m4
+ paddd m5, m13
+ paddd m0, m4
+ psubd m4, m5, m1
+ paddd m5, m1
+ REPX {psrad x, 12 }, m3, m5, m0, m4
+ paddd m1, m3, m2 ; dct4 out1
+ psubd m2, m3, m2 ; dct4 out2
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ pminsd m6, m15, m7
+ REPX {pmaxsd x, m14}, m0, m1, m2, m3
+ REPX {pminsd x, m15}, m0, m1, m2, m3
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_1)]
+.main_end2:
+ REPX {paddd x, m11}, m0, m1, m2, m3
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ psubd m6, m1, m5 ; out6
+ paddd m1, m5 ; out1
+ psubd m5, m2, m4 ; out5
+ paddd m2, m4 ; out2
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+ REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, identity, 35
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, adst
+
+cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call .main
+ psrad m0, 1
+ psrad m1, 1
+ psrad m6, m10, 1
+ psrad m7, m11, 1
+ psrad m2, 12
+ psrad m3, 12
+ psrad m4, m8, 12
+ psrad m5, m9, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call .fast_main
+ punpcklqdq m1, m2, m4 ; out4 out6
+ punpckhqdq m2, m0 ; -out5 -out7
+ punpcklqdq m0, m3 ; out0 out2
+ punpckhqdq m4, m3 ; -out1 -out3
+ paddd m1, m11
+ psubd m3, m11, m2
+ paddd m0, m11
+ psubd m4, m11, m4
+.fast_end:
+ movu m5, [o(permC+3)]
+ REPX {psrad x, 1}, m1, m0, m3, m4
+ packssdw m2, m0, m1 ; 0 2 4 6
+ packssdw m3, m4, m3 ; 1 3 5 7
+ vpermd m2, m5, m2
+ vpermd m3, m5, m3
+ mova ym0, ym2
+ vextracti32x8 ym2, m2, 1
+ mova ym1, ym3
+ vextracti32x8 ym3, m3, 1
+ jmp tx2q
+.pass2:
+ call .pass2_main
+ movu m4, [permB+2]
+ vbroadcasti32x8 m12, [pw_2048_m2048+16]
+ psrlq m7, m4, 8
+ vpermi2q m4, m0, m3 ; 0 1 2 3
+ psrlq m5, m7, 24
+ vpermi2q m7, m0, m3 ; 12 13 14 15
+ psrlq m6, m5, 8
+ vpermq m5, m5, m1 ; 4 5 6 7
+ vpermq m6, m6, m2 ; 8 9 10 11
+.pass2_end:
+ vpbroadcastd m11, [pixel_10bpc_max]
+ pxor m10, m10
+ lea r6, [strideq*3]
+ pmulhrsw m8, m12, m4
+ call m(idct_8x8_internal_10bpc).write_8x4_start
+ pmulhrsw m8, m12, m5
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m6
+ call m(idct_8x8_internal_10bpc).write_8x4
+ pmulhrsw m8, m12, m7
+ jmp m(idct_8x8_internal_10bpc).write_8x4
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7
+ vpbroadcastd m10, [o(pd_1567)]
+ vpbroadcastd m11, [o(pd_3784)]
+ ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a
+ vpbroadcastd m12, [o(pd_1448)]
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m14}, m5, m3, m2, m9
+ REPX {pminsd x, m15}, m5, m3, m2, m9
+ REPX {pmulld x, m12}, m5, m3, m2, m9
+ vpbroadcastd m4, [o(pd_1)]
+ psubd m8, m5, m3 ; (t2 - t3) * 1448
+ paddd m3, m5 ; (t2 + t3) * 1448
+ psubd m5, m2, m9 ; (t6 - t7) * 1448
+ paddd m2, m9 ; (t6 + t7) * 1448
+ vpbroadcastd m9, [o(pd_3072)]
+ paddd m0, m4
+ psubd m1, m4, m1
+ paddd m10, m6, m4
+ psubd m11, m4, m7
+ paddd m2, m9
+ paddd m8, m9
+ vpbroadcastd m9, [o(pd_3071)]
+ psubd m3, m9, m3
+ psubd m9, m5
+ ret
+ALIGN function_align
+.fast_main:
+ mova ym0, [cq+64*0]
+ mova ym4, [cq+64*2]
+ mova ym1, [cq+64*7]
+ mova ym5, [cq+64*5]
+ mova ym2, [cq+64*4]
+ mova ym6, [cq+64*6]
+ mova ym3, [cq+64*3]
+ mova ym7, [cq+64*1]
+ call m(idct_8x16_internal_10bpc).round_input_fast
+ jmp m(iadst_8x8_internal_10bpc).main
+ALIGN function_align
+.pass2_main:
+ mova m8, [o(iadst8x16p)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3
+ vpbroadcastd m10, [o(pw_2896x8)]
+ punpckhdq m5, m0, m1
+ punpckldq m0, m1
+ punpckhdq m1, m2, m3
+ punpckldq m2, m3
+ lea r5, [o_base_8bpc]
+ punpckhqdq m4, m0, m2 ; 12 3 14 1
+ punpcklqdq m0, m2 ; 0 15 2 13
+ punpckhqdq m6, m5, m1 ; 8 7 10 5
+ punpcklqdq m5, m1 ; 4 11 6 9
+ call m(iadst_8x16_internal_8bpc).main2
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m10 ; -out7 out4 out6 -out5
+ pmulhrsw m2, m10 ; out8 -out11 -out9 out10
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, identity, 35
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+
+cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 43
+ jl .fast
+ call m(idct_8x16_internal_10bpc).load
+ call m(iadst_8x16_internal_10bpc).main
+ psrad m7, m0, 1
+ psrad m0, m11, 1
+ psrad m6, m1, 1
+ psrad m1, m10, 1
+ psrad m5, m2, 12
+ psrad m2, m9, 12
+ psrad m4, m3, 12
+ psrad m3, m8, 12
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_8x16_internal_10bpc).fast_main
+ punpckhqdq m1, m3, m4 ; -out3 -out1
+ punpcklqdq m3, m0 ; out2 out0
+ punpckhqdq m0, m2 ; -out7 -out5
+ punpcklqdq m4, m2 ; out6 out4
+ psubd m1, m11, m1
+ paddd m3, m11
+ psubd m0, m11, m0
+ paddd m4, m11
+ jmp m(iadst_8x16_internal_10bpc).fast_end
+.pass2:
+ call m(iadst_8x16_internal_10bpc).pass2_main
+ movu m7, [permB+2]
+ vbroadcasti32x8 m12, [pw_m2048_2048+16]
+ psrlq m4, m7, 8
+ vpermi2q m7, m3, m0 ; 3 2 1 0
+ psrlq m5, m4, 24
+ vpermi2q m4, m3, m0 ; 15 14 13 12
+ psrlq m6, m5, 8
+ vpermq m5, m5, m2 ; 11 10 9 8
+ vpermq m6, m6, m1 ; 7 6 5 4
+ jmp m(iadst_8x16_internal_10bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ jmp m(idct_8x16_internal_10bpc).pass1_end
+.pass2:
+ vpbroadcastd m8, [o(pw_1697x16)]
+ pmulhrsw m4, m8, m0
+ pmulhrsw m5, m8, m1
+ pmulhrsw m6, m8, m2
+ pmulhrsw m7, m8, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ vpbroadcastd m7, [o(pw_2048)]
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ vpbroadcastd m6, [o(pixel_10bpc_max)]
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m1
+ punpckhdq m4, m1
+ pxor m5, m5
+ punpckhqdq m1, m0, m2 ; 1 5 9 13
+ punpcklqdq m0, m2 ; 0 4 8 12
+ punpcklqdq m2, m3, m4 ; 2 6 10 14
+ punpckhqdq m3, m4 ; 3 7 11 15
+ lea r6, [strideq*3]
+ pmulhrsw m0, m7
+ call .write_8x4_start
+ pmulhrsw m0, m7, m1
+ call .write_8x4
+ pmulhrsw m0, m7, m2
+ call .write_8x4
+ pmulhrsw m0, m7, m3
+.write_8x4:
+ add dstq, strideq
+ add cq, 64*2
+.write_8x4_start:
+ mova xm4, [dstq+strideq*0]
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ vinserti32x4 m4, [dstq+strideq*8], 2
+ vinserti32x4 m4, [dstq+r6*4 ], 3
+ mova [cq+64*0], m5
+ mova [cq+64*1], m5
+ paddw m4, m0
+ pmaxsw m4, m5
+ pminsw m4, m6
+ mova [dstq+strideq*0], xm4
+ vextracti32x4 [dstq+strideq*4], ym4, 1
+ vextracti32x4 [dstq+strideq*8], m4, 2
+ vextracti32x4 [dstq+r6*4 ], m4, 3
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x8
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 8
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 384
+ sar r6d, 9
+.dconly:
+ vpbroadcastd m2, [o(dconly_10bpc)]
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m1, r6d
+ paddsw m1, m2
+.dconly_loop:
+ mova ym0, [dstq+strideq*0]
+ vinserti32x8 m0, [dstq+strideq*1], 1
+ paddsw m0, m1
+ psubusw m0, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, -21
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, adst
+
+cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m4, m12, [cq+64*0] ; 0 1
+ pmulld m9, m12, [cq+64*1] ; 2 3
+ pmulld m8, m12, [cq+64*2] ; 4 5
+ pmulld m7, m12, [cq+64*3] ; 6 7
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m2, m2
+ mova m15, [o(permB)]
+ REPX {mova [cq+64*x], m2}, 0, 1, 2, 3
+ psrlq m0, m15, 32
+ REPX {paddd x, m13}, m4, m9, m8, m7
+ vpbroadcastd m14, [o(clip_18b_min)]
+ REPX {psrad x, 12 }, m4, m8, m9, m7
+ mova m1, m0
+ vpermi2q m0, m4, m8 ; 0 4
+ cmp eobd, 43
+ jl .fast
+ pmulld m5, m12, [cq+64*4] ; 8 9
+ pmulld m10, m12, [cq+64*5] ; 10 11
+ pmulld m11, m12, [cq+64*6] ; 12 13
+ pmulld m6, m12, [cq+64*7] ; 14 15
+ REPX {mova [cq+64*x], m2}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m5, m10, m11, m6
+ REPX {psrad x, 12 }, m10, m5, m11, m6
+ mova m2, m1
+ vpermi2q m1, m9, m10 ; 2 10
+ mova m3, m2
+ vpermi2q m2, m5, m11 ; 8 12
+ vpermi2q m3, m6, m7 ; 14 6
+ vpermt2q m4, m15, m11 ; 1 13
+ vpermt2q m6, m15, m9 ; 15 3
+ vpermt2q m5, m15, m8 ; 9 5
+ vpermt2q m7, m15, m10 ; 7 11
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main
+ call .main
+ jmp .pass1_end
+.fast:
+ vpermi2q m1, m9, m7 ; 2 6
+ vpermt2q m4, m15, m9 ; 1 3
+ vpermt2q m7, m15, m8 ; 7 5
+ vpbroadcastd m15, [o(clip_18b_max)]
+ call m(idct_8x8_internal_10bpc).main_fast
+ call .main_fast
+.pass1_end:
+ call m(idct_8x16_internal_10bpc).main_end
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+.pass1_end2:
+ mova m10, m9
+ mova m11, m8
+ call .transpose_16x8
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x8_internal_8bpc).main
+ movshdup m4, [permC]
+ vpbroadcastd m13, [pw_2048]
+ psrlq m5, m4, 8
+ vpermq m0, m4, m0
+ vpermq m1, m5, m1
+ vpermq m2, m4, m2
+ vpermq m3, m5, m3
+.end:
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ pmulhrsw m8, m13, m0
+ pmulhrsw m9, m13, m1
+ lea r6, [strideq*3]
+ call .write_16x4
+ pmulhrsw m8, m13, m2
+ pmulhrsw m9, m13, m3
+.write_16x4:
+ mova ym10, [dstq+strideq*0]
+ vinserti32x8 m10, [dstq+strideq*1], 1
+ paddw m8, m10
+ mova ym10, [dstq+strideq*2]
+ vinserti32x8 m10, [dstq+r6 ], 1
+ paddw m9, m10
+ pmaxsw m8, m14
+ pmaxsw m9, m14
+ pminsw m8, m15
+ pminsw m9, m15
+ mova [dstq+strideq*0], ym8
+ vextracti32x8 [dstq+strideq*1], m8, 1
+ mova [dstq+strideq*2], ym9
+ vextracti32x8 [dstq+r6 ], m9, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ pmulld m6, m4 ; t15 t12
+ pmulld m4, m3 ; t9 t10
+ REPX {paddd x, m13}, m6, m4
+ REPX {psrad x, 12 }, m6, m4
+ mova m5, m6 ; t14 t13
+ mova m9, m4 ; t8 t11
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m6, [o(pd_4076_3920)]
+ vbroadcasti32x4 m3, [o(pd_401_m1189)]
+ vbroadcasti32x4 m5, [o(pd_m2598_1931)]
+ vbroadcasti32x4 m9, [o(pd_3166_3612)]
+ pmulld m6, m4 ; t15a t12a
+ pmulld m4, m3 ; t8a t11a
+ pmulld m5, m7 ; t9a t10a
+ pmulld m7, m9 ; t14a t13a
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189
+ ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612
+.main2:
+ REPX {paddd x, m13}, m4, m6, m5, m7
+ REPX {psrad x, 12 }, m4, m5, m6, m7
+ paddd m9, m4, m5 ; t8 t11
+ psubd m4, m5 ; t9 t10
+ psubd m5, m6, m7 ; t14 t13
+ paddd m6, m7 ; t15 t12
+ REPX {pmaxsd x, m14}, m5, m4, m9, m6
+ REPX {pminsd x, m15}, m5, m4, m9, m6
+.main3:
+ psubd m3, m0, m1 ; dct8 out7 out6
+ paddd m0, m1 ; dct8 out0 out1
+ vbroadcasti32x4 m7, [o(pd_3784_m3784)]
+ pmulld m7, m5
+ vpmulld m5, [o(pd_1567)] {1to16}
+ paddd m1, m2, m8 ; dct8 out3 out2
+ psubd m2, m8 ; dct8 out4 out5
+ vbroadcasti32x4 m8, [o(pd_1567_m1567)]
+ pmulld m8, m4
+ vpmulld m4, [o(pd_3784)] {1to16}
+ REPX {pmaxsd x, m14}, m0, m1
+ REPX {pminsd x, m15}, m0, m1
+ paddd m7, m13
+ paddd m5, m13
+ paddd m7, m8
+ psubd m5, m4
+ psrad m7, 12 ; t14a t10a
+ psrad m5, 12 ; t9a t13a
+ punpckhqdq m4, m9, m7
+ punpcklqdq m8, m9, m5
+ punpckhqdq m5, m6, m5
+ punpcklqdq m6, m7
+ psubd m7, m8, m4 ; t11a t10
+ paddd m8, m4 ; t8a t9
+ psubd m4, m6, m5 ; t12a t13
+ paddd m6, m5 ; t15a t14
+ REPX {pmaxsd x, m14}, m4, m7
+ REPX {pminsd x, m15}, m4, m7
+ pmulld m4, m12
+ pmulld m7, m12
+ REPX {pmaxsd x, m14}, m2, m3, m6, m8
+ REPX {pminsd x, m15}, m2, m3, m6, m8
+ paddd m4, m13
+ paddd m5, m4, m7
+ psubd m4, m7
+ psrad m4, 12 ; t11 t10a
+ psrad m5, 12 ; t12 t13a
+ ret
+ALIGN function_align
+.transpose_16x8:
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ vpermi2d m8, m0, m2
+ vpermt2d m0, m9, m2
+ vpermi2d m10, m1, m3
+ vpermi2d m11, m1, m3
+ punpckhwd m3, m8, m0
+ punpcklwd m1, m8, m0
+ punpckhwd m4, m10, m11
+ punpcklwd m2, m10, m11
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, identity, -21
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, adst
+
+cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ call .main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
+ jmp m(idct_16x8_internal_10bpc).pass1_end2
+.pass2:
+ call .main_pass2
+ vpermq m8, m13, m0
+ vpermq m9, m13, m1
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m13, m2
+ vpermq m9, m13, m3
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m12, [o(pd_2896)]
+ pmulld m2, m12, [cq+64*0]
+ pmulld m7, m12, [cq+64*1]
+ pmulld m1, m12, [cq+64*2]
+ pmulld m5, m12, [cq+64*3]
+ vpbroadcastd m13, [o(pd_2048)]
+ pxor m4, m4
+ mova m10, [o(permB)]
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ REPX {paddd x, m13}, m2, m7, m1, m5
+ psrlq m6, m10, 32
+ REPX {psrad x, 12 }, m2, m7, m1, m5
+ mova m0, m6
+ vpermi2q m0, m2, m7 ; 0 2
+ vpermt2q m7, m10, m2 ; 3 1
+ mova m2, m6
+ vpermi2q m2, m1, m5 ; 4 6
+ vpermt2q m5, m10, m1 ; 7 5
+ cmp eobd, 43
+ jl .main_fast
+ pmulld m8, m12, [cq+64*4]
+ pmulld m3, m12, [cq+64*5]
+ pmulld m9, m12, [cq+64*6]
+ pmulld m1, m12, [cq+64*7]
+ REPX {mova [cq+64*x], m4}, 4, 5, 6, 7
+ REPX {paddd x, m13}, m8, m3, m9, m1
+ REPX {psrad x, 12 }, m8, m3, m9, m1
+ mova m4, m6
+ vpermi2q m4, m8, m3 ; 8 10
+ vpermt2q m3, m10, m8 ; 11 9
+ vpermi2q m6, m9, m1 ; 12 14
+ vpermt2q m1, m10, m9 ; 15 13
+.main:
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601
+ jmp .main2
+.main_fast:
+ vbroadcasti32x4 m1, [o(pd_4091_3973)]
+ vbroadcasti32x4 m8, [o(pd_201_995)]
+ vbroadcasti32x4 m3, [o(pd_3703_3290)]
+ vbroadcasti32x4 m9, [o(pd_1751_2440)]
+ vbroadcasti32x4 m4, [o(pd_2751_2106)]
+ vbroadcasti32x4 m10, [o(pd_3035_3513)]
+ vbroadcasti32x4 m6, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m1, m0
+ pmulld m0, m8
+ pmulld m3, m2
+ pmulld m2, m9
+ pmulld m4, m5
+ pmulld m5, m10
+ pmulld m6, m7
+ pmulld m7, m11
+.main2:
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ REPX {psubd x, m13, x}, m1, m3
+ REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7
+ psubd m8, m0, m4 ; t8a t10a
+ paddd m0, m4 ; t0a t2a
+ psubd m4, m1, m5 ; t9a t11a
+ paddd m1, m5 ; t1a t3a
+ psubd m5, m2, m6 ; t12a t14a
+ paddd m2, m6 ; t4a t6a
+ psubd m6, m3, m7 ; t13a t15a
+ paddd m3, m7 ; t5a t7a
+ REPX {pmaxsd x, m14}, m8, m4, m5, m6
+ REPX {pminsd x, m15}, m8, m4, m5, m6
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m2, m1, m3
+ REPX {pminsd x, m15}, m0, m2, m1, m3
+ psubd m7, m0, m2 ; t4 t6
+ paddd m0, m2 ; t0 t2
+ psubd m2, m1, m3 ; t5 t7
+ paddd m1, m3 ; t1 t3
+ psubd m3, m4, m6 ; t12a t14a
+ paddd m4, m6 ; t8a t10a
+ psubd m6, m8, m5 ; t13a t15a
+ paddd m8, m5 ; t9a t11a
+ REPX {pmaxsd x, m14}, m7, m3, m2, m6
+ REPX {pminsd x, m15}, m7, m3, m2, m6
+ punpcklqdq m5, m3, m7 ; t12a t4
+ punpckhqdq m3, m7 ; t14a t6
+ punpckhqdq m7, m6, m2 ; t15a t7
+ punpcklqdq m6, m2 ; t13a t5
+ vpbroadcastd m11, [o(pd_1567)]
+ vpbroadcastd m10, [o(pd_3784)]
+ ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11
+ ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10
+ REPX {pmaxsd x, m14}, m0, m4, m1, m8
+ REPX {pminsd x, m15}, m0, m4, m1, m8
+ punpckhqdq m2, m4, m0 ; t10a t2
+ punpcklqdq m4, m0 ; t8a t0
+ punpckhqdq m0, m8, m1 ; t11a t3
+ punpcklqdq m8, m1 ; t9a t1
+ paddd m1, m6, m7 ; out2 -out3
+ psubd m6, m7 ; t14a t6
+ paddd m7, m5, m3 ; -out13 out12
+ psubd m5, m3 ; t15a t7
+ psubd m3, m8, m0 ; t11 t3a
+ paddd m8, m0 ; out14 -out15
+ paddd m0, m4, m2 ; -out1 out0
+ psubd m4, m2 ; t10 t2a
+ REPX {pmaxsd x, m14}, m6, m5, m3, m4
+ mov r6d, 0x3333
+ REPX {pminsd x, m15}, m6, m5, m3, m4
+ kmovw k1, r6d
+ REPX {pmulld x, m12}, m6, m5, m3, m4
+ pxor m9, m9
+ REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8
+ paddd m6, m13
+ paddd m4, m13
+ paddd m2, m6, m5 ; -out5 out4
+ psubd m6, m5 ; out10 -out11
+ psubd m5, m4, m3 ; -out9 out8
+ paddd m3, m4 ; out6 -out7
+ REPX {psrad x, 12}, m2, m3, m5, m6
+ REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6
+ ret
+ALIGN function_align
+.main_pass2:
+ lea r5, [o_base_8bpc]
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m13, [permC]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ lea r6, [strideq*3]
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, identity, -21
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+
+cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_10bpc).main_pass1
+ vpbroadcastd m9, [o(pd_1)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x8_internal_10bpc).pass1_end
+.pass2:
+ call m(iadst_16x8_internal_10bpc).main_pass2
+ psrlq m13, 8
+ vpermq m8, m13, m3
+ vpermq m9, m13, m2
+ call m(idct_16x8_internal_10bpc).write_16x4
+ vpermq m8, m13, m1
+ vpermq m9, m13, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+ call m(idct_8x16_internal_10bpc).load2
+ vpbroadcastd m8, [o(pd_5793)]
+ vpbroadcastd m9, [o(pd_3072)]
+ pxor m10, m10
+ REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m10}, 0, 1, 2, 3
+ REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m10}, 4, 5, 6, 7
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ psrlq m8, [o(permA)], 16
+ psrlq m9, m8, 8
+ mova m10, m8
+ mova m11, m9
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ jmp tx2q
+.pass2:
+ movshdup m4, [o(permC)]
+ vpbroadcastd m13, [o(pw_4096)]
+ REPX {vpermq x, m4, x}, m0, m1, m2, m3
+ jmp m(idct_16x8_internal_10bpc).end
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 16x16
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 181
+ mov [cq], eobd ; 0
+ or r3d, 16
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, 28
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, adst
+
+cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ cmp eobd, 36
+ jl .fast
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 2]
+ mova m2, [cq+64* 4]
+ mova m3, [cq+64* 6]
+ mova m4, [cq+64* 8]
+ mova m5, [cq+64*10]
+ mova m6, [cq+64*12]
+ mova m7, [cq+64*14]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ call m(idct_8x16_internal_10bpc).main
+ mova m16, [cq+64* 1]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m22, [cq+64*13]
+ mova m23, [cq+64*15]
+ call .main
+ call .main_end
+.pass1_end:
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+.pass1_end2:
+ punpckhwd m8, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m4, m5
+ punpcklwd m4, m5
+ punpcklwd m5, m6, m7
+ punpckhwd m6, m7
+ punpckhdq m7, m0, m2
+ punpckldq m0, m2
+ punpckhdq m2, m8, m1
+ punpckldq m8, m1
+ punpckhdq m1, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m3, m6
+ punpckldq m3, m6
+ vshufi32x4 m6, m0, m4, q3232
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m4, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m7, ym1, 1
+ vshufi32x4 m7, m1, q3232
+ vshufi32x4 m1, m2, m5, q3232
+ vinserti32x8 m2, ym5, 1
+ vshufi32x4 m5, m7, m1, q2020 ; 10 11
+ vshufi32x4 m7, m1, q3131 ; 14 15
+ vshufi32x4 m1, m3, m2, q2020 ; 2 3
+ vshufi32x4 m3, m2, q3131 ; 6 7
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+.pass1_end3:
+ mov r6d, 64*12
+ pxor m8, m8
+.zero_loop:
+ mova [cq+r6+64*3], m8
+ mova [cq+r6+64*2], m8
+ mova [cq+r6+64*1], m8
+ mova [cq+r6+64*0], m8
+ sub r6d, 64*4
+ jge .zero_loop
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(idct_16x16_internal_8bpc).main
+ movshdup m10, [permC]
+ vpbroadcastd m13, [pw_2048]
+ psrlq m11, m10, 8
+ vpermq m8, m10, m0
+ vpermq m0, m11, m7
+ vpermq m7, m11, m1
+ vpermq m1, m10, m6
+ vpermq m6, m10, m2
+ vpermq m2, m11, m5
+ vpermq m5, m11, m3
+ vpermq m3, m10, m4
+.pass2_end:
+ lea r6, [strideq*3]
+ vpbroadcastd m15, [pixel_10bpc_max]
+ pxor m14, m14
+ pmulhrsw m8, m13, m8
+ pmulhrsw m9, m13, m7
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m8, m13, m6
+ pmulhrsw m9, m13, m5
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m8, m13, m3
+ pmulhrsw m9, m13, m2
+ call m(idct_16x8_internal_10bpc).write_16x4
+ pmulhrsw m8, m13, m1
+ pmulhrsw m9, m13, m0
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+.fast:
+ mova ym0, [cq+64*0]
+ mova ym2, [cq+64*4]
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+64*2]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*1]
+ mova ym5, [cq+64*3]
+ mova ym6, [cq+64*5]
+ mova ym7, [cq+64*7]
+ vpermt2q m0, m8, m2 ; 0 4
+ vpermt2q m1, m8, m3 ; 2 6
+ vpermt2q m4, m8, m5 ; 1 3
+ vpermt2q m7, m8, m6 ; 7 5
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ vpbroadcastd m11, [o(pd_2)]
+ call m(idct_8x16_internal_10bpc).main_end2
+ mova m8, [o(permA)]
+ psrlq m9, m8, 8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, 13, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, 13, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, 13, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, 13, 1931, 3612 ; t10a, t13a
+ paddd m9, m20, m16 ; t8
+ psubd m20, m16, m20 ; t9
+ psubd m16, m22, m18 ; t10
+ paddd m18, m22 ; t11
+ paddd m22, m23, m19 ; t15
+ psubd m23, m19 ; t14
+ psubd m19, m17, m21 ; t13
+ paddd m17, m21 ; t12
+ vpbroadcastd m11, [o(pd_3784)]
+ REPX {pmaxsd x, m14}, m20, m23, m16, m19
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m20, m23, m16, m19
+ ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11
+ ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m9, m18, m22, m17
+ REPX {pminsd x, m15}, m9, m18, m22, m17
+ paddd m21, m20, m19 ; t14
+ psubd m20, m19 ; t13
+ psubd m19, m9, m18 ; t11a
+ paddd m9, m18 ; t8a
+ psubd m18, m23, m16 ; t10
+ paddd m16, m23 ; t9
+ psubd m23, m22, m17 ; t12a
+ paddd m22, m17 ; t15a
+ REPX {pmaxsd x, m14}, m20, m23, m18, m19
+ REPX {pminsd x, m15}, m20, m23, m18, m19
+ REPX {pmulld x, m12}, m20, m23, m18, m19
+ psubd m7, m0, m6 ; dct8 out7
+ paddd m0, m6 ; dct8 out0
+ psubd m6, m1, m5 ; dct8 out6
+ paddd m1, m5 ; dct8 out1
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1
+ psubd m5, m2, m4 ; dct8 out5
+ paddd m2, m4 ; dct8 out2
+ REPX {pminsd x, m15}, m7, m0, m6, m1
+ psubd m4, m3, m8 ; dct8 out4
+ paddd m3, m8 ; dct8 out3
+ REPX {pmaxsd x, m14}, m5, m2, m4, m3
+ paddd m20, m13
+ paddd m23, m13
+ REPX {pminsd x, m15}, m5, m2, m4, m3
+ psubd m17, m20, m18 ; t10a
+ paddd m20, m18 ; t13a
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ psubd m18, m23, m19 ; t11
+ paddd m19, m23 ; t12
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+.main_end:
+ vpbroadcastd m11, [o(pd_2)]
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m23, m0, m22 ; out15
+ paddd m0, m22 ; out0
+ psubd m22, m1, m21 ; out14
+ paddd m1, m21 ; out1
+ psubd m21, m2, m20 ; out13
+ paddd m2, m20 ; out2
+ psubd m20, m3, m19 ; out12
+ paddd m3, m19 ; out3
+ psubd m19, m4, m18 ; out11
+ paddd m4, m18 ; out4
+ psubd m18, m5, m17 ; out10
+ paddd m5, m17 ; out5
+ psubd m17, m6, m16 ; out9
+ paddd m6, m16 ; out6
+ psubd m16, m7, m9 ; out8
+ paddd m7, m9 ; out7
+ REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \
+ m4, m20, m5, m21, m6, m22, m7, m23
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m20
+ packssdw m5, m21
+ packssdw m6, m22
+ packssdw m7, m23
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, flipadst
+INV_TXFM_16X16_FN adst, adst
+
+cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call .main_pass1
+ packssdw m0, m16
+ packssdw m1, m17
+ packssdw m2, m18
+ packssdw m3, m19
+ packssdw m4, m5, m20
+ packssdw m5, m6, m21
+ packssdw m6, m7, m22
+ packssdw m7, m8, m23
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call .main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ paddd m0, m9
+ psubd m1, m9, m1
+ paddd m2, m9
+ psubd m3, m9, m3
+ paddd m4, m9, m5
+ psubd m5, m9, m6
+ paddd m6, m9, m7
+ psubd m7, m9, m8
+.pass1_fast_end:
+ mova m9, [o(permA)]
+ psrlq m8, m9, 8
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+.pass1_fast_end2:
+ mova m10, m9
+ mova m11, m8
+ call m(idct_16x8_internal_10bpc).transpose_16x8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
+ jmp tx2q
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m10, [permC]
+ mova m13, [pw_2048_m2048]
+ psrlq m11, m10, 8
+ vpermq m8, m11, m0
+ vpermq m0, m10, m7
+ vpermq m7, m11, m1
+ vpermq m1, m10, m6
+ vpermq m6, m11, m2
+ vpermq m2, m10, m5
+ vpermq m5, m11, m3
+ vpermq m3, m10, m4
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+ALIGN function_align
+.main_pass1:
+ mova m0, [cq+64* 0]
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m23, [cq+64*15]
+ vpbroadcastd m13, [o(pd_2048)]
+ ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0
+ mova m7, [cq+64* 7]
+ mova m16, [cq+64* 8]
+ ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8
+ mova m2, [cq+64* 2]
+ mova m21, [cq+64*13]
+ ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2
+ mova m5, [cq+64* 5]
+ mova m18, [cq+64*10]
+ ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10
+ mova m4, [cq+64* 4]
+ mova m19, [cq+64*11]
+ ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4
+ mova m3, [cq+64* 3]
+ mova m20, [cq+64*12]
+ ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12
+ mova m6, [cq+64* 6]
+ mova m17, [cq+64* 9]
+ ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6
+ mova m1, [cq+64* 1]
+ mova m22, [cq+64*14]
+ ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psubd m9, m23, m7 ; t9a
+ paddd m23, m7 ; t1a
+ psubd m7, m2, m18 ; t10a
+ paddd m18, m2 ; t2a
+ REPX {pmaxsd x, m14}, m9, m23, m7, m18
+ psubd m2, m17, m1 ; t15a
+ paddd m17, m1 ; t7a
+ REPX {pminsd x, m15}, m9, m23, m7, m18
+ psubd m1, m21, m5 ; t11a
+ paddd m21, m5 ; t3a
+ REPX {pmaxsd x, m14}, m2, m17, m1, m21
+ psubd m5, m4, m20 ; t12a
+ paddd m4, m20 ; t4a
+ REPX {pminsd x, m15}, m2, m17, m1, m21
+ psubd m20, m19, m3 ; t13a
+ paddd m19, m3 ; t5a
+ REPX {pmaxsd x, m14}, m5, m4, m20, m19
+ psubd m8, m6, m22 ; t14a
+ paddd m6, m22 ; t6a
+ REPX {pminsd x, m15}, m5, m4, m20, m19
+ psubd m22, m0, m16 ; t8a
+ paddd m16, m0 ; t0a
+ REPX {pmaxsd x, m14}, m8, m6, m22, m16
+ vpbroadcastd m11, [o(pd_4017)]
+ vpbroadcastd m10, [o(pd_799)]
+ REPX {pminsd x, m15}, m8, m6, m22, m16
+ ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8
+ ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13
+ vpbroadcastd m11, [o(pd_2276)]
+ vpbroadcastd m10, [o(pd_3406)]
+ ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10
+ ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15
+ paddd m0, m16, m4 ; t0
+ psubd m16, m4 ; t4
+ psubd m3, m23, m19 ; t5
+ paddd m23, m19 ; t1
+ REPX {pmaxsd x, m14}, m0, m16, m3, m23
+ psubd m19, m18, m6 ; t6
+ paddd m18, m6 ; t2
+ REPX {pminsd x, m15}, m0, m16, m3, m23
+ psubd m6, m21, m17 ; t7
+ paddd m21, m17 ; t3
+ REPX {pmaxsd x, m14}, m19, m18, m6, m21
+ paddd m17, m9, m20 ; t8a
+ psubd m9, m20 ; t12a
+ REPX {pminsd x, m15}, m19, m18, m6, m21
+ psubd m20, m22, m5 ; t13a
+ paddd m22, m5 ; t9a
+ REPX {pmaxsd x, m14}, m17, m9, m20, m22
+ psubd m5, m1, m2 ; t14a
+ paddd m1, m2 ; t10a
+ REPX {pminsd x, m15}, m17, m9, m20, m22
+ psubd m2, m7, m8 ; t15a
+ paddd m7, m8 ; t11a
+ REPX {pmaxsd x, m14}, m5, m1, m2, m7
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ REPX {pminsd x, m15}, m5, m1, m2, m7
+ ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a
+ ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a
+ ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12
+ ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15
+ psubd m8, m0, m18 ; t2a
+ paddd m0, m18 ; out0
+ psubd m18, m23, m21 ; t3a
+ paddd m23, m21 ; -out15
+ paddd m21, m9, m5 ; -out13
+ psubd m9, m5 ; t15a
+ psubd m5, m3, m6 ; t6
+ paddd m3, m6 ; -out3
+ REPX {pmaxsd x, m14}, m8, m18, m9, m5
+ psubd m6, m20, m2 ; t14a
+ paddd m2, m20 ; out2
+ paddd m20, m16, m19 ; out12
+ psubd m16, m19 ; t7
+ REPX {pminsd x, m15}, m8, m18, m9, m5
+ psubd m19, m22, m7 ; t11
+ paddd m22, m7 ; out14
+ psubd m7, m17, m1 ; t10
+ paddd m1, m17 ; -out1
+ REPX {pmaxsd x, m14}, m6, m16, m19, m7
+ vpbroadcastd m12, [o(pd_1448)]
+ vpbroadcastd m4, [o(pd_2)]
+ vpbroadcastd m10, [o(pd_5120)]
+ vpbroadcastd m11, [o(pd_5119)]
+ REPX {pminsd x, m15}, m6, m16, m19, m7
+ psubd m17, m7, m19 ; -out9
+ paddd m7, m19 ; out6
+ psubd m19, m5, m16 ; -out11
+ paddd m5, m16 ; out4
+ REPX {pmulld x, m12}, m17, m7, m19, m5
+ psubd m16, m8, m18 ; out8
+ paddd m8, m18 ; -out7
+ psubd m18, m6, m9 ; out10
+ paddd m6, m9 ; -out5
+ REPX {pmulld x, m12}, m16, m8, m18, m6
+ REPX {paddd x, m4 }, m0, m2, m20, m22
+ REPX {psubd x, m4, x}, m1, m3, m21, m23
+ REPX {paddd x, m10 }, m7, m5, m16, m18
+ REPX {psubd x, m11, x}, m17, m19, m8, m6
+ REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3
+ REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8
+ ret
+ALIGN function_align
+.main_pass1_fast:
+ mova ym0, [cq+64*0]
+ mova ym1, [cq+64*2]
+ movshdup m8, [o(permB)]
+ mova ym6, [cq+64*1]
+ mova ym7, [cq+64*3]
+ mova ym2, [cq+64*4]
+ mova ym3, [cq+64*6]
+ mova ym4, [cq+64*5]
+ mova ym5, [cq+64*7]
+ vpermt2q m0, m8, m1 ; 0 2
+ vpermt2q m7, m8, m6 ; 3 1
+ vpermt2q m2, m8, m3 ; 4 6
+ vpermt2q m5, m8, m4 ; 7 5
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m12, [o(pd_2896)]
+ jmp m(iadst_16x8_internal_10bpc).main_fast
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ cmp eobd, 36
+ jl .fast
+ call m(iadst_16x16_internal_10bpc).main_pass1
+ packssdw m4, m19, m3
+ packssdw m3, m20, m5
+ packssdw m5, m18, m2
+ packssdw m2, m21, m6
+ packssdw m6, m17, m1
+ packssdw m1, m22, m7
+ packssdw m7, m16, m0
+ packssdw m0, m23, m8
+ jmp m(idct_16x16_internal_10bpc).pass1_end
+.fast:
+ call m(iadst_16x16_internal_10bpc).main_pass1_fast
+ vpbroadcastd m9, [o(pd_2)]
+ psubd m4, m9, m3
+ paddd m3, m9, m5
+ paddd m5, m9, m2
+ psubd m2, m9, m6
+ psubd m6, m9, m1
+ paddd m1, m9, m7
+ paddd m7, m9, m0
+ psubd m0, m9, m8
+ jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
+.pass2:
+ lea r5, [o_base_8bpc]
+ call m(iadst_16x16_internal_8bpc).main_pass2b
+ movshdup m10, [permC]
+ movu m13, [pw_m2048_2048]
+ psrlq m11, m10, 8
+ vpermq m8, m11, m7
+ vpermq m7, m11, m6
+ vpermq m6, m11, m5
+ vpermq m5, m11, m4
+ vpermq m3, m10, m3
+ vpermq m2, m10, m2
+ vpermq m1, m10, m1
+ vpermq m0, m10, m0
+ jmp m(idct_16x16_internal_10bpc).pass2_end
+
+INV_TXFM_16X16_FN identity, dct, -92
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
+%undef cmp
+ vpbroadcastd m10, [o(pd_5793)]
+ vpbroadcastd m11, [o(pd_5120)]
+ mov r6, cq
+ cmp eobd, 36
+ jl .fast
+ call .pass1_main
+ packssdw m0, m6, m8
+ packssdw m1, m7, m9
+ call .pass1_main
+ packssdw m2, m6, m8
+ packssdw m3, m7, m9
+ call .pass1_main
+ packssdw m4, m6, m8
+ packssdw m5, m7, m9
+ call .pass1_main
+ packssdw m6, m8
+ packssdw m7, m9
+ jmp m(idct_16x16_internal_10bpc).pass1_end2
+.fast:
+ call .pass1_main_fast
+ packssdw m0, m6, m7
+ call .pass1_main_fast
+ packssdw m1, m6, m7
+ call .pass1_main_fast
+ packssdw m2, m6, m7
+ call .pass1_main_fast
+ packssdw m3, m6, m7
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckldq m3, m4, m1
+ punpckhdq m4, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ pxor m7, m7
+ vshufi32x4 m2, m0, m3, q3131
+ vshufi32x4 m0, m3, q2020
+ vshufi32x4 m3, m1, m4, q3131
+ vshufi32x4 m1, m4, q2020
+ REPX {mova x, m7}, m4, m5, m6
+ jmp m(idct_16x16_internal_10bpc).pass1_end3
+.pass2:
+ movshdup m11, [o(permC)]
+ vpbroadcastd m12, [o(pw_1697x16)]
+ lea r6, [strideq*3]
+ vpbroadcastd m13, [o(pw_2048)]
+ pxor m14, m14
+ vpbroadcastd m15, [pixel_10bpc_max]
+ vpermq m8, m11, m0
+ vpermq m9, m11, m1
+ call .pass2_main
+ vpermq m8, m11, m2
+ vpermq m9, m11, m3
+ call .pass2_main
+ vpermq m8, m11, m4
+ vpermq m9, m11, m5
+ call .pass2_main
+ vpermq m8, m11, m6
+ vpermq m9, m11, m7
+.pass2_main:
+ pmulhrsw m0, m12, m8
+ pmulhrsw m1, m12, m9
+ paddsw m8, m8
+ paddsw m9, m9
+ paddsw m8, m0
+ paddsw m9, m1
+ pmulhrsw m8, m13
+ pmulhrsw m9, m13
+ jmp m(idct_16x8_internal_10bpc).write_16x4
+ALIGN function_align
+.pass1_main:
+ pmulld m6, m10, [r6+64*0]
+ pmulld m7, m10, [r6+64*1]
+ pmulld m8, m10, [r6+64*8]
+ pmulld m9, m10, [r6+64*9]
+ add r6, 64*2
+ REPX {paddd x, m11}, m6, m7, m8, m9
+ REPX {psrad x, 13 }, m6, m8, m7, m9
+ ret
+ALIGN function_align
+.pass1_main_fast:
+ mova ym6, [r6+64* 0]
+ vinserti32x8 m6, [r6+64* 4], 1
+ mova ym7, [r6+64* 8]
+ vinserti32x8 m7, [r6+64*12], 1
+ add r6, 64
+ REPX {pmulld x, m10}, m6, m7
+ REPX {paddd x, m11}, m6, m7
+ REPX {psrad x, 13 }, m6, m7
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ vpbroadcastd m11, [o(pd_2)]
+ mova m20, [o(idct8x32p)]
+ pxor m21, m21
+ cmp eobd, 43
+ jl .fast
+ call .pass1_main
+ punpcklwd m16, m0, m1
+ punpcklwd m17, m2, m3
+ punpckhwd m18, m0, m1
+ punpckhwd m19, m2, m3
+ cmp eobd, 107
+ jge .full
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ call m(idct_8x16_internal_8bpc).main_fast
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+ jmp .end
+.full:
+ add cq, 64
+ call .pass1_main
+ punpcklwd m5, m0, m1
+ punpcklwd m6, m2, m3
+ punpckhwd m7, m0, m1
+ punpckhwd m8, m2, m3
+ punpckldq m0, m16, m17 ; 0 2
+ punpckhdq m1, m16, m17 ; 4 6
+ punpckldq m2, m18, m19 ; 8 10
+ punpckhdq m3, m18, m19 ; 12 14
+ punpckldq m4, m5, m6 ; 16 18
+ punpckhdq m5, m6 ; 20 22
+ punpckldq m6, m7, m8 ; 24 26
+ punpckhdq m7, m8 ; 28 30
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ vextracti32x8 ym16, m2, 1
+ vextracti32x8 ym17, m3, 1
+ vextracti32x8 ym18, m4, 1
+ vextracti32x8 ym19, m5, 1
+ vextracti32x8 ym20, m6, 1
+ vextracti32x8 ym21, m7, 1
+ call m(idct_8x16_internal_8bpc).main
+ REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .end
+.fast:
+ movshdup m8, [o(permB)]
+ mova ym1, [cq+128*1]
+ mova ym5, [cq+128*5]
+ mova ym7, [cq+128*3]
+ mova ym3, [cq+128*7]
+ mova ym0, [cq+128*0]
+ mova ym4, [cq+128*2]
+ mova ym2, [cq+128*4]
+ mova ym6, [cq+128*6]
+ vpermt2q m1, m8, m5 ; 1 5
+ vpermt2q m3, m8, m7 ; 7 3
+ vpermt2q m0, m8, m4 ; 0 2
+ vpermt2q m2, m8, m6 ; 4 6
+ mova [cq+128*0], ym21
+ REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_8x8_internal_10bpc).main_end
+ packssdw m0, m2
+ packssdw m1, m3
+ vpermb m0, m20, m0
+ vprold m20, 16
+ vpermb m2, m20, m1
+ punpckhdq m1, m0, m2
+ punpckldq m0, m2
+ lea r5, [o_base_8bpc]
+ vextracti32x8 ym14, m0, 1
+ vextracti32x8 ym15, m1, 1
+ call m(idct_8x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2
+.end:
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper
+ lea r3, [strideq*2]
+ vpbroadcastd m12, [pixel_10bpc_max]
+ lea r6, [strideq*3]
+ pxor m11, m11
+ lea r3, [dstq+r3*8]
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ call .write_8x4x2
+ pmulhrsw m0, m10, m2
+ pmulhrsw m1, m10, m3
+ call .write_8x4x2
+ pmulhrsw m0, m10, m4
+ pmulhrsw m1, m10, m5
+ call .write_8x4x2
+ pmulhrsw m0, m10, m6
+ pmulhrsw m1, m10, m7
+.write_8x4x2:
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ vinserti32x4 m8, [dstq+strideq*2], 2
+ vinserti32x4 m8, [dstq+r6 ], 3
+ mova xm9, [r3 +r6 ]
+ vinserti32x4 ym9, [r3 +strideq*2], 1
+ vinserti32x4 m9, [r3 +strideq*1], 2
+ vinserti32x4 m9, [r3 +strideq*0], 3
+ paddw m8, m0
+ paddw m9, m1
+ pmaxsw m8, m11
+ pmaxsw m9, m11
+ pminsw m8, m12
+ pminsw m9, m12
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r6 ], m8, 3
+ lea dstq, [dstq+strideq*4]
+ vextracti32x4 [r3 +strideq*0], m9, 3
+ vextracti32x4 [r3 +strideq*1], m9, 2
+ vextracti32x4 [r3 +strideq*2], ym9, 1
+ mova [r3 +r6 ], xm9
+ lea r3, [r3+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ mov [cq], eobd
+ or r3d, 32
+ add r6d, 640
+ sar r6d, 10
+ jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
+ALIGN function_align
+.pass1_main:
+ mova m0, [cq+128*0]
+ mova m1, [cq+128*1]
+ mova m2, [cq+128*2]
+ mova m3, [cq+128*3]
+ mova m4, [cq+128*4]
+ mova m5, [cq+128*5]
+ mova m6, [cq+128*6]
+ mova m7, [cq+128*7]
+ REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(idct_8x16_internal_10bpc).main
+ call m(idct_8x16_internal_10bpc).main_end2
+ packssdw m0, m4
+ packssdw m1, m5
+ packssdw m2, m6
+ packssdw m3, m7
+ REPX {vpermb x, m20, x}, m0, m1, m2, m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob
+ vpbroadcastd m9, [pw_5]
+ lea r4, [strideq*3]
+ pxor m10, m10
+ lea r5, [strideq*5]
+ vpbroadcastd m11, [pixel_10bpc_max]
+ sub eobd, 107
+ lea r6, [strideq+r4*2]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ lea r7, [dstq+strideq*8]
+ REPX {mova [cq+128*x], m10}, 0, 1, 2, 3
+ REPX {paddsw x, m9}, m0, m1, m2, m3
+ REPX {mova [cq+128*x], m10}, 4, 5, 6, 7
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ add cq, 64
+ mova xm4, [dstq+strideq*0]
+ mova xm5, [dstq+strideq*1]
+ mova xm6, [dstq+strideq*2]
+ mova xm7, [dstq+r4 *1]
+ punpckhwd m8, m0, m1
+ vinserti32x4 ym4, [dstq+strideq*4], 1
+ punpcklwd m0, m1
+ vinserti32x4 ym5, [dstq+r5 *1], 1
+ punpckhwd m1, m2, m3
+ vinserti32x4 ym6, [dstq+r4 *2], 1
+ punpcklwd m2, m3
+ vinserti32x4 ym7, [dstq+r6 *1], 1
+ punpckhwd m3, m0, m8
+ vinserti32x4 m4, [r7 +strideq*0], 2
+ punpcklwd m0, m8
+ vinserti32x4 m5, [r7 +strideq*1], 2
+ punpckhwd m8, m2, m1
+ vinserti32x4 m6, [r7 +strideq*2], 2
+ punpcklwd m2, m1
+ vinserti32x4 m7, [r7 +r4 *1], 2
+ punpckhqdq m1, m0, m2
+ vinserti32x4 m4, [r7 +strideq*4], 3
+ punpcklqdq m0, m2
+ vinserti32x4 m5, [r7 +r5 *1], 3
+ punpcklqdq m2, m3, m8
+ vinserti32x4 m6, [r7 +r4 *2], 3
+ punpckhqdq m3, m8
+ vinserti32x4 m7, [r7 +r6 *1], 3
+ paddw m0, m4
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ REPX {pmaxsw x, m10}, m0, m1, m2, m3
+ REPX {pminsw x, m11}, m0, m1, m2, m3
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm1
+ mova [dstq+strideq*2], xm2
+ mova [dstq+r4 *1], xm3
+ vextracti32x4 [dstq+strideq*4], ym0, 1
+ vextracti32x4 [dstq+r5 *1], ym1, 1
+ vextracti32x4 [dstq+r4 *2], ym2, 1
+ vextracti32x4 [dstq+r6 *1], ym3, 1
+ lea dstq, [r7+strideq*8]
+ vextracti32x4 [r7 +strideq*0], m0, 2
+ vextracti32x4 [r7 +strideq*1], m1, 2
+ vextracti32x4 [r7 +strideq*2], m2, 2
+ vextracti32x4 [r7 +r4 *1], m3, 2
+ vextracti32x4 [r7 +strideq*4], m0, 3
+ vextracti32x4 [r7 +r5 *1], m1, 3
+ vextracti32x4 [r7 +r4 *2], m2, 3
+ vextracti32x4 [r7 +r6 *1], m3, 3
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m11, [o(permB)]
+ mova m0, [cq+64* 0] ; 0 1
+ mova m4, [cq+64* 1] ; 2 3
+ mova m1, [cq+64* 2] ; 4 5
+ mova m8, [cq+64* 3] ; 6 7
+ vpbroadcastd m12, [o(pd_2896)]
+ vpbroadcastd m13, [o(pd_2048)]
+ vpbroadcastd m14, [o(clip_18b_min)]
+ vpbroadcastd m15, [o(clip_18b_max)]
+ psrlq m10, m11, 32
+%if WIN64
+ movaps [cq+16*0], xmm6
+ movaps [cq+16*1], xmm7
+%endif
+ mova m16, m11
+ vpermi2q m16, m0, m1 ; 1 5
+ mova m17, m11
+ vpermi2q m17, m8, m4 ; 7 3
+ cmp eobd, 43
+ jl .fast
+ mova m18, [cq+64* 4] ; 8 9
+ mova m20, [cq+64* 5] ; 10 11
+ mova m6, [cq+64* 6] ; 12 13
+ mova m7, [cq+64* 7] ; 14 15
+ vpermt2q m0, m10, m18 ; 0 8
+ vpermt2q m18, m11, m6 ; 9 13
+ mova m19, m11
+ vpermi2q m19, m7, m20 ; 15 11
+ cmp eobd, 107
+ jge .full
+ vpermt2q m1, m10, m6 ; 4 12
+ vpermt2q m4, m10, m8 ; 2 6
+ vpermt2q m7, m10, m20 ; 14 10
+ mov r6d, 64*1
+ call m(idct_8x8_internal_10bpc).main_fast
+ call m(idct_16x8_internal_10bpc).main_fast
+ call .main_fast
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.full:
+ mova m2, [cq+64* 8] ; 16 17
+ mova m5, [cq+64* 9] ; 18 19
+ mova m9, [cq+64*10] ; 20 21
+ mova m21, [cq+64*11] ; 22 23
+ vpermt2q m1, m10, m9 ; 4 20
+ vpermt2q m7, m10, m21 ; 14 22
+ vpermt2q m21, m11, m5 ; 23 19
+ vpermt2q m5, m10, m20 ; 18 10
+ mova m20, m11
+ vpermi2q m20, m2, m9 ; 17 21
+ mova m22, [cq+64*12] ; 24 25
+ mova m9, [cq+64*13] ; 26 27
+ mova m3, [cq+64*14] ; 28 29
+ mova m23, [cq+64*15] ; 30 31
+ vpermt2q m2, m10, m22 ; 16 24
+ vpermt2q m22, m11, m3 ; 25 29
+ vpermt2q m3, m10, m6 ; 28 12
+ vpermt2q m4, m10, m9 ; 2 26
+ mova m6, m10
+ vpermi2q m6, m23, m8 ; 30 6
+ vpermt2q m23, m11, m9 ; 31 27
+ mov r6d, 64*3
+ call m(idct_8x8_internal_10bpc).main
+ call m(idct_16x8_internal_10bpc).main
+ call .main
+ call m(idct_16x16_internal_10bpc).main_end
+ jmp .end
+.fast:
+ vpermq m0, m10, m0 ; 0 0
+ vpermq m1, m10, m1 ; 4 4
+ vpermt2q m4, m10, m8 ; 2 6
+ xor r6d, r6d
+ call m(idct_8x8_internal_10bpc).main_fast2
+ call m(idct_16x8_internal_10bpc).main_fast2
+ call .main_fast2
+ call m(idct_16x16_internal_10bpc).main_end
+.end:
+ mova m10, [o(idct32x8p)]
+%if WIN64
+ movaps xmm6, [cq+16*0]
+ movaps xmm7, [cq+16*1]
+%endif
+ vzeroupper
+ psrlw m8, m10, 8
+ mova m9, m8
+ vpermi2w m8, m1, m5
+ vpermt2w m1, m10, m5
+ vprold m5, m9, 16
+ vpermi2w m9, m3, m7
+ vpermt2w m3, m10, m7
+ vprold m10, 16
+ mova m7, m5
+ vpermi2w m5, m0, m4
+ vpermt2w m0, m10, m4
+ pxor m14, m14
+ vpermi2w m7, m2, m6
+ vpermt2w m2, m10, m6
+.zero_loop:
+ mova [cq+r6*4+64*3], m14
+ mova [cq+r6*4+64*2], m14
+ mova [cq+r6*4+64*1], m14
+ mova [cq+r6*4+64*0], m14
+ sub r6d, 64
+ jge .zero_loop
+ punpckhdq m6, m5, m8
+ punpckldq m5, m8
+ punpckhdq m8, m7, m9
+ punpckldq m7, m9
+ punpckhdq m4, m2, m3
+ punpckldq m2, m3
+ punpckhdq m3, m0, m1
+ punpckldq m0, m1
+ vpbroadcastd m13, [o(pw_2048)]
+ vpbroadcastd m15, [o(pixel_10bpc_max)]
+ lea r5, [o_base_8bpc]
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ lea r6, [strideq*3]
+ pmulhrsw m0, m13
+ pmulhrsw m1, m13
+ pmulhrsw m2, m13
+ pmulhrsw m3, m13
+ call .write_32x4
+ pmulhrsw m0, m13, m4
+ pmulhrsw m1, m13, m5
+ pmulhrsw m2, m13, m6
+ pmulhrsw m3, m13, m7
+.write_32x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r6 ]
+ REPX {pmaxsw x, m14}, m0, m1, m2, m3
+ REPX {pminsw x, m15}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r6 ], m3
+ lea dstq, [dstq+strideq*4]
+ ret
+.dconly:
+ imul r6d, [cq], 181
+ vpbroadcastd m3, [o(dconly_10bpc)]
+ mov [cq], eobd
+ or r3d, 8
+ add r6d, 640
+ sar r6d, 10
+ imul r6d, 181
+ add r6d, 2176
+ sar r6d, 12
+ vpbroadcastw m2, r6d
+ paddsw m2, m3
+.dconly_loop:
+ paddsw m0, m2, [dstq+strideq*0]
+ paddsw m1, m2, [dstq+strideq*1]
+ psubusw m0, m3
+ psubusw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m9, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16 t20
+ pmulld m16, m7 ; t31 t27
+ pmulld m22, m17 ; -t19 -t25
+ pmulld m17, m9 ; t28 t24
+ REPX {paddd x, m13}, m23, m16, m17
+ psubd m22, m13, m22
+ REPX {psrad x, 12 }, m23, m16, m22, m17
+ mova m20, m23 ; t30 t26
+ mova m9, m16 ; t17 t21
+ mova m19, m22 ; t18 t22
+ mova m18, m17 ; t29 t25
+ jmp .main3
+.main_fast: ; bottom half is zero
+ vbroadcasti32x4 m23, [o(pd_4091_3973)]
+ vbroadcasti32x4 m7, [o(pd_201_995)]
+ vbroadcasti32x4 m20, [o(pd_2751_2106)]
+ vbroadcasti32x4 m9, [o(pd_3035_3513)]
+ vbroadcasti32x4 m21, [o(pd_3703_3290)]
+ vbroadcasti32x4 m10, [o(pd_1751_2440)]
+ vbroadcasti32x4 m22, [o(pd_1380_601)]
+ vbroadcasti32x4 m11, [o(pd_3857_4052)]
+ pmulld m23, m16 ; t16a t20a
+ pmulld m16, m7 ; t31a t27a
+ pmulld m20, m19 ; -t17a -t21a
+ pmulld m19, m9 ; t30a t26a
+ pmulld m21, m18 ; t18a t22a
+ pmulld m18, m10 ; t29a t25a
+ pmulld m22, m17 ; -t19a -t25a
+ pmulld m17, m11 ; t28a t24a
+ psubd m20, m13, m20
+ psubd m22, m13, m22
+ jmp .main2
+.main:
+ ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973
+ ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106
+ ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290
+ ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601
+ paddd m20, m13
+ paddd m22, m13
+.main2:
+ REPX {paddd x, m13}, m16, m23, m19
+ REPX {psrad x, 12 }, m16, m20, m23, m19
+ psubd m9, m16, m20 ; t17 t21
+ paddd m16, m20 ; t16 t20
+ psubd m20, m23, m19 ; t30 t26
+ paddd m23, m19 ; t31 t27
+ REPX {pmaxsd x, m14}, m9, m16, m20, m23
+ REPX {paddd x, m13}, m21, m18, m17
+ REPX {psrad x, 12 }, m18, m22, m21, m17
+ psubd m19, m22, m18 ; t18 t22
+ paddd m22, m18 ; t19 t23
+ psubd m18, m17, m21 ; t29 t25
+ paddd m17, m21 ; t28 t24
+ REPX {pmaxsd x, m14}, m19, m22, m18, m17
+ REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17
+.main3:
+ vbroadcasti32x4 m11, [o(pd_4017_2276)]
+ vbroadcasti32x4 m10, [o(pd_799_3406)]
+ psubd m7, m0, m6 ; dct16 out15 out14
+ paddd m0, m6 ; dct16 out0 out1
+ psubd m6, m1, m5 ; dct16 out12 out13
+ paddd m1, m5 ; dct16 out3 out2
+ psubd m5, m2, m4 ; dct16 out11 out10
+ paddd m2, m4 ; dct16 out4 out5
+ psubd m4, m3, m8 ; dct16 out8 out9
+ paddd m3, m8 ; dct16 out7 out6
+ ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11
+ ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3
+ punpckhqdq m21, m16, m20 ; t20 t21a
+ punpcklqdq m16, m20 ; t16 t17a
+ punpcklqdq m20, m22, m19 ; t19 t18a
+ punpckhqdq m22, m19 ; t23 t22a
+ REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpcklqdq m19, m23, m9 ; t31 t30a
+ punpckhqdq m23, m9 ; t27 t26a
+ punpckhqdq m9, m17, m18 ; t24 t25a
+ punpcklqdq m17, m18 ; t28 t29a
+ vpbroadcastd m11, [o(pd_3784)]
+ vpbroadcastd m10, [o(pd_1567)]
+ psubd m18, m16, m20 ; t19a t18
+ paddd m20, m16 ; t16a t17
+ psubd m16, m19, m17 ; t28a t29
+ paddd m19, m17 ; t31a t30
+ psubd m17, m22, m21 ; t20a t21
+ paddd m22, m21 ; t23a t22
+ psubd m21, m9, m23 ; t27a t26
+ paddd m23, m9 ; t24a t25
+ REPX {pmaxsd x, m14}, m18, m16, m17, m21
+ REPX {pminsd x, m15}, m16, m18, m21, m17
+ ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11
+ ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2
+ REPX {pmaxsd x, m14}, m20, m22, m19, m23
+ REPX {pminsd x, m15}, m20, m22, m19, m23
+ paddd m9, m20, m22 ; t16 t17a
+ psubd m20, m22 ; t23 t22a
+ paddd m22, m19, m23 ; t31 t30a
+ psubd m19, m23 ; t24 t25a
+ psubd m23, m16, m17 ; t20a t21
+ paddd m16, m17 ; t19a t18
+ psubd m17, m18, m21 ; t27a t26
+ paddd m21, m18 ; t28a t29
+ REPX {pmaxsd x, m14}, m20, m19, m23, m17
+ REPX {pminsd x, m15}, m19, m20, m17, m23
+ REPX {pmulld x, m12}, m19, m20, m17, m23
+ REPX {pmaxsd x, m14}, m22, m21, m16, m9
+ paddd m19, m13
+ paddd m17, m13
+ REPX {pminsd x, m15}, m22, m21, m16, m9
+ psubd m18, m19, m20 ; t23a t22
+ paddd m19, m20 ; t24a t25
+ paddd m20, m17, m23 ; t27 t26a
+ psubd m17, m23 ; t20 t21a
+ REPX {psrad x, 12 }, m20, m19, m18, m17
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob
+ vpbroadcastd m5, [pw_4096]
+ lea r4, [strideq*3]
+ mova m6, [idtx32x8p]
+ lea r5, [strideq*5]
+ vpbroadcastd m9, [pixel_10bpc_max]
+ lea r6, [strideq+r4*2]
+ pxor m8, m8
+ sub eobd, 107
+ psrlw m7, m6, 8
+.loop:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1] ; 02 13
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3] ; 46 57
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5] ; 8a 9b
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7] ; ce df
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ REPX {mova [cq+64*x], m8}, 0, 1, 2, 3
+ mova m4, m6
+ vpermi2w m4, m1, m3
+ vpermt2w m1, m7, m3
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ mova m3, m7
+ vpermi2w m3, m0, m2
+ vpermt2w m0, m6, m2
+ add cq, 64*8
+ punpcklqdq m2, m3, m1 ; 4 5
+ punpckhqdq m3, m1 ; 6 7
+ punpckhqdq m1, m0, m4 ; 2 3
+ punpcklqdq m0, m4 ; 0 1
+ mova ym4, [dstq+strideq*0]
+ vinserti32x8 m4, [dstq+strideq*1], 1
+ paddw m0, m4
+ mova ym4, [dstq+strideq*2]
+ vinserti32x8 m4, [dstq+r4 *1], 1
+ paddw m1, m4
+ mova ym4, [dstq+strideq*4]
+ vinserti32x8 m4, [dstq+r5 *1], 1
+ paddw m2, m4
+ mova ym4, [dstq+r4 *2]
+ vinserti32x8 m4, [dstq+r6 *1], 1
+ paddw m3, m4
+ REPX {pmaxsw x, m8}, m0, m1, m2, m3
+ REPX {pminsw x, m9}, m0, m1, m2, m3
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym1
+ vextracti32x8 [dstq+r4 *1], m1, 1
+ mova [dstq+strideq*4], ym2
+ vextracti32x8 [dstq+r5 *1], m2, 1
+ mova [dstq+r4 *2], ym3
+ vextracti32x8 [dstq+r6 *1], m3, 1
+ add dstq, 32
+ add eobd, 0x80000000
+ jnc .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
index 4fb30ef4e7a..3833e17c99f 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx16_sse.asm
@@ -361,18 +361,32 @@ ALIGN function_align
%macro INV_TXFM_4X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x4
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
- movd m1, [o(pw_2896x8)]
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
+ mov r3d, 4
+.dconly:
+ add r5d, 128
+ sar r5d, 8
+.dconly2:
+ imul r5d, 2896
+ mova m2, [o(pixel_10bpc_max)]
+ add r5d, 34816
movd m0, r5d
- packssdw m0, m0
- pmulhrsw m0, m1
- pshuflw m0, m0, q0000
+ pshuflw m0, m0, q1111
+ pxor m3, m3
punpcklqdq m0, m0
- mova m1, m0
- TAIL_CALL m(iadst_4x4_internal_16bpc).end
+.dconly_loop:
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ paddw m1, m0
+ pminsw m1, m2
+ pmaxsw m1, m3
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
%endif
%endmacro
@@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 4x8
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 2
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 2048
- sar r5d, 12
-.end:
- imul r5d, 2896
- add r5d, 34816
- movd m0, r5d
- pshuflw m0, m0, q1111
- punpcklqdq m0, m0
- pxor m4, m4
- mova m3, [o(pixel_10bpc_max)]
- lea r2, [strideq*3]
-.loop:
- movq m1, [dstq+strideq*0]
- movq m2, [dstq+strideq*2]
- movhps m1, [dstq+strideq*1]
- movhps m2, [dstq+r2]
- paddw m1, m0
- paddw m2, m0
- REPX {pminsw x, m3}, m1, m2
- REPX {pmaxsw x, m4}, m1, m2
- movq [dstq+strideq*0], m1
- movhps [dstq+strideq*1], m1
- movq [dstq+strideq*2], m2
- movhps [dstq+r2 ], m2
- lea dstq, [dstq+strideq*4]
- dec r3d
- jg .loop
- RET
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
%endif
%endmacro
@@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- mov r3d, 4
- add r5d, 6144
- sar r5d, 13
- jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end
+ mov r3d, 16
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
%endif
%endmacro
@@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 2048
- sar r5d, 12
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 128
+ sar r5d, 8
imul r5d, 2896
add r5d, 34816
movd m0, r5d
@@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 2
.end:
- add r5d, 6144
- sar r5d, 13
+ add r5d, 384
+ sar r5d, 9
.end2:
imul r5d, 2896
add r5d, 34816
@@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
mov r3d, 4
%if stack_size_padded > 0
; adjust to caller's stack allocation
@@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
.dconly:
- add r5d, 6144
- sar r5d, 13
+ add r5d, 384
+ sar r5d, 9
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -2755,6 +2742,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
ret
.round:
%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
pcmpeqd m8, m8
REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
@@ -2785,6 +2774,14 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
pcmpeqd m0, m0
REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
mova [r3+ 1*16], m1
@@ -3472,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
%if ARCH_X86_32
add rsp, 1*16
%endif
@@ -3939,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (5+ARCH_X86_64*3+WIN64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
%endif
@@ -4057,6 +4054,8 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
ret
.round:
%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
psrld m8, m11, 10 ; 2
REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
@@ -4087,6 +4086,14 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
mova m0, [o(pd_2)]
REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
paddd m0, [r3+ 0*16]
@@ -5162,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (31+2*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
@@ -5339,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
%endif
RET
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
add rsp, (65+4*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
@@ -5944,6 +5951,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
; final sumsub for idct16 as well as idct32, plus final downshift
%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
mova m%4, [r3+16*(23-%1)]
+ pmaxsd m%1, m12
+ pminsd m%1, m13
psubd m%3, m%1, m%4 ; idct16 out15 - n
paddd m%1, m%4 ; idct16 out0 + n
pmaxsd m%1, m12
@@ -6019,6 +6028,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
.loop_dct32_end:
mova m0, [r3+16*16]
mova m6, [r3+16*24]
+ pmaxsd m0, m2
+ pminsd m0, m3
psubd m5, m0, m6 ; idct16 out15 - n
paddd m0, m6 ; idct16 out0 + n
pmaxsd m0, m2
@@ -6045,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
.dconly1:
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -6344,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
@@ -6565,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
@@ -6838,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
@@ -7098,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
@@ -7537,6 +7548,8 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
mova m5, [r3-16* 4] ; idct64 48 + n
mova m6, [r4-16*20] ; idct64 47 - n
mova m7, [r3-16*20] ; idct64 32 + n
+ pmaxsd m0, m12
+ pminsd m0, m13
paddd m8, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
REPX {pmaxsd x, m12}, m8, m0
@@ -7565,11 +7578,13 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
mova [r4-16* 4], m6
mova [r3+16*12], m8
%else
+ mova m5, [o(clip_18b_min)]
+ mova m6, [o(clip_18b_max)]
mova m1, [r3+16*44] ; idct16 15 - n
+ pmaxsd m0, m5
+ pminsd m0, m6
paddd m4, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
- mova m5, [o(clip_18b_min)]
- mova m6, [o(clip_18b_max)]
REPX {pmaxsd x, m5}, m4, m0
REPX {pminsd x, m6}, m4, m0
paddd m1, m4, m3 ; idct32 out0 + n
@@ -7632,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
.dconly1:
- add r5d, 10240
- sar r5d, 14
+ add r5d, 640
+ sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@@ -7876,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
- add r5d, 2048
- sar r5d, 12
- imul r5d, 2896
- add r5d, 6144
- sar r5d, 13
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
add rsp, (1+8*32+1*WIN64)*16
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
@@ -8112,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
ret
.dconly:
- imul r5d, [cq], 2896
+ imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm
index 092c842786d..a67f053a61b 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx2.asm
@@ -126,7 +126,7 @@ pw_m2751_3035x8: dw -2751*8, 3035*8
SECTION .text
-; Code size reduction trickery: Intead of using rip-relative loads with
+; Code size reduction trickery: Instead of using rip-relative loads with
; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
; single rip-relative lea and then address things relative from that with
; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
@@ -1194,13 +1194,9 @@ cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
+ mov [cq], eobd
pmulhrsw xm0, xm1
- movd xm2, [o(pw_2048)]
- pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
- vpbroadcastw m0, xm0
- mova m1, m0
- jmp m(iadst_8x4_internal_8bpc).end3
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
%endif
%endmacro
@@ -1340,20 +1336,20 @@ cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
+ or r3d, 8
+.dconly:
pmulhrsw xm0, xm2
- psrlw xm2, 3 ; pw_2048
+.dconly2:
+ movd xm2, [pw_2048]
pmulhrsw xm0, xm1
+ lea r2, [strideq*3]
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
-.end:
- mov r2d, 2
-.end2:
- lea r3, [strideq*3]
-.loop:
- WRITE_8X4 0, 0, 1, 2
+.dconly_loop:
+ WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2
lea dstq, [dstq+strideq*4]
- dec r2d
- jg .loop
+ sub r3d, 4
+ jg .dconly_loop
RET
%endif
%endmacro
@@ -1543,13 +1539,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
- psrlw xm2, 3 ; pw_2048
- pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
- vpbroadcastw m0, xm0
- mov r2d, 4
- jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2
+ or r3d, 16
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
%endif
%endmacro
@@ -1902,7 +1893,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
mov [cq], eobd
- mov r2d, 2
+ or r3d, 4
.dconly:
pmulhrsw xm0, xm2
movd xm2, [pw_2048] ; intentionally rip-relative
@@ -1911,17 +1902,17 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
vpbroadcastw m0, xm0
pxor m3, m3
.dconly_loop:
- mova xm1, [dstq]
- vinserti128 m1, [dstq+strideq], 1
+ mova xm1, [dstq+strideq*0]
+ vinserti128 m1, [dstq+strideq*1], 1
punpckhbw m2, m1, m3
punpcklbw m1, m3
paddw m2, m0
paddw m1, m0
packuswb m1, m2
- mova [dstq], xm1
- vextracti128 [dstq+strideq], m1, 1
+ mova [dstq+strideq*0], xm1
+ vextracti128 [dstq+strideq*1], m1, 1
lea dstq, [dstq+strideq*2]
- dec r2d
+ sub r3d, 2
jg .dconly_loop
RET
%endif
@@ -2162,7 +2153,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 4
+ or r3d, 8
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
%endif
%endmacro
@@ -2473,7 +2464,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 8
+ or r3d, 16
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
%endif
%endmacro
@@ -3120,13 +3111,8 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- pmulhrsw xm0, xm2
- psrlw xm2, 2 ; pw_2048
- pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
- vpbroadcastw m0, xm0
- mov r2d, 8
- jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2
+ or r3d, 32
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
.full:
REPX {pmulhrsw x, m9}, m12, m13, m14, m15
pmulhrsw m6, m9, [rsp+32*2]
@@ -3290,7 +3276,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 8
+ or r3d, 8
.dconly:
pmulhrsw xm0, xm2
movd xm2, [pw_2048] ; intentionally rip-relative
@@ -3307,7 +3293,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
packuswb m1, m2
mova [dstq], m1
add dstq, strideq
- dec r2d
+ dec r3d
jg .dconly_loop
RET
.normal:
@@ -3672,7 +3658,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 16
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
.full:
mova [tmp1q-32*4], m1
@@ -3991,7 +3977,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 16
+ or r3d, 16
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
.normal:
PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
@@ -4222,7 +4208,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
.normal:
PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
@@ -4486,7 +4472,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 32
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
.normal:
PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
@@ -4832,7 +4818,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 16
+ or r3d, 16
.dconly:
pmulhrsw xm0, xm2
movd xm2, [o(pw_2048)]
@@ -4856,7 +4842,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
mova [dstq+32*0], m2
mova [dstq+32*1], m3
add dstq, strideq
- dec r2d
+ dec r3d
jg .dconly_loop
RET
.normal:
@@ -4997,7 +4983,7 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 64
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
.normal:
PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
@@ -5200,7 +5186,7 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob
movd xm2, [o(pw_16384)]
mov [cq], eobd
pmulhrsw xm0, xm1
- mov r2d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
.normal:
PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
@@ -5381,7 +5367,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_8192)]
mov [cq], eobd
- mov r2d, 64
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
.normal:
PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm
index 7d01bccb4f5..f30f4909287 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/itx_avx512.asm
@@ -29,7 +29,8 @@
%if ARCH_X86_64
SECTION_RODATA 64
-int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+const \
+int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
@@ -84,7 +85,7 @@ pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11
gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13
gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10
-gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10
+gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16
int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
@@ -845,7 +846,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
punpcklwd m3, m5 ; dct8 in3 in5
punpckhwd m5, m2 ; dct16 in11 in5
punpcklwd m6, m2 ; dct4 in3 in1
-.main2:
+cglobal_label .main2
vpbroadcastd m10, [o(pd_2048)]
.main3:
vpbroadcastq m13, [o(int_mshift)]
@@ -1355,7 +1356,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vpermq m3, m3, q2031
jmp m(iadst_8x8_internal_8bpc).end2
ALIGN function_align
-.main:
+cglobal_label .main
IDCT8_1D_PACKED
ret
@@ -1422,7 +1423,7 @@ ALIGN function_align
punpckhqdq m0, m4 ; out0 -out1
ret
ALIGN function_align
-.main_pass2:
+cglobal_label .main_pass2
IADST8_1D_PACKED 2
ret
@@ -1499,8 +1500,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_dct
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 16
imul r6d, 181
- mov r3d, 16
add r6d, 128
sar r6d, 8
jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
@@ -1608,7 +1609,54 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vpscatterdq [r3+ym8]{k2}, m2
RET
ALIGN function_align
-.main:
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ vpbroadcastd ym3, [o(pw_401_4076x8)]
+ vpbroadcastd ym5, [o(pw_799_4017x8)]
+ vpbroadcastd ym4, [o(pw_m1189_3920x8)]
+ pxor ym6, ym6
+ punpckhwd ym2, ym0, ym0
+ pmulhrsw ym2, ym3 ; t8a t15a
+ punpcklwd ym7, ym1, ym1
+ pmulhrsw ym7, ym5 ; t4a t7a
+ punpckhwd ym1, ym1
+ pmulhrsw ym4, ym1 ; t11a t12a
+ vpcmpub k7, ym13, ym10, 6
+ punpcklwd ym9, ym6, ym0
+ psubsw ym0, ym2, ym4 ; t11a t12a
+ paddsw ym8, ym2, ym4 ; t8a t15a
+ mova ym1, ym7
+ jmp .main5
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ vpbroadcastd ym10, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ pxor ym6, ym6
+ punpckhwd ym8, ym0, ym0
+ punpckhwd ym4, ym3, ym3
+ punpckhwd ym5, ym2, ym2
+ punpcklwd ym7, ym1, ym1
+ punpckhwd ym1, ym1
+ punpcklwd ym3, ym3
+ punpcklwd ym9, ym6, ym0
+ punpcklwd ym6, ym2
+ vpbroadcastd ym2, [o(pw_401_4076x8)]
+ vpbroadcastd ym0, [o(pw_m2598_3166x8)]
+ vpbroadcastd ym11, [o(pw_1931_3612x8)]
+ vpbroadcastd ym12, [o(pw_m1189_3920x8)]
+ pmulhrsw ym8, ym2 ; t8a t15a
+ vpbroadcastd ym2, [o(pw_799_4017x8)]
+ pmulhrsw ym0, ym4 ; t9a t14a
+ vpbroadcastd ym4, [o(pw_m2276_3406x8)]
+ pmulhrsw ym5, ym11 ; t10a t13a
+ pmulhrsw ym1, ym12 ; t11a t12a
+ pmulhrsw ym7, ym2 ; t4a t7a
+ pmulhrsw ym3, ym4 ; t5a t6a
+ vpcmpub k7, ym13, ym10, 6
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
WRAP_YMM IDCT16_1D_PACKED
ret
@@ -1685,13 +1733,14 @@ ALIGN function_align
vpermi2q m6, m0, m2 ; in4 in8 in6 in10
vpermt2q m1, m10, m3 ; in11 in7 in9 in5
.main:
- vpbroadcastd m9, [o(pd_2048)]
- vpbroadcastq m13, [o(int_mshift)]
- kxnorb k1, k1, k1
punpcklwd m0, m4, m5 ; in0 in15 in2 in13
punpckhwd m4, m5 ; in12 in3 in14 in1
punpcklwd m5, m6, m1 ; in4 in11 in6 in9
punpckhwd m6, m1 ; in8 in7 in10 in5
+cglobal_label .main2
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ kxnorb k1, k1, k1
vpcmpub k7, m13, m9, 6 ; 0x33...
pxor m8, m8
ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5
@@ -1976,7 +2025,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_dct
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 8
+ or r3d, 8
.dconly:
imul r6d, 181
add r6d, 128
@@ -2114,7 +2163,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vextracti32x4 [r3 +r4 ], m1, 3
RET
ALIGN function_align
-.main:
+cglobal_label .main
IDCT8_1D_PACKED
ret
@@ -2168,6 +2217,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
pshufd m4, m0, q1032 ; 1 0
pshufd m5, m1, q1032 ; 3 2
call .main_pass2
+ movshdup m4, [o(permC)]
pmulhrsw m0, m6
pmulhrsw m1, m6
psrlq m6, m4, 4
@@ -2194,9 +2244,8 @@ ALIGN function_align
IADST8_1D_PACKED 1
ret
ALIGN function_align
-.main_pass2:
+cglobal_label .main_pass2
IADST8_1D_PACKED 2
- movshdup m4, [o(permC)]
pxor m5, m5
psubd m5, m6
packssdw m6, m5
@@ -2222,6 +2271,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
pshufd m4, m0, q1032 ; 1 0
pshufd m5, m1, q1032 ; 3 2
call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m4, [o(permC)]
pmulhrsw m5, m6, m0
pmulhrsw m0, m6, m1
psrlq m1, m4, 12
@@ -2276,8 +2326,8 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_dct
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 16
imul r6d, 181
- mov r3d, 16
add r6d, 128+512
sar r6d, 8+2
jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
@@ -2456,7 +2506,7 @@ ALIGN function_align
pmulhrsw m3, m4 ; t5a t6a
jmp .main4
ALIGN function_align
-.main:
+cglobal_label .main
IDCT16_1D_PACKED
ret
@@ -2562,6 +2612,7 @@ ALIGN function_align
vshufi32x4 m1, m5, q2020 ; 2 3
vshufi32x4 m5, m7, m9, q2020 ; 10 11
vshufi32x4 m7, m9, q3131 ; 14 15
+cglobal_label .main_pass2b
REPX {pshufd x, x, q1032}, m1, m3, m5, m7
call .main
vpbroadcastd m8, [o(pw_2896x8)]
@@ -2770,13 +2821,13 @@ ALIGN function_align
vpermt2q m9, m12, m7
jmp m(idct_16x16_internal_8bpc).end
-%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
- vpbroadcastd m%3, [o(pw_%4_%5x8)]
- punpcklwd m%1, m%2, m%2
- pmulhrsw m%1, m%3
- vpbroadcastd m%3, [o(pw_%6_%7x8)]
- punpckhwd m%2, m%2
- pmulhrsw m%2, m%3
+%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4]
+ vpbroadcastd m%4, [o(pw_%5_%6x8)]
+ punpcklwd m%1, m%3, m%3
+ pmulhrsw m%1, m%4
+ vpbroadcastd m%4, [o(pw_%7_%8x8)]
+ punpckhwd m%2, m%3, m%3
+ pmulhrsw m%2, m%4
%endmacro
cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
@@ -2864,82 +2915,86 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6
vinserti32x4 ym14, ym16, xm17, 1 ; 1 3
vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7
- pxor ym4, ym4
vpermt2q m2, m5, m6 ; 8 10
vpermt2q m16, m5, m17 ; 9 11
- mova ym5, ym4
- mova ym6, ym4
- mova ym7, ym4
vextracti32x8 ym3, m2, 1 ; 12 14
vextracti32x8 ym17, m16, 1 ; 13 15
- call m(idct_8x16_internal_8bpc).main
+ call m(idct_8x16_internal_8bpc).main_fast
call .main_fast
.end:
- vpbroadcastd ym12, strided
- vpbroadcastd m13, [o(pw_2048)]
- pmulld ym7, ym12, [o(gather8d)]
- REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11
+ vpbroadcastd ym8, strided
+ pmulld ym8, [o(gather8d)]
+ call .main_end
lea r3, [dstq+strideq*4]
- shl strideq, 4
- lea r4, [dstq+strideq]
- add r1, r3
kxnorb k1, k1, k1
- pxor m6, m6
+ lea r4, [dstq+strideq*8]
+ pxor m9, m9
+ lea r1, [r3+strideq*8]
kmovb k2, k1
- vpgatherdq m12{k1}, [r0+ym7]
+ vpgatherdq m12{k1}, [r0+ym8]
kmovb k1, k2
- vpgatherdq m13{k2}, [r3+ym7]
+ vpgatherdq m13{k2}, [r3+ym8]
kmovb k2, k1
- vpgatherdq m14{k1}, [r4+ym7]
+ vpgatherdq m14{k1}, [r4+ym8]
kmovb k1, k2
- vpgatherdq m15{k2}, [r1+ym7]
- REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
- punpcklbw m4, m12, m6
- punpckhbw m12, m6
- paddw m0, m4
+ vpgatherdq m15{k2}, [r1+ym8]
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m11, m12, m9
+ punpckhbw m12, m9
+ paddw m0, m11
paddw m1, m12
packuswb m0, m1
kmovb k2, k1
- vpscatterdq [r0+ym7]{k1}, m0
- punpcklbw m4, m13, m6
- punpckhbw m13, m6
- paddw m2, m4
+ vpscatterdq [r0+ym8]{k1}, m0
+ punpcklbw m12, m13, m9
+ punpckhbw m13, m9
+ paddw m2, m12
paddw m3, m13
packuswb m2, m3
kmovb k1, k2
- vpscatterdq [r3+ym7]{k2}, m2
- punpcklbw m4, m14, m6
- punpckhbw m14, m6
- paddw m8, m4
- paddw m9, m14
- packuswb m8, m9
+ vpscatterdq [r3+ym8]{k2}, m2
+ punpcklbw m13, m14, m9
+ punpckhbw m14, m9
+ paddw m4, m13
+ paddw m5, m14
+ packuswb m4, m5
kmovb k2, k1
- vpscatterdq [r4+ym7]{k1}, m8
- punpcklbw m4, m15, m6
- punpckhbw m15, m6
- paddw m10, m4
- paddw m11, m15
- packuswb m10, m11
- vpscatterdq [r1+ym7]{k2}, m10
+ vpscatterdq [r4+ym8]{k1}, m4
+ punpcklbw m14, m15, m9
+ punpckhbw m15, m9
+ paddw m6, m14
+ paddw m7, m15
+ packuswb m6, m7
+ vpscatterdq [r1+ym8]{k2}, m6
RET
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 32
+ or r3d, 32
imul r6d, 181
add r6d, 128+512
sar r6d, 8+2
jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
INIT_YMM avx512icl
ALIGN function_align
-.main_fast: ; bottom half is zero
- ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
- ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
- ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
- ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+cglobal_label .main_fast2 ; bottom three-quarters are zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ mova m11, m12
+ mova m17, m20
+ mova m15, m21
+ mova m16, m14
+ jmp .main4
+ALIGN function_align
+cglobal_label .main_fast ; bottom half is zero
+ ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
jmp .main3
ALIGN function_align
-.main:
+cglobal_label .main
punpcklwd m12, m21, m14 ; in31 in1
punpckhwd m14, m21 ; in3 in29
punpcklwd m21, m20, m15 ; in27 in5
@@ -2966,6 +3021,7 @@ ALIGN function_align
paddsw m21, m16 ; t20 t27
psubsw m16, m14, m19 ; t22 t25
paddsw m14, m19 ; t23 t24
+.main4:
ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a
ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a
ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a
@@ -2997,8 +3053,8 @@ ALIGN function_align
REPX {pshufb x, m18}, m20, m11, m21, m19
ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a
ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25
- packssdw m18, m13 ; t23a t22
- packssdw m12, m15 ; t24a t25
+ packssdw m18, m13 ; t23a t22
+ packssdw m12, m15 ; t24a t25
ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a
ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27
packssdw m16, m13 ; t20 t21a
@@ -3007,32 +3063,27 @@ ALIGN function_align
punpckhqdq m19, m21 ; t28a t29
punpcklqdq m21, m20, m11 ; t16 t17a
punpckhqdq m20, m11 ; t31 t30a
- psubsw m15, m1, m19 ; out28 out29
- paddsw m1, m19 ; out3 out2
- psubsw m9, m6, m13 ; out19 out18
- paddsw m6, m13 ; out12 out13
- psubsw m10, m5, m16 ; out20 out21
- paddsw m5, m16 ; out11 out10
- psubsw m19, m3, m12 ; out24 out25
- paddsw m3, m12 ; out7 out6
- psubsw m8, m7, m21 ; out16 out17
- paddsw m7, m21 ; out15 out14
- psubsw m21, m0, m20 ; out31 out30
- paddsw m0, m20 ; out0 out1
- psubsw m11, m4, m18 ; out23 out22
- paddsw m4, m18 ; out8 out9
- psubsw m18, m2, m14 ; out27 out26
- paddsw m2, m14 ; out4 out5
INIT_ZMM avx512icl
- movu m16, [o(permD+3)]
- vpermt2q m0, m16, m4 ; 0 1 8 9
- vpermt2q m8, m16, m19 ; 16 17 24 25
- vpermt2q m1, m16, m5 ; 3 2 11 10
- vpermt2q m9, m16, m18 ; 19 18 27 26
- vpermt2q m2, m16, m6 ; 4 5 12 13
- vpermt2q m10, m16, m15 ; 20 21 28 29
- vpermt2q m3, m16, m7 ; 7 6 15 14
- vpermt2q m11, m16, m21 ; 23 22 31 30
+ mova m15, [o(permA)]
+ ret
+cglobal_label .main_end
+ vpbroadcastd m10, [o(pw_2048)]
+ vpermt2q m0, m15, m1 ; t0 t1 t2 t3
+ vpermt2q m20, m15, m19 ; t31 t30a t29 t28a
+ vpermt2q m2, m15, m3 ; t4 t5 t6 t7
+ vpermt2q m14, m15, m12 ; t27 t26a t25 t24a
+ vpermt2q m4, m15, m5 ; t8 t9 t10 t11
+ vpermt2q m18, m15, m16 ; t23a t22 t21a t20
+ vpermt2q m6, m15, m7 ; t12 t13 t14 t15
+ vpermt2q m13, m15, m21 ; t19a t18 t17a t16
+ psubsw m7, m0, m20 ; out31 out30 out29 out28
+ paddsw m0, m20 ; out0 out1 out2 out3
+ psubsw m5, m2, m14 ; out27 out26 out25 out24
+ paddsw m2, m14 ; out4 out5 out6 out7
+ psubsw m3, m4, m18 ; out23 out22 out21 out20
+ paddsw m4, m18 ; out8 out9 out10 out11
+ psubsw m1, m6, m13 ; out19 out18 out17 out16
+ paddsw m6, m13 ; out12 out13 out14 out15
vzeroupper
ret
@@ -3079,16 +3130,33 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
call m(idct_8x16_internal_8bpc).main
call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
.pass2:
- vpbroadcastd m12, [o(pw_8192)]
- vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31
- vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30
- vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29
- vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28
- vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27
- vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26
- vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15
- vshufi32x4 m0, m8, q2020 ; 0 8 16 24
- REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpbroadcastd m10, [o(pw_8192)]
+ vpermt2q m0, m15, m4 ; t0 t1 t9 t8
+ vpermt2q m20, m15, m18 ; t31 t30a t23a t22
+ vpermt2q m3, m15, m7 ; t7 t6 t14 t15
+ vpermt2q m12, m15, m21 ; t25 t24a t17a t16
+ vpermt2q m2, m15, m6 ; t4 t5 t13 t12
+ vpermt2q m14, m15, m13 ; t23a t22 t21a t20
+ vpermt2q m1, m15, m5 ; t3 t2 t10 t11
+ vpermt2q m19, m15, m16 ; t27 t26a t19a t18
+ psubsw m8, m0, m20 ; out31 out30 out22 out23
+ paddsw m0, m20 ; out0 out1 out9 out8
+ paddsw m6, m3, m12 ; out7 out6 out14 out15
+ psubsw m3, m12 ; out24 out25 out17 out16
+ psubsw m5, m2, m14 ; out27 out26 out18 out19
+ paddsw m4, m2, m14 ; out4 out5 out13 out12
+ psubsw m7, m1, m19 ; out28 out29 out21 out20
+ paddsw m2, m1, m19 ; out3 out2 out10 out11
+ vzeroupper
+ vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25
+ vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24
+ vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27
+ vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26
+ vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29
+ vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28
+ vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31
+ vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
call .main
vpbroadcastd m8, [o(pw_2048)]
@@ -3132,7 +3200,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 8
+ or r3d, 8
.dconly2:
imul r6d, 181
add r6d, 128+512
@@ -3158,7 +3226,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
jg .dconly_loop
RET
ALIGN function_align
-.main:
+cglobal_label .main
vpbroadcastd m10, [o(pd_2048)]
.main2:
ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
@@ -3535,7 +3603,7 @@ ALIGN function_align
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
ALIGN function_align
.main_oddhalf_fast2: ; bottom three-quarters are zero
@@ -3821,8 +3889,8 @@ ALIGN function_align
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 16
imul r6d, 181
- mov r3d, 16
add r6d, 128
sar r6d, 8
imul r6d, 181
@@ -4603,7 +4671,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 32
+ or r3d, 32
jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
ALIGN function_align
.main_oddhalf_fast2: ; bottom three-quarters are zero
@@ -5068,8 +5136,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 64
imul r6d, 181
- mov r3d, 64
add r6d, 128+512
sar r6d, 8+2
jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
@@ -5282,7 +5350,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
jnz .normal
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 16
+ or r3d, 16
.dconly:
imul r6d, 181
add r6d, 128+512
@@ -6012,8 +6080,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 64
imul r6d, 181
- mov r3d, 64
add r6d, 128
sar r6d, 8
imul r6d, 181
@@ -6674,8 +6742,8 @@ ALIGN function_align
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
+ or r3d, 32
imul r6d, 181
- mov r3d, 32
add r6d, 128
sar r6d, 8
imul r6d, 181
@@ -7117,7 +7185,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
.dconly:
movsx r6d, word [cq]
mov [cq], eobd
- mov r3d, 64
+ or r3d, 64
jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
ALIGN function_align
.pass2_end:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h
new file mode 100644
index 00000000000..33c842a9ce4
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/loopfilter.h"
+
+#define decl_loopfilter_sb_fns(ext) \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \
+decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext))
+
+decl_loopfilter_sb_fns(ssse3);
+decl_loopfilter_sb_fns(avx2);
+decl_loopfilter_sb_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3);
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl);
+ c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl);
+ c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl);
+ c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm
index 361ccc3b883..ed83000ac24 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx2.asm
@@ -30,22 +30,24 @@
SECTION_RODATA 32
+pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
times 4 db 8, 9
times 4 db 0, 1
times 4 db 8, 9
-pw_1: times 16 dw 1
-pw_2: times 16 dw 2
-pw_3: times 16 dw 3
-; 4 and 16 need to be next to each other since they are used as alternates
-; depending on whether bitdepth is 10 or 12
-pw_4: times 16 dw 4
-pw_16: times 16 dw 16
-pw_8: times 16 dw 8
-pw_4096: times 16 dw 4096
+pw_1: times 16 dw 1
+pw_2: times 16 dw 2
+pw_3: times 16 dw 3
+pw_4096: times 2 dw 4096
-pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
+; 10bpc/12bpc:
+pw_4: times 2 dw 4
+ times 2 dw 16
+clip_max: times 2 dw 511
+ times 2 dw 2047
+clip_min: times 2 dw -512
+ times 2 dw -2048
SECTION .text
@@ -398,9 +400,10 @@ SECTION .text
pmaxuw m2, [pw_1] ; I
psrlw m1, m0, 4 ; H
paddw m0, [pw_2]
+ vpbroadcastd m8, [r11]
paddw m0, m0
paddw m0, m2 ; E
- REPX {pmullw x, [r11]}, m0, m1, m2
+ REPX {pmullw x, m8}, m0, m1, m2
psubw m8, m3, m4 ; p1-p0
psubw m9, m5, m6 ; q1-q0
@@ -430,7 +433,8 @@ SECTION .text
pabsw m10, m10
pmaxuw m9, m10
%endif
- pcmpgtw m9, [r11] ; !flat8in
+ vpbroadcastd m10, [r11]
+ pcmpgtw m9, m10 ; !flat8in
psubw m10, m13, m3 ; p2-p1
pabsw m10, m10
@@ -503,7 +507,8 @@ SECTION .text
pmaxuw m0, m2
pmaxuw m1, m10
pmaxuw m1, m0
- pcmpgtw m1, [r11] ; !flat8out
+ vpbroadcastd m0, [r11]
+ pcmpgtw m1, m0 ; !flat8out
por m1, m9 ; !flat8in | !flat8out
vpbroadcastd m2, [maskq+8]
pand m10, m2, m12
@@ -544,12 +549,8 @@ SECTION .text
%endif
; short filter
-
- vpbroadcastw m0, r7m
- pcmpeqw m2, m2
- psrlw m0, 1 ; 511 or 2047
- pxor m2, m0 ; -512 or -2048
-
+ vpbroadcastd m0, [r11+8*1] ; 511 or 2047
+ vpbroadcastd m2, [r11+8*2] ; -512 or -2048
psubw m10, m5, m4
paddw m11, m10, m10
paddw m11, m10
@@ -561,17 +562,18 @@ SECTION .text
pminsw m10, m0
pmaxsw m10, m2
pand m8, m10 ; f&=fm
- paddw m10, m8, [pw_3]
- paddw m8, [pw_4]
+ vpbroadcastd m10, [pw_4]
+ paddw m10, m8
+ paddw m8, [pw_3]
REPX {pminsw x, m0}, m10, m8
psraw m10, 3 ; f2
psraw m8, 3 ; f1
- paddw m4, m10
- psubw m5, m8
+ psubw m5, m10
+ paddw m4, m8
- paddw m8, [pw_1]
- psraw m8, 1 ; f=(f1+1)>>1
- pandn m8, m7, m8 ; f&=!hev
+ paddw m10, [pw_1]
+ psraw m10, 1 ; f=(f1+1)>>1
+ pandn m8, m7, m10 ; f&=!hev
paddw m3, m8
psubw m6, m8
pxor m8, m8
@@ -603,8 +605,8 @@ SECTION .text
mova [rsp+ 0*32], m9
; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
- psllw m8, m0, 3 ; p6*8
- paddw m8, [pw_8]
+ paddw m8, m0, [pw_1]
+ psllw m8, 3 ; p6*8+8
paddw m10, m2, m7 ; p5+p4
psubw m8, m0
paddw m10, m10 ; (p5+p4)*2
@@ -759,7 +761,6 @@ SECTION .text
psubw m8, m15
paddw m8, m0
psrlw m10, m8, 4
- pand m10, m1
%ifidn %2, v
mova m9, [tmpq+strideq*1]
%else
@@ -788,6 +789,7 @@ SECTION .text
%if %1 >= 8
; flat8 filter
+ vpbroadcastd m7, [pw_4096]
%ifidn %2, v
mova m0, [tmpq+strideq*0] ; p3
%else
@@ -799,43 +801,43 @@ SECTION .text
paddw m2, m0 ; p1+p0+p3
paddw m8, m5 ; 2*(p3+p2)+q0
paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0
- pmulhrsw m7, m2, [pw_4096]
+ pmulhrsw m10, m2, m7
paddw m8, m3, m6
psubw m2, m1
paddw m2, m8
- pmulhrsw m8, m2, [pw_4096]
+ pmulhrsw m8, m2, m7
- paddw m10, m0, m3
- paddw m11, m4, m14
- psubw m2, m10
- paddw m2, m11
- pmulhrsw m10, m2, [pw_4096]
+ paddw m11, m0, m3
+ paddw m1, m4, m14
+ psubw m2, m11
+ paddw m2, m1
+ pmulhrsw m1, m2, m7
paddw m11, m0, m4
+ pblendvb m4, m1, m9
paddw m1, m5, m15
psubw m2, m11
paddw m2, m1
- pmulhrsw m11, m2, [pw_4096]
+ pmulhrsw m11, m2, m7
paddw m2, m6
paddw m2, m15
paddw m1, m13, m5
+ pblendvb m5, m11, m9
+ pblendvb m13, m10, m9
psubw m2, m1
- pmulhrsw m1, m2, [pw_4096]
+ pmulhrsw m1, m2, m7
psubw m2, m3
+ pblendvb m3, m8, m9
psubw m2, m6
- paddw m0, m15, m14
- paddw m2, m0
- pmulhrsw m2, [pw_4096]
+ pblendvb m6, m1, m9
+ paddw m1, m15, m14
+ paddw m2, m1
+ pmulhrsw m2, m7
- vpblendvb m13, m13, m7, m9
- vpblendvb m3, m3, m8, m9
- vpblendvb m4, m4, m10, m9
- vpblendvb m5, m5, m11, m9
- vpblendvb m6, m6, m1, m9
- vpblendvb m14, m14, m2, m9
+ pblendvb m14, m2, m9
%ifidn %2, v
mova [tmpq+strideq*1], m13 ; p2
@@ -844,9 +846,7 @@ SECTION .text
mova [dstq+strideq*0], m5 ; q0
mova [dstq+strideq*1], m6 ; q1
mova [dstq+strideq*2], m14 ; q2
-%else
- mova m0, [rsp+5*32]
-%if %1 == 8
+%elif %1 == 8
TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1
; write 8x16
@@ -871,29 +871,28 @@ SECTION .text
vextracti128 [dstq+stride3q -8], m15, 1
lea dstq, [dstq+strideq*4]
%else
- mova m0, [rsp+6*32]
+ mova m8, [rsp+6*32]
mova m1, [rsp+7*32]
mova m2, [rsp+8*32]
mova m7, [rsp+9*32]
- mova m8, [rsp+5*32]
- TRANSPOSE8X8W 0, 1, 2, 7, 8, 13, 3, 4, 9
+ TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9
- mova [dstq+strideq*0-16], xm0
+ mova [dstq+strideq*0-16], xm8
mova [dstq+strideq*1-16], xm1
mova [dstq+strideq*2-16], xm2
mova [dstq+stride3q -16], xm7
lea tmpq, [dstq+strideq*4]
- mova [tmpq+strideq*0-16], xm8
+ mova [tmpq+strideq*0-16], xm0
mova [tmpq+strideq*1-16], xm13
mova [tmpq+strideq*2-16], xm3
mova [tmpq+stride3q -16], xm4
lea tmpq, [tmpq+strideq*4]
- vextracti128 [tmpq+strideq*0-16], m0, 1
+ vextracti128 [tmpq+strideq*0-16], m8, 1
vextracti128 [tmpq+strideq*1-16], m1, 1
vextracti128 [tmpq+strideq*2-16], m2, 1
vextracti128 [tmpq+stride3q -16], m7, 1
lea tmpq, [tmpq+strideq*4]
- vextracti128 [tmpq+strideq*0-16], m8, 1
+ vextracti128 [tmpq+strideq*0-16], m0, 1
vextracti128 [tmpq+strideq*1-16], m13, 1
vextracti128 [tmpq+strideq*2-16], m3, 1
vextracti128 [tmpq+stride3q -16], m4, 1
@@ -924,39 +923,38 @@ SECTION .text
vextracti128 [dstq+stride3q ], m3, 1
lea dstq, [dstq+strideq*4]
%endif
-%endif
%elif %1 == 6
; flat6 filter
-
+ vpbroadcastd m7, [pw_4096]
paddw m8, m3, m4
paddw m8, m13 ; p2+p1+p0
paddw m11, m13, m5
paddw m8, m8
paddw m8, m11 ; p2+2*(p2+p1+p0)+q0
- pmulhrsw m2, m8, [pw_4096]
+ pmulhrsw m2, m8, m7
paddw m8, m5
paddw m11, m13, m13
paddw m8, m6
psubw m8, m11
- pmulhrsw m10, m8, [pw_4096]
+ pmulhrsw m10, m8, m7
paddw m8, m6
paddw m11, m13, m3
paddw m8, m14
psubw m8, m11
- pmulhrsw m11, m8, [pw_4096]
+ pmulhrsw m11, m8, m7
psubw m8, m3
paddw m14, m14
psubw m8, m4
paddw m8, m14
- pmulhrsw m8, [pw_4096]
+ pmulhrsw m8, m7
- vpblendvb m3, m3, m2, m9
- vpblendvb m4, m4, m10, m9
- vpblendvb m5, m5, m11, m9
- vpblendvb m6, m6, m8, m9
+ pblendvb m3, m2, m9
+ pblendvb m4, m10, m9
+ pblendvb m5, m11, m9
+ pblendvb m6, m8, m9
%ifidn %2, v
mova [tmpq+strideq*2], m3 ; p1
@@ -982,10 +980,10 @@ INIT_YMM avx2
cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
dst, stride, mask, l, l_stride, lut, \
w, stride3, mstride, tmp, mask_bits
- rorx r6d, r7m, 6
- and r6d, 32 ; 0 for 10bpc, 32 for 12bpc
+ mov r6d, r7m
lea r11, [pw_4]
- add r11, r6
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
mov wd, wm
shl l_strideq, 2
sub lq, l_strideq
@@ -1013,7 +1011,7 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
test [maskq+0], mask_bitsd ; vmask[0]
jz .end
- FILTER 4, v
+ call .v4
.end:
pslld m12, 4
@@ -1023,15 +1021,19 @@ cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
sub wd, 4
jg .loop
RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
INIT_YMM avx2
cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
dst, stride, mask, l, l_stride, lut, \
h, stride3, l_stride3, tmp, mask_bits
- rorx r6d, r7m, 6
- and r6d, 32 ; 0 for 10bpc, 32 for 12bpc
+ mov r6d, r7m
lea r11, [pw_4]
- add r11, r6
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
mov hd, hm
shl l_strideq, 2
sub lq, 4
@@ -1058,7 +1060,7 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
test [maskq+0], mask_bitsd ; vmask[0]
jz .no_filter
- FILTER 4, h
+ call .h4
jmp .end
.no_filter:
@@ -1071,15 +1073,19 @@ cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
sub hd, 4
jg .loop
RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
INIT_YMM avx2
cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
dst, stride, mask, l, l_stride, lut, \
w, stride3, mstride, tmp, mask_bits
- rorx r6d, r7m, 6
- and r6d, 32 ; 0 for 10bpc, 32 for 12bpc
+ mov r6d, r7m
lea r11, [pw_4]
- add r11, r6
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
mov wd, wm
shl l_strideq, 2
sub lq, l_strideq
@@ -1100,7 +1106,7 @@ cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
test [maskq+0], mask_bitsd ; vmask[0]
jz .end
- FILTER 4, v
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4
.end:
pslld m12, 4
@@ -1115,10 +1121,10 @@ INIT_YMM avx2
cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
dst, stride, mask, l, l_stride, lut, \
h, stride3, l_stride3, tmp, mask_bits
- rorx r6d, r7m, 6
- and r6d, 32 ; 0 for 10bpc, 32 for 12bpc
+ mov r6d, r7m
lea r11, [pw_4]
- add r11, r6
+ shr r6d, 11 ; is_12bpc
+ lea r11, [r11+r6*4]
mov hd, hm
shl l_strideq, 2
sub lq, 4
@@ -1138,7 +1144,7 @@ cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
test [maskq+0], mask_bitsd ; vmask[0]
jz .no_filter
- FILTER 4, h
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4
jmp .end
.no_filter:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm
new file mode 100644
index 00000000000..b7bc3aa106f
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm
@@ -0,0 +1,912 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+l_shuf_v: times 2 db 0, 32
+pw_1: times 2 dw 1
+ times 2 db 4, 36
+pw_3: times 2 dw 3
+ times 2 db 8, 40
+pw_4: times 2 dw 4
+ times 2 db 12, 44
+pw_16: times 2 dw 16
+ times 2 db 16, 48
+pw_4096: times 2 dw 4096
+ times 2 db 20, 52
+pw_16384: times 2 dw 16384
+ times 2 db 24, 56
+pw_32767: times 2 dw 32767
+ times 2 db 28, 60
+ times 2 dw 0
+filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128
+stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25
+l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1
+clip_max: dw 511, 511, 2047, 2047
+clip_min: dw -512, -512, -2048, -2048
+
+SECTION .text
+
+%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp
+ punpckhwd m%9, m%5, m%6
+ punpcklwd m%5, m%6
+ punpckhwd m%6, m%1, m%2
+ punpcklwd m%1, m%2
+ punpckhwd m%2, m%7, m%8
+ punpcklwd m%7, m%8
+ punpckhwd m%8, m%3, m%4
+ punpcklwd m%3, m%4
+ punpckhdq m%4, m%1, m%3
+ punpckldq m%1, m%3
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%5, m%7
+ punpckhdq m%7, m%6, m%8
+ punpckldq m%6, m%8
+ punpckldq m%8, m%9, m%2
+ punpckhdq m%9, m%2
+ punpckhqdq m%2, m%1, m%3
+ punpcklqdq m%1, m%3
+ punpcklqdq m%3, m%4, m%5
+ punpckhqdq m%4, m%5
+ punpcklqdq m%5, m%6, m%8
+ punpckhqdq m%6, m%8
+ punpckhqdq m%8, m%7, m%9
+ punpcklqdq m%7, m%9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%ifidn %2, v
+%if %1 == 16
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1 ]
+ mova m1, [tmpq+strideq*2 ] ; p5
+ mova m2, [tmpq+stride3q ] ; p4
+ mova m3, [tmpq+strideq*4 ] ; p3
+ mova m4, [tmpq+stride5q ] ; p2
+%elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 == 8
+ mova m3, [tmpq+strideq*0 ]
+%endif
+ mova m4, [tmpq+strideq*1 ]
+%endif
+ mova m5, [dstq+mstrideq*2] ; p1
+ mova m6, [dstq+mstrideq*1] ; p0
+ mova m7, [dstq+strideq*0 ] ; q0
+ mova m8, [dstq+strideq*1 ] ; q1
+%if %1 != 4
+ mova m9, [dstq+strideq*2 ] ; q2
+%endif
+%if %1 == 8 || %1 == 16
+ mova m10, [dstq+stride3q ] ; q3
+%endif
+%if %1 == 16
+ mova m11, [dstq+strideq*4 ] ; q4
+ mova m22, [dstq+stride5q ] ; q5
+ mova m23, [dstq+stride3q*2]
+%endif
+%else ; h
+%if %1 == 16
+ movu ym16, [dstq+strideq*0 -16]
+ movu ym17, [dstq+strideq*1 -16]
+ movu ym18, [dstq+strideq*2 -16]
+ movu ym19, [dstq+stride3q -16]
+ movu ym20, [dstq+strideq*4 -16]
+ movu ym22, [dstq+stride5q -16]
+ movu ym23, [dstq+stride3q*2-16]
+ movu ym28, [dstq+stride7q -16]
+ lea tmpq, [dstq+strideq*8 -16]
+ vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m10, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1
+ vinserti32x8 m22, m22, [tmpq+stride5q ], 1
+ vinserti32x8 m23, m23, [tmpq+stride3q*2], 1
+ vinserti32x8 m28, m28, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8]
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27
+ movu ym16, [tmpq+strideq*0 ]
+ movu ym17, [tmpq+strideq*1 ]
+ movu ym18, [tmpq+strideq*2 ]
+ movu ym19, [tmpq+stride3q ]
+ movu ym24, [tmpq+strideq*4 ]
+ movu ym25, [tmpq+stride5q ]
+ movu ym26, [tmpq+stride3q*2]
+ movu ym20, [tmpq+stride7q ]
+ lea tmpq, [tmpq+strideq*8]
+ vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m3, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1
+ vinserti32x8 m5, m25, [tmpq+stride5q ], 1
+ vinserti32x8 m6, m26, [tmpq+stride3q*2], 1
+ vinserti32x8 m20, m20, [tmpq+stride7q ], 1
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27
+ vshufi32x4 m27, m7, m0, q2020
+ vshufi32x4 m7, m0, q3131
+ vshufi32x4 m0, m8, m1, q2020
+ vshufi32x4 m8, m1, q3131
+ vshufi32x4 m1, m9, m2, q2020
+ vshufi32x4 m9, m2, q3131
+ vshufi32x4 m2, m10, m3, q2020
+ vshufi32x4 m10, m3, q3131
+ vshufi32x4 m3, m11, m4, q2020
+ vshufi32x4 m11, m4, q3131
+ vshufi32x4 m4, m22, m5, q2020
+ vshufi32x4 m22, m5, q3131
+ vshufi32x4 m5, m23, m6, q2020
+ vshufi32x4 m23, m6, q3131
+ vshufi32x4 m6, m28, m20, q2020
+ vshufi32x4 m28, m20, q3131
+%elif %1 == 6 || %1 == 8
+%if %1 == 8
+ sub dstq, 8
+ movu xm16, [dstq+strideq*0 ]
+ movu xm17, [dstq+strideq*1 ]
+ movu xm18, [dstq+strideq*2 ]
+ movu xm19, [dstq+stride3q ]
+ movu xm24, [dstq+strideq*4 ]
+ movu xm25, [dstq+stride5q ]
+ movu xm26, [dstq+stride3q*2]
+ movu xm27, [dstq+stride7q ]
+ lea tmpq, [dstq+strideq*8 ]
+ vinserti128 ym16, [tmpq+strideq*0 ], 1
+ vinserti128 ym17, [tmpq+strideq*1 ], 1
+ vinserti128 ym18, [tmpq+strideq*2 ], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ vinserti128 ym24, [tmpq+strideq*4 ], 1
+ vinserti128 ym25, [tmpq+stride5q ], 1
+ vinserti128 ym26, [tmpq+stride3q*2], 1
+ vinserti128 ym27, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2
+ vinserti32x4 m9, m25, [tmpq+stride5q ], 2
+ vinserti32x4 m3, m26, [tmpq+stride3q*2], 2
+ vinserti32x4 m4, m27, [tmpq+stride7q ], 2
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, [tmpq+strideq*0 ], 3
+ vinserti32x4 m8, [tmpq+strideq*1 ], 3
+ vinserti32x4 m5, [tmpq+strideq*2 ], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ vinserti32x4 m2, [tmpq+strideq*4 ], 3
+ vinserti32x4 m9, [tmpq+stride5q ], 3
+ vinserti32x4 m3, [tmpq+stride3q*2], 3
+ vinserti32x4 m4, [tmpq+stride7q ], 3
+%else ; %1 == 6
+ movu xm16, [dstq+strideq*0-8]
+ movu xm17, [dstq+strideq*1-8]
+ movu xm18, [dstq+strideq*2-8]
+ movu xm19, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4-8]
+ movu xm2, [tmpq+strideq*0]
+ movu xm9, [tmpq+strideq*1]
+ movu xm3, [tmpq+strideq*2]
+ movu xm4, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym16, [tmpq+strideq*0], 1
+ vinserti128 ym17, [tmpq+strideq*1], 1
+ vinserti128 ym18, [tmpq+strideq*2], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym2, [tmpq+strideq*0], 1
+ vinserti128 ym9, [tmpq+strideq*1], 1
+ vinserti128 ym3, [tmpq+strideq*2], 1
+ vinserti128 ym4, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, m16, [tmpq+strideq*0], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 2
+ vinserti32x4 m9, [tmpq+strideq*1], 2
+ vinserti32x4 m3, [tmpq+strideq*2], 2
+ vinserti32x4 m4, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, [tmpq+strideq*0], 3
+ vinserti32x4 m8, [tmpq+strideq*1], 3
+ vinserti32x4 m5, [tmpq+strideq*2], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 3
+ vinserti32x4 m9, [tmpq+strideq*1], 3
+ vinserti32x4 m3, [tmpq+strideq*2], 3
+ vinserti32x4 m4, [tmpq+stride3q ], 3
+%endif
+ punpcklwd m6, m10, m8
+ punpckhwd m10, m8
+ punpcklwd m8, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m2, m9
+ punpckhwd m2, m9
+ punpcklwd m9, m3, m4
+ punpckhwd m3, m4
+ punpckldq m4, m6, m8
+ punpckhdq m6, m8
+ punpckldq m8, m10, m5
+ punpckhdq m10, m5
+ punpckldq m5, m7, m9
+ punpckhdq m7, m9
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+%if %1 == 8
+ punpcklqdq m3, m4, m5
+%endif
+ punpckhqdq m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m9
+ punpckhqdq m8, m9
+ punpcklqdq m9, m10, m2
+%if %1 == 8
+ punpckhqdq m10, m2
+%endif
+%else ; %1 == 4
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdq m7{k1}, [dstq+ym12-4]
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpgatherdq m4{k2}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpgatherdq m5{k1}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ vpgatherdq m6{k2}, [tmpq+ym12]
+ punpcklwd m8, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m8, m7
+ punpckhwd m8, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m4
+ punpckhqdq m8, m4
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu ym16, [lq+l_strideq*1]
+ movsldup m17, [l_shuf_v]
+ vptestnmb k1, ym16, ym16
+ vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][]
+ vpermb m16, m17, m16 ; l[x][1]
+%else
+ movq xm16, [lq+l_strideq*0]
+ movq xm17, [lq+l_strideq*1]
+ vinserti128 ym16, [lq+l_strideq*2], 1
+ vinserti128 ym17, [lq+l_stride3q ], 1
+ lea tmpq, [lq+l_strideq*4]
+ vinserti32x4 m16, [tmpq+l_strideq*0], 2
+ vinserti32x4 m17, [tmpq+l_strideq*1], 2
+ vinserti32x4 m16, [tmpq+l_strideq*2], 3
+ vinserti32x4 m17, [tmpq+l_stride3q ], 3
+ punpcklqdq m16, m17
+ vbroadcasti32x4 m17, [l_shuf_h]
+ vptestnmb k1, m16, m16
+ vpalignr m16{k1}, m16, 12
+ pshufb m16, m17 ; l[x][1]
+%endif
+ vpbroadcastd m20, [pw_32767]
+ psubw m17, m5, m6 ; p1-p0
+ psubw m18, m7, m8 ; q1-q0
+ vptestmw k1, m16, m16 ; L
+ pabsw m17, m17
+ pabsw m18, m18
+ vpmaxuw m20{k1}, m17, m18
+ vpbroadcastw m17, [lutq+136]
+ psrlw m18, m16, [lutq+128]
+ vpbroadcastd m19, [pw_1]
+ pminuw m18, m17
+ psrlw m17, m16, 4 ; H
+ paddw m16, m16
+ pmaxuw m18, m19 ; I
+ vpaddd m16, [pw_4] {1to16}
+ paddw m16, m18 ; E
+ REPX {pmullw x, m13}, m17, m18, m16
+ vpcmpw k4, m20, m17, 6 ; hev
+%if %1 != 4
+ psubw m19, m4, m5 ; p2-p1
+ pabsw m19, m19
+%if %1 == 8 || %1 == 16
+ psubw m17, m3, m4 ; p3-p2
+ pabsw m17, m17
+ pmaxuw m19, m17
+ psubw m17, m9, m10 ; q3-q2
+ pabsw m17, m17
+ pmaxuw m19, m17
+%endif
+ psubw m17, m9, m8 ; q2-q1
+ pabsw m17, m17
+ pmaxuw m19, m17
+%if %1 == 16
+ vpbroadcastd ym17, [maskq+4]
+ vpord ym17, [maskq+8] {1to8}
+ vptestmd k1, ym17, ym21
+%else
+ vptestmd k1, ym21, [maskq+4] {1to8}
+%endif
+ pmaxuw m19, m20
+ psubw m17, m4, m6 ; p2-p0
+ pabsw m17, m17
+ pmaxuw m17, m20
+ vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks
+%if %1 == 8 || %1 == 16
+ psubw m19, m3, m6 ; p3-p0
+ pabsw m19, m19
+ pmaxuw m17, m19
+ psubw m19, m7, m10 ; q3-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ psubw m19, m7, m9 ; q2-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ vpcmpw k1, m20, m18, 2
+ psubw m18, m5, m8 ; p1-q1
+ psubw m19, m6, m7 ; p0-q0
+ pabsw m18, m18
+ pabsw m19, m19
+ psrlw m18, 1
+ paddw m19, m19
+ paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+%if %1 != 4
+ vpcmpw k2{k1}, m17, m13, 2 ; flat8in
+%endif
+%if %1 == 16
+ psubw m20, m0, m6
+ psubw m16, m1, m6
+ pabsw m20, m20
+ psubw m17, m2, m6
+ pabsw m16, m16
+ psubw m18, m11, m7
+ pabsw m17, m17
+ psubw m19, m22, m7
+ pabsw m18, m18
+ pmaxuw m20, m16
+ psubw m16, m23, m7
+ pabsw m19, m19
+ pmaxuw m17, m18
+ pabsw m16, m16
+ vpandd ym18, ym21, [maskq+8] {1to8}
+ pmaxuw m20, m17
+ pmaxuw m19, m16
+ pcmpeqd ym16, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8
+ pmaxuw m20, m19
+ pcmpeqd ym17, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8
+ vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out
+ pcmpeqd ym18, ym21
+ vptestmb k3{k3}, ym16, ym16 ; flat8 & fm
+ vptestmb k2{k2}, ym17, ym17 ; flat8in
+ vptestmb k1{k1}, ym18, ym18
+ kandnd k1, k2, k1 ; fm & !flat8 & !flat16
+ kandnd k2, k3, k2 ; flat8 & !flat16
+%elif %1 == 6 || %1 == 8
+ vpandd ym17, ym21, [maskq+4] {1to8}
+ pcmpeqd ym16, ym21, ym17
+ vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8
+ pcmpeqd ym17, ym21
+ vptestmb k2{k2}, ym16, ym16 ; flat8 & fm
+ vptestmb k1{k1}, ym17, ym17
+ kandnd k1, k2, k1 ; fm & !flat8
+%else ; %1 == 4
+ vpandd ym16, ym21, [maskq+0] {1to8}
+ pcmpeqd ym16, ym21
+ vptestmb k1{k1}, ym16, ym16
+%endif
+
+ ; short filter
+ psubw m16, m7, m6
+ vpbroadcastd m17, [pw_3]
+ paddw m18, m16, m16
+ paddw m18, m16
+ psubw m16, m5, m8 ; iclip_diff(p1-q1)
+ pminsw m16, m14
+ vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev
+ knotd k4, k4 ; !hev
+ paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f)
+ vpbroadcastd m18, [pw_4]
+ pminsw m16, m14
+ vpmaxsw m16{k1}{z}, m15 ; f&=fm
+ paddw m17, m16
+ paddw m16, m18
+ vpbroadcastd m18, [pw_16384]
+ pminsw m17, m14
+ pminsw m16, m14
+ psraw m17, 3 ; f2
+ psraw m16, 3 ; f1
+ paddw m6, m17
+ psubw m7, m16
+ vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev
+ psubw m17, m14, m15 ; 1023 or 4095
+ pxor m18, m18
+ paddw m5, m16
+ psubw m8, m16
+ REPX {pminsw x, m17}, m6, m7, m5, m8
+ REPX {pmaxsw x, m18}, m6, m7, m5, m8
+
+%if %1 == 16 ; flat16 filter
+ vpaddd m19, m0, [pw_1] {1to16}
+ paddw m16, m1, m2 ; p5+p4
+ paddw m26, m1, m6 ; p5+p0
+ paddw m24, m2, m7 ; p4+q0
+ paddw m16, m4 ; p5+p4+p3
+ paddw m17, m3, m5 ; p2+p1
+ psllw m19, 3
+ paddw m16, m26 ; p5*2+p4+p3+p0
+ paddw m17, m24 ; p4+p2+p1+q0
+ psubw m19, m0 ; p6*7+8
+ paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0
+ paddw m18, m3, m8
+ paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0
+ paddw m25, m1, m0
+ paddw m16, m0, m0
+ psrlw m1{k3}, m19, 4
+ paddw m19, m18
+ psubw m19, m16 ; +p3+q1-p6*2
+ paddw m16, m2, m0
+ psrlw m2{k3}, m19, 4
+ psubw m19, m25
+ paddw m25, m4, m9
+ paddw m20, m10, m5
+ paddw m19, m25 ; +p2+q2-p6-p5
+ paddw m17, m0, m3
+ psubw m16, m20, m16
+ psrlw m3{k3}, m19, 4
+ paddw m19, m16 ; +p1+q3-p6-p4
+ paddw m16, m11, m6
+ psubw m16, m17
+ paddw m17, m0, m4
+ psrlw m4{k3}, m19, 4
+ paddw m19, m16 ; +p0+q4-p6-p3
+ paddw m16, m22, m7
+ psubw m16, m17
+ paddw m17, m0, m5
+ psrlw m5{k3}, m19, 4
+ paddw m19, m16 ; +q0+q5-p6-p2
+ paddw m16, m23, m8
+ psrlw m6{k3}, m19, 4
+ psubw m16, m17
+ paddw m19, m16 ; +q1+q6-p6-p1
+ paddw m16, m23, m9
+ psrlw m7{k3}, m19, 4
+ psubw m16, m26
+ paddw m19, m16 ; +q2+q6-p5-p0
+ paddw m16, m23, m10
+ psrlw m8{k3}, m19, 4
+ psubw m16, m24
+ paddw m19, m16 ; +q3+q6-p4-p0
+ paddw m16, m23, m11
+ psrlw m9{k3}, m19, 4
+ psubw m16, m18
+ paddw m19, m16 ; +q4+q6-p3-q1
+ paddw m16, m23, m22
+ psrlw m10{k3}, m19, 4
+ psubw m16, m25
+ paddw m19, m16 ; +q5+q6-p2-q2
+ paddw m16, m23, m23
+ psrlw m11{k3}, m19, 4
+ psubw m16, m20
+ paddw m19, m16 ; +q6*2-p1-q3
+ psrlw m22{k3}, m19, 4
+%endif
+%if %1 == 8 || %1 == 16 ; flat8 filter
+ vpbroadcastd m20, [pw_4096]
+ paddw m16, m3, m4 ; p3+p2
+ paddw m19, m5, m6 ; p1+p0
+ paddw m17, m16, m16 ; 2*(p3+p2)
+ paddw m19, m3 ; p1+p0+p3
+ paddw m17, m7 ; 2*(p3+p2)+q0
+ paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0
+ paddw m18, m4, m7
+ pmulhrsw m4{k2}, m19, m20
+ psubw m19, m16
+ paddw m17, m5, m8
+ paddw m16, m3, m5
+ paddw m19, m17
+ pmulhrsw m5{k2}, m19, m20
+ psubw m19, m16
+ paddw m16, m6, m9
+ paddw m19, m16
+ paddw m16, m3, m6
+ pmulhrsw m6{k2}, m19, m20
+ paddw m19, m10
+ psubw m16, m7, m16
+ paddw m19, m16
+ psubw m16, m10, m18
+ pmulhrsw m7{k2}, m19, m20
+ paddw m16, m8
+ paddw m19, m16
+ psubw m16, m10, m17
+ pmulhrsw m8{k2}, m19, m20
+ paddw m16, m9
+ paddw m19, m16
+ pmulhrsw m9{k2}, m19, m20
+%elif %1 == 6 ; flat6 filter
+ vpbroadcastd m10, [pw_4096]
+ paddw m2, m5, m6
+ paddw m0, m4, m7
+ paddw m1, m2, m4 ; p2+p1+p0
+ paddw m3, m4, m4
+ paddw m1, m1
+ paddw m4, m5
+ paddw m1, m0 ; p2+2*(p2+p1+p0)+q0
+ psubw m3, m7, m3
+ pmulhrsw m5{k2}, m1, m10
+ paddw m3, m8
+ psubw m4, m8, m4
+ paddw m1, m3
+ pmulhrsw m6{k2}, m1, m10
+ paddw m4, m9
+ paddw m9, m9
+ paddw m1, m4
+ pmulhrsw m7{k2}, m1, m10
+ psubw m9, m2
+ paddw m1, m9
+ pmulhrsw m8{k2}, m1, m10
+%endif
+
+%ifidn %2, v
+%if %1 == 16
+ mova [tmpq+strideq*2 ], m1 ; p5
+ mova [tmpq+stride3q ], m2 ; p4
+ mova [tmpq+strideq*4 ], m3 ; p3
+ mova [tmpq+stride5q ], m4 ; p2
+%elif %1 == 8
+ mova [tmpq+strideq*1 ], m4 ; p2
+%endif
+ mova [dstq+mstrideq*2], m5 ; p1
+ mova [dstq+mstrideq ], m6 ; p0
+ mova [dstq+strideq*0 ], m7 ; q0
+ mova [dstq+strideq*1 ], m8 ; q1
+%if %1 == 8 || %1 == 16
+ mova [dstq+strideq*2 ], m9 ; q2
+%endif
+%if %1 == 16
+ mova [dstq+stride3q ], m10 ; q3
+ mova [dstq+strideq*4 ], m11 ; q4
+ mova [dstq+stride5q ], m22 ; q5
+%endif
+%else
+%if %1 == 16
+ TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20
+ mova [dstq+strideq*0 -16], xm27
+ mova [dstq+strideq*0 ], xm7
+ mova [dstq+strideq*1 -16], xm0
+ mova [dstq+strideq*1 ], xm8
+ mova [dstq+strideq*2 -16], xm1
+ mova [dstq+strideq*2 ], xm9
+ mova [dstq+stride3q -16], xm2
+ mova [dstq+stride3q ], xm10
+ mova [dstq+strideq*4 -16], xm3
+ mova [dstq+strideq*4 ], xm11
+ mova [dstq+stride5q -16], xm4
+ mova [dstq+stride5q ], xm22
+ mova [dstq+stride3q*2-16], xm5
+ mova [dstq+stride3q*2 ], xm23
+ mova [dstq+stride7q -16], xm6
+ mova [dstq+stride7q ], xm28
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 -16], ym27, 1
+ vextracti128 [dstq+strideq*0 ], ym7, 1
+ vextracti128 [dstq+strideq*1 -16], ym0, 1
+ vextracti128 [dstq+strideq*1 ], ym8, 1
+ vextracti128 [dstq+strideq*2 -16], ym1, 1
+ vextracti128 [dstq+strideq*2 ], ym9, 1
+ vextracti128 [dstq+stride3q -16], ym2, 1
+ vextracti128 [dstq+stride3q ], ym10, 1
+ vextracti128 [dstq+strideq*4 -16], ym3, 1
+ vextracti128 [dstq+strideq*4 ], ym11, 1
+ vextracti128 [dstq+stride5q -16], ym4, 1
+ vextracti128 [dstq+stride5q ], ym22, 1
+ vextracti128 [dstq+stride3q*2-16], ym5, 1
+ vextracti128 [dstq+stride3q*2 ], ym23, 1
+ vextracti128 [dstq+stride7q -16], ym6, 1
+ vextracti128 [dstq+stride7q ], ym28, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 2
+ vextracti32x4 [dstq+strideq*0 ], m7, 2
+ vextracti32x4 [dstq+strideq*1 -16], m0, 2
+ vextracti32x4 [dstq+strideq*1 ], m8, 2
+ vextracti32x4 [dstq+strideq*2 -16], m1, 2
+ vextracti32x4 [dstq+strideq*2 ], m9, 2
+ vextracti32x4 [dstq+stride3q -16], m2, 2
+ vextracti32x4 [dstq+stride3q ], m10, 2
+ vextracti32x4 [dstq+strideq*4 -16], m3, 2
+ vextracti32x4 [dstq+strideq*4 ], m11, 2
+ vextracti32x4 [dstq+stride5q -16], m4, 2
+ vextracti32x4 [dstq+stride5q ], m22, 2
+ vextracti32x4 [dstq+stride3q*2-16], m5, 2
+ vextracti32x4 [dstq+stride3q*2 ], m23, 2
+ vextracti32x4 [dstq+stride7q -16], m6, 2
+ vextracti32x4 [dstq+stride7q ], m28, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 3
+ vextracti32x4 [dstq+strideq*0 ], m7, 3
+ vextracti32x4 [dstq+strideq*1 -16], m0, 3
+ vextracti32x4 [dstq+strideq*1 ], m8, 3
+ vextracti32x4 [dstq+strideq*2 -16], m1, 3
+ vextracti32x4 [dstq+strideq*2 ], m9, 3
+ vextracti32x4 [dstq+stride3q -16], m2, 3
+ vextracti32x4 [dstq+stride3q ], m10, 3
+ vextracti32x4 [dstq+strideq*4 -16], m3, 3
+ vextracti32x4 [dstq+strideq*4 ], m11, 3
+ vextracti32x4 [dstq+stride5q -16], m4, 3
+ vextracti32x4 [dstq+stride5q ], m22, 3
+ vextracti32x4 [dstq+stride3q*2-16], m5, 3
+ vextracti32x4 [dstq+stride3q*2 ], m23, 3
+ vextracti32x4 [dstq+stride7q -16], m6, 3
+ vextracti32x4 [dstq+stride7q ], m28, 3
+%elif %1 == 8
+ TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2
+ movu [dstq+strideq*0 ], xm3
+ movu [dstq+strideq*1 ], xm4
+ movu [dstq+strideq*2 ], xm5
+ movu [dstq+stride3q ], xm6
+ movu [dstq+strideq*4 ], xm7
+ movu [dstq+stride5q ], xm8
+ movu [dstq+stride3q*2], xm9
+ movu [dstq+stride7q ], xm10
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 ], ym3, 1
+ vextracti128 [dstq+strideq*1 ], ym4, 1
+ vextracti128 [dstq+strideq*2 ], ym5, 1
+ vextracti128 [dstq+stride3q ], ym6, 1
+ vextracti128 [dstq+strideq*4 ], ym7, 1
+ vextracti128 [dstq+stride5q ], ym8, 1
+ vextracti128 [dstq+stride3q*2], ym9, 1
+ vextracti128 [dstq+stride7q ], ym10, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 2
+ vextracti32x4 [dstq+strideq*1 ], m4, 2
+ vextracti32x4 [dstq+strideq*2 ], m5, 2
+ vextracti32x4 [dstq+stride3q ], m6, 2
+ vextracti32x4 [dstq+strideq*4 ], m7, 2
+ vextracti32x4 [dstq+stride5q ], m8, 2
+ vextracti32x4 [dstq+stride3q*2], m9, 2
+ vextracti32x4 [dstq+stride7q ], m10, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 3
+ vextracti32x4 [dstq+strideq*1 ], m4, 3
+ vextracti32x4 [dstq+strideq*2 ], m5, 3
+ vextracti32x4 [dstq+stride3q ], m6, 3
+ vextracti32x4 [dstq+strideq*4 ], m7, 3
+ vextracti32x4 [dstq+stride5q ], m8, 3
+ vextracti32x4 [dstq+stride3q*2], m9, 3
+ vextracti32x4 [dstq+stride7q ], m10, 3
+ lea dstq, [dstq+strideq*8+8]
+%else ; %1 == 4 || %1 == 6
+ punpcklwd m9, m5, m6
+ punpckhwd m5, m6
+ kxnorb k1, k1, k1
+ punpcklwd m6, m7, m8
+ punpckhwd m7, m8
+ kmovb k2, k1
+ punpckldq m8, m9, m6
+ vpscatterdq [dstq+ym12-4]{k1}, m8
+ punpckhdq m9, m6
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpscatterdq [tmpq+ym12]{k2}, m9
+ punpckldq m6, m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpscatterdq [tmpq+ym12]{k1}, m6
+ punpckhdq m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ vpscatterdq [tmpq+ym12]{k2}, m5
+%endif
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride, tmp, \
+ mask_bits, stride5
+%define base tmpq-filter_mask
+ SWAP 12, 26 ; avoids clobbering xmm10 on WIN64
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, v
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call .v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4: ; called by both luma and chroma
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, l_stride3, tmp, \
+ mask_bits, stride5, stride7
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride7q, [strideq+stride3q*2]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, h
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, h
+ jmp .end2
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+ call .h4
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+.end:
+ lea dstq, [dstq+strideq*8]
+.end2:
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4: ; called by both luma and chroma
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ shl l_strideq, 2
+ lea stride3q, [strideq*3]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride3q, [strideq*3]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, h
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4
+.end:
+ lea tmpq, [strideq+stride3q]
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea dstq, [dstq+tmpq*8]
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm
index d6b296b19ef..84696c758ae 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx2.asm
@@ -1444,7 +1444,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
cmp byte [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call .v4
.end:
add lq, 32
@@ -1453,6 +1453,10 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
sub wd, 8
jg .loop
RET
+ALIGN function_align
+.v4:
+ FILTER 4, v
+ ret
INIT_YMM avx2
cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
@@ -1481,7 +1485,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
cmp byte [maskq+0], 0 ; vmask[0]
je .no_filter
- FILTER 4, h
+ call .h4
jmp .end
.no_filter:
@@ -1493,6 +1497,10 @@ cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
sub hd, 8
jg .loop
RET
+ALIGN function_align
+.h4:
+ FILTER 4, h
+ ret
INIT_YMM avx2
cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
@@ -1515,7 +1523,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
cmp byte [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4
.end:
add lq, 32
@@ -1545,7 +1553,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \
cmp byte [maskq+0], 0 ; vmask[0]
je .no_filter
- FILTER 4, h
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4
jmp .end
.no_filter:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
index c09dced418b..0218b624d3c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
@@ -80,25 +80,24 @@ SECTION .text
punpckhwd m%1, m%3
kmovw k1, k6
lea t0, [dstq+strideq*4]
- vpscatterdd [dstq+m29-2]{k1}, m%4
+ vpscatterdd [dstq+m19-2]{k1}, m%4
kmovw k1, k6
lea t1, [dstq+strideq*8]
- vpscatterdd [t0 +m29-2]{k1}, m%5
+ vpscatterdd [t0 +m19-2]{k1}, m%5
kmovw k1, k6
lea t2, [t0 +strideq*8]
- vpscatterdd [t1 +m29-2]{k1}, m%2
+ vpscatterdd [t1 +m19-2]{k1}, m%2
kmovw k1, k6
- vpscatterdd [t2 +m29-2]{k1}, m%1
+ vpscatterdd [t2 +m19-2]{k1}, m%1
%endmacro
%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
%if %1 == 0
- SWAP m16, m15
+ SWAP m16, m22
%endif
- ; input in m0-15
- punpcklbw m15, m0, m1
- punpckhbw m0, m1
- punpcklbw m1, m2, m3
+ punpcklbw m22, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m2, m3
punpckhbw m2, m3
punpcklbw m3, m4, m5
punpckhbw m4, m5
@@ -108,21 +107,21 @@ SECTION .text
punpckhbw m8, m9
punpcklbw m9, m10, m11
punpckhbw m10, m11
- punpcklbw m11, m12, m13
- punpckhbw m12, m13
+ punpcklbw m11, m25, m13
+ punpckhbw m25, m13
%if %1 == 0
SWAP m13, m16
%else
mova m13, %3
%endif
- SWAP m16, m12
- punpcklbw m12, m14, m13
+ SWAP m16, m25
+ punpcklbw m25, m14, m13
punpckhbw m13, m14, m13
- ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
- punpcklwd m14, m15, m1
- punpckhwd m15, m1
- punpcklwd m1, m0, m2
- punpckhwd m0, m2
+ ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13
+ punpcklwd m14, m22, m26
+ punpckhwd m22, m26
+ punpcklwd m26, m24, m2
+ punpckhwd m24, m2
punpcklwd m2, m3, m5
punpckhwd m3, m5
punpcklwd m5, m4, m6
@@ -131,58 +130,58 @@ SECTION .text
punpckhwd m7, m9
punpcklwd m9, m8, m10
punpckhwd m8, m10
- punpcklwd m10, m11, m12
- punpckhwd m11, m12
- SWAP m12, m16, m11
- punpcklwd m11, m12, m13
- punpckhwd m12, m13
- ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
+ punpcklwd m10, m11, m25
+ punpckhwd m11, m25
+ SWAP m25, m16, m11
+ punpcklwd m11, m25, m13
+ punpckhwd m25, m13
+ ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25
punpckldq m13, m14, m2
punpckhdq m14, m2
- punpckldq m2, m15, m3
- punpckhdq m15, m3
- punpckldq m3, m1, m5
- punpckhdq m1, m5
- punpckldq m5, m0, m4
- punpckhdq m0, m4
+ punpckldq m2, m22, m3
+ punpckhdq m22, m3
+ punpckldq m3, m26, m5
+ punpckhdq m26, m5
+ punpckldq m5, m24, m4
+ punpckhdq m24, m4
punpckldq m4, m6, m10
punpckhdq m6, m10
punpckldq m10, m9, m11
punpckhdq m9, m11
- punpckldq m11, m8, m12
- punpckhdq m8, m12
- SWAP m12, m16, m8
- punpckldq m8, m7, m12
- punpckhdq m7, m12
- ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
- punpcklqdq m12, m13, m4
+ punpckldq m11, m8, m25
+ punpckhdq m8, m25
+ SWAP m25, m16, m8
+ punpckldq m8, m7, m25
+ punpckhdq m7, m25
+ ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m25, m13, m4
punpckhqdq m13, m4
punpcklqdq m4, m14, m6
punpckhqdq m14, m6
punpcklqdq m6, m2, m8
punpckhqdq m2, m8
- punpcklqdq m8, m15, m7
- punpckhqdq m15, m7
+ punpcklqdq m8, m22, m7
+ punpckhqdq m22, m7
punpcklqdq m7, m3, m10
punpckhqdq m3, m10
- punpcklqdq m10, m1, m9
- punpckhqdq m1, m9
+ punpcklqdq m10, m26, m9
+ punpckhqdq m26, m9
punpcklqdq m9, m5, m11
punpckhqdq m5, m11
SWAP m11, m16
%if %2 == 0
- SWAP m16, m12
+ SWAP m16, m25
%else
- mova %3, m12
+ mova %3, m25
%endif
- punpcklqdq m12, m0, m11
- punpckhqdq m0, m11
+ punpcklqdq m25, m24, m11
+ punpckhqdq m24, m11
%if %2 == 0
SWAP m11, m16
%endif
- ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
- SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
- SWAP 3, 14, 12, 9
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24
+ SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22
+ SWAP 3, 14, 25, 9
%endmacro
%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
@@ -205,7 +204,7 @@ SECTION .text
%endif
lea t0, [dstq+mstrideq*4]
%if %1 != 6
- mova m12, [t0 +strideq*0]
+ mova m25, [t0 +strideq*0]
%endif
mova m13, [t0 +strideq*1]
mova m3, [t0 +strideq*2]
@@ -214,13 +213,13 @@ SECTION .text
mova m6, [dstq+strideq*1]
mova m14, [dstq+strideq*2]
%if %1 != 6
- mova m15, [dstq+stride3q ]
+ mova m22, [dstq+stride3q ]
%endif
%if %1 == 16
lea t0, [dstq+strideq*4]
- mova m19, [t0 +strideq*0]
- mova m20, [t0 +strideq*1]
- mova m21, [t0 +strideq*2]
+ mova m29, [t0 +strideq*0]
+ mova m30, [t0 +strideq*1]
+ mova m31, [t0 +strideq*2]
%endif
%endif
%else ; h
@@ -230,15 +229,15 @@ SECTION .text
vbroadcasti32x4 m0, [hshuf4]
kmovw k1, k6
lea t0, [dstq+strideq*4]
- vpgatherdd m3{k1}, [dstq+m29-2]
+ vpgatherdd m3{k1}, [dstq+m19-2]
kmovw k1, k6
lea t1, [dstq+strideq*8]
- vpgatherdd m4{k1}, [t0 +m29-2]
+ vpgatherdd m4{k1}, [t0 +m19-2]
kmovw k1, k6
lea t2, [t0 +strideq*8]
- vpgatherdd m5{k1}, [t1 +m29-2]
+ vpgatherdd m5{k1}, [t1 +m19-2]
kmovw k1, k6
- vpgatherdd m6{k1}, [t2 +m29-2]
+ vpgatherdd m6{k1}, [t2 +m19-2]
pshufb m3, m0
pshufb m4, m0
pshufb m5, m0
@@ -257,16 +256,16 @@ SECTION .text
%elif %1 == 6 || %1 == 8
kmovb k1, k7
lea t0, [dstq+strideq*1]
- vpgatherdq m3{k1}, [dstq+ym31-%1/2]
+ vpgatherdq m3{k1}, [dstq+ym21-%1/2]
kmovb k1, k7
lea t1, [dstq+strideq*2]
- vpgatherdq m4{k1}, [t0 +ym31-%1/2]
+ vpgatherdq m4{k1}, [t0 +ym21-%1/2]
kmovb k1, k7
lea t2, [dstq+stride3q ]
- vpgatherdq m5{k1}, [t1 +ym31-%1/2]
+ vpgatherdq m5{k1}, [t1 +ym21-%1/2]
kmovb k1, k7
- vextracti32x8 ym0, m31, 1
- vpgatherdq m6{k1}, [t2 +ym31-%1/2]
+ vextracti32x8 ym0, m21, 1
+ vpgatherdq m6{k1}, [t2 +ym21-%1/2]
kmovb k1, k7
vpgatherdq m12{k1}, [dstq+ym0 -%1/2]
kmovb k1, k7
@@ -344,7 +343,7 @@ SECTION .text
punpckhqdq m13, m5, m13
%if %1 == 8
punpcklqdq m5, m7, m12
- punpckhqdq m12, m7, m12
+ punpckhqdq m25, m7, m12
; xm3: A0-15
; xm14: B0-15
; xm15: C0-15
@@ -352,10 +351,11 @@ SECTION .text
; xm4: E0-15
; xm13: F0-15
; xm5: G0-15
- ; xm12: H0-15
- SWAP 12, 3, 15
+ ; xm25: H0-15
+ SWAP 25, 3, 15
SWAP 13, 14, 5, 4, 6
- ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
+ SWAP 15, 22
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22
%else
SWAP 13, 3, 14
SWAP 6, 4, 15, 5
@@ -364,8 +364,8 @@ SECTION .text
%else ; 16, h
; load and 16x16 transpose. We only use 14 pixels but we'll need the
; remainder at the end for the second transpose
- movu xm0, [dstq+strideq*0-8]
- movu xm1, [dstq+strideq*1-8]
+ movu xm24, [dstq+strideq*0-8]
+ movu xm26, [dstq+strideq*1-8]
movu xm2, [dstq+strideq*2-8]
movu xm3, [dstq+stride3q -8]
lea t0, [dstq+strideq*4]
@@ -379,13 +379,13 @@ SECTION .text
movu xm10, [t0 +strideq*2-8]
movu xm11, [t0 +stride3q -8]
lea t0, [t0 +strideq*4]
- movu xm12, [t0 +strideq*0-8]
+ movu xm25, [t0 +strideq*0-8]
movu xm13, [t0 +strideq*1-8]
movu xm14, [t0 +strideq*2-8]
- movu xm15, [t0 +stride3q -8]
+ movu xm22, [t0 +stride3q -8]
lea t0, [t0 +strideq*4]
- vinserti32x4 ym0, [t0 +strideq*0-8], 1
- vinserti32x4 ym1, [t0 +strideq*1-8], 1
+ vinserti32x4 ym24, [t0 +strideq*0-8], 1
+ vinserti32x4 ym26, [t0 +strideq*1-8], 1
vinserti32x4 ym2, [t0 +strideq*2-8], 1
vinserti32x4 ym3, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
@@ -399,13 +399,13 @@ SECTION .text
vinserti32x4 ym10, [t0 +strideq*2-8], 1
vinserti32x4 ym11, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
- vinserti32x4 ym12, [t0 +strideq*0-8], 1
+ vinserti32x4 ym25, [t0 +strideq*0-8], 1
vinserti32x4 ym13, [t0 +strideq*1-8], 1
vinserti32x4 ym14, [t0 +strideq*2-8], 1
- vinserti32x4 ym15, [t0 +stride3q -8], 1
+ vinserti32x4 ym22, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
- vinserti32x4 m0, [t0 +strideq*0-8], 2
- vinserti32x4 m1, [t0 +strideq*1-8], 2
+ vinserti32x4 m24, [t0 +strideq*0-8], 2
+ vinserti32x4 m26, [t0 +strideq*1-8], 2
vinserti32x4 m2, [t0 +strideq*2-8], 2
vinserti32x4 m3, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
@@ -419,13 +419,13 @@ SECTION .text
vinserti32x4 m10, [t0 +strideq*2-8], 2
vinserti32x4 m11, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
- vinserti32x4 m12, [t0 +strideq*0-8], 2
+ vinserti32x4 m25, [t0 +strideq*0-8], 2
vinserti32x4 m13, [t0 +strideq*1-8], 2
vinserti32x4 m14, [t0 +strideq*2-8], 2
- vinserti32x4 m15, [t0 +stride3q -8], 2
+ vinserti32x4 m22, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
- vinserti32x4 m0, [t0 +strideq*0-8], 3
- vinserti32x4 m1, [t0 +strideq*1-8], 3
+ vinserti32x4 m24, [t0 +strideq*0-8], 3
+ vinserti32x4 m26, [t0 +strideq*1-8], 3
vinserti32x4 m2, [t0 +strideq*2-8], 3
vinserti32x4 m3, [t0 +stride3q -8], 3
lea t0, [t0 +strideq*4]
@@ -439,41 +439,38 @@ SECTION .text
vinserti32x4 m10, [t0 +strideq*2-8], 3
vinserti32x4 m11, [t0 +stride3q -8], 3
lea t0, [t0 +strideq*4]
- vinserti32x4 m12, [t0 +strideq*0-8], 3
+ vinserti32x4 m25, [t0 +strideq*0-8], 3
vinserti32x4 m13, [t0 +strideq*1-8], 3
vinserti32x4 m14, [t0 +strideq*2-8], 3
- vinserti32x4 m15, [t0 +stride3q -8], 3
+ vinserti32x4 m22, [t0 +stride3q -8], 3
;
TRANSPOSE_16X16B 0, 1, [rsp+0*64]
- SWAP m16, m1
+ SWAP m16, m26
SWAP m17, m2
SWAP m18, m3
- SWAP m19, m12
- SWAP m20, m13
- SWAP m21, m14
- mova [rsp+4*64], m15
- ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
- SWAP 12, 4, 7
+ SWAP m29, m25
+ SWAP m30, m13
+ SWAP m31, m14
+ mova [rsp+4*64], m22
+ ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22
+ SWAP 25, 4, 7
SWAP 13, 5, 8
SWAP 3, 6, 9
SWAP 10, 14
- SWAP 11, 15
+ SWAP 11, 22
%endif
%endif
; load L/E/I/H
-%if is_uv
- SWAP m22, m15
-%endif
- vpbroadcastd m22, [pb_1]
+ vpbroadcastd m15, [pb_1]
%ifidn %2, v
movu m1, [lq]
movu m0, [lq+l_strideq]
%else
kmovw k1, k6
- vpgatherdd m0{k1}, [lq+m30+4]
+ vpgatherdd m0{k1}, [lq+m20+4]
kmovw k1, k6
- vpgatherdd m1{k1}, [lq+m30+0]
+ vpgatherdd m1{k1}, [lq+m20+0]
%endif
pxor m2, m2
pcmpeqb k1, m0, m2
@@ -484,7 +481,7 @@ SECTION .text
pand m2, [pb_63]{bcstd}
vpbroadcastb m1, [lutq+136]
pminub m2, m1
- pmaxub m2, m22 ; I
+ pmaxub m2, m15 ; I
pand m1, m0, [pb_240]{bcstd}
psrlq m1, 4 ; H
paddd m0, [pb_2]{bcstd}
@@ -500,7 +497,7 @@ SECTION .text
ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
pmaxub m9, m8
%else
- ABSSUB m9, m12, m4, m10 ; abs(p3-p0)
+ ABSSUB m9, m25, m4, m10 ; abs(p3-p0)
pmaxub m9, m8
ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
pmaxub m9, m10
@@ -508,17 +505,17 @@ SECTION .text
ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
pmaxub m9, m10
%if %1 != 6
- ABSSUB m10, m5, m15, m11 ; abs(q3-q0)
+ ABSSUB m10, m5, m22, m11 ; abs(q3-q0)
pmaxub m9, m10
%endif
- vpcmpub k2{k3}, m9, m22, 2 ; le ; flat8in
+ vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in
%if %1 == 6
ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
%else
- ABSSUB m10, m12, m13, m11 ; abs(p3-p2)
+ ABSSUB m10, m25, m13, m11 ; abs(p3-p2)
ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
pmaxub m10, m11
- ABSSUB m11, m14, m15, m1 ; abs(q3-q2)
+ ABSSUB m11, m14, m22, m1 ; abs(q3-q2)
pmaxub m10, m11
%endif
ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
@@ -526,16 +523,10 @@ SECTION .text
%if %1 == 16
vpbroadcastd m11, [maskq+8]
por m11, [maskq+4]{bcstd}
- pand m11, pbmask
%else
- %if !is_h || %1 == 6
- pand m11, pbmask, [maskq+4]{bcstd}
- %else
vpbroadcastd m11, [maskq+4]
- pand m11, pbmask
- %endif
%endif
- pcmpeqd k4, m11, pbmask
+ vptestmd k4, m11, pbmask
vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks
pmaxub m8, m10
%endif
@@ -554,77 +545,58 @@ SECTION .text
pmaxub m1, m2
ABSSUB m2, m18, m4, m10
pmaxub m1, m2
- ABSSUB m2, m19, m5, m10
+ ABSSUB m2, m29, m5, m10
pmaxub m1, m2
- ABSSUB m2, m20, m5, m10
+ ABSSUB m2, m30, m5, m10
pmaxub m1, m2
- ABSSUB m2, m21, m5, m10
+ ABSSUB m2, m31, m5, m10
pmaxub m1, m2
- ;
- vpcmpub k4, m1, m22, 2 ; flat8out
- kandq k4, k4, k2 ; flat8in & flat8out
-
+ kandq k2, k2, k3
+ vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out
vpbroadcastd m2, [maskq+8]
- pand m10, m2, pbmask
- pcmpeqd k5, m10, pbmask
+ vptestmd k5, m2, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k4, k4, k5 ; flat16
- kandq k4, k3, k4 ; flat16 & fm
+ vptestmb k4{k4}, m7, m7 ; flat16 & fm
por m10, m2, [maskq+4]{bcstd}
- pand m2, m10, pbmask
- pcmpeqd k5, m2, pbmask
+ vptestmd k5, m10, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k2, k2, k5 ; flat8in
- kandq k2, k3, k2
+ vptestmb k2{k2}, m7, m7 ; flat8in
por m2, m10, [maskq+0]{bcstd}
- pand m2, pbmask
- pcmpeqd k5, m2, pbmask
+ vptestmd k5, m2, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k3, k3, k5
+ vptestmb k3{k3}, m7, m7
kandnq k3, k2, k3 ; fm & !flat8 & !flat16
kandnq k2, k4, k2 ; flat8 & !flat16
%elif %1 != 4
vpbroadcastd m0, [maskq+4]
- pand m2, m0, pbmask
- pcmpeqd k4, m2, pbmask
+ vptestmd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k2, k2, k4
+ vptestmb k2{k2}, m7, m7
kandq k2, k2, k3 ; flat8 & fm
por m0, [maskq+0]{bcstd}
- pand m0, pbmask
- pcmpeqd k4, m0, pbmask
+ vptestmd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k3, k3, k4
+ vptestmb k3{k3}, m7, m7
kandnq k3, k2, k3 ; fm & !flat8
%else
%ifidn %2, v
- pand m0, pbmask, [maskq+0]{bcstd}
+ vptestmd k4, pbmask, [maskq+0]{bcstd}
%else
vpbroadcastd m0, [maskq+0]
- pand m0, pbmask
+ vptestmd k4, m0, pbmask
%endif
- pcmpeqd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k3, k3, k4 ; fm
+ vptestmb k3{k3}, m7, m7 ; fm
%endif
; short filter
-%if is_uv
- SWAP m23, m22
- SWAP m24, m0
- SWAP m25, m12
- SWAP m26, m1
+%if %1 >= 8
+ SWAP m23, m15
%endif
- vpbroadcastd m23, [pb_3]
- vpbroadcastd m24, [pb_4]
- vpbroadcastd m25, [pb_16]
- vpbroadcastd m26, [pb_64]
+ vpbroadcastd m15, [pb_3]
+ vpbroadcastd m0, [pb_4]
+ vpbroadcastd m12, [pb_16]
+ vpbroadcastd m1, [pb_64]
pxor m3, pb128
pxor m6, pb128
psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev
@@ -634,16 +606,16 @@ SECTION .text
paddsb m10, m11
paddsb m10, m11
paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm
- paddsb m8, m10, m23
- paddsb m10, m24
+ paddsb m8, m10, m15
+ paddsb m10, m0
pand m8, [pb_248]{bcstd}
pand m10, [pb_248]{bcstd}
psrlq m8, 3
psrlq m10, 3
- pxor m8, m25
- pxor m10, m25
- psubb m8, m25 ; f2
- psubb m10, m25 ; f1
+ pxor m8, m12
+ pxor m10, m12
+ psubb m8, m12 ; f2
+ psubb m10, m12 ; f1
paddsb m4, m8
psubsb m5, m10
pxor m4, pb128
@@ -652,7 +624,7 @@ SECTION .text
pxor m10, pb128
pxor m8, m8
pavgb m8, m10 ; f=(f1+1)>>1
- psubb m8, m26
+ psubb m8, m1
knotq k1, k1
paddsb m3{k1}, m3, m8
psubsb m6{k1}, m6, m8
@@ -664,40 +636,40 @@ SECTION .text
%ifidn %2, v
lea t0, [dstq+mstrideq*8]
%endif
- SWAP m0, m16, m14
- SWAP m2, m17, m15
+ SWAP m24, m16, m14
+ SWAP m2, m17, m22
SWAP m7, m18
; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
; write -6
- vpbroadcastd m26, [pb_7_1]
- vpbroadcastd m25, [pb_2]
- punpcklbw m14, m0, m12
- punpckhbw m15, m0, m12
- pmaddubsw m10, m14, m26
- pmaddubsw m11, m15, m26 ; p6*7+p3
+ vpbroadcastd m1, [pb_7_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m14, m24, m25
+ punpckhbw m22, m24, m25
+ pmaddubsw m10, m14, m1
+ pmaddubsw m11, m22, m1 ; p6*7+p3
punpcklbw m8, m2, m7
punpckhbw m9, m2, m7
- pmaddubsw m8, m25
- pmaddubsw m9, m25
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3
%ifidn %2, h
vpbroadcastd m27, [pw_2048]
- vpbroadcastd m26, [pb_m1_1]
+ vpbroadcastd m1, [pb_m1_1]
%define pw2048 m27
- %define pbm1_1 m26
+ %define pbm1_1 m1
%endif
punpcklbw m8, m13, m3
punpckhbw m9, m13, m3
- pmaddubsw m8, m22
- pmaddubsw m9, m22
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
punpcklbw m8, m4, m5
punpckhbw m9, m4, m5
- pmaddubsw m8, m22
- pmaddubsw m9, m22
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
pmulhrsw m8, m10, pw2048
@@ -713,17 +685,17 @@ SECTION .text
; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
; write -5
pmaddubsw m14, pbm1_1
- pmaddubsw m15, pbm1_1
+ pmaddubsw m22, pbm1_1
paddw m10, m14
- paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
- punpcklbw m8, m0, m6
- punpckhbw m9, m0, m6
+ paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m24, m6
+ punpckhbw m9, m24, m6
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
SWAP m18, m8
- SWAP m22, m9
+ SWAP m23, m9
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
@@ -737,8 +709,8 @@ SECTION .text
; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
; write -4
SWAP m14, m16
- punpcklbw m8, m0, m13
- punpckhbw m9, m0, m13
+ punpcklbw m8, m24, m13
+ punpckhbw m9, m24, m13
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -756,21 +728,21 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3
%else
- vpblendmb m8{k4}, m12, m8
+ vpblendmb m8{k4}, m25, m8
mova [rsp+3*64], m8
%endif
; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
; write -3
- SWAP m15, m17
- punpcklbw m8, m0, m3
- punpckhbw m9, m0, m3
+ SWAP m22, m17
+ punpcklbw m8, m24, m3
+ punpckhbw m9, m24, m3
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
- punpcklbw m8, m7, m15
- punpckhbw m7, m15
+ punpcklbw m8, m7, m22
+ punpckhbw m7, m22
pmaddubsw m8, pbm1_1
pmaddubsw m7, pbm1_1
paddw m10, m8
@@ -779,69 +751,69 @@ SECTION .text
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
- vpblendmb m23{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
+ vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
; write -2
%ifidn %2, v
lea t0, [dstq+strideq*4]
%endif
- punpcklbw m8, m0, m4
- punpckhbw m9, m0, m4
+ punpcklbw m8, m24, m4
+ punpckhbw m9, m24, m4
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
- punpcklbw m8, m12, m19
- punpckhbw m9, m12, m19
- SWAP m1, m19
+ punpcklbw m8, m25, m29
+ punpckhbw m9, m25, m29
+ SWAP m26, m29
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
- SWAP m19, m8
- SWAP m24, m9
+ SWAP m29, m8
+ SWAP m0, m9
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
- vpblendmb m25{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
+ vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
; write -1
%ifidn %2, h
- SWAP m28, m0
+ SWAP m28, m24
punpcklbw m8, m28, m5
- punpckhbw m0, m28, m5
+ punpckhbw m24, m28, m5
%else
- punpcklbw m8, m0, m5
- punpckhbw m0, m5
+ punpcklbw m8, m24, m5
+ punpckhbw m24, m5
%endif
pmaddubsw m8, pbm1_1
- pmaddubsw m0, pbm1_1
+ pmaddubsw m24, pbm1_1
paddw m10, m8
- paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
- punpcklbw m0, m13, m20
- punpckhbw m9, m13, m20
+ paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m24, m13, m30
+ punpckhbw m9, m13, m30
%ifidn %2, h
- SWAP m27, m20
+ SWAP m27, m30
%endif
- SWAP m13, m23
- pmaddubsw m0, pbm1_1
+ SWAP m13, m15
+ pmaddubsw m24, pbm1_1
pmaddubsw m9, pbm1_1
- paddw m10, m0
+ paddw m10, m24
paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
- SWAP m20, m0
- SWAP m23, m9
+ SWAP m30, m24
+ SWAP m15, m9
%ifidn %2, h
- SWAP m9, m0
+ SWAP m9, m24
%define pw2048 m9
%endif
- pmulhrsw m0, m10, pw2048
+ pmulhrsw m24, m10, pw2048
pmulhrsw m8, m11, pw2048
paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
- paddw m11, m22
- packuswb m0, m8
- punpcklbw m8, m3, m21
+ paddw m11, m23
+ packuswb m24, m8
+ punpcklbw m8, m3, m31
pmaddubsw m8, pbm1_1
paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
SWAP m18, m8
@@ -851,34 +823,34 @@ SECTION .text
SWAP m16, m9
%define pw2048 m16
%endif
- punpckhbw m9, m3, m21
- SWAP m3, m25
+ punpckhbw m9, m3, m31
+ SWAP m3, m12
pmaddubsw m9, pbm1_1
paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
- SWAP m22, m9
+ SWAP m23, m9
pmulhrsw m9, m11, pw2048
paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
%ifidn %2, h
- SWAP m2, m26
+ SWAP m2, m1
%define pbm1_1 m2
%endif
- vpblendmb m26{k4}, m4, m0 ; don't clobber p0/m4 since we need it in H
+ vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H
; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
; write +0
- SWAP m0, m21 ; q6
+ SWAP m24, m31 ; q6
packuswb m8, m9
%ifidn %2, h
- SWAP m21, m2
- %define pbm1_1 m21
+ SWAP m31, m2
+ %define pbm1_1 m31
%endif
- vpblendmb m25{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
+ vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
; write +1
- punpcklbw m8, m4, m0
- punpckhbw m2, m4, m0
- SWAP m4, m26
+ punpcklbw m8, m4, m24
+ punpckhbw m2, m4, m24
+ SWAP m4, m1
pmaddubsw m8, pbm1_1
pmaddubsw m2, pbm1_1
paddw m10, m8
@@ -892,9 +864,9 @@ SECTION .text
; write +2
paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
paddw m11, m7
- punpcklbw m8, m5, m0
- punpckhbw m9, m5, m0
- SWAP m5, m25
+ punpcklbw m8, m5, m24
+ punpckhbw m9, m5, m24
+ SWAP m5, m12
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -906,10 +878,10 @@ SECTION .text
; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
; write +3
- paddw m10, m19 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
- paddw m11, m24
- punpcklbw m8, m6, m0
- punpckhbw m9, m6, m0
+ paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ paddw m11, m0
+ punpcklbw m8, m6, m24
+ punpckhbw m9, m6, m24
SWAP 2, 6
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
@@ -921,20 +893,20 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+mstrideq]{k4}, m8
%else
- SWAP m19, m16
- %define pw2048 m19
- vpblendmb m16{k4}, m15, m8
+ SWAP m29, m16
+ %define pw2048 m29
+ vpblendmb m16{k4}, m22, m8
%endif
; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
; write +4
- paddw m10, m20 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
- paddw m11, m23
+ paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m15
%ifidn %2, h
- SWAP m23, m8
+ SWAP m15, m8
%endif
- punpcklbw m8, m14, m0
- punpckhbw m9, m14, m0
+ punpcklbw m8, m14, m24
+ punpckhbw m9, m14, m24
SWAP 14, 7
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
@@ -946,16 +918,16 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4
%else
- vpblendmb m17{k4}, m1, m8
+ vpblendmb m17{k4}, m26, m8
%endif
; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
; write +5
paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
- paddw m11, m22
- punpcklbw m8, m15, m0
- punpckhbw m9, m15, m0
- SWAP m20, m0
+ paddw m11, m23
+ punpcklbw m8, m22, m24
+ punpckhbw m9, m22, m24
+ SWAP m30, m24
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -979,26 +951,26 @@ SECTION .text
vpbroadcastd m9, [pb_3_1]
vpbroadcastd m10, [pb_2_1]
%if %1 == 16
- vpbroadcastd m22, [pb_1]
- vpbroadcastd m24, [pb_4]
+ vpbroadcastd m23, [pb_1]
+ vpbroadcastd m0, [pb_4]
%elifidn %2, h
- vpbroadcastd m21, [pb_m1_1]
- %define pbm1_1 m21
+ vpbroadcastd m31, [pb_m1_1]
+ %define pbm1_1 m31
%endif
- punpcklbw m0, m12, m3
- punpckhbw m1, m12, m3
- pmaddubsw m2, m0, m9
- pmaddubsw m7, m1, m9 ; 3 * p3 + p1
+ punpcklbw m24, m25, m3
+ punpckhbw m26, m25, m3
+ pmaddubsw m2, m24, m9
+ pmaddubsw m7, m26, m9 ; 3 * p3 + p1
punpcklbw m8, m13, m4
punpckhbw m11, m13, m4
pmaddubsw m8, m10
pmaddubsw m11, m10
paddw m2, m8
paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
- punpcklbw m8, m5, m24
- punpckhbw m11, m5, m24
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ punpcklbw m8, m5, m0
+ punpckhbw m11, m5, m0
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
paddw m2, m8
paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
psrlw m8, m2, 3
@@ -1015,8 +987,8 @@ SECTION .text
%endif
%endif
- pmaddubsw m8, m0, pbm1_1
- pmaddubsw m11, m1, pbm1_1
+ pmaddubsw m8, m24, pbm1_1
+ pmaddubsw m11, m26, pbm1_1
paddw m2, m8
paddw m7, m11
punpcklbw m8, m13, m6
@@ -1035,14 +1007,14 @@ SECTION .text
SWAP m18, m8
%endif
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- psubw m2, m0
- psubw m7, m1
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
punpcklbw m8, m4, m14
punpckhbw m11, m4, m14
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
paddw m2, m8
paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
psrlw m8, m2, 3
@@ -1052,19 +1024,19 @@ SECTION .text
%ifidn %2, v
mova [t0+stride3q], m8
%else
- SWAP m19, m8
+ SWAP m29, m8
%endif
- punpcklbw m0, m5, m15
- punpckhbw m1, m5, m15
- pmaddubsw m8, m0, m22
- pmaddubsw m11, m1, m22
+ punpcklbw m24, m5, m22
+ punpckhbw m26, m5, m22
+ pmaddubsw m8, m24, m23
+ pmaddubsw m11, m26, m23
paddw m2, m8
paddw m7, m11
- punpcklbw m8, m4, m12
- punpckhbw m11, m4, m12
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ punpcklbw m8, m4, m25
+ punpckhbw m11, m4, m25
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
psubw m2, m8
psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
psrlw m8, m2, 3
@@ -1075,10 +1047,10 @@ SECTION .text
mova [dstq+strideq*0], m11
%endif
- pmaddubsw m0, pbm1_1
- pmaddubsw m1, pbm1_1
- paddw m2, m0
- paddw m7, m1
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m26, pbm1_1
+ paddw m2, m24
+ paddw m7, m26
punpcklbw m8, m13, m6
punpckhbw m13, m6
pmaddubsw m8, pbm1_1
@@ -1093,18 +1065,18 @@ SECTION .text
mova [dstq+strideq*1], m13
%endif
- punpcklbw m0, m3, m6
- punpckhbw m1, m3, m6
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- psubw m2, m0
- psubw m7, m1
- punpcklbw m0, m14, m15
- punpckhbw m1, m14, m15
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- paddw m2, m0
- paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ punpcklbw m24, m3, m6
+ punpckhbw m26, m3, m6
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m24, m14, m22
+ punpckhbw m26, m14, m22
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ paddw m2, m24
+ paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
psrlw m2, 3
psrlw m7, 3
packuswb m2, m7
@@ -1120,36 +1092,36 @@ SECTION .text
%endif
%ifidn %2, h
- SWAP m0, m18
- SWAP m1, m19
+ SWAP m24, m18
+ SWAP m26, m29
%if %1 == 8
; 16x8 transpose
- punpcklbw m3, m12, m10
- punpckhbw m12, m10
- punpcklbw m10, m0, m1
- punpckhbw m0, m1
- punpcklbw m1, m11, m13
+ punpcklbw m3, m25, m10
+ punpckhbw m25, m10
+ punpcklbw m10, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m11, m13
punpckhbw m11, m13
- punpcklbw m13, m2, m15
- punpckhbw m2, m15
+ punpcklbw m13, m2, m22
+ punpckhbw m2, m22
;
- punpcklwd m15, m3, m10
+ punpcklwd m22, m3, m10
punpckhwd m3, m10
- punpcklwd m10, m12, m0
- punpckhwd m12, m0
- punpcklwd m0, m1, m13
- punpckhwd m1, m13
+ punpcklwd m10, m25, m24
+ punpckhwd m25, m24
+ punpcklwd m24, m26, m13
+ punpckhwd m26, m13
punpcklwd m13, m11, m2
punpckhwd m11, m2
;
- punpckldq m2, m15, m0
- punpckhdq m15, m0
- punpckldq m0, m3, m1
- punpckhdq m3, m1
- punpckldq m1, m10, m13
+ punpckldq m2, m22, m24
+ punpckhdq m22, m24
+ punpckldq m24, m3, m26
+ punpckhdq m3, m26
+ punpckldq m26, m10, m13
punpckhdq m10, m13
- punpckldq m13, m12, m11
- punpckhdq m12, m11
+ punpckldq m13, m25, m11
+ punpckhdq m25, m11
; write 8x32
vpbroadcastd ym16, strided
pmulld ym16, [hmulD]
@@ -1162,8 +1134,8 @@ SECTION .text
kmovb k3, k6
kmovb k4, k6
vpscatterdq [dstq+ym16-4]{k1}, m2
- vpscatterdq [t1 +ym16-4]{k2}, m15
- vpscatterdq [t2 +ym16-4]{k3}, m0
+ vpscatterdq [t1 +ym16-4]{k2}, m22
+ vpscatterdq [t2 +ym16-4]{k3}, m24
vpscatterdq [t3 +ym16-4]{k4}, m3
lea t1, [t0+strideq*2]
lea t2, [t0+strideq*4]
@@ -1172,29 +1144,29 @@ SECTION .text
kmovb k2, k6
kmovb k3, k6
kmovb k4, k6
- vpscatterdq [t0+ym16-4]{k1}, m1
+ vpscatterdq [t0+ym16-4]{k1}, m26
vpscatterdq [t1+ym16-4]{k2}, m10
vpscatterdq [t2+ym16-4]{k3}, m13
- vpscatterdq [t3+ym16-4]{k4}, m12
+ vpscatterdq [t3+ym16-4]{k4}, m25
%else
; 16x16 transpose and store
SWAP 5, 10, 2
- SWAP 6, 0
- SWAP 7, 1
+ SWAP 6, 24
+ SWAP 7, 26
SWAP 8, 11
SWAP 9, 13
- mova m0, [rsp+0*64]
- SWAP m1, m28
+ mova m24, [rsp+0*64]
+ SWAP m26, m28
mova m2, [rsp+1*64]
mova m3, [rsp+2*64]
mova m4, [rsp+3*64]
SWAP m11, m16
- SWAP m12, m17
+ SWAP m25, m17
SWAP m13, m27
- SWAP m14, m20
+ SWAP m14, m30
TRANSPOSE_16X16B 1, 0, [rsp+4*64]
- movu [dstq+strideq*0-8], xm0
- movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*0-8], xm24
+ movu [dstq+strideq*1-8], xm26
movu [dstq+strideq*2-8], xm2
movu [dstq+stride3q -8], xm3
lea t0, [dstq+strideq*4]
@@ -1208,13 +1180,13 @@ SECTION .text
movu [t0+strideq*2-8], xm10
movu [t0+stride3q -8], xm11
lea t0, [t0+strideq*4]
- movu [t0+strideq*0-8], xm12
+ movu [t0+strideq*0-8], xm25
movu [t0+strideq*1-8], xm13
movu [t0+strideq*2-8], xm14
- movu [t0+stride3q -8], xm15
+ movu [t0+stride3q -8], xm22
lea t0, [t0+strideq*4]
- vextracti128 [t0+strideq*0-8], ym0, 1
- vextracti128 [t0+strideq*1-8], ym1, 1
+ vextracti128 [t0+strideq*0-8], ym24, 1
+ vextracti128 [t0+strideq*1-8], ym26, 1
vextracti128 [t0+strideq*2-8], ym2, 1
vextracti128 [t0+stride3q -8], ym3, 1
lea t0, [t0+strideq*4]
@@ -1228,13 +1200,13 @@ SECTION .text
vextracti128 [t0+strideq*2-8], ym10, 1
vextracti128 [t0+stride3q -8], ym11, 1
lea t0, [t0+strideq*4]
- vextracti128 [t0+strideq*0-8], ym12, 1
+ vextracti128 [t0+strideq*0-8], ym25, 1
vextracti128 [t0+strideq*1-8], ym13, 1
vextracti128 [t0+strideq*2-8], ym14, 1
- vextracti128 [t0+stride3q -8], ym15, 1
+ vextracti128 [t0+stride3q -8], ym22, 1
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m0, 2
- vextracti32x4 [t0+strideq*1-8], m1, 2
+ vextracti32x4 [t0+strideq*0-8], m24, 2
+ vextracti32x4 [t0+strideq*1-8], m26, 2
vextracti32x4 [t0+strideq*2-8], m2, 2
vextracti32x4 [t0+stride3q -8], m3, 2
lea t0, [t0+strideq*4]
@@ -1248,13 +1220,13 @@ SECTION .text
vextracti32x4 [t0+strideq*2-8], m10, 2
vextracti32x4 [t0+stride3q -8], m11, 2
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m12, 2
+ vextracti32x4 [t0+strideq*0-8], m25, 2
vextracti32x4 [t0+strideq*1-8], m13, 2
vextracti32x4 [t0+strideq*2-8], m14, 2
- vextracti32x4 [t0+stride3q -8], m15, 2
+ vextracti32x4 [t0+stride3q -8], m22, 2
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m0, 3
- vextracti32x4 [t0+strideq*1-8], m1, 3
+ vextracti32x4 [t0+strideq*0-8], m24, 3
+ vextracti32x4 [t0+strideq*1-8], m26, 3
vextracti32x4 [t0+strideq*2-8], m2, 3
vextracti32x4 [t0+stride3q -8], m3, 3
lea t0, [t0+strideq*4]
@@ -1268,19 +1240,15 @@ SECTION .text
vextracti32x4 [t0+strideq*2-8], m10, 3
vextracti32x4 [t0+stride3q -8], m11, 3
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m12, 3
+ vextracti32x4 [t0+strideq*0-8], m25, 3
vextracti32x4 [t0+strideq*1-8], m13, 3
vextracti32x4 [t0+strideq*2-8], m14, 3
- vextracti32x4 [t0+stride3q -8], m15, 3
+ vextracti32x4 [t0+stride3q -8], m22, 3
%endif
%endif
%elif %1 == 6
; flat6 filter
- SWAP m15, m23
- SWAP m0, m24
- SWAP m12, m25
- SWAP m1, m26
vpbroadcastd m15, [pb_3_1]
vpbroadcastd m12, [pb_2]
punpcklbw m8, m13, m5
@@ -1381,17 +1349,16 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
mov mstrideq, strideq
neg mstrideq
lea stride3q, [strideq*3]
- mova m31, [pb_4x0_4x4_4x8_4x12]
- mova m30, [pb_mask]
- vpbroadcastd m29, [pb_128]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
vpbroadcastd m28, [pb_m1_1]
vpbroadcastd m27, [pw_2048]
- %define pbshuf m31
- %define pbmask m30
- %define pb128 m29
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
%define pbm1_1 m28
%define pw2048 m27
- %define is_uv 0
.loop:
cmp word [maskq+8], 0 ; vmask[2]
@@ -1411,7 +1378,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call .v4
.end:
add lq, 64
@@ -1420,6 +1387,11 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
sub wd, 16
jg .loop
RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.v4:
+ FILTER 4, v
+ ret
cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
lut, h, stride3, stride8
@@ -1429,11 +1401,11 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
lea stride3q, [strideq*3]
lea stride8q, [strideq*8]
kxnorw k6, k6, k6
- vpbroadcastd m29, strided
- vpbroadcastd m30, l_strided
- pmulld m31, m29, [hmulA]
- pmulld m30, m30, [hmulB]
- pmulld m29, m29, [hmulC]
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
%define pbshuf [pb_4x0_4x4_4x8_4x12]
%define pbmask [pb_mask]
%define pb128 [pb_128]{bcstd}
@@ -1457,7 +1429,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, h
+ call .h4
.end:
lea lq, [lq+l_strideq*8]
@@ -1466,9 +1438,13 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
sub hd, 16
jg .loop
RET
+ALIGN function_align
RESET_MM_PERMUTATION
+.h4:
+ FILTER 4, h
+ ret
-cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \
lut, w, stride3, mstride
DECLARE_REG_TMP 9
shl l_strideq, 2
@@ -1476,16 +1452,15 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
mov mstrideq, strideq
neg mstrideq
lea stride3q, [strideq*3]
- mova m20, [pb_4x0_4x4_4x8_4x12]
- mova m19, [pb_mask]
- vpbroadcastd m18, [pb_128]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
vpbroadcastd m17, [pb_m1_1]
vpbroadcastd m16, [pw_4096]
- %define pbshuf m20
- %define pbmask m19
- %define pb128 m18
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
%define pbm1_1 m17
- %define is_uv 1
.loop:
cmp word [maskq+4], 0 ; vmask[1]
@@ -1498,7 +1473,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4
.end:
add lq, 64
@@ -1525,17 +1500,14 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
vpbroadcastd m19, strided
vpbroadcastd m20, l_strided
pmulld m21, m19, [hmulA]
- pmulld m20, m20, [hmulB]
- pmulld m19, m19, [hmulC]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
mova m18, [pb_mask]
vpbroadcastd m17, [pb_128]
vpbroadcastd m16, [pw_4096]
%define pbshuf [pb_4x0_4x4_4x8_4x12]
%define pbmask m18
%define pb128 m17
- %xdefine m31 m21
- %xdefine m30 m20
- %xdefine m29 m19
add l_strideq, l_strideq
.loop:
@@ -1549,7 +1521,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, h
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4
.end:
lea lq, [lq+l_strideq*8]
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h
new file mode 100644
index 00000000000..de23be8866c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#include "common/intops.h"
+
+#define decl_wiener_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \
+decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext))
+
+#define decl_sgr_filter_fns(ext) \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \
+decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext))
+
+decl_wiener_filter_fns(sse2);
+decl_wiener_filter_fns(ssse3);
+decl_wiener_filter_fns(avx2);
+decl_wiener_filter_fns(avx512icl);
+decl_sgr_filter_fns(ssse3);
+decl_sgr_filter_fns(avx2);
+decl_sgr_filter_fns(avx512icl);
+
+static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
+#if BITDEPTH == 8
+ c->wiener[0] = BF(dav1d_wiener_filter7, sse2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, sse2);
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+ c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
+ c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
+ }
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
+ }
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+ c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl);
+#if BITDEPTH == 8
+ /* With VNNI we don't need a 5-tap version. */
+ c->wiener[1] = c->wiener[0];
+#else
+ c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl);
+#endif
+ if (BITDEPTH == 8 || bpc == 10) {
+ c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl);
+ c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl);
+ c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl);
+ }
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm
index 5669ce66d8f..1e571774caf 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/looprestoration_avx512.asm
@@ -329,11 +329,11 @@ ALIGN function_align
packuswb m2, m4
psrlw m2, 8
vpackuswb m2{k2}, m3, m5
- mova [dstq+r10], m2
- add r10, 64
- jl .hv_loop
- mov t6, t5
- mov t5, t4
+ movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
+ add r10, 64 ; function is used for chroma as well, and in some
+ jl .hv_loop ; esoteric edge cases chroma dst pointers may only
+ mov t6, t5 ; have a 32-byte alignment despite having a width
+ mov t5, t4 ; larger than 32, so use an unaligned store here.
mov t4, t3
mov t3, t2
mov t2, t1
@@ -379,7 +379,7 @@ ALIGN function_align
packuswb m0, m2
psrlw m0, 8
vpackuswb m0{k2}, m1, m3
- mova [dstq+r10], m0
+ movu [dstq+r10], m0
add r10, 64
jl .v_loop
mov t6, t5
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc.h b/chromium/third_party/dav1d/libdav1d/src/x86/mc.h
new file mode 100644
index 00000000000..65c607e180c
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2018-2021, VideoLAN and dav1d authors
+ * Copyright © 2018-2021, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+#include "src/mc.h"
+
+#define decl_fn(type, name) \
+ decl_##type##_fn(BF(name, sse2)); \
+ decl_##type##_fn(BF(name, ssse3)); \
+ decl_##type##_fn(BF(name, avx2)); \
+ decl_##type##_fn(BF(name, avx512icl));
+#define init_mc_fn(type, name, suffix) \
+ c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+ c->mct[type] = BF(dav1d_prep_##name, suffix)
+#define init_mc_scaled_fn(type, name, suffix) \
+ c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_scaled_fn(type, name, suffix) \
+ c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
+
+decl_fn(mc, dav1d_put_8tap_regular);
+decl_fn(mc, dav1d_put_8tap_regular_smooth);
+decl_fn(mc, dav1d_put_8tap_regular_sharp);
+decl_fn(mc, dav1d_put_8tap_smooth);
+decl_fn(mc, dav1d_put_8tap_smooth_regular);
+decl_fn(mc, dav1d_put_8tap_smooth_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp);
+decl_fn(mc, dav1d_put_8tap_sharp_regular);
+decl_fn(mc, dav1d_put_8tap_sharp_smooth);
+decl_fn(mc, dav1d_put_bilin);
+
+decl_fn(mct, dav1d_prep_8tap_regular);
+decl_fn(mct, dav1d_prep_8tap_regular_smooth);
+decl_fn(mct, dav1d_prep_8tap_regular_sharp);
+decl_fn(mct, dav1d_prep_8tap_smooth);
+decl_fn(mct, dav1d_prep_8tap_smooth_regular);
+decl_fn(mct, dav1d_prep_8tap_smooth_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp);
+decl_fn(mct, dav1d_prep_8tap_sharp_regular);
+decl_fn(mct, dav1d_prep_8tap_sharp_smooth);
+decl_fn(mct, dav1d_prep_bilin);
+
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular);
+decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth);
+decl_fn(mc_scaled, dav1d_put_bilin_scaled);
+
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular);
+decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth);
+decl_fn(mct_scaled, dav1d_prep_bilin_scaled);
+
+decl_fn(avg, dav1d_avg);
+decl_fn(w_avg, dav1d_w_avg);
+decl_fn(mask, dav1d_mask);
+decl_fn(w_mask, dav1d_w_mask_420);
+decl_fn(w_mask, dav1d_w_mask_422);
+decl_fn(w_mask, dav1d_w_mask_444);
+decl_fn(blend, dav1d_blend);
+decl_fn(blend_dir, dav1d_blend_v);
+decl_fn(blend_dir, dav1d_blend_h);
+
+decl_fn(warp8x8, dav1d_warp_affine_8x8);
+decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
+decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
+decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
+
+decl_fn(emu_edge, dav1d_emu_edge);
+
+decl_fn(resize, dav1d_resize);
+
+static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
+ return;
+
+#if BITDEPTH == 8
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
+
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
+#endif
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
+
+ c->avg = BF(dav1d_avg, ssse3);
+ c->w_avg = BF(dav1d_w_avg, ssse3);
+ c->mask = BF(dav1d_mask, ssse3);
+ c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
+ c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
+ c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
+ c->blend = BF(dav1d_blend, ssse3);
+ c->blend_v = BF(dav1d_blend_v, ssse3);
+ c->blend_h = BF(dav1d_blend_h, ssse3);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
+ c->emu_edge = BF(dav1d_emu_edge, ssse3);
+ c->resize = BF(dav1d_resize, ssse3);
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
+ return;
+
+#if BITDEPTH == 8
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
+#endif
+
+#if ARCH_X86_64
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+ return;
+
+ init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
+
+ c->avg = BF(dav1d_avg, avx2);
+ c->w_avg = BF(dav1d_w_avg, avx2);
+ c->mask = BF(dav1d_mask, avx2);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
+ c->blend = BF(dav1d_blend, avx2);
+ c->blend_v = BF(dav1d_blend_v, avx2);
+ c->blend_h = BF(dav1d_blend_h, avx2);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
+ c->emu_edge = BF(dav1d_emu_edge, avx2);
+ c->resize = BF(dav1d_resize, avx2);
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
+ return;
+
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
+ init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
+
+ c->avg = BF(dav1d_avg, avx512icl);
+ c->w_avg = BF(dav1d_w_avg, avx512icl);
+ c->mask = BF(dav1d_mask, avx512icl);
+ c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
+ c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
+ c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
+ c->blend = BF(dav1d_blend, avx512icl);
+ c->blend_v = BF(dav1d_blend_v, avx512icl);
+ c->blend_h = BF(dav1d_blend_h, avx512icl);
+ c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
+ c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
+ c->resize = BF(dav1d_resize, avx512icl);
+#endif
+}
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm
index e83b18ad969..585ba53e080 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm
@@ -1604,7 +1604,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
vpbroadcastd m11, [buf+ 4]
vpbroadcastd m12, [buf+ 8]
vpbroadcastd m13, [buf+12]
- cmp wd, 16
+ sub wd, 16
je .h_w16
jg .h_w32
.h_w8:
@@ -3615,32 +3615,32 @@ ALIGN function_align
.w4:
movq [dstq ], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq ], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq ], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm0, ym1, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ vextracti32x4 xm0, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq ], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
@@ -3860,33 +3860,33 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb m3, m15, m3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
mova [maskq], xm3
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8:
@@ -4090,32 +4090,32 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
@@ -4249,32 +4249,32 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm
index eb3ca1c427d..7897f1decc1 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc_avx512.asm
@@ -449,9 +449,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshufb ym0, ym4
pmaddubsw ym0, ym5
pmulhrsw ym0, ym3
- vpmovuswb xmm0, ym0
- movq [dstq+dsq*0], xmm0
- movhps [dstq+dsq*1], xmm0
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
@@ -755,9 +755,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pmulhw ym1, ym6
paddw ym1, ym2
pmulhrsw ym1, ym7
- vpmovuswb xmm1, ym1
- movq [dstq+dsq*0], xmm1
- movhps [dstq+dsq*1], xmm1
+ vpmovuswb xm1, ym1
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
@@ -1588,13 +1588,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_w4_loop
RET
.h_w8:
- movu xmm0, [srcq+ssq*0]
- vinserti32x4 ym0, ymm0, [srcq+ssq*1], 1
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
- vpmovuswb xmm0, ym0
- movq [dstq+dsq*0], xmm0
- movhps [dstq+dsq*1], xmm0
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
@@ -3308,17 +3308,17 @@ ALIGN function_align
cmp hd, 8
jg .w4_h16
WRAP_YMM %1 0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq ], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_ret
lea dstq, [dstq+strideq*4]
pextrd [dstq ], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_ret:
RET
.w4_h16:
@@ -3332,29 +3332,29 @@ ALIGN function_align
cmp hd, 4
jne .w8_h8
WRAP_YMM %1 0
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq ], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
%1_INC_PTR 2
lea dstq, [dstq+strideq*4]
.w8_h8:
%1 0
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq ], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq ], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3415,8 +3415,8 @@ ALIGN function_align
paddw m0, [tmp2q+(%1+0)*mmsize]
mova m1, [tmp1q+(%1+1)*mmsize]
paddw m1, [tmp2q+(%1+1)*mmsize]
- pmulhrsw m0, m2
- pmulhrsw m1, m2
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
packuswb m0, m1
%endmacro
@@ -3425,13 +3425,13 @@ ALIGN function_align
add tmp2q, %1*mmsize
%endmacro
-cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-avg_avx512icl_table
lea r6, [avg_avx512icl_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, dword [r6+wq*4]
- vpbroadcastd m2, [base+pw_1024]
+ vpbroadcastd m4, [base+pw_1024]
add wq, r6
BIDIR_FN AVG
@@ -3573,17 +3573,17 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym5, [wm_420_perm4+32], 1
vpermb ym4, ym5, ym4
vpdpbusd ym8, ym4, ym9
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
vpermb ym8, ym10, ym8
movq [maskq], xm8
@@ -3609,11 +3609,11 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpdpbusd ym8, ym4, ym9
vpermb m8, m10, m8
mova [maskq], xm8
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3627,18 +3627,18 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpdpbusd m1, m4, m9
vpermb m1, m10, m1
mova [maskq], xm1
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3766,17 +3766,17 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
movhps xm10, [wm_422_mask+16]
vpdpwssd ym8, ym4, ym9
vpermb ym8, ym10, ym8
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
pand xm8, xm11
mova [maskq], xm8
@@ -3801,11 +3801,11 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb ym8, ym10, ym8
pand xm8, xm11
mova [maskq], xm8
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3819,18 +3819,18 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb m1, m10, m1
pand ym1, ym11
mova [maskq], ym1
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3936,17 +3936,17 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym8, [wm_444_mask+32], 1
vpermb ym4, ym8, ym4
mova [maskq], ym4
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
RET
.w4_h16:
@@ -3965,11 +3965,11 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym8, [wm_444_mask+32], 1
vpermb ym4, ym8, ym4
mova [maskq], ym4
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3980,18 +3980,18 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
W_MASK 0, 4, 0, 1, 1
vpermb m4, m8, m4
mova [maskq], m4
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac.h b/chromium/third_party/dav1d/libdav1d/src/x86/msac.h
index e11cd08c8a4..0bb632fb314 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/msac.h
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/msac.h
@@ -28,21 +28,21 @@
#ifndef DAV1D_SRC_X86_MSAC_H
#define DAV1D_SRC_X86_MSAC_H
+#include "src/cpu.h"
+
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
-/* Needed for checkasm */
-unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-
#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
@@ -55,10 +55,21 @@ unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
#if ARCH_X86_64
#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
+
+static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+ }
+
+ if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+ }
+}
+
#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#endif
-void dav1d_msac_init_x86(MsacContext *const s);
-
#endif /* DAV1D_SRC_X86_MSAC_H */
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c b/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c
deleted file mode 100644
index a634da27c4e..00000000000
--- a/chromium/third_party/dav1d/libdav1d/src/x86/msac_init.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright © 2020, VideoLAN and dav1d authors
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "src/cpu.h"
-#include "src/msac.h"
-#include "src/x86/msac.h"
-
-#if ARCH_X86_64
-void dav1d_msac_init_x86(MsacContext *const s) {
- const unsigned flags = dav1d_get_cpu_flags();
-
- if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
- s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
- }
-
- if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
- s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
- }
-}
-#endif
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c b/chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h
index e3575ba4da7..de4124c436e 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/refmvs_init.c
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/refmvs.h
@@ -32,7 +32,7 @@ decl_splat_mv_fn(dav1d_splat_mv_sse2);
decl_splat_mv_fn(dav1d_splat_mv_avx2);
decl_splat_mv_fn(dav1d_splat_mv_avx512icl);
-COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
+static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;