summaryrefslogtreecommitdiff
path: root/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm')
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm912
1 files changed, 912 insertions, 0 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm
new file mode 100644
index 00000000000..b7bc3aa106f
--- /dev/null
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter16_avx512.asm
@@ -0,0 +1,912 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+
+l_shuf_v: times 2 db 0, 32
+pw_1: times 2 dw 1
+ times 2 db 4, 36
+pw_3: times 2 dw 3
+ times 2 db 8, 40
+pw_4: times 2 dw 4
+ times 2 db 12, 44
+pw_16: times 2 dw 16
+ times 2 db 16, 48
+pw_4096: times 2 dw 4096
+ times 2 db 20, 52
+pw_16384: times 2 dw 16384
+ times 2 db 24, 56
+pw_32767: times 2 dw 32767
+ times 2 db 28, 60
+ times 2 dw 0
+filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128
+stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25
+l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1
+clip_max: dw 511, 511, 2047, 2047
+clip_min: dw -512, -512, -2048, -2048
+
+SECTION .text
+
+%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp
+ punpckhwd m%9, m%5, m%6
+ punpcklwd m%5, m%6
+ punpckhwd m%6, m%1, m%2
+ punpcklwd m%1, m%2
+ punpckhwd m%2, m%7, m%8
+ punpcklwd m%7, m%8
+ punpckhwd m%8, m%3, m%4
+ punpcklwd m%3, m%4
+ punpckhdq m%4, m%1, m%3
+ punpckldq m%1, m%3
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%5, m%7
+ punpckhdq m%7, m%6, m%8
+ punpckldq m%6, m%8
+ punpckldq m%8, m%9, m%2
+ punpckhdq m%9, m%2
+ punpckhqdq m%2, m%1, m%3
+ punpcklqdq m%1, m%3
+ punpcklqdq m%3, m%4, m%5
+ punpckhqdq m%4, m%5
+ punpcklqdq m%5, m%6, m%8
+ punpckhqdq m%6, m%8
+ punpckhqdq m%8, m%7, m%9
+ punpcklqdq m%7, m%9
+%endmacro
+
+%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%ifidn %2, v
+%if %1 == 16
+ lea tmpq, [dstq+mstrideq*8]
+ mova m0, [tmpq+strideq*1 ]
+ mova m1, [tmpq+strideq*2 ] ; p5
+ mova m2, [tmpq+stride3q ] ; p4
+ mova m3, [tmpq+strideq*4 ] ; p3
+ mova m4, [tmpq+stride5q ] ; p2
+%elif %1 == 6 || %1 == 8
+ lea tmpq, [dstq+mstrideq*4]
+%if %1 == 8
+ mova m3, [tmpq+strideq*0 ]
+%endif
+ mova m4, [tmpq+strideq*1 ]
+%endif
+ mova m5, [dstq+mstrideq*2] ; p1
+ mova m6, [dstq+mstrideq*1] ; p0
+ mova m7, [dstq+strideq*0 ] ; q0
+ mova m8, [dstq+strideq*1 ] ; q1
+%if %1 != 4
+ mova m9, [dstq+strideq*2 ] ; q2
+%endif
+%if %1 == 8 || %1 == 16
+ mova m10, [dstq+stride3q ] ; q3
+%endif
+%if %1 == 16
+ mova m11, [dstq+strideq*4 ] ; q4
+ mova m22, [dstq+stride5q ] ; q5
+ mova m23, [dstq+stride3q*2]
+%endif
+%else ; h
+%if %1 == 16
+ movu ym16, [dstq+strideq*0 -16]
+ movu ym17, [dstq+strideq*1 -16]
+ movu ym18, [dstq+strideq*2 -16]
+ movu ym19, [dstq+stride3q -16]
+ movu ym20, [dstq+strideq*4 -16]
+ movu ym22, [dstq+stride5q -16]
+ movu ym23, [dstq+stride3q*2-16]
+ movu ym28, [dstq+stride7q -16]
+ lea tmpq, [dstq+strideq*8 -16]
+ vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m10, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1
+ vinserti32x8 m22, m22, [tmpq+stride5q ], 1
+ vinserti32x8 m23, m23, [tmpq+stride3q*2], 1
+ vinserti32x8 m28, m28, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8]
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27
+ movu ym16, [tmpq+strideq*0 ]
+ movu ym17, [tmpq+strideq*1 ]
+ movu ym18, [tmpq+strideq*2 ]
+ movu ym19, [tmpq+stride3q ]
+ movu ym24, [tmpq+strideq*4 ]
+ movu ym25, [tmpq+stride5q ]
+ movu ym26, [tmpq+stride3q*2]
+ movu ym20, [tmpq+stride7q ]
+ lea tmpq, [tmpq+strideq*8]
+ vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1
+ vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1
+ vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1
+ vinserti32x8 m3, m19, [tmpq+stride3q ], 1
+ vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1
+ vinserti32x8 m5, m25, [tmpq+stride5q ], 1
+ vinserti32x8 m6, m26, [tmpq+stride3q*2], 1
+ vinserti32x8 m20, m20, [tmpq+stride7q ], 1
+ TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27
+ vshufi32x4 m27, m7, m0, q2020
+ vshufi32x4 m7, m0, q3131
+ vshufi32x4 m0, m8, m1, q2020
+ vshufi32x4 m8, m1, q3131
+ vshufi32x4 m1, m9, m2, q2020
+ vshufi32x4 m9, m2, q3131
+ vshufi32x4 m2, m10, m3, q2020
+ vshufi32x4 m10, m3, q3131
+ vshufi32x4 m3, m11, m4, q2020
+ vshufi32x4 m11, m4, q3131
+ vshufi32x4 m4, m22, m5, q2020
+ vshufi32x4 m22, m5, q3131
+ vshufi32x4 m5, m23, m6, q2020
+ vshufi32x4 m23, m6, q3131
+ vshufi32x4 m6, m28, m20, q2020
+ vshufi32x4 m28, m20, q3131
+%elif %1 == 6 || %1 == 8
+%if %1 == 8
+ sub dstq, 8
+ movu xm16, [dstq+strideq*0 ]
+ movu xm17, [dstq+strideq*1 ]
+ movu xm18, [dstq+strideq*2 ]
+ movu xm19, [dstq+stride3q ]
+ movu xm24, [dstq+strideq*4 ]
+ movu xm25, [dstq+stride5q ]
+ movu xm26, [dstq+stride3q*2]
+ movu xm27, [dstq+stride7q ]
+ lea tmpq, [dstq+strideq*8 ]
+ vinserti128 ym16, [tmpq+strideq*0 ], 1
+ vinserti128 ym17, [tmpq+strideq*1 ], 1
+ vinserti128 ym18, [tmpq+strideq*2 ], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ vinserti128 ym24, [tmpq+strideq*4 ], 1
+ vinserti128 ym25, [tmpq+stride5q ], 1
+ vinserti128 ym26, [tmpq+stride3q*2], 1
+ vinserti128 ym27, [tmpq+stride7q ], 1
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2
+ vinserti32x4 m9, m25, [tmpq+stride5q ], 2
+ vinserti32x4 m3, m26, [tmpq+stride3q*2], 2
+ vinserti32x4 m4, m27, [tmpq+stride7q ], 2
+ lea tmpq, [tmpq+strideq*8 ]
+ vinserti32x4 m10, [tmpq+strideq*0 ], 3
+ vinserti32x4 m8, [tmpq+strideq*1 ], 3
+ vinserti32x4 m5, [tmpq+strideq*2 ], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ vinserti32x4 m2, [tmpq+strideq*4 ], 3
+ vinserti32x4 m9, [tmpq+stride5q ], 3
+ vinserti32x4 m3, [tmpq+stride3q*2], 3
+ vinserti32x4 m4, [tmpq+stride7q ], 3
+%else ; %1 == 6
+ movu xm16, [dstq+strideq*0-8]
+ movu xm17, [dstq+strideq*1-8]
+ movu xm18, [dstq+strideq*2-8]
+ movu xm19, [dstq+stride3q -8]
+ lea tmpq, [dstq+strideq*4-8]
+ movu xm2, [tmpq+strideq*0]
+ movu xm9, [tmpq+strideq*1]
+ movu xm3, [tmpq+strideq*2]
+ movu xm4, [tmpq+stride3q ]
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym16, [tmpq+strideq*0], 1
+ vinserti128 ym17, [tmpq+strideq*1], 1
+ vinserti128 ym18, [tmpq+strideq*2], 1
+ vinserti128 ym19, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti128 ym2, [tmpq+strideq*0], 1
+ vinserti128 ym9, [tmpq+strideq*1], 1
+ vinserti128 ym3, [tmpq+strideq*2], 1
+ vinserti128 ym4, [tmpq+stride3q ], 1
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, m16, [tmpq+strideq*0], 2
+ vinserti32x4 m8, m17, [tmpq+strideq*1], 2
+ vinserti32x4 m5, m18, [tmpq+strideq*2], 2
+ vinserti32x4 m7, m19, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 2
+ vinserti32x4 m9, [tmpq+strideq*1], 2
+ vinserti32x4 m3, [tmpq+strideq*2], 2
+ vinserti32x4 m4, [tmpq+stride3q ], 2
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m10, [tmpq+strideq*0], 3
+ vinserti32x4 m8, [tmpq+strideq*1], 3
+ vinserti32x4 m5, [tmpq+strideq*2], 3
+ vinserti32x4 m7, [tmpq+stride3q ], 3
+ lea tmpq, [tmpq+strideq*4]
+ vinserti32x4 m2, [tmpq+strideq*0], 3
+ vinserti32x4 m9, [tmpq+strideq*1], 3
+ vinserti32x4 m3, [tmpq+strideq*2], 3
+ vinserti32x4 m4, [tmpq+stride3q ], 3
+%endif
+ punpcklwd m6, m10, m8
+ punpckhwd m10, m8
+ punpcklwd m8, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m2, m9
+ punpckhwd m2, m9
+ punpcklwd m9, m3, m4
+ punpckhwd m3, m4
+ punpckldq m4, m6, m8
+ punpckhdq m6, m8
+ punpckldq m8, m10, m5
+ punpckhdq m10, m5
+ punpckldq m5, m7, m9
+ punpckhdq m7, m9
+ punpckldq m9, m2, m3
+ punpckhdq m2, m3
+%if %1 == 8
+ punpcklqdq m3, m4, m5
+%endif
+ punpckhqdq m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m9
+ punpckhqdq m8, m9
+ punpcklqdq m9, m10, m2
+%if %1 == 8
+ punpckhqdq m10, m2
+%endif
+%else ; %1 == 4
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdq m7{k1}, [dstq+ym12-4]
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpgatherdq m4{k2}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpgatherdq m5{k1}, [tmpq+ym12]
+ lea tmpq, [tmpq+strideq*2]
+ vpgatherdq m6{k2}, [tmpq+ym12]
+ punpcklwd m8, m7, m4
+ punpckhwd m7, m4
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m6, m8, m7
+ punpckhwd m8, m7
+ punpcklwd m7, m4, m5
+ punpckhwd m4, m5
+ punpcklqdq m5, m6, m7
+ punpckhqdq m6, m7
+ punpcklqdq m7, m8, m4
+ punpckhqdq m8, m4
+%endif
+%endif
+
+ ; load L/E/I/H
+%ifidn %2, v
+ movu ym16, [lq+l_strideq*1]
+ movsldup m17, [l_shuf_v]
+ vptestnmb k1, ym16, ym16
+ vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][]
+ vpermb m16, m17, m16 ; l[x][1]
+%else
+ movq xm16, [lq+l_strideq*0]
+ movq xm17, [lq+l_strideq*1]
+ vinserti128 ym16, [lq+l_strideq*2], 1
+ vinserti128 ym17, [lq+l_stride3q ], 1
+ lea tmpq, [lq+l_strideq*4]
+ vinserti32x4 m16, [tmpq+l_strideq*0], 2
+ vinserti32x4 m17, [tmpq+l_strideq*1], 2
+ vinserti32x4 m16, [tmpq+l_strideq*2], 3
+ vinserti32x4 m17, [tmpq+l_stride3q ], 3
+ punpcklqdq m16, m17
+ vbroadcasti32x4 m17, [l_shuf_h]
+ vptestnmb k1, m16, m16
+ vpalignr m16{k1}, m16, 12
+ pshufb m16, m17 ; l[x][1]
+%endif
+ vpbroadcastd m20, [pw_32767]
+ psubw m17, m5, m6 ; p1-p0
+ psubw m18, m7, m8 ; q1-q0
+ vptestmw k1, m16, m16 ; L
+ pabsw m17, m17
+ pabsw m18, m18
+ vpmaxuw m20{k1}, m17, m18
+ vpbroadcastw m17, [lutq+136]
+ psrlw m18, m16, [lutq+128]
+ vpbroadcastd m19, [pw_1]
+ pminuw m18, m17
+ psrlw m17, m16, 4 ; H
+ paddw m16, m16
+ pmaxuw m18, m19 ; I
+ vpaddd m16, [pw_4] {1to16}
+ paddw m16, m18 ; E
+ REPX {pmullw x, m13}, m17, m18, m16
+ vpcmpw k4, m20, m17, 6 ; hev
+%if %1 != 4
+ psubw m19, m4, m5 ; p2-p1
+ pabsw m19, m19
+%if %1 == 8 || %1 == 16
+ psubw m17, m3, m4 ; p3-p2
+ pabsw m17, m17
+ pmaxuw m19, m17
+ psubw m17, m9, m10 ; q3-q2
+ pabsw m17, m17
+ pmaxuw m19, m17
+%endif
+ psubw m17, m9, m8 ; q2-q1
+ pabsw m17, m17
+ pmaxuw m19, m17
+%if %1 == 16
+ vpbroadcastd ym17, [maskq+4]
+ vpord ym17, [maskq+8] {1to8}
+ vptestmd k1, ym17, ym21
+%else
+ vptestmd k1, ym21, [maskq+4] {1to8}
+%endif
+ pmaxuw m19, m20
+ psubw m17, m4, m6 ; p2-p0
+ pabsw m17, m17
+ pmaxuw m17, m20
+ vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks
+%if %1 == 8 || %1 == 16
+ psubw m19, m3, m6 ; p3-p0
+ pabsw m19, m19
+ pmaxuw m17, m19
+ psubw m19, m7, m10 ; q3-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ psubw m19, m7, m9 ; q2-q0
+ pabsw m19, m19
+ pmaxuw m17, m19
+%endif
+ vpcmpw k1, m20, m18, 2
+ psubw m18, m5, m8 ; p1-q1
+ psubw m19, m6, m7 ; p0-q0
+ pabsw m18, m18
+ pabsw m19, m19
+ psrlw m18, 1
+ paddw m19, m19
+ paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+ vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
+%if %1 != 4
+ vpcmpw k2{k1}, m17, m13, 2 ; flat8in
+%endif
+%if %1 == 16
+ psubw m20, m0, m6
+ psubw m16, m1, m6
+ pabsw m20, m20
+ psubw m17, m2, m6
+ pabsw m16, m16
+ psubw m18, m11, m7
+ pabsw m17, m17
+ psubw m19, m22, m7
+ pabsw m18, m18
+ pmaxuw m20, m16
+ psubw m16, m23, m7
+ pabsw m19, m19
+ pmaxuw m17, m18
+ pabsw m16, m16
+ vpandd ym18, ym21, [maskq+8] {1to8}
+ pmaxuw m20, m17
+ pmaxuw m19, m16
+ pcmpeqd ym16, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8
+ pmaxuw m20, m19
+ pcmpeqd ym17, ym21, ym18
+ vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8
+ vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out
+ pcmpeqd ym18, ym21
+ vptestmb k3{k3}, ym16, ym16 ; flat8 & fm
+ vptestmb k2{k2}, ym17, ym17 ; flat8in
+ vptestmb k1{k1}, ym18, ym18
+ kandnd k1, k2, k1 ; fm & !flat8 & !flat16
+ kandnd k2, k3, k2 ; flat8 & !flat16
+%elif %1 == 6 || %1 == 8
+ vpandd ym17, ym21, [maskq+4] {1to8}
+ pcmpeqd ym16, ym21, ym17
+ vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8
+ pcmpeqd ym17, ym21
+ vptestmb k2{k2}, ym16, ym16 ; flat8 & fm
+ vptestmb k1{k1}, ym17, ym17
+ kandnd k1, k2, k1 ; fm & !flat8
+%else ; %1 == 4
+ vpandd ym16, ym21, [maskq+0] {1to8}
+ pcmpeqd ym16, ym21
+ vptestmb k1{k1}, ym16, ym16
+%endif
+
+ ; short filter
+ psubw m16, m7, m6
+ vpbroadcastd m17, [pw_3]
+ paddw m18, m16, m16
+ paddw m18, m16
+ psubw m16, m5, m8 ; iclip_diff(p1-q1)
+ pminsw m16, m14
+ vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev
+ knotd k4, k4 ; !hev
+ paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f)
+ vpbroadcastd m18, [pw_4]
+ pminsw m16, m14
+ vpmaxsw m16{k1}{z}, m15 ; f&=fm
+ paddw m17, m16
+ paddw m16, m18
+ vpbroadcastd m18, [pw_16384]
+ pminsw m17, m14
+ pminsw m16, m14
+ psraw m17, 3 ; f2
+ psraw m16, 3 ; f1
+ paddw m6, m17
+ psubw m7, m16
+ vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev
+ psubw m17, m14, m15 ; 1023 or 4095
+ pxor m18, m18
+ paddw m5, m16
+ psubw m8, m16
+ REPX {pminsw x, m17}, m6, m7, m5, m8
+ REPX {pmaxsw x, m18}, m6, m7, m5, m8
+
+%if %1 == 16 ; flat16 filter
+ vpaddd m19, m0, [pw_1] {1to16}
+ paddw m16, m1, m2 ; p5+p4
+ paddw m26, m1, m6 ; p5+p0
+ paddw m24, m2, m7 ; p4+q0
+ paddw m16, m4 ; p5+p4+p3
+ paddw m17, m3, m5 ; p2+p1
+ psllw m19, 3
+ paddw m16, m26 ; p5*2+p4+p3+p0
+ paddw m17, m24 ; p4+p2+p1+q0
+ psubw m19, m0 ; p6*7+8
+ paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0
+ paddw m18, m3, m8
+ paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0
+ paddw m25, m1, m0
+ paddw m16, m0, m0
+ psrlw m1{k3}, m19, 4
+ paddw m19, m18
+ psubw m19, m16 ; +p3+q1-p6*2
+ paddw m16, m2, m0
+ psrlw m2{k3}, m19, 4
+ psubw m19, m25
+ paddw m25, m4, m9
+ paddw m20, m10, m5
+ paddw m19, m25 ; +p2+q2-p6-p5
+ paddw m17, m0, m3
+ psubw m16, m20, m16
+ psrlw m3{k3}, m19, 4
+ paddw m19, m16 ; +p1+q3-p6-p4
+ paddw m16, m11, m6
+ psubw m16, m17
+ paddw m17, m0, m4
+ psrlw m4{k3}, m19, 4
+ paddw m19, m16 ; +p0+q4-p6-p3
+ paddw m16, m22, m7
+ psubw m16, m17
+ paddw m17, m0, m5
+ psrlw m5{k3}, m19, 4
+ paddw m19, m16 ; +q0+q5-p6-p2
+ paddw m16, m23, m8
+ psrlw m6{k3}, m19, 4
+ psubw m16, m17
+ paddw m19, m16 ; +q1+q6-p6-p1
+ paddw m16, m23, m9
+ psrlw m7{k3}, m19, 4
+ psubw m16, m26
+ paddw m19, m16 ; +q2+q6-p5-p0
+ paddw m16, m23, m10
+ psrlw m8{k3}, m19, 4
+ psubw m16, m24
+ paddw m19, m16 ; +q3+q6-p4-p0
+ paddw m16, m23, m11
+ psrlw m9{k3}, m19, 4
+ psubw m16, m18
+ paddw m19, m16 ; +q4+q6-p3-q1
+ paddw m16, m23, m22
+ psrlw m10{k3}, m19, 4
+ psubw m16, m25
+ paddw m19, m16 ; +q5+q6-p2-q2
+ paddw m16, m23, m23
+ psrlw m11{k3}, m19, 4
+ psubw m16, m20
+ paddw m19, m16 ; +q6*2-p1-q3
+ psrlw m22{k3}, m19, 4
+%endif
+%if %1 == 8 || %1 == 16 ; flat8 filter
+ vpbroadcastd m20, [pw_4096]
+ paddw m16, m3, m4 ; p3+p2
+ paddw m19, m5, m6 ; p1+p0
+ paddw m17, m16, m16 ; 2*(p3+p2)
+ paddw m19, m3 ; p1+p0+p3
+ paddw m17, m7 ; 2*(p3+p2)+q0
+ paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0
+ paddw m18, m4, m7
+ pmulhrsw m4{k2}, m19, m20
+ psubw m19, m16
+ paddw m17, m5, m8
+ paddw m16, m3, m5
+ paddw m19, m17
+ pmulhrsw m5{k2}, m19, m20
+ psubw m19, m16
+ paddw m16, m6, m9
+ paddw m19, m16
+ paddw m16, m3, m6
+ pmulhrsw m6{k2}, m19, m20
+ paddw m19, m10
+ psubw m16, m7, m16
+ paddw m19, m16
+ psubw m16, m10, m18
+ pmulhrsw m7{k2}, m19, m20
+ paddw m16, m8
+ paddw m19, m16
+ psubw m16, m10, m17
+ pmulhrsw m8{k2}, m19, m20
+ paddw m16, m9
+ paddw m19, m16
+ pmulhrsw m9{k2}, m19, m20
+%elif %1 == 6 ; flat6 filter
+ vpbroadcastd m10, [pw_4096]
+ paddw m2, m5, m6
+ paddw m0, m4, m7
+ paddw m1, m2, m4 ; p2+p1+p0
+ paddw m3, m4, m4
+ paddw m1, m1
+ paddw m4, m5
+ paddw m1, m0 ; p2+2*(p2+p1+p0)+q0
+ psubw m3, m7, m3
+ pmulhrsw m5{k2}, m1, m10
+ paddw m3, m8
+ psubw m4, m8, m4
+ paddw m1, m3
+ pmulhrsw m6{k2}, m1, m10
+ paddw m4, m9
+ paddw m9, m9
+ paddw m1, m4
+ pmulhrsw m7{k2}, m1, m10
+ psubw m9, m2
+ paddw m1, m9
+ pmulhrsw m8{k2}, m1, m10
+%endif
+
+%ifidn %2, v
+%if %1 == 16
+ mova [tmpq+strideq*2 ], m1 ; p5
+ mova [tmpq+stride3q ], m2 ; p4
+ mova [tmpq+strideq*4 ], m3 ; p3
+ mova [tmpq+stride5q ], m4 ; p2
+%elif %1 == 8
+ mova [tmpq+strideq*1 ], m4 ; p2
+%endif
+ mova [dstq+mstrideq*2], m5 ; p1
+ mova [dstq+mstrideq ], m6 ; p0
+ mova [dstq+strideq*0 ], m7 ; q0
+ mova [dstq+strideq*1 ], m8 ; q1
+%if %1 == 8 || %1 == 16
+ mova [dstq+strideq*2 ], m9 ; q2
+%endif
+%if %1 == 16
+ mova [dstq+stride3q ], m10 ; q3
+ mova [dstq+strideq*4 ], m11 ; q4
+ mova [dstq+stride5q ], m22 ; q5
+%endif
+%else
+%if %1 == 16
+ TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20
+ TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20
+ mova [dstq+strideq*0 -16], xm27
+ mova [dstq+strideq*0 ], xm7
+ mova [dstq+strideq*1 -16], xm0
+ mova [dstq+strideq*1 ], xm8
+ mova [dstq+strideq*2 -16], xm1
+ mova [dstq+strideq*2 ], xm9
+ mova [dstq+stride3q -16], xm2
+ mova [dstq+stride3q ], xm10
+ mova [dstq+strideq*4 -16], xm3
+ mova [dstq+strideq*4 ], xm11
+ mova [dstq+stride5q -16], xm4
+ mova [dstq+stride5q ], xm22
+ mova [dstq+stride3q*2-16], xm5
+ mova [dstq+stride3q*2 ], xm23
+ mova [dstq+stride7q -16], xm6
+ mova [dstq+stride7q ], xm28
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 -16], ym27, 1
+ vextracti128 [dstq+strideq*0 ], ym7, 1
+ vextracti128 [dstq+strideq*1 -16], ym0, 1
+ vextracti128 [dstq+strideq*1 ], ym8, 1
+ vextracti128 [dstq+strideq*2 -16], ym1, 1
+ vextracti128 [dstq+strideq*2 ], ym9, 1
+ vextracti128 [dstq+stride3q -16], ym2, 1
+ vextracti128 [dstq+stride3q ], ym10, 1
+ vextracti128 [dstq+strideq*4 -16], ym3, 1
+ vextracti128 [dstq+strideq*4 ], ym11, 1
+ vextracti128 [dstq+stride5q -16], ym4, 1
+ vextracti128 [dstq+stride5q ], ym22, 1
+ vextracti128 [dstq+stride3q*2-16], ym5, 1
+ vextracti128 [dstq+stride3q*2 ], ym23, 1
+ vextracti128 [dstq+stride7q -16], ym6, 1
+ vextracti128 [dstq+stride7q ], ym28, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 2
+ vextracti32x4 [dstq+strideq*0 ], m7, 2
+ vextracti32x4 [dstq+strideq*1 -16], m0, 2
+ vextracti32x4 [dstq+strideq*1 ], m8, 2
+ vextracti32x4 [dstq+strideq*2 -16], m1, 2
+ vextracti32x4 [dstq+strideq*2 ], m9, 2
+ vextracti32x4 [dstq+stride3q -16], m2, 2
+ vextracti32x4 [dstq+stride3q ], m10, 2
+ vextracti32x4 [dstq+strideq*4 -16], m3, 2
+ vextracti32x4 [dstq+strideq*4 ], m11, 2
+ vextracti32x4 [dstq+stride5q -16], m4, 2
+ vextracti32x4 [dstq+stride5q ], m22, 2
+ vextracti32x4 [dstq+stride3q*2-16], m5, 2
+ vextracti32x4 [dstq+stride3q*2 ], m23, 2
+ vextracti32x4 [dstq+stride7q -16], m6, 2
+ vextracti32x4 [dstq+stride7q ], m28, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 -16], m27, 3
+ vextracti32x4 [dstq+strideq*0 ], m7, 3
+ vextracti32x4 [dstq+strideq*1 -16], m0, 3
+ vextracti32x4 [dstq+strideq*1 ], m8, 3
+ vextracti32x4 [dstq+strideq*2 -16], m1, 3
+ vextracti32x4 [dstq+strideq*2 ], m9, 3
+ vextracti32x4 [dstq+stride3q -16], m2, 3
+ vextracti32x4 [dstq+stride3q ], m10, 3
+ vextracti32x4 [dstq+strideq*4 -16], m3, 3
+ vextracti32x4 [dstq+strideq*4 ], m11, 3
+ vextracti32x4 [dstq+stride5q -16], m4, 3
+ vextracti32x4 [dstq+stride5q ], m22, 3
+ vextracti32x4 [dstq+stride3q*2-16], m5, 3
+ vextracti32x4 [dstq+stride3q*2 ], m23, 3
+ vextracti32x4 [dstq+stride7q -16], m6, 3
+ vextracti32x4 [dstq+stride7q ], m28, 3
+%elif %1 == 8
+ TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2
+ movu [dstq+strideq*0 ], xm3
+ movu [dstq+strideq*1 ], xm4
+ movu [dstq+strideq*2 ], xm5
+ movu [dstq+stride3q ], xm6
+ movu [dstq+strideq*4 ], xm7
+ movu [dstq+stride5q ], xm8
+ movu [dstq+stride3q*2], xm9
+ movu [dstq+stride7q ], xm10
+ lea dstq, [dstq+strideq*8]
+ vextracti128 [dstq+strideq*0 ], ym3, 1
+ vextracti128 [dstq+strideq*1 ], ym4, 1
+ vextracti128 [dstq+strideq*2 ], ym5, 1
+ vextracti128 [dstq+stride3q ], ym6, 1
+ vextracti128 [dstq+strideq*4 ], ym7, 1
+ vextracti128 [dstq+stride5q ], ym8, 1
+ vextracti128 [dstq+stride3q*2], ym9, 1
+ vextracti128 [dstq+stride7q ], ym10, 1
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 2
+ vextracti32x4 [dstq+strideq*1 ], m4, 2
+ vextracti32x4 [dstq+strideq*2 ], m5, 2
+ vextracti32x4 [dstq+stride3q ], m6, 2
+ vextracti32x4 [dstq+strideq*4 ], m7, 2
+ vextracti32x4 [dstq+stride5q ], m8, 2
+ vextracti32x4 [dstq+stride3q*2], m9, 2
+ vextracti32x4 [dstq+stride7q ], m10, 2
+ lea dstq, [dstq+strideq*8]
+ vextracti32x4 [dstq+strideq*0 ], m3, 3
+ vextracti32x4 [dstq+strideq*1 ], m4, 3
+ vextracti32x4 [dstq+strideq*2 ], m5, 3
+ vextracti32x4 [dstq+stride3q ], m6, 3
+ vextracti32x4 [dstq+strideq*4 ], m7, 3
+ vextracti32x4 [dstq+stride5q ], m8, 3
+ vextracti32x4 [dstq+stride3q*2], m9, 3
+ vextracti32x4 [dstq+stride7q ], m10, 3
+ lea dstq, [dstq+strideq*8+8]
+%else ; %1 == 4 || %1 == 6
+ punpcklwd m9, m5, m6
+ punpckhwd m5, m6
+ kxnorb k1, k1, k1
+ punpcklwd m6, m7, m8
+ punpckhwd m7, m8
+ kmovb k2, k1
+ punpckldq m8, m9, m6
+ vpscatterdq [dstq+ym12-4]{k1}, m8
+ punpckhdq m9, m6
+ lea tmpq, [dstq+strideq*2-4]
+ kmovb k1, k2
+ vpscatterdq [tmpq+ym12]{k2}, m9
+ punpckldq m6, m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ kmovb k2, k1
+ vpscatterdq [tmpq+ym12]{k1}, m6
+ punpckhdq m5, m7
+ lea tmpq, [tmpq+strideq*2]
+ vpscatterdq [tmpq+ym12]{k2}, m5
+%endif
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \
+ lut, w, stride3, mstride, tmp, \
+ mask_bits, stride5
+%define base tmpq-filter_mask
+ SWAP 12, 26 ; avoids clobbering xmm10 on WIN64
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, v
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call .v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.v4: ; called by both luma and chroma
+ FILTER 4, v
+ ret
+
+cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \
+ lut, h, stride3, l_stride3, tmp, \
+ mask_bits, stride5, stride7
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ lea stride3q, [strideq*3]
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ lea stride5q, [strideq*5]
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride7q, [strideq+stride3q*2]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+8], mask_bitsd ; vmask[2]
+ jz .no_flat16
+ FILTER 16, h
+ jmp .end
+.no_flat16:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 8, h
+ jmp .end2
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .no_filter
+ call .h4
+.no_filter:
+ lea dstq, [dstq+stride3q*8]
+.end:
+ lea dstq, [dstq+strideq*8]
+.end2:
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+ALIGN function_align
+.h4: ; called by both luma and chroma
+ FILTER 4, h
+ ret
+
+cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ w, stride3, mstride, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ shl l_strideq, 2
+ lea stride3q, [strideq*3]
+ shr r6d, 11 ; is_12bpc
+ mova ym21, [base+filter_mask]
+ mov mstrideq, strideq
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ sub lq, l_strideq
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ neg mstrideq
+ mov wd, wm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, v
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4
+.end:
+ shl mask_bitsd, 8
+ add dstq, 64
+ pslld ym21, 8
+ add lq, 32
+ sub wd, 8
+ jg .loop
+ RET
+
+cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \
+ h, stride3, l_stride3, tmp, mask_bits
+ lea tmpq, [filter_mask]
+ mov r6d, r7m ; bitdepth_max
+ vpbroadcastd ym12, strided
+ shl l_strideq, 2
+ shr r6d, 11 ; is_12bpc
+ pmulld ym12, [base+stride_mul]
+ lea stride3q, [strideq*3]
+ mova ym21, [base+filter_mask]
+ mov mask_bitsd, 0xff
+ vpbroadcastd m13, [base+pw_4+r6*8]
+ sub lq, 4
+ vpbroadcastd m14, [base+clip_max+r6*4]
+ lea l_stride3q, [l_strideq*3]
+ vpbroadcastd m15, [base+clip_min+r6*4]
+ mov hd, hm
+.loop:
+ test [maskq+4], mask_bitsd ; vmask[1]
+ jz .no_flat
+ FILTER 6, h
+ jmp .end
+.no_flat:
+ test [maskq+0], mask_bitsd ; vmask[0]
+ jz .end
+ call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4
+.end:
+ lea tmpq, [strideq+stride3q]
+ shl mask_bitsd, 8
+ pslld ym21, 8
+ lea dstq, [dstq+tmpq*8]
+ lea lq, [lq+l_strideq*8]
+ sub hd, 8
+ jg .loop
+ RET
+
+%endif ; ARCH_X86_64