summaryrefslogtreecommitdiff
path: root/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm')
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm718
1 files changed, 345 insertions, 373 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
index c09dced418b..0218b624d3c 100644
--- a/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
+++ b/chromium/third_party/dav1d/libdav1d/src/x86/loopfilter_avx512.asm
@@ -80,25 +80,24 @@ SECTION .text
punpckhwd m%1, m%3
kmovw k1, k6
lea t0, [dstq+strideq*4]
- vpscatterdd [dstq+m29-2]{k1}, m%4
+ vpscatterdd [dstq+m19-2]{k1}, m%4
kmovw k1, k6
lea t1, [dstq+strideq*8]
- vpscatterdd [t0 +m29-2]{k1}, m%5
+ vpscatterdd [t0 +m19-2]{k1}, m%5
kmovw k1, k6
lea t2, [t0 +strideq*8]
- vpscatterdd [t1 +m29-2]{k1}, m%2
+ vpscatterdd [t1 +m19-2]{k1}, m%2
kmovw k1, k6
- vpscatterdd [t2 +m29-2]{k1}, m%1
+ vpscatterdd [t2 +m19-2]{k1}, m%1
%endmacro
%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
%if %1 == 0
- SWAP m16, m15
+ SWAP m16, m22
%endif
- ; input in m0-15
- punpcklbw m15, m0, m1
- punpckhbw m0, m1
- punpcklbw m1, m2, m3
+ punpcklbw m22, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m2, m3
punpckhbw m2, m3
punpcklbw m3, m4, m5
punpckhbw m4, m5
@@ -108,21 +107,21 @@ SECTION .text
punpckhbw m8, m9
punpcklbw m9, m10, m11
punpckhbw m10, m11
- punpcklbw m11, m12, m13
- punpckhbw m12, m13
+ punpcklbw m11, m25, m13
+ punpckhbw m25, m13
%if %1 == 0
SWAP m13, m16
%else
mova m13, %3
%endif
- SWAP m16, m12
- punpcklbw m12, m14, m13
+ SWAP m16, m25
+ punpcklbw m25, m14, m13
punpckhbw m13, m14, m13
- ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
- punpcklwd m14, m15, m1
- punpckhwd m15, m1
- punpcklwd m1, m0, m2
- punpckhwd m0, m2
+ ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13
+ punpcklwd m14, m22, m26
+ punpckhwd m22, m26
+ punpcklwd m26, m24, m2
+ punpckhwd m24, m2
punpcklwd m2, m3, m5
punpckhwd m3, m5
punpcklwd m5, m4, m6
@@ -131,58 +130,58 @@ SECTION .text
punpckhwd m7, m9
punpcklwd m9, m8, m10
punpckhwd m8, m10
- punpcklwd m10, m11, m12
- punpckhwd m11, m12
- SWAP m12, m16, m11
- punpcklwd m11, m12, m13
- punpckhwd m12, m13
- ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
+ punpcklwd m10, m11, m25
+ punpckhwd m11, m25
+ SWAP m25, m16, m11
+ punpcklwd m11, m25, m13
+ punpckhwd m25, m13
+ ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25
punpckldq m13, m14, m2
punpckhdq m14, m2
- punpckldq m2, m15, m3
- punpckhdq m15, m3
- punpckldq m3, m1, m5
- punpckhdq m1, m5
- punpckldq m5, m0, m4
- punpckhdq m0, m4
+ punpckldq m2, m22, m3
+ punpckhdq m22, m3
+ punpckldq m3, m26, m5
+ punpckhdq m26, m5
+ punpckldq m5, m24, m4
+ punpckhdq m24, m4
punpckldq m4, m6, m10
punpckhdq m6, m10
punpckldq m10, m9, m11
punpckhdq m9, m11
- punpckldq m11, m8, m12
- punpckhdq m8, m12
- SWAP m12, m16, m8
- punpckldq m8, m7, m12
- punpckhdq m7, m12
- ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
- punpcklqdq m12, m13, m4
+ punpckldq m11, m8, m25
+ punpckhdq m8, m25
+ SWAP m25, m16, m8
+ punpckldq m8, m7, m25
+ punpckhdq m7, m25
+ ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3
+ punpcklqdq m25, m13, m4
punpckhqdq m13, m4
punpcklqdq m4, m14, m6
punpckhqdq m14, m6
punpcklqdq m6, m2, m8
punpckhqdq m2, m8
- punpcklqdq m8, m15, m7
- punpckhqdq m15, m7
+ punpcklqdq m8, m22, m7
+ punpckhqdq m22, m7
punpcklqdq m7, m3, m10
punpckhqdq m3, m10
- punpcklqdq m10, m1, m9
- punpckhqdq m1, m9
+ punpcklqdq m10, m26, m9
+ punpckhqdq m26, m9
punpcklqdq m9, m5, m11
punpckhqdq m5, m11
SWAP m11, m16
%if %2 == 0
- SWAP m16, m12
+ SWAP m16, m25
%else
- mova %3, m12
+ mova %3, m25
%endif
- punpcklqdq m12, m0, m11
- punpckhqdq m0, m11
+ punpcklqdq m25, m24, m11
+ punpckhqdq m24, m11
%if %2 == 0
SWAP m11, m16
%endif
- ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
- SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
- SWAP 3, 14, 12, 9
+ ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24
+ SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22
+ SWAP 3, 14, 25, 9
%endmacro
%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
@@ -205,7 +204,7 @@ SECTION .text
%endif
lea t0, [dstq+mstrideq*4]
%if %1 != 6
- mova m12, [t0 +strideq*0]
+ mova m25, [t0 +strideq*0]
%endif
mova m13, [t0 +strideq*1]
mova m3, [t0 +strideq*2]
@@ -214,13 +213,13 @@ SECTION .text
mova m6, [dstq+strideq*1]
mova m14, [dstq+strideq*2]
%if %1 != 6
- mova m15, [dstq+stride3q ]
+ mova m22, [dstq+stride3q ]
%endif
%if %1 == 16
lea t0, [dstq+strideq*4]
- mova m19, [t0 +strideq*0]
- mova m20, [t0 +strideq*1]
- mova m21, [t0 +strideq*2]
+ mova m29, [t0 +strideq*0]
+ mova m30, [t0 +strideq*1]
+ mova m31, [t0 +strideq*2]
%endif
%endif
%else ; h
@@ -230,15 +229,15 @@ SECTION .text
vbroadcasti32x4 m0, [hshuf4]
kmovw k1, k6
lea t0, [dstq+strideq*4]
- vpgatherdd m3{k1}, [dstq+m29-2]
+ vpgatherdd m3{k1}, [dstq+m19-2]
kmovw k1, k6
lea t1, [dstq+strideq*8]
- vpgatherdd m4{k1}, [t0 +m29-2]
+ vpgatherdd m4{k1}, [t0 +m19-2]
kmovw k1, k6
lea t2, [t0 +strideq*8]
- vpgatherdd m5{k1}, [t1 +m29-2]
+ vpgatherdd m5{k1}, [t1 +m19-2]
kmovw k1, k6
- vpgatherdd m6{k1}, [t2 +m29-2]
+ vpgatherdd m6{k1}, [t2 +m19-2]
pshufb m3, m0
pshufb m4, m0
pshufb m5, m0
@@ -257,16 +256,16 @@ SECTION .text
%elif %1 == 6 || %1 == 8
kmovb k1, k7
lea t0, [dstq+strideq*1]
- vpgatherdq m3{k1}, [dstq+ym31-%1/2]
+ vpgatherdq m3{k1}, [dstq+ym21-%1/2]
kmovb k1, k7
lea t1, [dstq+strideq*2]
- vpgatherdq m4{k1}, [t0 +ym31-%1/2]
+ vpgatherdq m4{k1}, [t0 +ym21-%1/2]
kmovb k1, k7
lea t2, [dstq+stride3q ]
- vpgatherdq m5{k1}, [t1 +ym31-%1/2]
+ vpgatherdq m5{k1}, [t1 +ym21-%1/2]
kmovb k1, k7
- vextracti32x8 ym0, m31, 1
- vpgatherdq m6{k1}, [t2 +ym31-%1/2]
+ vextracti32x8 ym0, m21, 1
+ vpgatherdq m6{k1}, [t2 +ym21-%1/2]
kmovb k1, k7
vpgatherdq m12{k1}, [dstq+ym0 -%1/2]
kmovb k1, k7
@@ -344,7 +343,7 @@ SECTION .text
punpckhqdq m13, m5, m13
%if %1 == 8
punpcklqdq m5, m7, m12
- punpckhqdq m12, m7, m12
+ punpckhqdq m25, m7, m12
; xm3: A0-15
; xm14: B0-15
; xm15: C0-15
@@ -352,10 +351,11 @@ SECTION .text
; xm4: E0-15
; xm13: F0-15
; xm5: G0-15
- ; xm12: H0-15
- SWAP 12, 3, 15
+ ; xm25: H0-15
+ SWAP 25, 3, 15
SWAP 13, 14, 5, 4, 6
- ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
+ SWAP 15, 22
+ ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22
%else
SWAP 13, 3, 14
SWAP 6, 4, 15, 5
@@ -364,8 +364,8 @@ SECTION .text
%else ; 16, h
; load and 16x16 transpose. We only use 14 pixels but we'll need the
; remainder at the end for the second transpose
- movu xm0, [dstq+strideq*0-8]
- movu xm1, [dstq+strideq*1-8]
+ movu xm24, [dstq+strideq*0-8]
+ movu xm26, [dstq+strideq*1-8]
movu xm2, [dstq+strideq*2-8]
movu xm3, [dstq+stride3q -8]
lea t0, [dstq+strideq*4]
@@ -379,13 +379,13 @@ SECTION .text
movu xm10, [t0 +strideq*2-8]
movu xm11, [t0 +stride3q -8]
lea t0, [t0 +strideq*4]
- movu xm12, [t0 +strideq*0-8]
+ movu xm25, [t0 +strideq*0-8]
movu xm13, [t0 +strideq*1-8]
movu xm14, [t0 +strideq*2-8]
- movu xm15, [t0 +stride3q -8]
+ movu xm22, [t0 +stride3q -8]
lea t0, [t0 +strideq*4]
- vinserti32x4 ym0, [t0 +strideq*0-8], 1
- vinserti32x4 ym1, [t0 +strideq*1-8], 1
+ vinserti32x4 ym24, [t0 +strideq*0-8], 1
+ vinserti32x4 ym26, [t0 +strideq*1-8], 1
vinserti32x4 ym2, [t0 +strideq*2-8], 1
vinserti32x4 ym3, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
@@ -399,13 +399,13 @@ SECTION .text
vinserti32x4 ym10, [t0 +strideq*2-8], 1
vinserti32x4 ym11, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
- vinserti32x4 ym12, [t0 +strideq*0-8], 1
+ vinserti32x4 ym25, [t0 +strideq*0-8], 1
vinserti32x4 ym13, [t0 +strideq*1-8], 1
vinserti32x4 ym14, [t0 +strideq*2-8], 1
- vinserti32x4 ym15, [t0 +stride3q -8], 1
+ vinserti32x4 ym22, [t0 +stride3q -8], 1
lea t0, [t0 +strideq*4]
- vinserti32x4 m0, [t0 +strideq*0-8], 2
- vinserti32x4 m1, [t0 +strideq*1-8], 2
+ vinserti32x4 m24, [t0 +strideq*0-8], 2
+ vinserti32x4 m26, [t0 +strideq*1-8], 2
vinserti32x4 m2, [t0 +strideq*2-8], 2
vinserti32x4 m3, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
@@ -419,13 +419,13 @@ SECTION .text
vinserti32x4 m10, [t0 +strideq*2-8], 2
vinserti32x4 m11, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
- vinserti32x4 m12, [t0 +strideq*0-8], 2
+ vinserti32x4 m25, [t0 +strideq*0-8], 2
vinserti32x4 m13, [t0 +strideq*1-8], 2
vinserti32x4 m14, [t0 +strideq*2-8], 2
- vinserti32x4 m15, [t0 +stride3q -8], 2
+ vinserti32x4 m22, [t0 +stride3q -8], 2
lea t0, [t0 +strideq*4]
- vinserti32x4 m0, [t0 +strideq*0-8], 3
- vinserti32x4 m1, [t0 +strideq*1-8], 3
+ vinserti32x4 m24, [t0 +strideq*0-8], 3
+ vinserti32x4 m26, [t0 +strideq*1-8], 3
vinserti32x4 m2, [t0 +strideq*2-8], 3
vinserti32x4 m3, [t0 +stride3q -8], 3
lea t0, [t0 +strideq*4]
@@ -439,41 +439,38 @@ SECTION .text
vinserti32x4 m10, [t0 +strideq*2-8], 3
vinserti32x4 m11, [t0 +stride3q -8], 3
lea t0, [t0 +strideq*4]
- vinserti32x4 m12, [t0 +strideq*0-8], 3
+ vinserti32x4 m25, [t0 +strideq*0-8], 3
vinserti32x4 m13, [t0 +strideq*1-8], 3
vinserti32x4 m14, [t0 +strideq*2-8], 3
- vinserti32x4 m15, [t0 +stride3q -8], 3
+ vinserti32x4 m22, [t0 +stride3q -8], 3
;
TRANSPOSE_16X16B 0, 1, [rsp+0*64]
- SWAP m16, m1
+ SWAP m16, m26
SWAP m17, m2
SWAP m18, m3
- SWAP m19, m12
- SWAP m20, m13
- SWAP m21, m14
- mova [rsp+4*64], m15
- ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
- SWAP 12, 4, 7
+ SWAP m29, m25
+ SWAP m30, m13
+ SWAP m31, m14
+ mova [rsp+4*64], m22
+ ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22
+ SWAP 25, 4, 7
SWAP 13, 5, 8
SWAP 3, 6, 9
SWAP 10, 14
- SWAP 11, 15
+ SWAP 11, 22
%endif
%endif
; load L/E/I/H
-%if is_uv
- SWAP m22, m15
-%endif
- vpbroadcastd m22, [pb_1]
+ vpbroadcastd m15, [pb_1]
%ifidn %2, v
movu m1, [lq]
movu m0, [lq+l_strideq]
%else
kmovw k1, k6
- vpgatherdd m0{k1}, [lq+m30+4]
+ vpgatherdd m0{k1}, [lq+m20+4]
kmovw k1, k6
- vpgatherdd m1{k1}, [lq+m30+0]
+ vpgatherdd m1{k1}, [lq+m20+0]
%endif
pxor m2, m2
pcmpeqb k1, m0, m2
@@ -484,7 +481,7 @@ SECTION .text
pand m2, [pb_63]{bcstd}
vpbroadcastb m1, [lutq+136]
pminub m2, m1
- pmaxub m2, m22 ; I
+ pmaxub m2, m15 ; I
pand m1, m0, [pb_240]{bcstd}
psrlq m1, 4 ; H
paddd m0, [pb_2]{bcstd}
@@ -500,7 +497,7 @@ SECTION .text
ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
pmaxub m9, m8
%else
- ABSSUB m9, m12, m4, m10 ; abs(p3-p0)
+ ABSSUB m9, m25, m4, m10 ; abs(p3-p0)
pmaxub m9, m8
ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
pmaxub m9, m10
@@ -508,17 +505,17 @@ SECTION .text
ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
pmaxub m9, m10
%if %1 != 6
- ABSSUB m10, m5, m15, m11 ; abs(q3-q0)
+ ABSSUB m10, m5, m22, m11 ; abs(q3-q0)
pmaxub m9, m10
%endif
- vpcmpub k2{k3}, m9, m22, 2 ; le ; flat8in
+ vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in
%if %1 == 6
ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
%else
- ABSSUB m10, m12, m13, m11 ; abs(p3-p2)
+ ABSSUB m10, m25, m13, m11 ; abs(p3-p2)
ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
pmaxub m10, m11
- ABSSUB m11, m14, m15, m1 ; abs(q3-q2)
+ ABSSUB m11, m14, m22, m1 ; abs(q3-q2)
pmaxub m10, m11
%endif
ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
@@ -526,16 +523,10 @@ SECTION .text
%if %1 == 16
vpbroadcastd m11, [maskq+8]
por m11, [maskq+4]{bcstd}
- pand m11, pbmask
%else
- %if !is_h || %1 == 6
- pand m11, pbmask, [maskq+4]{bcstd}
- %else
vpbroadcastd m11, [maskq+4]
- pand m11, pbmask
- %endif
%endif
- pcmpeqd k4, m11, pbmask
+ vptestmd k4, m11, pbmask
vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks
pmaxub m8, m10
%endif
@@ -554,77 +545,58 @@ SECTION .text
pmaxub m1, m2
ABSSUB m2, m18, m4, m10
pmaxub m1, m2
- ABSSUB m2, m19, m5, m10
+ ABSSUB m2, m29, m5, m10
pmaxub m1, m2
- ABSSUB m2, m20, m5, m10
+ ABSSUB m2, m30, m5, m10
pmaxub m1, m2
- ABSSUB m2, m21, m5, m10
+ ABSSUB m2, m31, m5, m10
pmaxub m1, m2
- ;
- vpcmpub k4, m1, m22, 2 ; flat8out
- kandq k4, k4, k2 ; flat8in & flat8out
-
+ kandq k2, k2, k3
+ vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out
vpbroadcastd m2, [maskq+8]
- pand m10, m2, pbmask
- pcmpeqd k5, m10, pbmask
+ vptestmd k5, m2, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k4, k4, k5 ; flat16
- kandq k4, k3, k4 ; flat16 & fm
+ vptestmb k4{k4}, m7, m7 ; flat16 & fm
por m10, m2, [maskq+4]{bcstd}
- pand m2, m10, pbmask
- pcmpeqd k5, m2, pbmask
+ vptestmd k5, m10, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k2, k2, k5 ; flat8in
- kandq k2, k3, k2
+ vptestmb k2{k2}, m7, m7 ; flat8in
por m2, m10, [maskq+0]{bcstd}
- pand m2, pbmask
- pcmpeqd k5, m2, pbmask
+ vptestmd k5, m2, pbmask
vpmovm2d m7, k5
- vpmovb2m k5, m7
- kandq k3, k3, k5
+ vptestmb k3{k3}, m7, m7
kandnq k3, k2, k3 ; fm & !flat8 & !flat16
kandnq k2, k4, k2 ; flat8 & !flat16
%elif %1 != 4
vpbroadcastd m0, [maskq+4]
- pand m2, m0, pbmask
- pcmpeqd k4, m2, pbmask
+ vptestmd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k2, k2, k4
+ vptestmb k2{k2}, m7, m7
kandq k2, k2, k3 ; flat8 & fm
por m0, [maskq+0]{bcstd}
- pand m0, pbmask
- pcmpeqd k4, m0, pbmask
+ vptestmd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k3, k3, k4
+ vptestmb k3{k3}, m7, m7
kandnq k3, k2, k3 ; fm & !flat8
%else
%ifidn %2, v
- pand m0, pbmask, [maskq+0]{bcstd}
+ vptestmd k4, pbmask, [maskq+0]{bcstd}
%else
vpbroadcastd m0, [maskq+0]
- pand m0, pbmask
+ vptestmd k4, m0, pbmask
%endif
- pcmpeqd k4, m0, pbmask
vpmovm2d m7, k4
- vpmovb2m k4, m7
- kandq k3, k3, k4 ; fm
+ vptestmb k3{k3}, m7, m7 ; fm
%endif
; short filter
-%if is_uv
- SWAP m23, m22
- SWAP m24, m0
- SWAP m25, m12
- SWAP m26, m1
+%if %1 >= 8
+ SWAP m23, m15
%endif
- vpbroadcastd m23, [pb_3]
- vpbroadcastd m24, [pb_4]
- vpbroadcastd m25, [pb_16]
- vpbroadcastd m26, [pb_64]
+ vpbroadcastd m15, [pb_3]
+ vpbroadcastd m0, [pb_4]
+ vpbroadcastd m12, [pb_16]
+ vpbroadcastd m1, [pb_64]
pxor m3, pb128
pxor m6, pb128
psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev
@@ -634,16 +606,16 @@ SECTION .text
paddsb m10, m11
paddsb m10, m11
paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm
- paddsb m8, m10, m23
- paddsb m10, m24
+ paddsb m8, m10, m15
+ paddsb m10, m0
pand m8, [pb_248]{bcstd}
pand m10, [pb_248]{bcstd}
psrlq m8, 3
psrlq m10, 3
- pxor m8, m25
- pxor m10, m25
- psubb m8, m25 ; f2
- psubb m10, m25 ; f1
+ pxor m8, m12
+ pxor m10, m12
+ psubb m8, m12 ; f2
+ psubb m10, m12 ; f1
paddsb m4, m8
psubsb m5, m10
pxor m4, pb128
@@ -652,7 +624,7 @@ SECTION .text
pxor m10, pb128
pxor m8, m8
pavgb m8, m10 ; f=(f1+1)>>1
- psubb m8, m26
+ psubb m8, m1
knotq k1, k1
paddsb m3{k1}, m3, m8
psubsb m6{k1}, m6, m8
@@ -664,40 +636,40 @@ SECTION .text
%ifidn %2, v
lea t0, [dstq+mstrideq*8]
%endif
- SWAP m0, m16, m14
- SWAP m2, m17, m15
+ SWAP m24, m16, m14
+ SWAP m2, m17, m22
SWAP m7, m18
; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
; write -6
- vpbroadcastd m26, [pb_7_1]
- vpbroadcastd m25, [pb_2]
- punpcklbw m14, m0, m12
- punpckhbw m15, m0, m12
- pmaddubsw m10, m14, m26
- pmaddubsw m11, m15, m26 ; p6*7+p3
+ vpbroadcastd m1, [pb_7_1]
+ vpbroadcastd m12, [pb_2]
+ punpcklbw m14, m24, m25
+ punpckhbw m22, m24, m25
+ pmaddubsw m10, m14, m1
+ pmaddubsw m11, m22, m1 ; p6*7+p3
punpcklbw m8, m2, m7
punpckhbw m9, m2, m7
- pmaddubsw m8, m25
- pmaddubsw m9, m25
+ pmaddubsw m8, m12
+ pmaddubsw m9, m12
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3
%ifidn %2, h
vpbroadcastd m27, [pw_2048]
- vpbroadcastd m26, [pb_m1_1]
+ vpbroadcastd m1, [pb_m1_1]
%define pw2048 m27
- %define pbm1_1 m26
+ %define pbm1_1 m1
%endif
punpcklbw m8, m13, m3
punpckhbw m9, m13, m3
- pmaddubsw m8, m22
- pmaddubsw m9, m22
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
punpcklbw m8, m4, m5
punpckhbw m9, m4, m5
- pmaddubsw m8, m22
- pmaddubsw m9, m22
+ pmaddubsw m8, m23
+ pmaddubsw m9, m23
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
pmulhrsw m8, m10, pw2048
@@ -713,17 +685,17 @@ SECTION .text
; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
; write -5
pmaddubsw m14, pbm1_1
- pmaddubsw m15, pbm1_1
+ pmaddubsw m22, pbm1_1
paddw m10, m14
- paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
- punpcklbw m8, m0, m6
- punpckhbw m9, m0, m6
+ paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
+ punpcklbw m8, m24, m6
+ punpckhbw m9, m24, m6
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
SWAP m18, m8
- SWAP m22, m9
+ SWAP m23, m9
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
@@ -737,8 +709,8 @@ SECTION .text
; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
; write -4
SWAP m14, m16
- punpcklbw m8, m0, m13
- punpckhbw m9, m0, m13
+ punpcklbw m8, m24, m13
+ punpckhbw m9, m24, m13
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -756,21 +728,21 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3
%else
- vpblendmb m8{k4}, m12, m8
+ vpblendmb m8{k4}, m25, m8
mova [rsp+3*64], m8
%endif
; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
; write -3
- SWAP m15, m17
- punpcklbw m8, m0, m3
- punpckhbw m9, m0, m3
+ SWAP m22, m17
+ punpcklbw m8, m24, m3
+ punpckhbw m9, m24, m3
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
- punpcklbw m8, m7, m15
- punpckhbw m7, m15
+ punpcklbw m8, m7, m22
+ punpckhbw m7, m22
pmaddubsw m8, pbm1_1
pmaddubsw m7, pbm1_1
paddw m10, m8
@@ -779,69 +751,69 @@ SECTION .text
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
- vpblendmb m23{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
+ vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F
; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
; write -2
%ifidn %2, v
lea t0, [dstq+strideq*4]
%endif
- punpcklbw m8, m0, m4
- punpckhbw m9, m0, m4
+ punpcklbw m8, m24, m4
+ punpckhbw m9, m24, m4
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
- punpcklbw m8, m12, m19
- punpckhbw m9, m12, m19
- SWAP m1, m19
+ punpcklbw m8, m25, m29
+ punpckhbw m9, m25, m29
+ SWAP m26, m29
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
- SWAP m19, m8
- SWAP m24, m9
+ SWAP m29, m8
+ SWAP m0, m9
pmulhrsw m8, m10, pw2048
pmulhrsw m9, m11, pw2048
packuswb m8, m9
- vpblendmb m25{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
+ vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G
; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
; write -1
%ifidn %2, h
- SWAP m28, m0
+ SWAP m28, m24
punpcklbw m8, m28, m5
- punpckhbw m0, m28, m5
+ punpckhbw m24, m28, m5
%else
- punpcklbw m8, m0, m5
- punpckhbw m0, m5
+ punpcklbw m8, m24, m5
+ punpckhbw m24, m5
%endif
pmaddubsw m8, pbm1_1
- pmaddubsw m0, pbm1_1
+ pmaddubsw m24, pbm1_1
paddw m10, m8
- paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
- punpcklbw m0, m13, m20
- punpckhbw m9, m13, m20
+ paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+ punpcklbw m24, m13, m30
+ punpckhbw m9, m13, m30
%ifidn %2, h
- SWAP m27, m20
+ SWAP m27, m30
%endif
- SWAP m13, m23
- pmaddubsw m0, pbm1_1
+ SWAP m13, m15
+ pmaddubsw m24, pbm1_1
pmaddubsw m9, pbm1_1
- paddw m10, m0
+ paddw m10, m24
paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
- SWAP m20, m0
- SWAP m23, m9
+ SWAP m30, m24
+ SWAP m15, m9
%ifidn %2, h
- SWAP m9, m0
+ SWAP m9, m24
%define pw2048 m9
%endif
- pmulhrsw m0, m10, pw2048
+ pmulhrsw m24, m10, pw2048
pmulhrsw m8, m11, pw2048
paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
- paddw m11, m22
- packuswb m0, m8
- punpcklbw m8, m3, m21
+ paddw m11, m23
+ packuswb m24, m8
+ punpcklbw m8, m3, m31
pmaddubsw m8, pbm1_1
paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
SWAP m18, m8
@@ -851,34 +823,34 @@ SECTION .text
SWAP m16, m9
%define pw2048 m16
%endif
- punpckhbw m9, m3, m21
- SWAP m3, m25
+ punpckhbw m9, m3, m31
+ SWAP m3, m12
pmaddubsw m9, pbm1_1
paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
- SWAP m22, m9
+ SWAP m23, m9
pmulhrsw m9, m11, pw2048
paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
%ifidn %2, h
- SWAP m2, m26
+ SWAP m2, m1
%define pbm1_1 m2
%endif
- vpblendmb m26{k4}, m4, m0 ; don't clobber p0/m4 since we need it in H
+ vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H
; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
; write +0
- SWAP m0, m21 ; q6
+ SWAP m24, m31 ; q6
packuswb m8, m9
%ifidn %2, h
- SWAP m21, m2
- %define pbm1_1 m21
+ SWAP m31, m2
+ %define pbm1_1 m31
%endif
- vpblendmb m25{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
+ vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I
; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
; write +1
- punpcklbw m8, m4, m0
- punpckhbw m2, m4, m0
- SWAP m4, m26
+ punpcklbw m8, m4, m24
+ punpckhbw m2, m4, m24
+ SWAP m4, m1
pmaddubsw m8, pbm1_1
pmaddubsw m2, pbm1_1
paddw m10, m8
@@ -892,9 +864,9 @@ SECTION .text
; write +2
paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
paddw m11, m7
- punpcklbw m8, m5, m0
- punpckhbw m9, m5, m0
- SWAP m5, m25
+ punpcklbw m8, m5, m24
+ punpckhbw m9, m5, m24
+ SWAP m5, m12
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -906,10 +878,10 @@ SECTION .text
; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
; write +3
- paddw m10, m19 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
- paddw m11, m24
- punpcklbw m8, m6, m0
- punpckhbw m9, m6, m0
+ paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
+ paddw m11, m0
+ punpcklbw m8, m6, m24
+ punpckhbw m9, m6, m24
SWAP 2, 6
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
@@ -921,20 +893,20 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+mstrideq]{k4}, m8
%else
- SWAP m19, m16
- %define pw2048 m19
- vpblendmb m16{k4}, m15, m8
+ SWAP m29, m16
+ %define pw2048 m29
+ vpblendmb m16{k4}, m22, m8
%endif
; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
; write +4
- paddw m10, m20 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
- paddw m11, m23
+ paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
+ paddw m11, m15
%ifidn %2, h
- SWAP m23, m8
+ SWAP m15, m8
%endif
- punpcklbw m8, m14, m0
- punpckhbw m9, m14, m0
+ punpcklbw m8, m14, m24
+ punpckhbw m9, m14, m24
SWAP 14, 7
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
@@ -946,16 +918,16 @@ SECTION .text
%ifidn %2, v
vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4
%else
- vpblendmb m17{k4}, m1, m8
+ vpblendmb m17{k4}, m26, m8
%endif
; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
; write +5
paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
- paddw m11, m22
- punpcklbw m8, m15, m0
- punpckhbw m9, m15, m0
- SWAP m20, m0
+ paddw m11, m23
+ punpcklbw m8, m22, m24
+ punpckhbw m9, m22, m24
+ SWAP m30, m24
pmaddubsw m8, pbm1_1
pmaddubsw m9, pbm1_1
paddw m10, m8
@@ -979,26 +951,26 @@ SECTION .text
vpbroadcastd m9, [pb_3_1]
vpbroadcastd m10, [pb_2_1]
%if %1 == 16
- vpbroadcastd m22, [pb_1]
- vpbroadcastd m24, [pb_4]
+ vpbroadcastd m23, [pb_1]
+ vpbroadcastd m0, [pb_4]
%elifidn %2, h
- vpbroadcastd m21, [pb_m1_1]
- %define pbm1_1 m21
+ vpbroadcastd m31, [pb_m1_1]
+ %define pbm1_1 m31
%endif
- punpcklbw m0, m12, m3
- punpckhbw m1, m12, m3
- pmaddubsw m2, m0, m9
- pmaddubsw m7, m1, m9 ; 3 * p3 + p1
+ punpcklbw m24, m25, m3
+ punpckhbw m26, m25, m3
+ pmaddubsw m2, m24, m9
+ pmaddubsw m7, m26, m9 ; 3 * p3 + p1
punpcklbw m8, m13, m4
punpckhbw m11, m13, m4
pmaddubsw m8, m10
pmaddubsw m11, m10
paddw m2, m8
paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
- punpcklbw m8, m5, m24
- punpckhbw m11, m5, m24
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ punpcklbw m8, m5, m0
+ punpckhbw m11, m5, m0
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
paddw m2, m8
paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
psrlw m8, m2, 3
@@ -1015,8 +987,8 @@ SECTION .text
%endif
%endif
- pmaddubsw m8, m0, pbm1_1
- pmaddubsw m11, m1, pbm1_1
+ pmaddubsw m8, m24, pbm1_1
+ pmaddubsw m11, m26, pbm1_1
paddw m2, m8
paddw m7, m11
punpcklbw m8, m13, m6
@@ -1035,14 +1007,14 @@ SECTION .text
SWAP m18, m8
%endif
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- psubw m2, m0
- psubw m7, m1
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
punpcklbw m8, m4, m14
punpckhbw m11, m4, m14
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
paddw m2, m8
paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
psrlw m8, m2, 3
@@ -1052,19 +1024,19 @@ SECTION .text
%ifidn %2, v
mova [t0+stride3q], m8
%else
- SWAP m19, m8
+ SWAP m29, m8
%endif
- punpcklbw m0, m5, m15
- punpckhbw m1, m5, m15
- pmaddubsw m8, m0, m22
- pmaddubsw m11, m1, m22
+ punpcklbw m24, m5, m22
+ punpckhbw m26, m5, m22
+ pmaddubsw m8, m24, m23
+ pmaddubsw m11, m26, m23
paddw m2, m8
paddw m7, m11
- punpcklbw m8, m4, m12
- punpckhbw m11, m4, m12
- pmaddubsw m8, m22
- pmaddubsw m11, m22
+ punpcklbw m8, m4, m25
+ punpckhbw m11, m4, m25
+ pmaddubsw m8, m23
+ pmaddubsw m11, m23
psubw m2, m8
psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
psrlw m8, m2, 3
@@ -1075,10 +1047,10 @@ SECTION .text
mova [dstq+strideq*0], m11
%endif
- pmaddubsw m0, pbm1_1
- pmaddubsw m1, pbm1_1
- paddw m2, m0
- paddw m7, m1
+ pmaddubsw m24, pbm1_1
+ pmaddubsw m26, pbm1_1
+ paddw m2, m24
+ paddw m7, m26
punpcklbw m8, m13, m6
punpckhbw m13, m6
pmaddubsw m8, pbm1_1
@@ -1093,18 +1065,18 @@ SECTION .text
mova [dstq+strideq*1], m13
%endif
- punpcklbw m0, m3, m6
- punpckhbw m1, m3, m6
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- psubw m2, m0
- psubw m7, m1
- punpcklbw m0, m14, m15
- punpckhbw m1, m14, m15
- pmaddubsw m0, m22
- pmaddubsw m1, m22
- paddw m2, m0
- paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+ punpcklbw m24, m3, m6
+ punpckhbw m26, m3, m6
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ psubw m2, m24
+ psubw m7, m26
+ punpcklbw m24, m14, m22
+ punpckhbw m26, m14, m22
+ pmaddubsw m24, m23
+ pmaddubsw m26, m23
+ paddw m2, m24
+ paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
psrlw m2, 3
psrlw m7, 3
packuswb m2, m7
@@ -1120,36 +1092,36 @@ SECTION .text
%endif
%ifidn %2, h
- SWAP m0, m18
- SWAP m1, m19
+ SWAP m24, m18
+ SWAP m26, m29
%if %1 == 8
; 16x8 transpose
- punpcklbw m3, m12, m10
- punpckhbw m12, m10
- punpcklbw m10, m0, m1
- punpckhbw m0, m1
- punpcklbw m1, m11, m13
+ punpcklbw m3, m25, m10
+ punpckhbw m25, m10
+ punpcklbw m10, m24, m26
+ punpckhbw m24, m26
+ punpcklbw m26, m11, m13
punpckhbw m11, m13
- punpcklbw m13, m2, m15
- punpckhbw m2, m15
+ punpcklbw m13, m2, m22
+ punpckhbw m2, m22
;
- punpcklwd m15, m3, m10
+ punpcklwd m22, m3, m10
punpckhwd m3, m10
- punpcklwd m10, m12, m0
- punpckhwd m12, m0
- punpcklwd m0, m1, m13
- punpckhwd m1, m13
+ punpcklwd m10, m25, m24
+ punpckhwd m25, m24
+ punpcklwd m24, m26, m13
+ punpckhwd m26, m13
punpcklwd m13, m11, m2
punpckhwd m11, m2
;
- punpckldq m2, m15, m0
- punpckhdq m15, m0
- punpckldq m0, m3, m1
- punpckhdq m3, m1
- punpckldq m1, m10, m13
+ punpckldq m2, m22, m24
+ punpckhdq m22, m24
+ punpckldq m24, m3, m26
+ punpckhdq m3, m26
+ punpckldq m26, m10, m13
punpckhdq m10, m13
- punpckldq m13, m12, m11
- punpckhdq m12, m11
+ punpckldq m13, m25, m11
+ punpckhdq m25, m11
; write 8x32
vpbroadcastd ym16, strided
pmulld ym16, [hmulD]
@@ -1162,8 +1134,8 @@ SECTION .text
kmovb k3, k6
kmovb k4, k6
vpscatterdq [dstq+ym16-4]{k1}, m2
- vpscatterdq [t1 +ym16-4]{k2}, m15
- vpscatterdq [t2 +ym16-4]{k3}, m0
+ vpscatterdq [t1 +ym16-4]{k2}, m22
+ vpscatterdq [t2 +ym16-4]{k3}, m24
vpscatterdq [t3 +ym16-4]{k4}, m3
lea t1, [t0+strideq*2]
lea t2, [t0+strideq*4]
@@ -1172,29 +1144,29 @@ SECTION .text
kmovb k2, k6
kmovb k3, k6
kmovb k4, k6
- vpscatterdq [t0+ym16-4]{k1}, m1
+ vpscatterdq [t0+ym16-4]{k1}, m26
vpscatterdq [t1+ym16-4]{k2}, m10
vpscatterdq [t2+ym16-4]{k3}, m13
- vpscatterdq [t3+ym16-4]{k4}, m12
+ vpscatterdq [t3+ym16-4]{k4}, m25
%else
; 16x16 transpose and store
SWAP 5, 10, 2
- SWAP 6, 0
- SWAP 7, 1
+ SWAP 6, 24
+ SWAP 7, 26
SWAP 8, 11
SWAP 9, 13
- mova m0, [rsp+0*64]
- SWAP m1, m28
+ mova m24, [rsp+0*64]
+ SWAP m26, m28
mova m2, [rsp+1*64]
mova m3, [rsp+2*64]
mova m4, [rsp+3*64]
SWAP m11, m16
- SWAP m12, m17
+ SWAP m25, m17
SWAP m13, m27
- SWAP m14, m20
+ SWAP m14, m30
TRANSPOSE_16X16B 1, 0, [rsp+4*64]
- movu [dstq+strideq*0-8], xm0
- movu [dstq+strideq*1-8], xm1
+ movu [dstq+strideq*0-8], xm24
+ movu [dstq+strideq*1-8], xm26
movu [dstq+strideq*2-8], xm2
movu [dstq+stride3q -8], xm3
lea t0, [dstq+strideq*4]
@@ -1208,13 +1180,13 @@ SECTION .text
movu [t0+strideq*2-8], xm10
movu [t0+stride3q -8], xm11
lea t0, [t0+strideq*4]
- movu [t0+strideq*0-8], xm12
+ movu [t0+strideq*0-8], xm25
movu [t0+strideq*1-8], xm13
movu [t0+strideq*2-8], xm14
- movu [t0+stride3q -8], xm15
+ movu [t0+stride3q -8], xm22
lea t0, [t0+strideq*4]
- vextracti128 [t0+strideq*0-8], ym0, 1
- vextracti128 [t0+strideq*1-8], ym1, 1
+ vextracti128 [t0+strideq*0-8], ym24, 1
+ vextracti128 [t0+strideq*1-8], ym26, 1
vextracti128 [t0+strideq*2-8], ym2, 1
vextracti128 [t0+stride3q -8], ym3, 1
lea t0, [t0+strideq*4]
@@ -1228,13 +1200,13 @@ SECTION .text
vextracti128 [t0+strideq*2-8], ym10, 1
vextracti128 [t0+stride3q -8], ym11, 1
lea t0, [t0+strideq*4]
- vextracti128 [t0+strideq*0-8], ym12, 1
+ vextracti128 [t0+strideq*0-8], ym25, 1
vextracti128 [t0+strideq*1-8], ym13, 1
vextracti128 [t0+strideq*2-8], ym14, 1
- vextracti128 [t0+stride3q -8], ym15, 1
+ vextracti128 [t0+stride3q -8], ym22, 1
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m0, 2
- vextracti32x4 [t0+strideq*1-8], m1, 2
+ vextracti32x4 [t0+strideq*0-8], m24, 2
+ vextracti32x4 [t0+strideq*1-8], m26, 2
vextracti32x4 [t0+strideq*2-8], m2, 2
vextracti32x4 [t0+stride3q -8], m3, 2
lea t0, [t0+strideq*4]
@@ -1248,13 +1220,13 @@ SECTION .text
vextracti32x4 [t0+strideq*2-8], m10, 2
vextracti32x4 [t0+stride3q -8], m11, 2
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m12, 2
+ vextracti32x4 [t0+strideq*0-8], m25, 2
vextracti32x4 [t0+strideq*1-8], m13, 2
vextracti32x4 [t0+strideq*2-8], m14, 2
- vextracti32x4 [t0+stride3q -8], m15, 2
+ vextracti32x4 [t0+stride3q -8], m22, 2
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m0, 3
- vextracti32x4 [t0+strideq*1-8], m1, 3
+ vextracti32x4 [t0+strideq*0-8], m24, 3
+ vextracti32x4 [t0+strideq*1-8], m26, 3
vextracti32x4 [t0+strideq*2-8], m2, 3
vextracti32x4 [t0+stride3q -8], m3, 3
lea t0, [t0+strideq*4]
@@ -1268,19 +1240,15 @@ SECTION .text
vextracti32x4 [t0+strideq*2-8], m10, 3
vextracti32x4 [t0+stride3q -8], m11, 3
lea t0, [t0+strideq*4]
- vextracti32x4 [t0+strideq*0-8], m12, 3
+ vextracti32x4 [t0+strideq*0-8], m25, 3
vextracti32x4 [t0+strideq*1-8], m13, 3
vextracti32x4 [t0+strideq*2-8], m14, 3
- vextracti32x4 [t0+stride3q -8], m15, 3
+ vextracti32x4 [t0+stride3q -8], m22, 3
%endif
%endif
%elif %1 == 6
; flat6 filter
- SWAP m15, m23
- SWAP m0, m24
- SWAP m12, m25
- SWAP m1, m26
vpbroadcastd m15, [pb_3_1]
vpbroadcastd m12, [pb_2]
punpcklbw m8, m13, m5
@@ -1381,17 +1349,16 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
mov mstrideq, strideq
neg mstrideq
lea stride3q, [strideq*3]
- mova m31, [pb_4x0_4x4_4x8_4x12]
- mova m30, [pb_mask]
- vpbroadcastd m29, [pb_128]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
vpbroadcastd m28, [pb_m1_1]
vpbroadcastd m27, [pw_2048]
- %define pbshuf m31
- %define pbmask m30
- %define pb128 m29
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
%define pbm1_1 m28
%define pw2048 m27
- %define is_uv 0
.loop:
cmp word [maskq+8], 0 ; vmask[2]
@@ -1411,7 +1378,7 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call .v4
.end:
add lq, 64
@@ -1420,6 +1387,11 @@ cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
sub wd, 16
jg .loop
RET
+ALIGN function_align
+RESET_MM_PERMUTATION
+.v4:
+ FILTER 4, v
+ ret
cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
lut, h, stride3, stride8
@@ -1429,11 +1401,11 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
lea stride3q, [strideq*3]
lea stride8q, [strideq*8]
kxnorw k6, k6, k6
- vpbroadcastd m29, strided
- vpbroadcastd m30, l_strided
- pmulld m31, m29, [hmulA]
- pmulld m30, m30, [hmulB]
- pmulld m29, m29, [hmulC]
+ vpbroadcastd m19, strided
+ vpbroadcastd m20, l_strided
+ pmulld m21, m19, [hmulA]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
%define pbshuf [pb_4x0_4x4_4x8_4x12]
%define pbmask [pb_mask]
%define pb128 [pb_128]{bcstd}
@@ -1457,7 +1429,7 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, h
+ call .h4
.end:
lea lq, [lq+l_strideq*8]
@@ -1466,9 +1438,13 @@ cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
sub hd, 16
jg .loop
RET
+ALIGN function_align
RESET_MM_PERMUTATION
+.h4:
+ FILTER 4, h
+ ret
-cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
+cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \
lut, w, stride3, mstride
DECLARE_REG_TMP 9
shl l_strideq, 2
@@ -1476,16 +1452,15 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
mov mstrideq, strideq
neg mstrideq
lea stride3q, [strideq*3]
- mova m20, [pb_4x0_4x4_4x8_4x12]
- mova m19, [pb_mask]
- vpbroadcastd m18, [pb_128]
+ mova m21, [pb_4x0_4x4_4x8_4x12]
+ mova m20, [pb_mask]
+ vpbroadcastd m19, [pb_128]
vpbroadcastd m17, [pb_m1_1]
vpbroadcastd m16, [pw_4096]
- %define pbshuf m20
- %define pbmask m19
- %define pb128 m18
+ %define pbshuf m21
+ %define pbmask m20
+ %define pb128 m19
%define pbm1_1 m17
- %define is_uv 1
.loop:
cmp word [maskq+4], 0 ; vmask[1]
@@ -1498,7 +1473,7 @@ cglobal lpf_v_sb_uv_8bpc, 7, 10, 21, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, v
+ call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4
.end:
add lq, 64
@@ -1525,17 +1500,14 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
vpbroadcastd m19, strided
vpbroadcastd m20, l_strided
pmulld m21, m19, [hmulA]
- pmulld m20, m20, [hmulB]
- pmulld m19, m19, [hmulC]
+ pmulld m20, [hmulB]
+ pmulld m19, [hmulC]
mova m18, [pb_mask]
vpbroadcastd m17, [pb_128]
vpbroadcastd m16, [pw_4096]
%define pbshuf [pb_4x0_4x4_4x8_4x12]
%define pbmask m18
%define pb128 m17
- %xdefine m31 m21
- %xdefine m30 m20
- %xdefine m29 m19
add l_strideq, l_strideq
.loop:
@@ -1549,7 +1521,7 @@ cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
cmp word [maskq+0], 0 ; vmask[0]
je .end
- FILTER 4, h
+ call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4
.end:
lea lq, [lq+l_strideq*8]