diff options
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm')
-rw-r--r-- | chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm | 146 |
1 files changed, 73 insertions, 73 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm index e83b18ad969..585ba53e080 100644 --- a/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm +++ b/chromium/third_party/dav1d/libdav1d/src/x86/mc16_avx512.asm @@ -1604,7 +1604,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my vpbroadcastd m11, [buf+ 4] vpbroadcastd m12, [buf+ 8] vpbroadcastd m13, [buf+12] - cmp wd, 16 + sub wd, 16 je .h_w16 jg .h_w32 .h_w8: @@ -3615,32 +3615,32 @@ ALIGN function_align .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq ], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm0, ym1, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq ], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: @@ -3860,33 +3860,33 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: @@ -4090,32 +4090,32 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: @@ -4249,32 +4249,32 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: |