summaryrefslogtreecommitdiff
path: root/vp10/common/x86/postproc_sse2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'vp10/common/x86/postproc_sse2.asm')
-rw-r--r--vp10/common/x86/postproc_sse2.asm694
1 files changed, 0 insertions, 694 deletions
diff --git a/vp10/common/x86/postproc_sse2.asm b/vp10/common/x86/postproc_sse2.asm
deleted file mode 100644
index d5f8e927b..000000000
--- a/vp10/common/x86/postproc_sse2.asm
+++ /dev/null
@@ -1,694 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp10_post_proc_down_and_across_xmm
-;(
-; unsigned char *src_ptr,
-; unsigned char *dst_ptr,
-; int src_pixels_per_line,
-; int dst_pixels_per_line,
-; int rows,
-; int cols,
-; int flimit
-;)
-global sym(vp10_post_proc_down_and_across_xmm) PRIVATE
-sym(vp10_post_proc_down_and_across_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- ALIGN_STACK 16, rax
- ; move the global rd onto the stack, since we don't have enough registers
- ; to do PIC addressing
- movdqa xmm0, [GLOBAL(rd42)]
- sub rsp, 16
- movdqa [rsp], xmm0
-%define RD42 [rsp]
-%else
-%define RD42 [GLOBAL(rd42)]
-%endif
-
-
- movd xmm2, dword ptr arg(6) ;flimit
- punpcklwd xmm2, xmm2
- punpckldq xmm2, xmm2
- punpcklqdq xmm2, xmm2
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
- movsxd rcx, DWORD PTR arg(4) ;rows
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
- pxor xmm0, xmm0 ; mm0 = 00000000
-
-.nextrow:
-
- xor rdx, rdx ; clear out rdx for use as loop counter
-.nextcol:
- movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
- punpcklbw xmm3, xmm0 ; mm3 = p0..p3
- movdqa xmm1, xmm3 ; mm1 = p0..p3
- psllw xmm3, 2 ;
-
- movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm6
-
- ; thresholding
- movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
- psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
- paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw xmm7, xmm2
-
- movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- neg rax
- movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
- punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
- punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
- paddusw xmm3, xmm4 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
- psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
- paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- paddusw xmm3, RD42 ; mm3 += round value
- psraw xmm3, 3 ; mm3 /= 8
-
- pand xmm1, xmm7 ; mm1 select vals > thresh from source
- pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ; combination
-
- packuswb xmm1, xmm0 ; pack to bytes
- movq QWORD PTR [rdi], xmm1 ;
-
- neg rax ; pitch is positive
- add rsi, 8
- add rdi, 8
-
- add rdx, 8
- cmp edx, dword arg(5) ;cols
-
- jl .nextcol
-
- ; done with the all cols, start the across filtering in place
- sub rsi, rdx
- sub rdi, rdx
-
- xor rdx, rdx
- movq mm0, QWORD PTR [rdi-8];
-
-.acrossnextcol:
- movq xmm7, QWORD PTR [rdi +rdx -2]
- movd xmm4, DWORD PTR [rdi +rdx +6]
-
- pslldq xmm4, 8
- por xmm4, xmm7
-
- movdqa xmm3, xmm4
- psrldq xmm3, 2
- punpcklbw xmm3, xmm0 ; mm3 = p0..p3
- movdqa xmm1, xmm3 ; mm1 = p0..p3
- psllw xmm3, 2
-
-
- movdqa xmm5, xmm4
- psrldq xmm5, 3
- punpcklbw xmm5, xmm0 ; mm5 = p1..p4
- paddusw xmm3, xmm5 ; mm3 += mm6
-
- ; thresholding
- movdqa xmm7, xmm1 ; mm7 = p0..p3
- psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm7, xmm2
-
- movdqa xmm5, xmm4
- psrldq xmm5, 4
- punpcklbw xmm5, xmm0 ; mm5 = p2..p5
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
-
- movdqa xmm5, xmm4 ; mm5 = p-2..p5
- punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
- paddusw xmm3, xmm5 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
- psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- psrldq xmm4, 1 ; mm4 = p-1..p5
- punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
- paddusw xmm3, xmm4 ; mm3 += mm5
-
- ; thresholding
- movdqa xmm6, xmm1 ; mm6 = p0..p3
- psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
- psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
- paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw xmm6, xmm2
- por xmm7, xmm6 ; accumulate thresholds
-
- paddusw xmm3, RD42 ; mm3 += round value
- psraw xmm3, 3 ; mm3 /= 8
-
- pand xmm1, xmm7 ; mm1 select vals > thresh from source
- pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
- paddusw xmm1, xmm7 ; combination
-
- packuswb xmm1, xmm0 ; pack to bytes
- movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
- movdq2q mm0, xmm1
-
- add rdx, 8
- cmp edx, dword arg(5) ;cols
- jl .acrossnextcol;
-
- ; last 8 pixels
- movq QWORD PTR [rdi+rdx-8], mm0
-
- ; done with this rwo
- add rsi,rax ; next line
- mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
- add rdi,rax ; next destination
- mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- add rsp,16
- pop rsp
-%endif
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef RD42
-
-
-;void vp10_mbpost_proc_down_xmm(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vp10_rv)
-global sym(vp10_mbpost_proc_down_xmm) PRIVATE
-sym(vp10_mbpost_proc_down_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 128+16
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
- mov [rsp+128+8], eax
- mov [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vp10_rv))]
-%endif
-
- ;rows +=8;
- add dword arg(2), 8
-
- ;for(c=0; c<cols; c+=8)
-.loop_col:
- mov rsi, arg(0) ; s
- pxor xmm0, xmm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
- neg rax ; rax = -pitch
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
-
- pxor xmm5, xmm5
- pxor xmm6, xmm6 ;
-
- pxor xmm7, xmm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movq xmm1, QWORD PTR [rdi];
- punpcklbw xmm1, xmm0 ;
-
- paddw xmm5, xmm1 ;
- pmullw xmm1, xmm1 ;
-
- movdqa xmm2, xmm1 ;
- punpcklwd xmm1, xmm0 ;
-
- punpckhwd xmm2, xmm0 ;
- paddd xmm6, xmm1 ;
-
- paddd xmm7, xmm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
- movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- paddw xmm5, xmm2
- psubw xmm5, xmm1
-
- pmullw xmm2, xmm2
- movdqa xmm4, xmm2
-
- punpcklwd xmm2, xmm0
- punpckhwd xmm4, xmm0
-
- paddd xmm6, xmm2
- paddd xmm7, xmm4
-
- pmullw xmm1, xmm1
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm0
- psubd xmm6, xmm1
-
- punpckhwd xmm2, xmm0
- psubd xmm7, xmm2
-
-
- movdqa xmm3, xmm6
- pslld xmm3, 4
-
- psubd xmm3, xmm6
- movdqa xmm1, xmm5
-
- movdqa xmm4, xmm5
- pmullw xmm1, xmm1
-
- pmulhw xmm4, xmm4
- movdqa xmm2, xmm1
-
- punpcklwd xmm1, xmm4
- punpckhwd xmm2, xmm4
-
- movdqa xmm4, xmm7
- pslld xmm4, 4
-
- psubd xmm4, xmm7
-
- psubd xmm3, xmm1
- psubd xmm4, xmm2
-
- psubd xmm3, flimit4
- psubd xmm4, flimit4
-
- psrad xmm3, 31
- psrad xmm4, 31
-
- packssdw xmm3, xmm4
- packsswb xmm3, xmm0
-
- movq xmm1, QWORD PTR [rsi+rax*8]
-
- movq xmm2, xmm1
- punpcklbw xmm1, xmm0
-
- paddw xmm1, xmm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vp10_rv))]
- movdqu xmm4, [rax + rcx*2] ;vp10_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movdqu xmm4, [r8 + rcx*2] ;vp10_rv[rcx*2]
-%else
- movdqu xmm4, [sym(vp10_rv) + rcx*2]
-%endif
-
- paddw xmm1, xmm4
- ;paddw xmm1, eight8s
- psraw xmm1, 4
-
- packuswb xmm1, xmm0
- pand xmm1, xmm3
-
- pandn xmm3, xmm2
- por xmm1, xmm3
-
- and rcx, 15
- movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
-
- mov rcx, rdx
- sub rcx, 8
-
- and rcx, 15
- movq mm0, [rsp + rcx*8] ;d[rcx*8]
-
- movq [rsi], mm0
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
- add dword arg(0), 8 ; s += 8
- sub dword arg(3), 8 ; cols -= 8
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 128+16
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit4
-
-
-;void vp10_mbpost_proc_across_ip_xmm(unsigned char *src,
-; int pitch, int rows, int cols,int flimit)
-global sym(vp10_mbpost_proc_across_ip_xmm) PRIVATE
-sym(vp10_mbpost_proc_across_ip_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16
-
- ; create flimit4 at [rsp]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp], eax
- mov [rsp+4], eax
- mov [rsp+8], eax
- mov [rsp+12], eax
-%define flimit4 [rsp]
-
-
- ;for(r=0;r<rows;r++)
-.ip_row_loop:
-
- xor rdx, rdx ;sumsq=0;
- xor rcx, rcx ;sum=0;
- mov rsi, arg(0); s
- mov rdi, -8
-.ip_var_loop:
- ;for(i=-8;i<=6;i++)
- ;{
- ; sumsq += s[i]*s[i];
- ; sum += s[i];
- ;}
- movzx eax, byte [rsi+rdi]
- add ecx, eax
- mul al
- add edx, eax
- add rdi, 1
- cmp rdi, 6
- jle .ip_var_loop
-
-
- ;mov rax, sumsq
- ;movd xmm7, rax
- movd xmm7, edx
-
- ;mov rax, sum
- ;movd xmm6, rax
- movd xmm6, ecx
-
- mov rsi, arg(0) ;s
- xor rcx, rcx
-
- movsxd rdx, dword arg(3) ;cols
- add rdx, 8
- pxor mm0, mm0
- pxor mm1, mm1
-
- pxor xmm0, xmm0
-.nextcol4:
-
- movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
- movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
-
- punpcklbw xmm1, xmm0 ; expanding
- punpcklbw xmm2, xmm0 ; expanding
-
- punpcklwd xmm1, xmm0 ; expanding to dwords
- punpcklwd xmm2, xmm0 ; expanding to dwords
-
- psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
- paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
-
- paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
- pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
-
- paddd xmm6, xmm2
- paddd xmm7, xmm1
-
- pshufd xmm6, xmm6, 0 ; duplicate the last ones
- pshufd xmm7, xmm7, 0 ; duplicate the last ones
-
- psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
- psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
-
- pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
- pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
-
- paddd xmm6, xmm4
- paddd xmm7, xmm3
-
- pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
- pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
- pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
-
- paddd xmm7, xmm3
- paddd xmm6, xmm4
-
- movdqa xmm3, xmm6
- pmaddwd xmm3, xmm3
-
- movdqa xmm5, xmm7
- pslld xmm5, 4
-
- psubd xmm5, xmm7
- psubd xmm5, xmm3
-
- psubd xmm5, flimit4
- psrad xmm5, 31
-
- packssdw xmm5, xmm0
- packsswb xmm5, xmm0
-
- movd xmm1, DWORD PTR [rsi+rcx]
- movq xmm2, xmm1
-
- punpcklbw xmm1, xmm0
- punpcklwd xmm1, xmm0
-
- paddd xmm1, xmm6
- paddd xmm1, [GLOBAL(four8s)]
-
- psrad xmm1, 4
- packssdw xmm1, xmm0
-
- packuswb xmm1, xmm0
- pand xmm1, xmm5
-
- pandn xmm5, xmm2
- por xmm5, xmm1
-
- movd [rsi+rcx-8], mm0
- movq mm0, mm1
-
- movdq2q mm1, xmm5
- psrldq xmm7, 12
-
- psrldq xmm6, 12
- add rcx, 4
-
- cmp rcx, rdx
- jl .nextcol4
-
- ;s+=pitch;
- movsxd rax, dword arg(1)
- add arg(0), rax
-
- sub dword arg(2), 1 ;rows-=1
- cmp dword arg(2), 0
- jg .ip_row_loop
-
- add rsp, 16
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit4
-
-
-;void vp10_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-; unsigned char blackclamp[16],
-; unsigned char whiteclamp[16],
-; unsigned char bothclamp[16],
-; unsigned int width, unsigned int height, int pitch)
-global sym(vp10_plane_add_noise_wmt) PRIVATE
-sym(vp10_plane_add_noise_wmt):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-.addnoise_loop:
- call sym(LIBVPX_RAND) WRT_PLT
- mov rcx, arg(1) ;noise
- and rax, 0xff
- add rcx, rax
-
- ; we rely on the fact that the clamping vectors are stored contiguously
- ; in black/white/both order. Note that we have to reload this here because
- ; rdx could be trashed by rand()
- mov rdx, arg(2) ; blackclamp
-
-
- mov rdi, rcx
- movsxd rcx, dword arg(5) ;[Width]
- mov rsi, arg(0) ;Pos
- xor rax,rax
-
-.addnoise_nextset:
- movdqu xmm1,[rsi+rax] ; get the source
-
- psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
- paddusb xmm1, [rdx+32] ;bothclamp
- psubusb xmm1, [rdx+16] ;whiteclamp
-
- movdqu xmm2,[rdi+rax] ; get the noise for this line
- paddb xmm1,xmm2 ; add it in
- movdqu [rsi+rax],xmm1 ; store the result
-
- add rax,16 ; move to the next line
-
- cmp rax, rcx
- jl .addnoise_nextset
-
- movsxd rax, dword arg(7) ; Pitch
- add arg(0), rax ; Start += Pitch
- sub dword arg(6), 1 ; Height -= 1
- jg .addnoise_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-rd42:
- times 8 dw 0x04
-four8s:
- times 4 dd 8