summaryrefslogtreecommitdiff
path: root/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm')
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm40
1 files changed, 20 insertions, 20 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
index 5b0238272bb..0a472ec7402 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -11,6 +11,7 @@
%define private_prefix vp9
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
@@ -22,14 +23,14 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
pxor m4, m4 ; sse accumulator
pxor m6, m6 ; ssz accumulator
pxor m5, m5 ; dedicated zero register
- lea uqcq, [uqcq+sizeq*2]
- lea dqcq, [dqcq+sizeq*2]
- neg sizeq
.loop:
- mova m2, [uqcq+sizeq*2]
- mova m0, [dqcq+sizeq*2]
- mova m3, [uqcq+sizeq*2+mmsize]
- mova m1, [dqcq+sizeq*2+mmsize]
+ LOAD_TRAN_LOW 2, uqcq, 0
+ LOAD_TRAN_LOW 0, dqcq, 0
+ LOAD_TRAN_LOW 3, uqcq, 8
+ LOAD_TRAN_LOW 1, dqcq, 8
+ INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+ INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+ sub sizeq, 16
psubw m0, m2
psubw m1, m3
; individual errors are max. 15bit+sign, so squares are 30bit, and
@@ -55,8 +56,7 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
punpckhdq m3, m5
paddq m6, m7
paddq m6, m3
- add sizeq, mmsize
- jl .loop
+ jg .loop
; accumulate horizontally and store in return value
movhlps m5, m4
@@ -75,22 +75,23 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
%endif
RET
-; Compute the sum of squared difference between two int16_t vectors.
-; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+; Compute the sum of squared difference between two tran_low_t vectors.
+; Vectors are converted (if necessary) to int16_t for calculations.
+; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
; intptr_t block_size)
INIT_XMM sse2
cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
pxor m4, m4 ; sse accumulator
pxor m5, m5 ; dedicated zero register
- lea uqcq, [uqcq+sizeq*2]
- lea dqcq, [dqcq+sizeq*2]
- neg sizeq
.loop:
- mova m2, [uqcq+sizeq*2]
- mova m0, [dqcq+sizeq*2]
- mova m3, [uqcq+sizeq*2+mmsize]
- mova m1, [dqcq+sizeq*2+mmsize]
+ LOAD_TRAN_LOW 2, uqcq, 0
+ LOAD_TRAN_LOW 0, dqcq, 0
+ LOAD_TRAN_LOW 3, uqcq, 8
+ LOAD_TRAN_LOW 1, dqcq, 8
+ INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+ INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+ sub sizeq, 16
psubw m0, m2
psubw m1, m3
; individual errors are max. 15bit+sign, so squares are 30bit, and
@@ -106,8 +107,7 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
punpckhdq m1, m5
paddq m4, m3
paddq m4, m1
- add sizeq, mmsize
- jl .loop
+ jnz .loop
; accumulate horizontally and store in return value
movhlps m5, m4