diff options
Diffstat (limited to 'chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm')
-rw-r--r-- | chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm | 40 |
1 files changed, 20 insertions, 20 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm index 5b0238272bb..0a472ec7402 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm @@ -11,6 +11,7 @@ %define private_prefix vp9 %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text @@ -22,14 +23,14 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz pxor m4, m4 ; sse accumulator pxor m6, m6 ; ssz accumulator pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*2] - lea dqcq, [dqcq+sizeq*2] - neg sizeq .loop: - mova m2, [uqcq+sizeq*2] - mova m0, [dqcq+sizeq*2] - mova m3, [uqcq+sizeq*2+mmsize] - mova m1, [dqcq+sizeq*2+mmsize] + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 psubw m0, m2 psubw m1, m3 ; individual errors are max. 15bit+sign, so squares are 30bit, and @@ -55,8 +56,7 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz punpckhdq m3, m5 paddq m6, m7 paddq m6, m3 - add sizeq, mmsize - jl .loop + jg .loop ; accumulate horizontally and store in return value movhlps m5, m4 @@ -75,22 +75,23 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz %endif RET -; Compute the sum of squared difference between two int16_t vectors. -; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff, +; Compute the sum of squared difference between two tran_low_t vectors. +; Vectors are converted (if necessary) to int16_t for calculations. +; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff, ; intptr_t block_size) INIT_XMM sse2 cglobal block_error_fp, 3, 3, 6, uqc, dqc, size pxor m4, m4 ; sse accumulator pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*2] - lea dqcq, [dqcq+sizeq*2] - neg sizeq .loop: - mova m2, [uqcq+sizeq*2] - mova m0, [dqcq+sizeq*2] - mova m3, [uqcq+sizeq*2+mmsize] - mova m1, [dqcq+sizeq*2+mmsize] + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 psubw m0, m2 psubw m1, m3 ; individual errors are max. 15bit+sign, so squares are 30bit, and @@ -106,8 +107,7 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size punpckhdq m1, m5 paddq m4, m3 paddq m4, m1 - add sizeq, mmsize - jl .loop + jnz .loop ; accumulate horizontally and store in return value movhlps m5, m4 |