summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorClément Bœsch <u@pkh.me>2014-08-23 20:03:10 +0200
committerClément Bœsch <u@pkh.me>2014-08-23 20:12:56 +0200
commit554d8190624f25cefe079bd7b9ad61a2ade8541a (patch)
treef4c2d420952d1d6780c14ee962f588028d8b58ab
parentf4dec0dba0faa46758554ff6229a57c0bf833404 (diff)
downloadffmpeg-554d8190624f25cefe079bd7b9ad61a2ade8541a.tar.gz
avutil/pixelutils: faster pixelutils_sad_16x16
501 to 439 decicycles. See 45c7f3997ea11c3d1007b2126b1c0049a8c27105.
-rw-r--r--libavutil/x86/pixelutils.asm16
1 files changed, 11 insertions, 5 deletions
diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
index 15213d92d8..7522f24a42 100644
--- a/libavutil/x86/pixelutils.asm
+++ b/libavutil/x86/pixelutils.asm
@@ -109,18 +109,24 @@ cglobal pixelutils_sad_16x16, 4,4,0, src1, stride1, src2, stride2
;-------------------------------------------------------------------------------
INIT_XMM sse2
cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
- pxor m4, m4
-%rep 8
- movu m0, [src1q]
+ movu m4, [src1q]
+ movu m2, [src2q]
movu m1, [src1q + stride1q]
+ movu m3, [src2q + stride2q]
+ psadbw m4, m2
+ psadbw m1, m3
+ paddw m4, m1
+%rep 7
+ lea src1q, [src1q + 2*stride1q]
+ lea src2q, [src2q + 2*stride2q]
+ movu m0, [src1q]
movu m2, [src2q]
+ movu m1, [src1q + stride1q]
movu m3, [src2q + stride2q]
psadbw m0, m2
psadbw m1, m3
paddw m4, m0
paddw m4, m1
- lea src1q, [src1q + 2*stride1q]
- lea src2q, [src2q + 2*stride2q]
%endrep
movhlps m0, m4
paddw m4, m0