summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2014-12-02 14:31:49 +0100
committerMichael Niedermayer <michaelni@gmx.at>2014-12-03 11:56:22 +0100
commit9fa056ba75c089b5120366ab7c5ce8cc4c5bd67a (patch)
treef05f5ff297ed51c916d5e7fdb16c6805c4ba4ca5
parent242f1152bf906a4a3164a9a8e40bd52723bd5afe (diff)
downloadffmpeg-9fa056ba75c089b5120366ab7c5ce8cc4c5bd67a.tar.gz
pngdsp x86: use unaligned access
For test images manually generated to contain only up prediction, timing results: 8380x3032 255x185 before: 138635 1992 after: 139232 1996 Actually jumping to the proper version depending on the alignment: 8380x3032: 138767 A 0.5% speed improvement for gigantic images is not worth the code duplication. Fixes ticket #4148 Signed-off-by: Christophe Gisquet <christophe.gisquet@gmail.com> Tested-by: Benoit Fouet <benoit.fouet@free.fr> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/pngdsp.h4
-rw-r--r--libavcodec/x86/pngdsp.asm12
2 files changed, 8 insertions, 8 deletions
diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h
index 1475b0cbe9..fbc1a508e7 100644
--- a/libavcodec/pngdsp.h
+++ b/libavcodec/pngdsp.h
@@ -25,9 +25,9 @@
#include <stdint.h>
typedef struct PNGDSPContext {
- void (*add_bytes_l2)(uint8_t *dst /* align 16 */,
+ void (*add_bytes_l2)(uint8_t *dst,
uint8_t *src1 /* align 16 */,
- uint8_t *src2 /* align 16 */, int w);
+ uint8_t *src2, int w);
/* this might write to dst[w] */
void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index 8e23ccfbc6..678a032521 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
and waq, ~(mmsize*2-1)
jmp .end_v
.loop_v:
- mova m0, [src1q+iq]
- mova m1, [src1q+iq+mmsize]
- paddb m0, [src2q+iq]
- paddb m1, [src2q+iq+mmsize]
- mova [dstq+iq ], m0
- mova [dstq+iq+mmsize], m1
+ movu m0, [src2q+iq]
+ movu m1, [src2q+iq+mmsize]
+ paddb m0, [src1q+iq]
+ paddb m1, [src1q+iq+mmsize]
+ movu [dstq+iq ], m0
+ movu [dstq+iq+mmsize], m1
add iq, mmsize*2
.end_v:
cmp iq, waq