diff options
Diffstat (limited to 'libavcodec/arm')
-rw-r--r-- | libavcodec/arm/Makefile | 5 | ||||
-rw-r--r-- | libavcodec/arm/dsputil_init_neon.c | 10 | ||||
-rw-r--r-- | libavcodec/arm/dsputil_init_vfp.c | 4 | ||||
-rw-r--r-- | libavcodec/arm/dsputil_neon.S | 365 | ||||
-rw-r--r-- | libavcodec/arm/dsputil_vfp.S | 55 | ||||
-rw-r--r-- | libavcodec/arm/fmtconvert_init_arm.c | 48 | ||||
-rw-r--r-- | libavcodec/arm/fmtconvert_neon.S | 391 | ||||
-rw-r--r-- | libavcodec/arm/fmtconvert_vfp.S | 77 |
8 files changed, 521 insertions, 434 deletions
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 4c30e0ab9f..014456ee32 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o OBJS += arm/dsputil_init_arm.o \ arm/dsputil_arm.o \ arm/fft_init_arm.o \ + arm/fmtconvert_init_arm.o \ arm/jrevdct_arm.o \ arm/mpegvideo_arm.o \ arm/simple_idct_arm.o \ @@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \ arm/dsputil_armv6.o \ arm/simple_idct_armv6.o \ +VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \ + OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \ arm/dsputil_init_vfp.o \ + $(VFP-OBJS-yes) OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \ arm/mpegvideo_iwmmxt.o \ @@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \ OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \ arm/dsputil_neon.o \ + arm/fmtconvert_neon.o \ arm/int_neon.o \ arm/mpegvideo_neon.o \ arm/simple_idct_neon.o \ diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c index 67982048f9..76ae632273 100644 --- a/libavcodec/arm/dsputil_init_neon.c +++ b/libavcodec/arm/dsputil_init_neon.c @@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul, int len); void ff_butterflies_float_neon(float *v1, float *v2, int len); float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len); -void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, - float mul, int len); void ff_vector_fmul_reverse_neon(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, @@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, int len); -void ff_float_to_int16_neon(int16_t *, const float *, long); -void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize); @@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) c->vector_fmul_scalar = ff_vector_fmul_scalar_neon; c->butterflies_float = ff_butterflies_float_neon; c->scalarproduct_float = ff_scalarproduct_float_neon; - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; c->vector_fmul_reverse = ff_vector_fmul_reverse_neon; c->vector_fmul_add = ff_vector_fmul_add_neon; c->vector_clipf = ff_vector_clipf_neon; @@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon; c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon; - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->float_to_int16 = ff_float_to_int16_neon; - c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; - } - if (CONFIG_VORBIS_DECODER) c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c index 76ef6b4171..bd52315934 100644 --- a/libavcodec/arm/dsputil_init_vfp.c +++ b/libavcodec/arm/dsputil_init_vfp.c @@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len); -void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx) { c->vector_fmul = ff_vector_fmul_vfp; c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; -#if HAVE_ARMV6 - c->float_to_int16 = ff_float_to_int16_vfp; -#endif } diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S index 8329f6cc57..05a911502b 100644 --- a/libavcodec/arm/dsputil_neon.S +++ b/libavcodec/arm/dsputil_neon.S @@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1 bx lr endfunc -function ff_float_to_int16_neon, export=1 - subs r2, r2, #8 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q9, q1, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vshrn.s32 d4, q8, #16 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q0, q0, #16 - vshrn.s32 d5, q9, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vld1.64 {d16-d17},[r1,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r1,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6-d7}, [r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vld1.64 {d0-d1}, [r1,:128]! - vshrn.s32 d4, q8, #16 - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vshrn.s32 d5, q9, #16 - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vst1.64 {d6-d7}, [r0,:128]! - bx lr -3: vshrn.s32 d4, q8, #16 - vshrn.s32 d5, q9, #16 - vst1.64 {d4-d5}, [r0,:128]! - bx lr -endfunc - -function ff_float_to_int16_interleave_neon, export=1 - cmp r3, #2 - ldrlt r1, [r1] - blt ff_float_to_int16_neon - bne 4f - - ldr r3, [r1] - ldr r1, [r1, #4] - - subs r2, r2, #8 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q9, q1, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q10, q8, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vld1.64 {d26-d27},[r1,:128]! - vsri.32 q11, q9, #16 - vst1.64 {d20-d21},[r0,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q12, q0, #16 - vld1.64 {d16-d17},[r3,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d25},[r0,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r3,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d26-d27},[r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vsri.32 q10, q8, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vsri.32 q11, q9, #16 - vld1.64 {d26-d27},[r1,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d20-d21},[r0,:128]! - vsri.32 q12, q0, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d27},[r0,:128]! - bx lr -3: vsri.32 q10, q8, #16 - vsri.32 q11, q9, #16 - vst1.64 {d20-d23},[r0,:128]! - bx lr - -4: push {r4-r8,lr} - cmp r3, #4 - lsl ip, r3, #1 - blt 4f - - @ 4 channels -5: ldmia r1!, {r4-r7} - mov lr, r2 - mov r8, r0 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #8 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q9, q8, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 q11, q10, #16 - vld1.64 {d4-d5}, [r6,:128]! - vcvt.s32.f32 q2, q2, #16 - vzip.32 d18, d22 - vld1.64 {d6-d7}, [r7,:128]! - vcvt.s32.f32 q3, q3, #16 - vzip.32 d19, d23 - vst1.64 {d18}, [r8], ip - vsri.32 q1, q0, #16 - vst1.64 {d22}, [r8], ip - vsri.32 q3, q2, #16 - vst1.64 {d19}, [r8], ip - vzip.32 d2, d6 - vst1.64 {d23}, [r8], ip - vzip.32 d3, d7 - beq 7f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.64 {d2}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6}, [r8], ip - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.64 {d3}, [r8], ip - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d7}, [r8], ip - b 6b -7: vst1.64 {d2}, [r8], ip - vst1.64 {d6}, [r8], ip - vst1.64 {d3}, [r8], ip - vst1.64 {d7}, [r8], ip - subs r3, r3, #4 - popeq {r4-r8,pc} - cmp r3, #4 - add r0, r0, #8 - bge 5b - - @ 2 channels -4: cmp r3, #2 - blt 4f - ldmia r1!, {r4-r5} - mov lr, r2 - mov r8, r0 - tst lr, #8 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 6f - subs lr, lr, #8 - beq 7f - vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #16 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 d18, d16, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 d19, d17, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r5,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vsri.32 d2, d0, #16 - vst1.32 {d19[1]}, [r8], ip - vsri.32 d3, d1, #16 - vst1.32 {d22[0]}, [r8], ip - vsri.32 d6, d4, #16 - vst1.32 {d22[1]}, [r8], ip - vsri.32 d7, d5, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - beq 6f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - bgt 6b -6: vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - b 8f -7: vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip -8: subs r3, r3, #2 - add r0, r0, #4 - popeq {r4-r8,pc} - - @ 1 channel -4: ldr r4, [r1],#4 - tst r2, #8 - mov lr, r2 - mov r5, r0 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - bne 8f -6: subs lr, lr, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r4,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - beq 7f - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 -7: vst1.16 {d4[1]}, [r5,:16], ip - vst1.16 {d4[3]}, [r5,:16], ip - vst1.16 {d5[1]}, [r5,:16], ip - vst1.16 {d5[3]}, [r5,:16], ip - vst1.16 {d6[1]}, [r5,:16], ip - vst1.16 {d6[3]}, [r5,:16], ip - vst1.16 {d7[1]}, [r5,:16], ip - vst1.16 {d7[3]}, [r5,:16], ip - bgt 6b - pop {r4-r8,pc} -8: subs lr, lr, #8 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - popeq {r4-r8,pc} - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - b 6b -endfunc - function ff_vector_fmul_neon, export=1 subs r3, r3, #8 vld1.64 {d0-d3}, [r1,:128]! @@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0] bx lr endfunc -function ff_int32_to_float_fmul_scalar_neon, export=1 -VFP vdup.32 q0, d0[0] -VFP len .req r2 -NOVFP vdup.32 q0, r2 -NOVFP len .req r3 - - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 -1: subs len, len, #8 - pld [r1, #16] - vmul.f32 q9, q3, q0 - vmul.f32 q10, q8, q0 - beq 2f - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 - vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - b 1b -2: vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - bx lr - .unreq len -endfunc - function ff_vector_fmul_reverse_neon, export=1 add r2, r2, r3, lsl #2 sub r2, r2, #32 diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S index a65b69e20a..197d500819 100644 --- a/libavcodec/arm/dsputil_vfp.S +++ b/libavcodec/arm/dsputil_vfp.S @@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1 vpop {d8-d15} bx lr endfunc - -#if HAVE_ARMV6 -/** - * ARM VFP optimized float to int16 conversion. - * Assume that len is a positive number and is multiple of 8, destination - * buffer is at least 4 bytes aligned (8 bytes alignment is better for - * performance), little endian byte sex - */ -@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) -function ff_float_to_int16_vfp, export=1 - push {r4-r8,lr} - vpush {d8-d11} - vldmia r1!, {s16-s23} - vcvt.s32.f32 s0, s16 - vcvt.s32.f32 s1, s17 - vcvt.s32.f32 s2, s18 - vcvt.s32.f32 s3, s19 - vcvt.s32.f32 s4, s20 - vcvt.s32.f32 s5, s21 - vcvt.s32.f32 s6, s22 - vcvt.s32.f32 s7, s23 -1: - subs r2, r2, #8 - vmov r3, r4, s0, s1 - vmov r5, r6, s2, s3 - vmov r7, r8, s4, s5 - vmov ip, lr, s6, s7 - vldmiagt r1!, {s16-s23} - ssat r4, #16, r4 - ssat r3, #16, r3 - ssat r6, #16, r6 - ssat r5, #16, r5 - pkhbt r3, r3, r4, lsl #16 - pkhbt r4, r5, r6, lsl #16 - vcvtgt.s32.f32 s0, s16 - vcvtgt.s32.f32 s1, s17 - vcvtgt.s32.f32 s2, s18 - vcvtgt.s32.f32 s3, s19 - vcvtgt.s32.f32 s4, s20 - vcvtgt.s32.f32 s5, s21 - vcvtgt.s32.f32 s6, s22 - vcvtgt.s32.f32 s7, s23 - ssat r8, #16, r8 - ssat r7, #16, r7 - ssat lr, #16, lr - ssat ip, #16, ip - pkhbt r5, r7, r8, lsl #16 - pkhbt r6, ip, lr, lsl #16 - stmia r0!, {r3-r6} - bgt 1b - - vpop {d8-d11} - pop {r4-r8,pc} -endfunc -#endif diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c new file mode 100644 index 0000000000..4b6e3939f5 --- /dev/null +++ b/libavcodec/arm/fmtconvert_init_arm.c @@ -0,0 +1,48 @@ +/* + * ARM optimized Format Conversion Utils + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "libavcodec/avcodec.h" +#include "libavcodec/fmtconvert.h" + +void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src, + float mul, int len); + +void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); +void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); + +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); + +void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) +{ + if (HAVE_ARMVFP && HAVE_ARMV6) { + c->float_to_int16 = ff_float_to_int16_vfp; + } + + if (HAVE_NEON) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; + + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->float_to_int16 = ff_float_to_int16_neon; + c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; + } + } +} diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S new file mode 100644 index 0000000000..359e57e40b --- /dev/null +++ b/libavcodec/arm/fmtconvert_neon.S @@ -0,0 +1,391 @@ +/* + * ARM NEON optimised Format Conversion Utils + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + + preserve8 + .text + +function ff_float_to_int16_neon, export=1 + subs r2, r2, #8 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q9, q1, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vshrn.s32 d4, q8, #16 + vld1.64 {d0-d1}, [r1,:128]! + vcvt.s32.f32 q0, q0, #16 + vshrn.s32 d5, q9, #16 + vld1.64 {d2-d3}, [r1,:128]! + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vld1.64 {d16-d17},[r1,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r1,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6-d7}, [r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vld1.64 {d0-d1}, [r1,:128]! + vshrn.s32 d4, q8, #16 + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r1,:128]! + vshrn.s32 d5, q9, #16 + vcvt.s32.f32 q1, q1, #16 + vshrn.s32 d6, q0, #16 + vst1.64 {d4-d5}, [r0,:128]! + vshrn.s32 d7, q1, #16 + vst1.64 {d6-d7}, [r0,:128]! + bx lr +3: vshrn.s32 d4, q8, #16 + vshrn.s32 d5, q9, #16 + vst1.64 {d4-d5}, [r0,:128]! + bx lr +endfunc + +function ff_float_to_int16_interleave_neon, export=1 + cmp r3, #2 + ldrlt r1, [r1] + blt ff_float_to_int16_neon + bne 4f + + ldr r3, [r1] + ldr r1, [r1, #4] + + subs r2, r2, #8 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q8, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q9, q1, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 3f + bics ip, r2, #15 + beq 2f +1: subs ip, ip, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q10, q8, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vld1.64 {d26-d27},[r1,:128]! + vsri.32 q11, q9, #16 + vst1.64 {d20-d21},[r0,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q12, q0, #16 + vld1.64 {d16-d17},[r3,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d25},[r0,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r3,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r1,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r1,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d26-d27},[r0,:128]! + bne 1b + ands r2, r2, #15 + beq 3f +2: vsri.32 q10, q8, #16 + vld1.64 {d0-d1}, [r3,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r3,:128]! + vcvt.s32.f32 q1, q1, #16 + vld1.64 {d24-d25},[r1,:128]! + vcvt.s32.f32 q12, q12, #16 + vsri.32 q11, q9, #16 + vld1.64 {d26-d27},[r1,:128]! + vcvt.s32.f32 q13, q13, #16 + vst1.64 {d20-d21},[r0,:128]! + vsri.32 q12, q0, #16 + vst1.64 {d22-d23},[r0,:128]! + vsri.32 q13, q1, #16 + vst1.64 {d24-d27},[r0,:128]! + bx lr +3: vsri.32 q10, q8, #16 + vsri.32 q11, q9, #16 + vst1.64 {d20-d23},[r0,:128]! + bx lr + +4: push {r4-r8,lr} + cmp r3, #4 + lsl ip, r3, #1 + blt 4f + + @ 4 channels +5: ldmia r1!, {r4-r7} + mov lr, r2 + mov r8, r0 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #8 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 q9, q8, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 q11, q10, #16 + vld1.64 {d4-d5}, [r6,:128]! + vcvt.s32.f32 q2, q2, #16 + vzip.32 d18, d22 + vld1.64 {d6-d7}, [r7,:128]! + vcvt.s32.f32 q3, q3, #16 + vzip.32 d19, d23 + vst1.64 {d18}, [r8], ip + vsri.32 q1, q0, #16 + vst1.64 {d22}, [r8], ip + vsri.32 q3, q2, #16 + vst1.64 {d19}, [r8], ip + vzip.32 d2, d6 + vst1.64 {d23}, [r8], ip + vzip.32 d3, d7 + beq 7f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.64 {d2}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.64 {d6}, [r8], ip + vld1.64 {d20-d21},[r6,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.64 {d3}, [r8], ip + vld1.64 {d22-d23},[r7,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.64 {d7}, [r8], ip + b 6b +7: vst1.64 {d2}, [r8], ip + vst1.64 {d6}, [r8], ip + vst1.64 {d3}, [r8], ip + vst1.64 {d7}, [r8], ip + subs r3, r3, #4 + popeq {r4-r8,pc} + cmp r3, #4 + add r0, r0, #8 + bge 5b + + @ 2 channels +4: cmp r3, #2 + blt 4f + ldmia r1!, {r4-r5} + mov lr, r2 + mov r8, r0 + tst lr, #8 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + beq 6f + subs lr, lr, #8 + beq 7f + vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 +6: subs lr, lr, #16 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vsri.32 d18, d16, #16 + vld1.64 {d2-d3}, [r5,:128]! + vcvt.s32.f32 q1, q1, #16 + vsri.32 d19, d17, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r5,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vsri.32 d2, d0, #16 + vst1.32 {d19[1]}, [r8], ip + vsri.32 d3, d1, #16 + vst1.32 {d22[0]}, [r8], ip + vsri.32 d6, d4, #16 + vst1.32 {d22[1]}, [r8], ip + vsri.32 d7, d5, #16 + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip + beq 6f + vld1.64 {d16-d17},[r4,:128]! + vcvt.s32.f32 q8, q8, #16 + vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vld1.64 {d18-d19},[r5,:128]! + vcvt.s32.f32 q9, q9, #16 + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vld1.64 {d20-d21},[r4,:128]! + vcvt.s32.f32 q10, q10, #16 + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vld1.64 {d22-d23},[r5,:128]! + vcvt.s32.f32 q11, q11, #16 + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + bgt 6b +6: vst1.32 {d2[0]}, [r8], ip + vst1.32 {d2[1]}, [r8], ip + vst1.32 {d3[0]}, [r8], ip + vst1.32 {d3[1]}, [r8], ip + vst1.32 {d6[0]}, [r8], ip + vst1.32 {d6[1]}, [r8], ip + vst1.32 {d7[0]}, [r8], ip + vst1.32 {d7[1]}, [r8], ip + b 8f +7: vsri.32 d18, d16, #16 + vsri.32 d19, d17, #16 + vst1.32 {d18[0]}, [r8], ip + vsri.32 d22, d20, #16 + vst1.32 {d18[1]}, [r8], ip + vsri.32 d23, d21, #16 + vst1.32 {d19[0]}, [r8], ip + vst1.32 {d19[1]}, [r8], ip + vst1.32 {d22[0]}, [r8], ip + vst1.32 {d22[1]}, [r8], ip + vst1.32 {d23[0]}, [r8], ip + vst1.32 {d23[1]}, [r8], ip +8: subs r3, r3, #2 + add r0, r0, #4 + popeq {r4-r8,pc} + + @ 1 channel +4: ldr r4, [r1],#4 + tst r2, #8 + mov lr, r2 + mov r5, r0 + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + bne 8f +6: subs lr, lr, #16 + vld1.64 {d4-d5}, [r4,:128]! + vcvt.s32.f32 q2, q2, #16 + vld1.64 {d6-d7}, [r4,:128]! + vcvt.s32.f32 q3, q3, #16 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + beq 7f + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 +7: vst1.16 {d4[1]}, [r5,:16], ip + vst1.16 {d4[3]}, [r5,:16], ip + vst1.16 {d5[1]}, [r5,:16], ip + vst1.16 {d5[3]}, [r5,:16], ip + vst1.16 {d6[1]}, [r5,:16], ip + vst1.16 {d6[3]}, [r5,:16], ip + vst1.16 {d7[1]}, [r5,:16], ip + vst1.16 {d7[3]}, [r5,:16], ip + bgt 6b + pop {r4-r8,pc} +8: subs lr, lr, #8 + vst1.16 {d0[1]}, [r5,:16], ip + vst1.16 {d0[3]}, [r5,:16], ip + vst1.16 {d1[1]}, [r5,:16], ip + vst1.16 {d1[3]}, [r5,:16], ip + vst1.16 {d2[1]}, [r5,:16], ip + vst1.16 {d2[3]}, [r5,:16], ip + vst1.16 {d3[1]}, [r5,:16], ip + vst1.16 {d3[3]}, [r5,:16], ip + popeq {r4-r8,pc} + vld1.64 {d0-d1}, [r4,:128]! + vcvt.s32.f32 q0, q0, #16 + vld1.64 {d2-d3}, [r4,:128]! + vcvt.s32.f32 q1, q1, #16 + b 6b +endfunc + +function ff_int32_to_float_fmul_scalar_neon, export=1 +VFP vdup.32 q0, d0[0] +VFP len .req r2 +NOVFP vdup.32 q0, r2 +NOVFP len .req r3 + + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 +1: subs len, len, #8 + pld [r1, #16] + vmul.f32 q9, q3, q0 + vmul.f32 q10, q8, q0 + beq 2f + vld1.32 {q1},[r1,:128]! + vcvt.f32.s32 q3, q1 + vld1.32 {q2},[r1,:128]! + vcvt.f32.s32 q8, q2 + vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + b 1b +2: vst1.32 {q9}, [r0,:128]! + vst1.32 {q10},[r0,:128]! + bx lr + .unreq len +endfunc diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S new file mode 100644 index 0000000000..1d19e7758b --- /dev/null +++ b/libavcodec/arm/fmtconvert_vfp.S @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "asm.S" + + .syntax unified + +/** + * ARM VFP optimized float to int16 conversion. + * Assume that len is a positive number and is multiple of 8, destination + * buffer is at least 4 bytes aligned (8 bytes alignment is better for + * performance), little endian byte sex + */ +@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) +function ff_float_to_int16_vfp, export=1 + push {r4-r8,lr} + vpush {d8-d11} + vldmia r1!, {s16-s23} + vcvt.s32.f32 s0, s16 + vcvt.s32.f32 s1, s17 + vcvt.s32.f32 s2, s18 + vcvt.s32.f32 s3, s19 + vcvt.s32.f32 s4, s20 + vcvt.s32.f32 s5, s21 + vcvt.s32.f32 s6, s22 + vcvt.s32.f32 s7, s23 +1: + subs r2, r2, #8 + vmov r3, r4, s0, s1 + vmov r5, r6, s2, s3 + vmov r7, r8, s4, s5 + vmov ip, lr, s6, s7 + vldmiagt r1!, {s16-s23} + ssat r4, #16, r4 + ssat r3, #16, r3 + ssat r6, #16, r6 + ssat r5, #16, r5 + pkhbt r3, r3, r4, lsl #16 + pkhbt r4, r5, r6, lsl #16 + vcvtgt.s32.f32 s0, s16 + vcvtgt.s32.f32 s1, s17 + vcvtgt.s32.f32 s2, s18 + vcvtgt.s32.f32 s3, s19 + vcvtgt.s32.f32 s4, s20 + vcvtgt.s32.f32 s5, s21 + vcvtgt.s32.f32 s6, s22 + vcvtgt.s32.f32 s7, s23 + ssat r8, #16, r8 + ssat r7, #16, r7 + ssat lr, #16, lr + ssat ip, #16, ip + pkhbt r5, r7, r8, lsl #16 + pkhbt r6, ip, lr, lsl #16 + stmia r0!, {r3-r6} + bgt 1b + + vpop {d8-d11} + pop {r4-r8,pc} +endfunc |