diff options
Diffstat (limited to 'libavutil')
-rw-r--r-- | libavutil/arm/float_dsp_init_neon.c | 4 | ||||
-rw-r--r-- | libavutil/arm/float_dsp_init_vfp.c | 4 | ||||
-rw-r--r-- | libavutil/arm/float_dsp_neon.S | 24 | ||||
-rw-r--r-- | libavutil/arm/float_dsp_vfp.S | 69 | ||||
-rw-r--r-- | libavutil/float_dsp.c | 11 | ||||
-rw-r--r-- | libavutil/float_dsp.h | 19 | ||||
-rw-r--r-- | libavutil/ppc/float_dsp_altivec.c | 29 | ||||
-rw-r--r-- | libavutil/ppc/float_dsp_altivec.h | 3 | ||||
-rw-r--r-- | libavutil/ppc/float_dsp_init.c | 1 | ||||
-rw-r--r-- | libavutil/x86/float_dsp.asm | 39 | ||||
-rw-r--r-- | libavutil/x86/float_dsp_init.c | 7 |
11 files changed, 210 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c index 41e513fcdc..c6f02bd2c5 100644 --- a/libavutil/arm/float_dsp_init_neon.c +++ b/libavutil/arm/float_dsp_init_neon.c @@ -38,6 +38,9 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0, void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1, const float *src2, int len); +void ff_vector_fmul_reverse_neon(float *dst, const float *src0, + const float *src1, int len); + void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) { fdsp->vector_fmul = ff_vector_fmul_neon; @@ -45,4 +48,5 @@ void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp) fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon; fdsp->vector_fmul_window = ff_vector_fmul_window_neon; fdsp->vector_fmul_add = ff_vector_fmul_add_neon; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon; } diff --git a/libavutil/arm/float_dsp_init_vfp.c b/libavutil/arm/float_dsp_init_vfp.c index 7abc3322cf..f7e2f54601 100644 --- a/libavutil/arm/float_dsp_init_vfp.c +++ b/libavutil/arm/float_dsp_init_vfp.c @@ -25,10 +25,14 @@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, + const float *src1, int len); + void ff_float_dsp_init_vfp(AVFloatDSPContext *fdsp) { int cpu_flags = av_get_cpu_flags(); if (!have_vfpv3(cpu_flags)) fdsp->vector_fmul = ff_vector_fmul_vfp; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_vfp; } diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S index 100eb02455..d00e59de8f 100644 --- a/libavutil/arm/float_dsp_neon.S +++ b/libavutil/arm/float_dsp_neon.S @@ -220,3 +220,27 @@ function ff_vector_fmul_add_neon, export=1 2: vst1.32 {q12-q13},[r0,:128]! bx lr endfunc + +function ff_vector_fmul_reverse_neon, export=1 + add r2, r2, r3, lsl #2 + sub r2, r2, #32 + mov r12, #-32 + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 +1: pld [r1, #32] + vrev64.32 q3, q3 + vmul.f32 d16, d0, d7 + vmul.f32 d17, d1, d6 + pld [r2, #-32] + vrev64.32 q2, q2 + vmul.f32 d18, d2, d5 + vmul.f32 d19, d3, d4 + subs r3, r3, #8 + beq 2f + vld1.32 {q0-q1}, [r1,:128]! + vld1.32 {q2-q3}, [r2,:128], r12 + vst1.32 {q8-q9}, [r0,:128]! + b 1b +2: vst1.32 {q8-q9}, [r0,:128]! + bx lr +endfunc diff --git a/libavutil/arm/float_dsp_vfp.S b/libavutil/arm/float_dsp_vfp.S index db63e5a675..8695fbd981 100644 --- a/libavutil/arm/float_dsp_vfp.S +++ b/libavutil/arm/float_dsp_vfp.S @@ -66,3 +66,72 @@ function ff_vector_fmul_vfp, export=1 vpop {d8-d15} bx lr endfunc + +/** + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. + * Assume that len is a positive number and is multiple of 8 + */ +@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, +@ const float *src1, int len) +function ff_vector_fmul_reverse_vfp, export=1 + vpush {d8-d15} + add r2, r2, r3, lsl #2 + vldmdb r2!, {s0-s3} + vldmia r1!, {s8-s11} + vldmdb r2!, {s4-s7} + vldmia r1!, {s12-s15} + vmul.f32 s8, s3, s8 + vmul.f32 s9, s2, s9 + vmul.f32 s10, s1, s10 + vmul.f32 s11, s0, s11 +1: + subs r3, r3, #16 + it ge + vldmdbge r2!, {s16-s19} + vmul.f32 s12, s7, s12 + it ge + vldmiage r1!, {s24-s27} + vmul.f32 s13, s6, s13 + it ge + vldmdbge r2!, {s20-s23} + vmul.f32 s14, s5, s14 + it ge + vldmiage r1!, {s28-s31} + vmul.f32 s15, s4, s15 + it ge + vmulge.f32 s24, s19, s24 + it gt + vldmdbgt r2!, {s0-s3} + it ge + vmulge.f32 s25, s18, s25 + vstmia r0!, {s8-s13} + it ge + vmulge.f32 s26, s17, s26 + it gt + vldmiagt r1!, {s8-s11} + itt ge + vmulge.f32 s27, s16, s27 + vmulge.f32 s28, s23, s28 + it gt + vldmdbgt r2!, {s4-s7} + it ge + vmulge.f32 s29, s22, s29 + vstmia r0!, {s14-s15} + ittt ge + vmulge.f32 s30, s21, s30 + vmulge.f32 s31, s20, s31 + vmulge.f32 s8, s3, s8 + it gt + vldmiagt r1!, {s12-s15} + itttt ge + vmulge.f32 s9, s2, s9 + vmulge.f32 s10, s1, s10 + vstmiage r0!, {s24-s27} + vmulge.f32 s11, s0, s11 + it ge + vstmiage r0!, {s28-s31} + bgt 1b + + vpop {d8-d15} + bx lr +endfunc diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c index 119fa650a3..810acfe26d 100644 --- a/libavutil/float_dsp.c +++ b/libavutil/float_dsp.c @@ -82,6 +82,16 @@ static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, dst[i] = src0[i] * src1[i] + src2[i]; } +static void vector_fmul_reverse_c(float *dst, const float *src0, + const float *src1, int len) +{ + int i; + + src1 += len-1; + for (i = 0; i < len; i++) + dst[i] = src0[i] * src1[-i]; +} + void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact) { fdsp->vector_fmul = vector_fmul_c; @@ -90,6 +100,7 @@ void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact) fdsp->vector_dmul_scalar = vector_dmul_scalar_c; fdsp->vector_fmul_window = vector_fmul_window_c; fdsp->vector_fmul_add = vector_fmul_add_c; + fdsp->vector_fmul_reverse = vector_fmul_reverse_c; #if ARCH_ARM ff_float_dsp_init_arm(fdsp); diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h index d2109a0af9..3ee4ca269c 100644 --- a/libavutil/float_dsp.h +++ b/libavutil/float_dsp.h @@ -118,6 +118,25 @@ typedef struct AVFloatDSPContext { */ void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len); + + /** + * Calculate the product of two vectors of floats, and store the result + * in a vector of floats. The second vector of floats is iterated over + * in reverse order. + * + * @param dst output vector + * constraints: 32-byte aligned + * @param src0 first input vector + * constraints: 32-byte aligned + * @param src1 second input vector + * constraints: 32-byte aligned + * @param src1 third input vector + * constraints: 32-byte aligned + * @param len number of elements in the input + * constraints: multiple of 16 + */ + void (*vector_fmul_reverse)(float *dst, const float *src0, + const float *src1, int len); } AVFloatDSPContext; /** diff --git a/libavutil/ppc/float_dsp_altivec.c b/libavutil/ppc/float_dsp_altivec.c index fa66d5c1ca..8cee82c1c7 100644 --- a/libavutil/ppc/float_dsp_altivec.c +++ b/libavutil/ppc/float_dsp_altivec.c @@ -93,3 +93,32 @@ void ff_vector_fmul_add_altivec(float *dst, const float *src0, vec_st(t0, 0, dst + i); } } + +void ff_vector_fmul_reverse_altivec(float *dst, const float *src0, + const float *src1, int len) +{ + int i; + vector float d, s0, s1, h0, l0, + s2, s3, zero = (vector float) vec_splat_u32(0); + + src1 += len-4; + for(i = 0; i < len - 7; i += 8) { + s1 = vec_ld(0, src1 - i); // [a,b,c,d] + s0 = vec_ld(0, src0 + i); + l0 = vec_mergel(s1, s1); // [c,c,d,d] + s3 = vec_ld(-16, src1 - i); + h0 = vec_mergeh(s1, s1); // [a,a,b,b] + s2 = vec_ld(16, src0 + i); + s1 = vec_mergeh(vec_mergel(l0, h0), // [d,b,d,b] + vec_mergeh(l0, h0)); // [c,a,c,a] + // [d,c,b,a] + l0 = vec_mergel(s3, s3); + d = vec_madd(s0, s1, zero); + h0 = vec_mergeh(s3, s3); + vec_st(d, 0, dst + i); + s3 = vec_mergeh(vec_mergel(l0, h0), + vec_mergeh(l0, h0)); + d = vec_madd(s2, s3, zero); + vec_st(d, 16, dst + i); + } +} diff --git a/libavutil/ppc/float_dsp_altivec.h b/libavutil/ppc/float_dsp_altivec.h index 2bb87cd281..b262a83548 100644 --- a/libavutil/ppc/float_dsp_altivec.h +++ b/libavutil/ppc/float_dsp_altivec.h @@ -32,4 +32,7 @@ extern void ff_vector_fmul_add_altivec(float *dst, const float *src0, const float *src1, const float *src2, int len); +extern void ff_vector_fmul_reverse_altivec(float *dst, const float *src0, + const float *src1, int len); + #endif /* AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H */ diff --git a/libavutil/ppc/float_dsp_init.c b/libavutil/ppc/float_dsp_init.c index f00cae0487..d9ca53eeec 100644 --- a/libavutil/ppc/float_dsp_init.c +++ b/libavutil/ppc/float_dsp_init.c @@ -33,6 +33,7 @@ void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int bit_exact) fdsp->vector_fmul = ff_vector_fmul_altivec; fdsp->vector_fmul_add = ff_vector_fmul_add_altivec; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_altivec; if (!bit_exact) { fdsp->vector_fmul_window = ff_vector_fmul_window_altivec; diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index f69fc6b00a..3e5e91ad07 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -198,3 +198,42 @@ VECTOR_FMUL_ADD INIT_YMM avx VECTOR_FMUL_ADD %endif + +;----------------------------------------------------------------------------- +; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, +; int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_REVERSE 0 +cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len + lea lenq, [lend*4 - 2*mmsize] +ALIGN 16 +.loop: +%if cpuflag(avx) + vmovaps xmm0, [src1q + 16] + vinsertf128 m0, m0, [src1q], 1 + vshufps m0, m0, m0, q0123 + vmovaps xmm1, [src1q + mmsize + 16] + vinsertf128 m1, m1, [src1q + mmsize], 1 + vshufps m1, m1, m1, q0123 +%else + mova m0, [src1q] + mova m1, [src1q + mmsize] + shufps m0, m0, q0123 + shufps m1, m1, q0123 +%endif + mulps m0, m0, [src0q + lenq + mmsize] + mulps m1, m1, [src0q + lenq] + mova [dstq + lenq + mmsize], m0 + mova [dstq + lenq], m1 + add src1q, 2*mmsize + sub lenq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_REVERSE +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_FMUL_REVERSE +%endif
\ No newline at end of file diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index e8620449ef..9c58e2bc30 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -46,6 +46,11 @@ void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1, void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1, const float *src2, int len); +void ff_vector_fmul_reverse_sse(float *dst, const float *src0, + const float *src1, int len); +void ff_vector_fmul_reverse_avx(float *dst, const float *src0, + const float *src1, int len); + #if HAVE_6REGS && HAVE_INLINE_ASM static void vector_fmul_window_3dnowext(float *dst, const float *src0, const float *src1, const float *win, @@ -129,6 +134,7 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse; fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse; fdsp->vector_fmul_add = ff_vector_fmul_add_sse; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_sse; } if (EXTERNAL_SSE2(mm_flags)) { fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; @@ -138,5 +144,6 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx; fdsp->vector_fmul_add = ff_vector_fmul_add_avx; + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; } } |