diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-12-09 02:33:42 -0500 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-12-09 15:26:58 -0500 |
commit | 57cd849cf71e6abdfedfea1d381d4e06581015d5 (patch) | |
tree | c08ca3adc7ae370c550034e9ba1ac8f1d09b46d7 | |
parent | ff072009fe5bdd3540ac6ac331e9961e83da722a (diff) | |
download | opus-57cd849cf71e6abdfedfea1d381d4e06581015d5.tar.gz |
Defining celt_inner_prod() and using it instead of explicit loops.
Also adds an SSE-optimized celt_inner_prod().
-rw-r--r-- | celt/celt_encoder.c | 14 | ||||
-rw-r--r-- | celt/pitch.c | 22 | ||||
-rw-r--r-- | celt/pitch.h | 12 | ||||
-rw-r--r-- | celt/vq.c | 21 | ||||
-rw-r--r-- | celt/x86/pitch_sse.h | 26 |
5 files changed, 59 insertions, 36 deletions
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index db183430..592ae250 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -576,7 +576,7 @@ static int tf_analysis(const CELTMode *m, int len, int isTransient, *tf_sum = 0; for (i=0;i<len;i++) { - int j, k, N; + int k, N; int narrow; opus_val32 L1, best_L1; int best_level=0; @@ -768,10 +768,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X, /* Compute inter-channel correlation for low frequencies */ for (i=0;i<8;i++) { - int j; - opus_val32 partial = 0; - for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++) - partial = MAC16_16(partial, X[j], X[N0+j]); + opus_val32 partial; + partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM); sum = ADD16(sum, EXTRACT16(SHR32(partial, 18))); } sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum); @@ -779,10 +777,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X, minXC = sum; for (i=8;i<intensity;i++) { - int j; - opus_val32 partial = 0; - for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++) - partial = MAC16_16(partial, X[j], X[N0+j]); + opus_val32 partial; + partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM); minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18)))); } minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC)); diff --git a/celt/pitch.c b/celt/pitch.c index d2b30544..93db58d7 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -252,7 +252,7 @@ void #endif celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch) { - int i,j; + int i; /*The EDSP version requires that max_pitch is at least 1, and that _x is 32-bit aligned. Since it's hard to put asserts in assembly, put them here.*/ @@ -279,9 +279,8 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */ for (;i<max_pitch;i++) { - opus_val32 sum = 0; - for (j=0;j<len;j++) - sum = MAC16_16(sum, _x[j],_y[i+j]); + opus_val32 sum; + sum = celt_inner_prod(_x, _y+i, len); xcorr[i] = sum; #ifdef FIXED_POINT maxcorr = MAX32(maxcorr, sum); @@ -361,12 +360,17 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR #endif for (i=0;i<max_pitch>>1;i++) { - opus_val32 sum=0; + opus_val32 sum; xcorr[i] = 0; if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2) continue; +#ifdef FIXED_POINT + sum = 0; for (j=0;j<len>>1;j++) sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift); +#else + sum = celt_inner_prod(x_lp, y+i, len>>1); +#endif xcorr[i] = MAX32(-1, sum); #ifdef FIXED_POINT maxcorr = MAX32(maxcorr, sum); @@ -513,13 +517,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, pg = SHR32(frac_div32(best_xy,best_yy+1),16); for (k=0;k<3;k++) - { - int T1 = T+k-1; - xy = 0; - for (i=0;i<N;i++) - xy = MAC16_16(xy, x[i], x[i-T1]); - xcorr[k] = xy; - } + xcorr[k] = celt_inner_prod(x, x-(T+k-1), N); if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0])) offset = 1; else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2])) diff --git a/celt/pitch.h b/celt/pitch.h index df317ecc..ec55acae 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -141,6 +141,18 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y } #endif +#ifndef OVERRIDE_CELT_INNER_PROD +static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y, + int N) +{ + int i; + opus_val32 xy=0; + for (i=0;i<N;i++) + xy = MAC16_16(xy, x[i], y[i]); + return xy; +} +#endif + #ifdef FIXED_POINT opus_val32 #else @@ -37,6 +37,7 @@ #include "os_support.h" #include "bands.h" #include "rate.h" +#include "pitch.h" static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s) { @@ -350,15 +351,11 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain) #ifdef FIXED_POINT int k; #endif - opus_val32 E = EPSILON; + opus_val32 E; opus_val16 g; opus_val32 t; - celt_norm *xptr = X; - for (i=0;i<N;i++) - { - E = MAC16_16(E, *xptr, *xptr); - xptr++; - } + celt_norm *xptr; + E = EPSILON + celt_inner_prod(X, X, N); #ifdef FIXED_POINT k = celt_ilog2(E)>>1; #endif @@ -393,14 +390,8 @@ int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N) Eside = MAC16_16(Eside, s, s); } } else { - for (i=0;i<N;i++) - { - celt_norm m, s; - m = X[i]; - s = Y[i]; - Emid = MAC16_16(Emid, m, m); - Eside = MAC16_16(Eside, s, s); - } + Emid += celt_inner_prod(X, X, N); + Eside += celt_inner_prod(Y, Y, N); } mid = celt_sqrt(Emid); side = celt_sqrt(Eside); diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index 695122a5..58f83246 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -101,6 +101,32 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y } } +#define OVERRIDE_CELT_INNER_PROD +static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y, + int N) +{ + int i; + float xy; + __m128 sum; + sum = _mm_setzero_ps(); + /* FIXME: We should probably go 8-way and use 2 sums. */ + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 yi = _mm_loadu_ps(y+i); + sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); + } + /* Horizontal sum */ + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); + _mm_store_ss(&xy, sum); + for (;i<N;i++) + { + xy = MAC16_16(xy, x[i], y[i]); + } + return xy; +} + #define OVERRIDE_COMB_FILTER_CONST static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) |