diff options
Diffstat (limited to 'celt')
-rw-r--r-- | celt/_kiss_fft_guts.h | 40 | ||||
-rw-r--r-- | celt/celt.c | 57 | ||||
-rw-r--r-- | celt/fixed_generic.h | 72 | ||||
-rw-r--r-- | celt/kiss_fft.c | 18 | ||||
-rw-r--r-- | celt/kiss_fft.h | 28 | ||||
-rw-r--r-- | celt/mdct.c | 30 | ||||
-rw-r--r-- | celt/pitch.h | 126 | ||||
-rw-r--r-- | celt/vq.c | 27 |
8 files changed, 314 insertions, 84 deletions
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h index 9dfb2dbe..f0840655 100644 --- a/celt/_kiss_fft_guts.h +++ b/celt/_kiss_fft_guts.h @@ -61,10 +61,50 @@ do{ (m).r = SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ (m).i = ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0) + +#undef C_MUL +# define C_MUL(m,a,b) (m=C_MUL_fun(a,b)) +static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) { + kiss_fft_cpx m; + long long ac1 = ((long long)a.r * (long long)b.r); + long long ac2 = ((long long)a.i * (long long)b.i); + ac1 = ac1 - ac2; + ac1 = ac1 >> 15; + m.r = ac1; + + ac1 = ((long long)a.r * (long long)b.i); + ac2 = ((long long)a.i * (long long)b.r); + ac1 = ac1 + ac2; + ac1 = ac1 >> 15; + m.i = ac1; + + return m; +} + # define C_MULC(m,a,b) \ do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \ (m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0) + +#undef C_MULC +# define C_MULC(m,a,b) (m=C_MULC_fun(a,b)) +static inline kiss_fft_cpx C_MULC_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) { + kiss_fft_cpx m; + + long long ac1 = ((long long)a.r * (long long)b.r); + long long ac2 = ((long long)a.i * (long long)b.i); + ac1 = ac1 + ac2; + ac1 = ac1 >> 15; + m.r = ac1; + + ac1 = ((long long)a.i * (long long)b.r); + ac2 = ((long long)a.r * (long long)b.i); + ac1 = ac1 - ac2; + ac1 = ac1 >> 15; + m.i = ac1; + + return m; +} # define C_MULBYSCALAR( c, s ) \ do{ (c).r = S_MUL( (c).r , s ) ;\ (c).i = S_MUL( (c).i , s ) ; }while(0) diff --git a/celt/celt.c b/celt/celt.c index 7e47ea49..9a448b96 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -205,22 +205,38 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, for (i=0;i<overlap;i++) { opus_val16 f; - x0=x[i-T1+2]; + opus_val32 res; f = MULT16_16_Q15(window[i],window[i]); - y[i] = x[i] - + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g00),x[i-T0]) - + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),ADD32(x[i-T0+1],x[i-T0-1])) - + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),ADD32(x[i-T0+2],x[i-T0-2])) - + MULT16_32_Q15(MULT16_16_Q15(f,g10),x2) - + MULT16_32_Q15(MULT16_16_Q15(f,g11),ADD32(x1,x3)) - + MULT16_32_Q15(MULT16_16_Q15(f,g12),ADD32(x0,x4)); + x0= x[i-T1+2]; + + { + long long ac1 = 0; + ac1 = ((long long)MULT16_16_Q15((Q15ONE-f),g00)) * ((long long )x[i-T0]); + ac1 += ( ((long long)MULT16_16_Q15((Q15ONE-f),g01)) * ((long long)ADD32(x[i-T0-1],x[i-T0+1])) ); + ac1 += ( ((long long)MULT16_16_Q15((Q15ONE-f),g02)) * ((long long)ADD32(x[i-T0-2],x[i-T0+2])) ); + ac1 += ( ((long long)MULT16_16_Q15(f,g10)) * ((long long)x2) ); + ac1 += ( ((long long)MULT16_16_Q15(f,g11)) * ((long long)ADD32(x3,x1)) ); + ac1 += ( ((long long)MULT16_16_Q15(f,g12)) * ((long long)ADD32(x4,x0)) ); + + ac1 = ac1 >> 15; + res = ac1; + } + + y[i] = x[i] + res; + x4=x3; x3=x2; x2=x1; x1=x0; - } - if (g1==0) + + + x4 = x[i-T1-2]; + x3 = x[i-T1-1]; + x2 = x[i-T1]; + x1 = x[i-T1+1]; + + if (g1==0) { /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */ if (x!=y) @@ -228,8 +244,25 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, return; } - /* Compute the part with the constant filter. */ - comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12); + for (i=overlap;i<N;i++) { + + opus_val32 res; + x0=x[i-T1+2]; + { + long long ac1 = 0; + ac1 = ( ((long long)g10) * ((long long)x2) ); + ac1 += ( ((long long)g11) * ((long long)ADD32(x3,x1)) ); + ac1 += ( ((long long)g12) * ((long long)ADD32(x4,x0))); + ac1 = ac1 >> 15; + res = ac1; + } + + y[i] = x[i] + res; + x4=x3; + x3=x2; + x2=x1; + x1=x0; + } } const signed char tf_select_table[4][8] = { diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index 5ea1c7ba..7bd60e60 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -33,21 +33,82 @@ #ifndef FIXED_GENERIC_H #define FIXED_GENERIC_H +static inline int MULT16_16_Q15_ADD(int a, int b, int c, int d) { + int m; + long long ac1 = ((long long)a * (long long)b); + long long ac2 = ((long long)c * (long long)d); + ac1 += ac2; + ac1 = ac1>>15; + m = (int )(ac1); + return m; +} + +static inline int MULT16_16_Q15_SUB(int a, int b, int c, int d) { + int m; + long long ac1 = ((long long)a * (long long)b); + long long ac2 = ((long long)c * (long long)d); + ac1 -= ac2; + ac1 = ac1>>15; + m = (int )(ac1); + return m; +} + /** Multiply a 16-bit signed value by a 16-bit unsigned value. The result is a 32-bit signed value */ #define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b)) /** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ #define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)) +#undef MULT16_32_Q16 +static inline int MULT16_32_Q16(int a, int b) +{ + int c; + long long ac1 = ((long long)a * (long long)b); + ac1 = ac1>>16; + c =(int)(ac1); + return c; +} + /** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */ #define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16SU((a),((b)&0x0000ffff)),16)) +#undef MULT16_32_P16 +static inline int MULT16_32_P16(int a, int b) +{ + int c; + long long ac1 = ((long long)a * (long long)b); + ac1 += 32768; + ac1 = ac1>>16; + c =(int)(ac1); + return c; +} + /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */ #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15)) +#undef MULT16_32_Q15 +static inline int MULT16_32_Q15(int a, int b) +{ + int c; + long long ac1 = ((long long)a * (long long)b); + ac1 = ac1>>15; + c =(int)(ac1); + return c; +} + /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */ #define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15)) +#undef MULT32_32_Q31 +static inline int MULT32_32_Q31(int a, int b) +{ + int c; + long long ac1 = ((long long)a * (long long)b); + ac1 = ac1>>31; + c =(int)(ac1); + return c; +} + /** Compile-time conversion of float constant to 16-bit value */ #define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits)))) @@ -129,6 +190,17 @@ #define MULT16_16_P14(a,b) (SHR(ADD32(8192,MULT16_16((a),(b))),14)) #define MULT16_16_P15(a,b) (SHR(ADD32(16384,MULT16_16((a),(b))),15)) +#undef MULT16_16_P15 +static inline int MULT16_16_P15(int a, int b) +{ + int r; + int ac1 = a*b; + ac1 += 16384; + ac1 = ac1 >> 15; + r = (int)(ac1); + return r; +} + /** Divide a 32-bit value by a 16-bit value. Result fits in 16 bits */ #define DIV32_16(a,b) ((opus_val16)(((opus_val32)(a))/((opus_val16)(b)))) diff --git a/celt/kiss_fft.c b/celt/kiss_fft.c index 0bf058c2..a49b99ef 100644 --- a/celt/kiss_fft.c +++ b/celt/kiss_fft.c @@ -283,20 +283,20 @@ static void kf_bfly5( Fout0->r += scratch[7].r + scratch[8].r; Fout0->i += scratch[7].i + scratch[8].i; + scratch[5].r = scratch[0].r + S_MUL_ADD(scratch[7].r,ya.r,scratch[8].r,yb.r); + scratch[5].i = scratch[0].i + S_MUL_ADD(scratch[7].i,ya.r,scratch[8].i,yb.r); - scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r); - scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r); - - scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i); - scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i); + scratch[6].r = S_MUL_ADD(scratch[10].i,ya.i,scratch[9].i,yb.i); + scratch[6].i = -S_MUL_ADD(scratch[10].r,ya.i,scratch[9].r,yb.i); C_SUB(*Fout1,scratch[5],scratch[6]); C_ADD(*Fout4,scratch[5],scratch[6]); - scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r); - scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r); - scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i); - scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i); + scratch[11].r = scratch[0].r + S_MUL_ADD(scratch[7].r,yb.r,scratch[8].r,ya.r); + scratch[11].i = scratch[0].i + S_MUL_ADD(scratch[7].i,yb.r,scratch[8].i,ya.r); + + scratch[12].r = S_MUL_SUB(scratch[9].i,ya.i,scratch[10].i,yb.i); + scratch[12].i = S_MUL_SUB(scratch[10].r,yb.i,scratch[9].r,ya.i); C_ADD(*Fout2,scratch[11],scratch[12]); C_SUB(*Fout3,scratch[11],scratch[12]); diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h index 390b54d9..af62f75b 100644 --- a/celt/kiss_fft.h +++ b/celt/kiss_fft.h @@ -114,6 +114,34 @@ typedef struct kiss_fft_state{ * buffer size in *lenmem. * */ + +#define S_MUL_ADD(a, b, c, d) (S_MUL(a,b)+S_MUL(c,d)) +#define S_MUL_SUB(a, b, c, d) (S_MUL(a,b)-S_MUL(c,d)) + +#undef S_MUL_ADD +static inline int S_MUL_ADD(int a, int b, int c, int d) { + int m; + long long ac1 = ((long long)a * (long long)b); + long long ac2 = ((long long)c * (long long)d); + ac1 += ac2; + ac1 = ac1>>15; + m = (int )(ac1); + return m; +} + + +#undef S_MUL_SUB +static inline int S_MUL_SUB(int a, int b, int c, int d) { + int m; + long long ac1 = ((long long)a * (long long)b); + long long ac2 = ((long long)c * (long long)d); + ac1 -= ac2; + ac1 = ac1>>15; + m = (int )(ac1); + return m; +} + + kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base); kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem); diff --git a/celt/mdct.c b/celt/mdct.c index fa5098cd..a9ffef0b 100644 --- a/celt/mdct.c +++ b/celt/mdct.c @@ -153,8 +153,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar for(i=0;i<((overlap+3)>>2);i++) { /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ - *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2); - *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]); + *yp++ = S_MUL_ADD(*wp2, xp1[N2],*wp1,*xp2); + *yp++ = S_MUL_SUB(*wp1, *xp1,*wp2, xp2[-N2]); xp1+=2; xp2-=2; wp1+=2; @@ -173,8 +173,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar for(;i<N4;i++) { /* Real part arranged as a-bR, Imag part arranged as -c-dR */ - *yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2); - *yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]); + *yp++ = S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]); + *yp++ = S_MUL_ADD(*wp2, *xp1, *wp1, xp2[N2]); xp1+=2; xp2-=2; wp1+=2; @@ -194,8 +194,10 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar t1 = t[N4+i]; re = *yp++; im = *yp++; - yr = S_MUL(re,t0) - S_MUL(im,t1); - yi = S_MUL(im,t0) + S_MUL(re,t1); + + yr = S_MUL_SUB(re,t0,im,t1); + yi = S_MUL_ADD(im,t0,re,t1); + yc.r = yr; yc.i = yi; yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift); @@ -218,8 +220,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar for(i=0;i<N4;i++) { kiss_fft_scalar yr, yi; - yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]); - yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]); + yr = S_MUL_SUB(fp->i,t[N4+i] , fp->r,t[i]); + yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]); *yp1 = yr; *yp2 = yi; fp++; @@ -260,8 +262,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala int rev; kiss_fft_scalar yr, yi; rev = *bitrev++; - yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]); - yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]); + yr = S_MUL_ADD(*xp2, t[i] , *xp1, t[N4+i]); + yi = S_MUL_SUB(*xp1, t[i] , *xp2, t[N4+i]); /* We swap real and imag because we use an FFT instead of an IFFT. */ yp[2*rev+1] = yr; yp[2*rev] = yi; @@ -291,8 +293,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala t0 = t[i]; t1 = t[N4+i]; /* We'd scale up by 2 here, but instead it's done when mixing the windows */ - yr = S_MUL(re,t0) + S_MUL(im,t1); - yi = S_MUL(re,t1) - S_MUL(im,t0); + yr = S_MUL_ADD(re,t0 , im,t1); + yi = S_MUL_SUB(re,t1 , im,t0); /* We swap real and imag because we're using an FFT instead of an IFFT. */ re = yp1[1]; im = yp1[0]; @@ -302,8 +304,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala t0 = t[(N4-i-1)]; t1 = t[(N2-i-1)]; /* We'd scale up by 2 here, but instead it's done when mixing the windows */ - yr = S_MUL(re,t0) + S_MUL(im,t1); - yi = S_MUL(re,t1) - S_MUL(im,t0); + yr = S_MUL_ADD(re,t0,im,t1); + yi = S_MUL_SUB(re,t1,im,t0); yp1[0] = yr; yp0[1] = yi; yp0 += 2; diff --git a/celt/pitch.h b/celt/pitch.h index ec55acae..4b19754d 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -61,66 +61,91 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, { int j; opus_val16 y_0, y_1, y_2, y_3; - celt_assert(len>=3); - y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ - y_0=*y++; - y_1=*y++; - y_2=*y++; - for (j=0;j<len-3;j+=4) - { - opus_val16 tmp; - tmp = *x++; - y_3=*y++; - sum[0] = MAC16_16(sum[0],tmp,y_0); - sum[1] = MAC16_16(sum[1],tmp,y_1); - sum[2] = MAC16_16(sum[2],tmp,y_2); - sum[3] = MAC16_16(sum[3],tmp,y_3); - tmp=*x++; - y_0=*y++; - sum[0] = MAC16_16(sum[0],tmp,y_1); - sum[1] = MAC16_16(sum[1],tmp,y_2); - sum[2] = MAC16_16(sum[2],tmp,y_3); - sum[3] = MAC16_16(sum[3],tmp,y_0); - tmp=*x++; - y_1=*y++; - sum[0] = MAC16_16(sum[0],tmp,y_2); - sum[1] = MAC16_16(sum[1],tmp,y_3); - sum[2] = MAC16_16(sum[2],tmp,y_0); - sum[3] = MAC16_16(sum[3],tmp,y_1); + + opus_int64 sum_0, sum_1, sum_2, sum_3; + sum_0 = (opus_int64)sum[0]; + sum_1 = (opus_int64)sum[1]; + sum_2 = (opus_int64)sum[2]; + sum_3 = (opus_int64)sum[3]; + + y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ + y_0=*y++; + y_1=*y++; + y_2=*y++; + for (j=0;j<len-3;j+=4) + { + opus_val16 tmp; + tmp = *x++; + y_3=*y++; + + sum_0 += ( ((long long)tmp) * ((long long)y_0) ); + sum_1 += ( ((long long)tmp) * ((long long)y_1) ); + sum_2 += ( ((long long)tmp) * ((long long)y_2) ); + sum_3 += ( ((long long)tmp) * ((long long)y_3) ); + + tmp=*x++; + y_0=*y++; + + sum_0 += ( ((long long)tmp) * ((long long)y_1) ); + sum_1 += ( ((long long)tmp) * ((long long)y_2) ); + sum_2 += ( ((long long)tmp) * ((long long)y_3) ); + sum_3 += ( ((long long)tmp) * ((long long)y_0) ); + + tmp=*x++; + y_1=*y++; + + sum_0 += ( ((long long)tmp) * ((long long)y_2) ); + sum_1 += ( ((long long)tmp) * ((long long)y_3) ); + sum_2 += ( ((long long)tmp) * ((long long)y_0) ); + sum_3 += ( ((long long)tmp) * ((long long)y_1) ); + + tmp=*x++; y_2=*y++; - sum[0] = MAC16_16(sum[0],tmp,y_3); - sum[1] = MAC16_16(sum[1],tmp,y_0); - sum[2] = MAC16_16(sum[2],tmp,y_1); - sum[3] = MAC16_16(sum[3],tmp,y_2); + + sum_0 += ( ((long long)tmp) * ((long long)y_3) ); + sum_1 += ( ((long long)tmp) * ((long long)y_0) ); + sum_2 += ( ((long long)tmp) * ((long long)y_1) ); + sum_3 += ( ((long long)tmp) * ((long long)y_2) ); + } if (j++<len) { opus_val16 tmp = *x++; y_3=*y++; - sum[0] = MAC16_16(sum[0],tmp,y_0); - sum[1] = MAC16_16(sum[1],tmp,y_1); - sum[2] = MAC16_16(sum[2],tmp,y_2); - sum[3] = MAC16_16(sum[3],tmp,y_3); + + sum_0 += ( ((long long)tmp) * ((long long)y_0) ); + sum_1 += ( ((long long)tmp) * ((long long)y_1) ); + sum_2 += ( ((long long)tmp) * ((long long)y_2) ); + sum_3 += ( ((long long)tmp) * ((long long)y_3) ); } + if (j++<len) { opus_val16 tmp=*x++; y_0=*y++; - sum[0] = MAC16_16(sum[0],tmp,y_1); - sum[1] = MAC16_16(sum[1],tmp,y_2); - sum[2] = MAC16_16(sum[2],tmp,y_3); - sum[3] = MAC16_16(sum[3],tmp,y_0); + + sum_0 += ( ((long long)tmp) * ((long long)y_1) ); + sum_1 += ( ((long long)tmp) * ((long long)y_2) ); + sum_2 += ( ((long long)tmp) * ((long long)y_3) ); + sum_3 += ( ((long long)tmp) * ((long long)y_0) ); } + if (j<len) { opus_val16 tmp=*x++; y_1=*y++; - sum[0] = MAC16_16(sum[0],tmp,y_2); - sum[1] = MAC16_16(sum[1],tmp,y_3); - sum[2] = MAC16_16(sum[2],tmp,y_0); - sum[3] = MAC16_16(sum[3],tmp,y_1); + + sum_0 += ( ((long long)tmp) * ((long long)y_2) ); + sum_1 += ( ((long long)tmp) * ((long long)y_3) ); + sum_2 += ( ((long long)tmp) * ((long long)y_0) ); + sum_3 += ( ((long long)tmp) * ((long long)y_1) ); } + + sum[0] = (opus_val32)sum_0; + sum[1] = (opus_val32)sum_1; + sum[2] = (opus_val32)sum_2; + sum[3] = (opus_val32)sum_3; } #endif /* OVERRIDE_XCORR_KERNEL */ @@ -128,14 +153,23 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) { - int i; + int j; opus_val32 xy01=0; opus_val32 xy02=0; - for (i=0;i<N;i++) + long long ac1 = 0; + long long ac2 = 0; + + /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */ + for (j=0;j<N;j++) { - xy01 = MAC16_16(xy01, x[i], y01[i]); - xy02 = MAC16_16(xy02, x[i], y02[i]); + ac1 += ( ((long long)x[j]) * ((long long)y01[j]) ); + ac2 += ( ((long long)x[j]) * ((long long)y02[j]) ); + ++j; + ac1 += ( ((long long)x[j]) * ((long long)y01[j]) ); + ac2 += ( ((long long)x[j]) * ((long long)y02[j]) ); } + xy01 = ac1; + xy02 = ac2; *xy1 = xy01; *xy2 = xy02; } @@ -349,11 +349,32 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain) #ifdef FIXED_POINT int k; #endif - opus_val32 E; + opus_val32 E = EPSILON; opus_val16 g; opus_val32 t; - celt_norm *xptr; - E = EPSILON + celt_inner_prod(X, X, N); + celt_norm *xptr = X; + + int X0, X2, X3, X1; + { + long long ac1 = ((long long)E); + /*if(N %4) + printf("error");*/ + for (i=0;i<N-2;i+=2) + { + X0 = (int)*xptr++; + ac1 += ( ((long long)X0) * ((long long)X0) ); + + X1 = (int)*xptr++; + ac1 += ( ((long long)X1) * ((long long)X1) ); + } + for (;i<N;i++) + { + X0 = (int)*xptr++; + ac1 += ( ((long long)X0) * ((long long)X0) ); + } + E = ac1; + } + #ifdef FIXED_POINT k = celt_ilog2(E)>>1; #endif |