summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2014-06-19 00:24:14 -0400
committerJean-Marc Valin <jmvalin@jmvalin.ca>2014-06-19 00:24:14 -0400
commit9618b5cb6308961861ffffdf3794fc0d07bea2f3 (patch)
tree5ddbb6b85af75a83f8bd8e48986484f354be9fab
parenta88d8365d42e5369777afa496b43ea88a1aa9106 (diff)
downloadopus-exp_mips_alt_jun03.tar.gz
C equivalent of the MIPS codeexp_mips_alt_jun03
-rw-r--r--Makefile.unix6
-rw-r--r--celt/_kiss_fft_guts.h40
-rw-r--r--celt/celt.c57
-rw-r--r--celt/fixed_generic.h72
-rw-r--r--celt/kiss_fft.c18
-rw-r--r--celt/kiss_fft.h28
-rw-r--r--celt/mdct.c30
-rw-r--r--celt/pitch.h126
-rw-r--r--celt/vq.c27
-rw-r--r--silk/NSQ_del_dec.c115
-rw-r--r--silk/fixed/warped_autocorrelation_FIX.c90
-rw-r--r--silk/macros.h46
12 files changed, 526 insertions, 129 deletions
diff --git a/Makefile.unix b/Makefile.unix
index 82b3d13b..9295601b 100644
--- a/Makefile.unix
+++ b/Makefile.unix
@@ -1,7 +1,7 @@
#################### COMPILE OPTIONS #######################
# Uncomment this for fixed-point build
-#FIXED_POINT=1
+FIXED_POINT=1
# It is strongly recommended to uncomment one of these
# VAR_ARRAYS: Use C99 variable-length arrays for stack allocation
@@ -45,7 +45,9 @@ ldflags-from-ldlibdirs = $(addprefix -L,$(1))
ldlibs-from-libs = $(addprefix -l,$(1))
WARNINGS = -Wall -W -Wstrict-prototypes -Wextra -Wcast-align -Wnested-externs -Wshadow
-CFLAGS += -O2 -g $(WARNINGS) -DOPUS_BUILD
+
+CFLAGS += -mips32r2 -mno-mips16 -std=gnu99 -O2 -g $(WARNINGS) -DENABLE_ASSERTIONS -DOPUS_BUILD -mdspr2 -march=74kc -mtune=74kc -mmt -mgp32
+
CINCLUDES = include silk celt
ifdef FIXED_POINT
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index 9dfb2dbe..f0840655 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -61,10 +61,50 @@
do{ (m).r = SUB32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
(m).i = ADD32(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0)
+
+#undef C_MUL
+# define C_MUL(m,a,b) (m=C_MUL_fun(a,b))
+static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+ kiss_fft_cpx m;
+ long long ac1 = ((long long)a.r * (long long)b.r);
+ long long ac2 = ((long long)a.i * (long long)b.i);
+ ac1 = ac1 - ac2;
+ ac1 = ac1 >> 15;
+ m.r = ac1;
+
+ ac1 = ((long long)a.r * (long long)b.i);
+ ac2 = ((long long)a.i * (long long)b.r);
+ ac1 = ac1 + ac2;
+ ac1 = ac1 >> 15;
+ m.i = ac1;
+
+ return m;
+}
+
# define C_MULC(m,a,b) \
do{ (m).r = ADD32(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
(m).i = SUB32(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)
+
+#undef C_MULC
+# define C_MULC(m,a,b) (m=C_MULC_fun(a,b))
+static inline kiss_fft_cpx C_MULC_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+ kiss_fft_cpx m;
+
+ long long ac1 = ((long long)a.r * (long long)b.r);
+ long long ac2 = ((long long)a.i * (long long)b.i);
+ ac1 = ac1 + ac2;
+ ac1 = ac1 >> 15;
+ m.r = ac1;
+
+ ac1 = ((long long)a.i * (long long)b.r);
+ ac2 = ((long long)a.r * (long long)b.i);
+ ac1 = ac1 - ac2;
+ ac1 = ac1 >> 15;
+ m.i = ac1;
+
+ return m;
+}
# define C_MULBYSCALAR( c, s ) \
do{ (c).r = S_MUL( (c).r , s ) ;\
(c).i = S_MUL( (c).i , s ) ; }while(0)
diff --git a/celt/celt.c b/celt/celt.c
index 7e47ea49..9a448b96 100644
--- a/celt/celt.c
+++ b/celt/celt.c
@@ -205,22 +205,38 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
for (i=0;i<overlap;i++)
{
opus_val16 f;
- x0=x[i-T1+2];
+ opus_val32 res;
f = MULT16_16_Q15(window[i],window[i]);
- y[i] = x[i]
- + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g00),x[i-T0])
- + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),ADD32(x[i-T0+1],x[i-T0-1]))
- + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),ADD32(x[i-T0+2],x[i-T0-2]))
- + MULT16_32_Q15(MULT16_16_Q15(f,g10),x2)
- + MULT16_32_Q15(MULT16_16_Q15(f,g11),ADD32(x1,x3))
- + MULT16_32_Q15(MULT16_16_Q15(f,g12),ADD32(x0,x4));
+ x0= x[i-T1+2];
+
+ {
+ long long ac1 = 0;
+ ac1 = ((long long)MULT16_16_Q15((Q15ONE-f),g00)) * ((long long )x[i-T0]);
+ ac1 += ( ((long long)MULT16_16_Q15((Q15ONE-f),g01)) * ((long long)ADD32(x[i-T0-1],x[i-T0+1])) );
+ ac1 += ( ((long long)MULT16_16_Q15((Q15ONE-f),g02)) * ((long long)ADD32(x[i-T0-2],x[i-T0+2])) );
+ ac1 += ( ((long long)MULT16_16_Q15(f,g10)) * ((long long)x2) );
+ ac1 += ( ((long long)MULT16_16_Q15(f,g11)) * ((long long)ADD32(x3,x1)) );
+ ac1 += ( ((long long)MULT16_16_Q15(f,g12)) * ((long long)ADD32(x4,x0)) );
+
+ ac1 = ac1 >> 15;
+ res = ac1;
+ }
+
+ y[i] = x[i] + res;
+
x4=x3;
x3=x2;
x2=x1;
x1=x0;
-
}
- if (g1==0)
+
+
+ x4 = x[i-T1-2];
+ x3 = x[i-T1-1];
+ x2 = x[i-T1];
+ x1 = x[i-T1+1];
+
+ if (g1==0)
{
/* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
if (x!=y)
@@ -228,8 +244,25 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
return;
}
- /* Compute the part with the constant filter. */
- comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
+ for (i=overlap;i<N;i++) {
+
+ opus_val32 res;
+ x0=x[i-T1+2];
+ {
+ long long ac1 = 0;
+ ac1 = ( ((long long)g10) * ((long long)x2) );
+ ac1 += ( ((long long)g11) * ((long long)ADD32(x3,x1)) );
+ ac1 += ( ((long long)g12) * ((long long)ADD32(x4,x0)));
+ ac1 = ac1 >> 15;
+ res = ac1;
+ }
+
+ y[i] = x[i] + res;
+ x4=x3;
+ x3=x2;
+ x2=x1;
+ x1=x0;
+ }
}
const signed char tf_select_table[4][8] = {
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 5ea1c7ba..7bd60e60 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -33,21 +33,82 @@
#ifndef FIXED_GENERIC_H
#define FIXED_GENERIC_H
+static inline int MULT16_16_Q15_ADD(int a, int b, int c, int d) {
+ int m;
+ long long ac1 = ((long long)a * (long long)b);
+ long long ac2 = ((long long)c * (long long)d);
+ ac1 += ac2;
+ ac1 = ac1>>15;
+ m = (int )(ac1);
+ return m;
+}
+
+static inline int MULT16_16_Q15_SUB(int a, int b, int c, int d) {
+ int m;
+ long long ac1 = ((long long)a * (long long)b);
+ long long ac2 = ((long long)c * (long long)d);
+ ac1 -= ac2;
+ ac1 = ac1>>15;
+ m = (int )(ac1);
+ return m;
+}
+
/** Multiply a 16-bit signed value by a 16-bit unsigned value. The result is a 32-bit signed value */
#define MULT16_16SU(a,b) ((opus_val32)(opus_val16)(a)*(opus_val32)(opus_uint16)(b))
/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
#define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16))
+#undef MULT16_32_Q16
+static inline int MULT16_32_Q16(int a, int b)
+{
+ int c;
+ long long ac1 = ((long long)a * (long long)b);
+ ac1 = ac1>>16;
+ c =(int)(ac1);
+ return c;
+}
+
/** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */
#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16SU((a),((b)&0x0000ffff)),16))
+#undef MULT16_32_P16
+static inline int MULT16_32_P16(int a, int b)
+{
+ int c;
+ long long ac1 = ((long long)a * (long long)b);
+ ac1 += 32768;
+ ac1 = ac1>>16;
+ c =(int)(ac1);
+ return c;
+}
+
/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
+#undef MULT16_32_Q15
+static inline int MULT16_32_Q15(int a, int b)
+{
+ int c;
+ long long ac1 = ((long long)a * (long long)b);
+ ac1 = ac1>>15;
+ c =(int)(ac1);
+ return c;
+}
+
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
#define MULT32_32_Q31(a,b) ADD32(ADD32(SHL(MULT16_16(SHR((a),16),SHR((b),16)),1), SHR(MULT16_16SU(SHR((a),16),((b)&0x0000ffff)),15)), SHR(MULT16_16SU(SHR((b),16),((a)&0x0000ffff)),15))
+#undef MULT32_32_Q31
+static inline int MULT32_32_Q31(int a, int b)
+{
+ int c;
+ long long ac1 = ((long long)a * (long long)b);
+ ac1 = ac1>>31;
+ c =(int)(ac1);
+ return c;
+}
+
/** Compile-time conversion of float constant to 16-bit value */
#define QCONST16(x,bits) ((opus_val16)(.5+(x)*(((opus_val32)1)<<(bits))))
@@ -129,6 +190,17 @@
#define MULT16_16_P14(a,b) (SHR(ADD32(8192,MULT16_16((a),(b))),14))
#define MULT16_16_P15(a,b) (SHR(ADD32(16384,MULT16_16((a),(b))),15))
+#undef MULT16_16_P15
+static inline int MULT16_16_P15(int a, int b)
+{
+ int r;
+ int ac1 = a*b;
+ ac1 += 16384;
+ ac1 = ac1 >> 15;
+ r = (int)(ac1);
+ return r;
+}
+
/** Divide a 32-bit value by a 16-bit value. Result fits in 16 bits */
#define DIV32_16(a,b) ((opus_val16)(((opus_val32)(a))/((opus_val16)(b))))
diff --git a/celt/kiss_fft.c b/celt/kiss_fft.c
index 0bf058c2..a49b99ef 100644
--- a/celt/kiss_fft.c
+++ b/celt/kiss_fft.c
@@ -283,20 +283,20 @@ static void kf_bfly5(
Fout0->r += scratch[7].r + scratch[8].r;
Fout0->i += scratch[7].i + scratch[8].i;
+ scratch[5].r = scratch[0].r + S_MUL_ADD(scratch[7].r,ya.r,scratch[8].r,yb.r);
+ scratch[5].i = scratch[0].i + S_MUL_ADD(scratch[7].i,ya.r,scratch[8].i,yb.r);
- scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
- scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
-
- scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
- scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
+ scratch[6].r = S_MUL_ADD(scratch[10].i,ya.i,scratch[9].i,yb.i);
+ scratch[6].i = -S_MUL_ADD(scratch[10].r,ya.i,scratch[9].r,yb.i);
C_SUB(*Fout1,scratch[5],scratch[6]);
C_ADD(*Fout4,scratch[5],scratch[6]);
- scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
- scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
- scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
- scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
+ scratch[11].r = scratch[0].r + S_MUL_ADD(scratch[7].r,yb.r,scratch[8].r,ya.r);
+ scratch[11].i = scratch[0].i + S_MUL_ADD(scratch[7].i,yb.r,scratch[8].i,ya.r);
+
+ scratch[12].r = S_MUL_SUB(scratch[9].i,ya.i,scratch[10].i,yb.i);
+ scratch[12].i = S_MUL_SUB(scratch[10].r,yb.i,scratch[9].r,ya.i);
C_ADD(*Fout2,scratch[11],scratch[12]);
C_SUB(*Fout3,scratch[11],scratch[12]);
diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h
index 390b54d9..af62f75b 100644
--- a/celt/kiss_fft.h
+++ b/celt/kiss_fft.h
@@ -114,6 +114,34 @@ typedef struct kiss_fft_state{
* buffer size in *lenmem.
* */
+
+#define S_MUL_ADD(a, b, c, d) (S_MUL(a,b)+S_MUL(c,d))
+#define S_MUL_SUB(a, b, c, d) (S_MUL(a,b)-S_MUL(c,d))
+
+#undef S_MUL_ADD
+static inline int S_MUL_ADD(int a, int b, int c, int d) {
+ int m;
+ long long ac1 = ((long long)a * (long long)b);
+ long long ac2 = ((long long)c * (long long)d);
+ ac1 += ac2;
+ ac1 = ac1>>15;
+ m = (int )(ac1);
+ return m;
+}
+
+
+#undef S_MUL_SUB
+static inline int S_MUL_SUB(int a, int b, int c, int d) {
+ int m;
+ long long ac1 = ((long long)a * (long long)b);
+ long long ac2 = ((long long)c * (long long)d);
+ ac1 -= ac2;
+ ac1 = ac1>>15;
+ m = (int )(ac1);
+ return m;
+}
+
+
kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem, const kiss_fft_state *base);
kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem);
diff --git a/celt/mdct.c b/celt/mdct.c
index fa5098cd..a9ffef0b 100644
--- a/celt/mdct.c
+++ b/celt/mdct.c
@@ -153,8 +153,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
for(i=0;i<((overlap+3)>>2);i++)
{
/* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
- *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
- *yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]);
+ *yp++ = S_MUL_ADD(*wp2, xp1[N2],*wp1,*xp2);
+ *yp++ = S_MUL_SUB(*wp1, *xp1,*wp2, xp2[-N2]);
xp1+=2;
xp2-=2;
wp1+=2;
@@ -173,8 +173,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
for(;i<N4;i++)
{
/* Real part arranged as a-bR, Imag part arranged as -c-dR */
- *yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
- *yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]);
+ *yp++ = S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]);
+ *yp++ = S_MUL_ADD(*wp2, *xp1, *wp1, xp2[N2]);
xp1+=2;
xp2-=2;
wp1+=2;
@@ -194,8 +194,10 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
t1 = t[N4+i];
re = *yp++;
im = *yp++;
- yr = S_MUL(re,t0) - S_MUL(im,t1);
- yi = S_MUL(im,t0) + S_MUL(re,t1);
+
+ yr = S_MUL_SUB(re,t0,im,t1);
+ yi = S_MUL_ADD(im,t0,re,t1);
+
yc.r = yr;
yc.i = yi;
yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
@@ -218,8 +220,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
for(i=0;i<N4;i++)
{
kiss_fft_scalar yr, yi;
- yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
- yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
+ yr = S_MUL_SUB(fp->i,t[N4+i] , fp->r,t[i]);
+ yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]);
*yp1 = yr;
*yp2 = yi;
fp++;
@@ -260,8 +262,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
int rev;
kiss_fft_scalar yr, yi;
rev = *bitrev++;
- yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
- yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
+ yr = S_MUL_ADD(*xp2, t[i] , *xp1, t[N4+i]);
+ yi = S_MUL_SUB(*xp1, t[i] , *xp2, t[N4+i]);
/* We swap real and imag because we use an FFT instead of an IFFT. */
yp[2*rev+1] = yr;
yp[2*rev] = yi;
@@ -291,8 +293,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
t0 = t[i];
t1 = t[N4+i];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
- yr = S_MUL(re,t0) + S_MUL(im,t1);
- yi = S_MUL(re,t1) - S_MUL(im,t0);
+ yr = S_MUL_ADD(re,t0 , im,t1);
+ yi = S_MUL_SUB(re,t1 , im,t0);
/* We swap real and imag because we're using an FFT instead of an IFFT. */
re = yp1[1];
im = yp1[0];
@@ -302,8 +304,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
t0 = t[(N4-i-1)];
t1 = t[(N2-i-1)];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
- yr = S_MUL(re,t0) + S_MUL(im,t1);
- yi = S_MUL(re,t1) - S_MUL(im,t0);
+ yr = S_MUL_ADD(re,t0,im,t1);
+ yi = S_MUL_SUB(re,t1,im,t0);
yp1[0] = yr;
yp0[1] = yi;
yp0 += 2;
diff --git a/celt/pitch.h b/celt/pitch.h
index ec55acae..4b19754d 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -61,66 +61,91 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y,
{
int j;
opus_val16 y_0, y_1, y_2, y_3;
- celt_assert(len>=3);
- y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
- y_0=*y++;
- y_1=*y++;
- y_2=*y++;
- for (j=0;j<len-3;j+=4)
- {
- opus_val16 tmp;
- tmp = *x++;
- y_3=*y++;
- sum[0] = MAC16_16(sum[0],tmp,y_0);
- sum[1] = MAC16_16(sum[1],tmp,y_1);
- sum[2] = MAC16_16(sum[2],tmp,y_2);
- sum[3] = MAC16_16(sum[3],tmp,y_3);
- tmp=*x++;
- y_0=*y++;
- sum[0] = MAC16_16(sum[0],tmp,y_1);
- sum[1] = MAC16_16(sum[1],tmp,y_2);
- sum[2] = MAC16_16(sum[2],tmp,y_3);
- sum[3] = MAC16_16(sum[3],tmp,y_0);
- tmp=*x++;
- y_1=*y++;
- sum[0] = MAC16_16(sum[0],tmp,y_2);
- sum[1] = MAC16_16(sum[1],tmp,y_3);
- sum[2] = MAC16_16(sum[2],tmp,y_0);
- sum[3] = MAC16_16(sum[3],tmp,y_1);
+
+ opus_int64 sum_0, sum_1, sum_2, sum_3;
+ sum_0 = (opus_int64)sum[0];
+ sum_1 = (opus_int64)sum[1];
+ sum_2 = (opus_int64)sum[2];
+ sum_3 = (opus_int64)sum[3];
+
+ y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
+ y_0=*y++;
+ y_1=*y++;
+ y_2=*y++;
+ for (j=0;j<len-3;j+=4)
+ {
+ opus_val16 tmp;
+ tmp = *x++;
+ y_3=*y++;
+
+ sum_0 += ( ((long long)tmp) * ((long long)y_0) );
+ sum_1 += ( ((long long)tmp) * ((long long)y_1) );
+ sum_2 += ( ((long long)tmp) * ((long long)y_2) );
+ sum_3 += ( ((long long)tmp) * ((long long)y_3) );
+
+ tmp=*x++;
+ y_0=*y++;
+
+ sum_0 += ( ((long long)tmp) * ((long long)y_1) );
+ sum_1 += ( ((long long)tmp) * ((long long)y_2) );
+ sum_2 += ( ((long long)tmp) * ((long long)y_3) );
+ sum_3 += ( ((long long)tmp) * ((long long)y_0) );
+
+ tmp=*x++;
+ y_1=*y++;
+
+ sum_0 += ( ((long long)tmp) * ((long long)y_2) );
+ sum_1 += ( ((long long)tmp) * ((long long)y_3) );
+ sum_2 += ( ((long long)tmp) * ((long long)y_0) );
+ sum_3 += ( ((long long)tmp) * ((long long)y_1) );
+
+
tmp=*x++;
y_2=*y++;
- sum[0] = MAC16_16(sum[0],tmp,y_3);
- sum[1] = MAC16_16(sum[1],tmp,y_0);
- sum[2] = MAC16_16(sum[2],tmp,y_1);
- sum[3] = MAC16_16(sum[3],tmp,y_2);
+
+ sum_0 += ( ((long long)tmp) * ((long long)y_3) );
+ sum_1 += ( ((long long)tmp) * ((long long)y_0) );
+ sum_2 += ( ((long long)tmp) * ((long long)y_1) );
+ sum_3 += ( ((long long)tmp) * ((long long)y_2) );
+
}
if (j++<len)
{
opus_val16 tmp = *x++;
y_3=*y++;
- sum[0] = MAC16_16(sum[0],tmp,y_0);
- sum[1] = MAC16_16(sum[1],tmp,y_1);
- sum[2] = MAC16_16(sum[2],tmp,y_2);
- sum[3] = MAC16_16(sum[3],tmp,y_3);
+
+ sum_0 += ( ((long long)tmp) * ((long long)y_0) );
+ sum_1 += ( ((long long)tmp) * ((long long)y_1) );
+ sum_2 += ( ((long long)tmp) * ((long long)y_2) );
+ sum_3 += ( ((long long)tmp) * ((long long)y_3) );
}
+
if (j++<len)
{
opus_val16 tmp=*x++;
y_0=*y++;
- sum[0] = MAC16_16(sum[0],tmp,y_1);
- sum[1] = MAC16_16(sum[1],tmp,y_2);
- sum[2] = MAC16_16(sum[2],tmp,y_3);
- sum[3] = MAC16_16(sum[3],tmp,y_0);
+
+ sum_0 += ( ((long long)tmp) * ((long long)y_1) );
+ sum_1 += ( ((long long)tmp) * ((long long)y_2) );
+ sum_2 += ( ((long long)tmp) * ((long long)y_3) );
+ sum_3 += ( ((long long)tmp) * ((long long)y_0) );
}
+
if (j<len)
{
opus_val16 tmp=*x++;
y_1=*y++;
- sum[0] = MAC16_16(sum[0],tmp,y_2);
- sum[1] = MAC16_16(sum[1],tmp,y_3);
- sum[2] = MAC16_16(sum[2],tmp,y_0);
- sum[3] = MAC16_16(sum[3],tmp,y_1);
+
+ sum_0 += ( ((long long)tmp) * ((long long)y_2) );
+ sum_1 += ( ((long long)tmp) * ((long long)y_3) );
+ sum_2 += ( ((long long)tmp) * ((long long)y_0) );
+ sum_3 += ( ((long long)tmp) * ((long long)y_1) );
}
+
+ sum[0] = (opus_val32)sum_0;
+ sum[1] = (opus_val32)sum_1;
+ sum[2] = (opus_val32)sum_2;
+ sum[3] = (opus_val32)sum_3;
}
#endif /* OVERRIDE_XCORR_KERNEL */
@@ -128,14 +153,23 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y,
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
- int i;
+ int j;
opus_val32 xy01=0;
opus_val32 xy02=0;
- for (i=0;i<N;i++)
+ long long ac1 = 0;
+ long long ac2 = 0;
+
+ /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
+ for (j=0;j<N;j++)
{
- xy01 = MAC16_16(xy01, x[i], y01[i]);
- xy02 = MAC16_16(xy02, x[i], y02[i]);
+ ac1 += ( ((long long)x[j]) * ((long long)y01[j]) );
+ ac2 += ( ((long long)x[j]) * ((long long)y02[j]) );
+ ++j;
+ ac1 += ( ((long long)x[j]) * ((long long)y01[j]) );
+ ac2 += ( ((long long)x[j]) * ((long long)y02[j]) );
}
+ xy01 = ac1;
+ xy02 = ac2;
*xy1 = xy01;
*xy2 = xy02;
}
diff --git a/celt/vq.c b/celt/vq.c
index 6bf9b2b0..3c10ec27 100644
--- a/celt/vq.c
+++ b/celt/vq.c
@@ -349,11 +349,32 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
#ifdef FIXED_POINT
int k;
#endif
- opus_val32 E;
+ opus_val32 E = EPSILON;
opus_val16 g;
opus_val32 t;
- celt_norm *xptr;
- E = EPSILON + celt_inner_prod(X, X, N);
+ celt_norm *xptr = X;
+
+ int X0, X2, X3, X1;
+ {
+ long long ac1 = ((long long)E);
+ /*if(N %4)
+ printf("error");*/
+ for (i=0;i<N-2;i+=2)
+ {
+ X0 = (int)*xptr++;
+ ac1 += ( ((long long)X0) * ((long long)X0) );
+
+ X1 = (int)*xptr++;
+ ac1 += ( ((long long)X1) * ((long long)X1) );
+ }
+ for (;i<N;i++)
+ {
+ X0 = (int)*xptr++;
+ ac1 += ( ((long long)X0) * ((long long)X0) );
+ }
+ E = ac1;
+ }
+
#ifdef FIXED_POINT
k = celt_ilog2(E)>>1;
#endif
diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index 522be406..32993c94 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -339,13 +339,44 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
- VARDECL( NSQ_sample_pair, psSampleState );
+ NSQ_sample_struct psSampleState[ MAX_DEL_DEC_STATES ][ 2 ];
NSQ_del_dec_struct *psDD;
NSQ_sample_struct *psSS;
- SAVE_STACK;
+ opus_int16 b_Q14_0, b_Q14_1, b_Q14_2, b_Q14_3, b_Q14_4;
+ opus_int16 a_Q12_0, a_Q12_1, a_Q12_2, a_Q12_3, a_Q12_4, a_Q12_5, a_Q12_6;
+ opus_int16 a_Q12_7, a_Q12_8, a_Q12_9, a_Q12_10, a_Q12_11, a_Q12_12, a_Q12_13;
+ opus_int16 a_Q12_14, a_Q12_15;
+
+ opus_int32 cur, prev, next;
+
+ //Intialize b_Q14 variables
+ b_Q14_0 = b_Q14[ 0 ];
+ b_Q14_1 = b_Q14[ 1 ];
+ b_Q14_2 = b_Q14[ 2 ];
+ b_Q14_3 = b_Q14[ 3 ];
+ b_Q14_4 = b_Q14[ 4 ];
+
+ //Intialize a_Q12 variables
+ a_Q12_0 = a_Q12[0];
+ a_Q12_1 = a_Q12[1];
+ a_Q12_2 = a_Q12[2];
+ a_Q12_3 = a_Q12[3];
+ a_Q12_4 = a_Q12[4];
+ a_Q12_5 = a_Q12[5];
+ a_Q12_6 = a_Q12[6];
+ a_Q12_7 = a_Q12[7];
+ a_Q12_8 = a_Q12[8];
+ a_Q12_9 = a_Q12[9];
+ a_Q12_10 = a_Q12[10];
+ a_Q12_11 = a_Q12[11];
+ a_Q12_12 = a_Q12[12];
+ a_Q12_13 = a_Q12[13];
+ a_Q12_14 = a_Q12[14];
+ a_Q12_15 = a_Q12[15];
+
+ long long temp64;
silk_assert( nStatesDelayedDecision > 0 );
- ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
@@ -358,12 +389,15 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
if( signalType == TYPE_VOICED ) {
/* Unrolled loop */
/* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
- LTP_pred_Q14 = 2;
- LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] );
- LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );
- LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );
- LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );
- LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
+ temp64 = ( ((long long)pred_lag_ptr[ 0 ]) * ((long long)b_Q14_0) );
+ temp64 += ( ((long long)pred_lag_ptr[ -1 ]) * ((long long)b_Q14_1) );
+ temp64 += ( ((long long)pred_lag_ptr[ -2 ]) * ((long long)b_Q14_2) );
+ temp64 += ( ((long long)pred_lag_ptr[ -3 ]) * ((long long)b_Q14_3) );
+ temp64 += ( ((long long)pred_lag_ptr[ -4 ]) * ((long long)b_Q14_4) );
+ temp64 += 32768;
+ temp64 = temp64 >> 16;
+ LTP_pred_Q14 = temp64;
+
LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
pred_lag_ptr++;
} else {
@@ -395,26 +429,29 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
/* Short-term prediction */
silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
- /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
- LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ 0 ], a_Q12[ 0 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -1 ], a_Q12[ 1 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -2 ], a_Q12[ 2 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -3 ], a_Q12[ 3 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -4 ], a_Q12[ 4 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -5 ], a_Q12[ 5 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -6 ], a_Q12[ 6 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -7 ], a_Q12[ 7 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
+ temp64 = ( ((long long)psLPC_Q14[ 0 ]) * ((long long)a_Q12_0) );
+ temp64 += ( ((long long)psLPC_Q14[ -1 ]) * ((long long)a_Q12_1) );
+ temp64 += ( ((long long)psLPC_Q14[ -2 ]) * ((long long)a_Q12_2) );
+ temp64 += ( ((long long)psLPC_Q14[ -3 ]) * ((long long)a_Q12_3) );
+ temp64 += ( ((long long)psLPC_Q14[ -4 ]) * ((long long)a_Q12_4) );
+ temp64 += ( ((long long)psLPC_Q14[ -5 ]) * ((long long)a_Q12_5) );
+ temp64 += ( ((long long)psLPC_Q14[ -6 ]) * ((long long)a_Q12_6) );
+ temp64 += ( ((long long)psLPC_Q14[ -7 ]) * ((long long)a_Q12_7) );
+ temp64 += ( ((long long)psLPC_Q14[ -8 ]) * ((long long)a_Q12_8) );
+ temp64 += ( ((long long)psLPC_Q14[ -9 ]) * ((long long)a_Q12_9) );
if( predictLPCOrder == 16 ) {
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -10 ], a_Q12[ 10 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -11 ], a_Q12[ 11 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -12 ], a_Q12[ 12 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -13 ], a_Q12[ 13 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -14 ], a_Q12[ 14 ] );
- LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -15 ], a_Q12[ 15 ] );
+ temp64 += ( ((long long)psLPC_Q14[ -10 ]) * ((long long)a_Q12_10) );
+ temp64 += ( ((long long)psLPC_Q14[ -11 ]) * ((long long)a_Q12_11) );
+ temp64 += ( ((long long)psLPC_Q14[ -12 ]) * ((long long)a_Q12_12) );
+ temp64 += ( ((long long)psLPC_Q14[ -13 ]) * ((long long)a_Q12_13) );
+ temp64 += ( ((long long)psLPC_Q14[ -14 ]) * ((long long)a_Q12_14) );
+ temp64 += ( ((long long)psLPC_Q14[ -15 ]) * ((long long)a_Q12_15) );
}
+ temp64 += 32768;
+ temp64 = temp64 >> 16;
+ LPC_pred_Q14 = temp64;
+
+
LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
/* Noise shape feedback */
@@ -424,21 +461,31 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
/* Output of allpass section */
tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
psDD->sAR2_Q14[ 0 ] = tmp2;
- n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
- n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
+
+ temp64 = ( ((long long)tmp2) * ((long long)AR_shp_Q13[ 0 ]) );
+
+ prev = psDD->sAR2_Q14[ 1 ];
+
/* Loop over allpass sections */
for( j = 2; j < shapingLPCOrder; j += 2 ) {
+ cur = psDD->sAR2_Q14[ j ];
+ next = psDD->sAR2_Q14[ j+1 ];
/* Output of allpass section */
- tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
+ tmp2 = silk_SMLAWB( prev, cur - tmp1, warping_Q16 );
psDD->sAR2_Q14[ j - 1 ] = tmp1;
- n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
+ temp64 += ( ((long long)tmp1) * ((long long)AR_shp_Q13[ j - 1 ]) );
+ temp64 += ( ((long long)tmp2) * ((long long)AR_shp_Q13[ j ]) );
/* Output of allpass section */
- tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
+ tmp1 = silk_SMLAWB( cur, next - tmp2, warping_Q16 );
psDD->sAR2_Q14[ j + 0 ] = tmp2;
- n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
+ prev = next;
}
psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
- n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
+ temp64 += ( ((long long)tmp1) * ((long long)AR_shp_Q13[ shapingLPCOrder - 1 ]) );
+ temp64 += 32768;
+ temp64 = temp64 >> 16;
+ n_AR_Q14 = temp64;
+
n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */
n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */
diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c
index a4a579b1..47671346 100644
--- a/silk/fixed/warped_autocorrelation_FIX.c
+++ b/silk/fixed/warped_autocorrelation_FIX.c
@@ -45,44 +45,116 @@ void silk_warped_autocorrelation_FIX(
)
{
opus_int n, i, lsh;
- opus_int32 tmp1_QS, tmp2_QS;
+ opus_int32 tmp1_QS=0, tmp2_QS=0, tmp3_QS=0, tmp4_QS=0, tmp5_QS=0, tmp6_QS=0, tmp7_QS=0, tmp8_QS=0,start_1=0, start_2=0, start_3=0;
opus_int32 state_QS[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+ opus_int64 temp64;
+
+ opus_int64 val;
+ val = (opus_int64)(2 * QS - QC);
+
/* Order must be even */
silk_assert( ( order & 1 ) == 0 );
silk_assert( 2 * QS - QC >= 0 );
/* Loop over samples */
- for( n = 0; n < length; n++ ) {
+ for( n = 0; n < length; n=n+4 ) {
+
+ tmp1_QS = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+ start_1 = tmp1_QS;
+ tmp3_QS = silk_LSHIFT32( (opus_int32)input[ n+1], QS );
+ start_2 = tmp3_QS;
+ tmp5_QS = silk_LSHIFT32( (opus_int32)input[ n+2], QS );
+ start_3 = tmp5_QS;
+ tmp7_QS = silk_LSHIFT32( (opus_int32)input[ n+3], QS );
+
+ /* Loop over allpass sections */
+ for( i = 0; i < order; i += 2 ) {
+
+ /* Output of allpass section */
+ tmp2_QS = silk_SMLAWB( state_QS[ i ], state_QS[ i + 1 ] - tmp1_QS, warping_Q16 );
+ corr_QC[ i ] += ( ((long long)tmp1_QS) * ((long long)start_1) );
+
+ tmp4_QS = silk_SMLAWB( tmp1_QS, tmp2_QS - tmp3_QS, warping_Q16 );
+ corr_QC[ i ] += ( ((long long)tmp3_QS) * ((long long)start_2) );
+
+ tmp6_QS = silk_SMLAWB( tmp3_QS, tmp4_QS - tmp5_QS, warping_Q16 );
+ corr_QC[ i ] += ( ((long long)tmp5_QS) * ((long long)start_3) );
+
+ tmp8_QS = silk_SMLAWB( tmp5_QS, tmp6_QS - tmp7_QS, warping_Q16 );
+ state_QS[ i ] = tmp7_QS;
+ corr_QC[ i ] += ( ((long long)tmp7_QS) * ((long long)state_QS[0]) );
+
+
+ /* Output of allpass section */
+ tmp1_QS = silk_SMLAWB( state_QS[ i + 1 ], state_QS[ i + 2 ] - tmp2_QS, warping_Q16 );
+ corr_QC[ i +1] += ( ((long long)tmp2_QS) * ((long long)start_1) );
+
+ tmp3_QS = silk_SMLAWB( tmp2_QS, tmp1_QS - tmp4_QS, warping_Q16 );
+ corr_QC[ i +1] += ( ((long long)tmp4_QS) * ((long long)start_2) );
+
+ tmp5_QS = silk_SMLAWB( tmp4_QS, tmp3_QS - tmp6_QS, warping_Q16 );
+ corr_QC[ i +1] += ( ((long long)tmp6_QS) * ((long long)start_3) );
+
+ tmp7_QS = silk_SMLAWB( tmp6_QS, tmp5_QS - tmp8_QS, warping_Q16 );
+ state_QS[ i + 1 ] = tmp8_QS;
+ corr_QC[ i +1] += ( ((long long)tmp8_QS) * ((long long)state_QS[ 0 ]) );
+
+ }
+ state_QS[ order ] = tmp7_QS;
+
+ corr_QC[order] += ( ((long long)tmp1_QS) * ((long long)start_1) );
+ corr_QC[order] += ( ((long long)tmp3_QS) * ((long long)start_2) );
+ corr_QC[order] += ( ((long long)tmp5_QS) * ((long long)start_3) );
+ corr_QC[order] += ( ((long long)tmp7_QS) * ((long long)state_QS[ 0 ]) );
+ }
+
+ for(;n< length; n++ ) {
+
tmp1_QS = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+
/* Loop over allpass sections */
for( i = 0; i < order; i += 2 ) {
+
/* Output of allpass section */
tmp2_QS = silk_SMLAWB( state_QS[ i ], state_QS[ i + 1 ] - tmp1_QS, warping_Q16 );
- state_QS[ i ] = tmp1_QS;
- corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS, state_QS[ 0 ] ), 2 * QS - QC );
+ state_QS[ i ] = tmp1_QS;
+ corr_QC[i] += ( ((long long)tmp1_QS) * ((long long)state_QS[ 0 ]) );
+
/* Output of allpass section */
tmp1_QS = silk_SMLAWB( state_QS[ i + 1 ], state_QS[ i + 2 ] - tmp2_QS, warping_Q16 );
state_QS[ i + 1 ] = tmp2_QS;
- corr_QC[ i + 1 ] += silk_RSHIFT64( silk_SMULL( tmp2_QS, state_QS[ 0 ] ), 2 * QS - QC );
+ corr_QC[i+1] += ( ((long long)tmp2_QS) * ((long long)state_QS[ 0 ]) );
}
state_QS[ order ] = tmp1_QS;
- corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS, state_QS[ 0 ] ), 2 * QS - QC );
+ corr_QC[order] += ( ((long long)tmp1_QS) * ((long long)state_QS[ 0 ]) );
}
- lsh = silk_CLZ64( corr_QC[ 0 ] ) - 35;
+ temp64 = corr_QC[ 0 ];
+ temp64 = (val >= 0) ? (temp64 >> val) : (temp64 << -val);
+
+ lsh = silk_CLZ64( temp64 ) - 35;
lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
*scale = -( QC + lsh );
silk_assert( *scale >= -30 && *scale <= 12 );
if( lsh >= 0 ) {
for( i = 0; i < order + 1; i++ ) {
- corr[ i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) );
+ temp64 = corr_QC[ i ];
+ temp64 = (val >= 0) ? (temp64 >> val) : (temp64 << -val);
+ corr[ i ] = (opus_int32)silk_CHECK_FIT32( ((-lsh) >= 0) ? (temp64 >> (-lsh)) : (temp64 << -(-lsh)) );
}
} else {
for( i = 0; i < order + 1; i++ ) {
- corr[ i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) );
+ temp64 = corr_QC[ i ];
+ temp64 = (val >= 0) ? (temp64 >> val) : (temp64 << -val);
+ corr[ i ] = (opus_int32)silk_CHECK_FIT32( ((-lsh) >= 0) ? (temp64 >> (-lsh)) : (temp64 << -(-lsh)) );
}
}
+
+ temp64 = (corr_QC[ 0 ]>=0) ? corr_QC[ 0 ] : (-corr_QC[ 0 ]);
+ temp64 = (val >= 0) ? (temp64 >> val) : (temp64 << -val);
+ corr_QC[ 0 ] = (opus_int64)temp64;
+
silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/
}
diff --git a/silk/macros.h b/silk/macros.h
index a84e5a5d..c5030703 100644
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -40,9 +40,26 @@ POSSIBILITY OF SUCH DAMAGE.
/* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
#define silk_SMULWB(a32, b32) ((((a32) >> 16) * (opus_int32)((opus_int16)(b32))) + ((((a32) & 0x0000FFFF) * (opus_int32)((opus_int16)(b32))) >> 16))
+#undef silk_SMULWB
+static inline int silk_SMULWB(int a, int b)
+{
+ long long ac;
+ int c;
+
+ ac = ((long long) a * (long long)(opus_int16)b);
+ ac = ac >> 16;
+ c = ac;
+
+ return c;
+}
+
+
/* a32 + (b32 * (opus_int32)((opus_int16)(c32))) >> 16 output have to be 32bit int */
#define silk_SMLAWB(a32, b32, c32) ((a32) + ((((b32) >> 16) * (opus_int32)((opus_int16)(c32))) + ((((b32) & 0x0000FFFF) * (opus_int32)((opus_int16)(c32))) >> 16)))
+#undef silk_SMLAWB
+#define silk_SMLAWB(a32, b32, c32) ((a32) + silk_SMULWB(b32, c32))
+
/* (a32 * (b32 >> 16)) >> 16 */
#define silk_SMULWT(a32, b32) (((a32) >> 16) * ((b32) >> 16) + ((((a32) & 0x0000FFFF) * ((b32) >> 16)) >> 16))
@@ -67,9 +84,38 @@ POSSIBILITY OF SUCH DAMAGE.
/* (a32 * b32) >> 16 */
#define silk_SMULWW(a32, b32) silk_MLA(silk_SMULWB((a32), (b32)), (a32), silk_RSHIFT_ROUND((b32), 16))
+
+#undef silk_SMULWW
+static inline int silk_SMULWW(int a, int b)
+{
+ long long ac;
+ int c;
+
+ ac = ((long long) a * (long long)b);
+ ac = ac >> 16;
+ c = ac;
+
+ return c;
+}
+
/* a32 + ((b32 * c32) >> 16) */
#define silk_SMLAWW(a32, b32, c32) silk_MLA(silk_SMLAWB((a32), (b32), (c32)), (b32), silk_RSHIFT_ROUND((c32), 16))
+#undef silk_SMLAWW
+static inline int silk_SMLAWW(int a, int b, int c)
+{
+ long long ac;
+ int res;
+
+ ac = ((long long)b * (long long)c);
+ ac = ac >> 16;
+ res = ac;
+ res += a;
+
+ return res;
+}
+
+
/* add/subtract with output saturated */
#define silk_ADD_SAT32(a, b) ((((opus_uint32)(a) + (opus_uint32)(b)) & 0x80000000) == 0 ? \
((((a) & (b)) & 0x80000000) != 0 ? silk_int32_MIN : (a)+(b)) : \