summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2014-01-21 11:07:57 -0500
committerJean-Marc Valin <jmvalin@jmvalin.ca>2014-01-21 11:07:57 -0500
commit17b197837fb5bf6361e4cae7fbe0d0163e74b8a8 (patch)
tree3f253af8b9e9cb8d872db33a8ae58eb034d7fec1
parent29354ff6e05c9ead9454981a7404a9b9ea203d2e (diff)
downloadopus-17b197837fb5bf6361e4cae7fbe0d0163e74b8a8.tar.gz
Speed up the comb filter on ARM by using MAC16_32_Q16()
-rw-r--r--celt/arch.h1
-rw-r--r--celt/arm/fixed_armv4.h4
-rw-r--r--celt/arm/fixed_armv5e.h17
-rw-r--r--celt/celt.c28
-rw-r--r--celt/fixed_debug.h1
-rw-r--r--celt/fixed_generic.h6
6 files changed, 56 insertions, 1 deletions
diff --git a/celt/arch.h b/celt/arch.h
index 8c79a66e..c910c807 100644
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -208,6 +208,7 @@ static OPUS_INLINE int celt_isnan(float x)
#define MULT32_32_Q31(a,b) ((a)*(b))
#define MAC16_32_Q15(c,a,b) ((c)+(a)*(b))
+#define MAC16_32_Q16(c,a,b) ((c)+(a)*(b))
#define MULT16_16_Q11_32(a,b) ((a)*(b))
#define MULT16_16_Q11(a,b) ((a)*(b))
diff --git a/celt/arm/fixed_armv4.h b/celt/arm/fixed_armv4.h
index b690bc8c..efb3b189 100644
--- a/celt/arm/fixed_armv4.h
+++ b/celt/arm/fixed_armv4.h
@@ -68,6 +68,10 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
#undef MAC16_32_Q15
#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
+ Result fits in 32 bits. */
+#undef MAC16_32_Q16
+#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b))
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
#undef MULT32_32_Q31
diff --git a/celt/arm/fixed_armv5e.h b/celt/arm/fixed_armv5e.h
index 1194a7d3..36d6bed0 100644
--- a/celt/arm/fixed_armv5e.h
+++ b/celt/arm/fixed_armv5e.h
@@ -82,6 +82,23 @@ static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
}
#define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
+/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
+ Result fits in 32 bits. */
+#undef MAC16_32_Q16
+static OPUS_INLINE opus_val32 MAC16_32_Q16_armv5e(opus_val32 c, opus_val16 a,
+ opus_val32 b)
+{
+ int res;
+ __asm__(
+ "#MAC16_32_Q16\n\t"
+ "smlawb %0, %1, %2, %3;\n"
+ : "=r"(res)
+ : "r"(b), "r"(a), "r"(c)
+ );
+ return res;
+}
+#define MAC16_32_Q16(c, a, b) (MAC16_32_Q16_armv5e(c, a, b))
+
/** 16x16 multiply-add where the result fits in 32 bits */
#undef MAC16_16
static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
diff --git a/celt/celt.c b/celt/celt.c
index 4f9c9e05..5741b6b5 100644
--- a/celt/celt.c
+++ b/celt/celt.c
@@ -86,6 +86,33 @@ int resampling_factor(opus_int32 rate)
}
#ifndef OVERRIDE_COMB_FILTER_CONST
+/* This version should be faster on ARM */
+#ifdef OPUS_ARM_ASM
+static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+ opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+ opus_val32 x0, x1, x2, x3, x4;
+ int i;
+ x4 = SHL32(x[-T-2], 1);
+ x3 = SHL32(x[-T-1], 1);
+ x2 = SHL32(x[-T], 1);
+ x1 = SHL32(x[-T+1], 1);
+ for (i=0;i<N;i++)
+ {
+ opus_val32 t;
+ x0=SHL32(x[i-T+2],1);
+ t = MAC16_32_Q16(x[i], g10, x2);
+ t = MAC16_32_Q16(t, g11, ADD32(x1,x3));
+ t = MAC16_32_Q16(t, g12, ADD32(x0,x4));
+ y[i] = t;
+ x4=x3;
+ x3=x2;
+ x2=x1;
+ x1=x0;
+ }
+
+}
+#else
static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12)
{
@@ -110,6 +137,7 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
}
#endif
+#endif
void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index 80bc9491..5d7120d4 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -496,6 +496,7 @@ static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int
#define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15)
#define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b))))
+#define MAC16_32_Q16(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q16((a),(b))))
static OPUS_INLINE int SATURATE(int a, int b)
{
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index ecf018a2..8d13fde7 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -113,7 +113,11 @@
/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
b must fit in 31 bits.
Result fits in 32 bits. */
-#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+#define MAC16_32_Q15(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+
+/** 16x32 multiplication, followed by a 16-bit shift right and 32-bit add.
+ Results fits in 32 bits */
+#define MAC16_32_Q16(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)))
#define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))
#define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))