diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2016-08-06 01:01:13 -0400 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2016-08-06 15:09:39 -0400 |
commit | c7bbc3e31fa67da78a41868576672ef01f888ae1 (patch) | |
tree | 4cddb731850db74b754af4c7d9b84b9c5a3c1685 | |
parent | 09b53529280fd132438d24d39dd0f1614318abfd (diff) | |
download | opus-c7bbc3e31fa67da78a41868576672ef01f888ae1.tar.gz |
Speeding up PVQ using unlikely() and moving first position out of the loop
Speeds up encoding by another ~1-2%
-rw-r--r-- | celt/arch.h | 8 | ||||
-rw-r--r-- | celt/vq.c | 33 | ||||
-rw-r--r-- | silk/macros.h | 8 |
3 files changed, 32 insertions, 17 deletions
diff --git a/celt/arch.h b/celt/arch.h index 6a328ea0..9eb37d8f 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -46,6 +46,14 @@ # endif # endif +#if OPUS_GNUC_PREREQ(3, 0) +#define opus_likely(x) (__builtin_expect(!!(x), 1)) +#define opus_unlikely(x) (__builtin_expect(!!(x), 0)) +#else +#define opus_likely(x) (!!(x)) +#define opus_unlikely(x) (!!(x)) +#endif + #define CELT_SIG_SCALE 32768.f #define celt_fatal(str) _celt_fatal(str, __FILE__, __LINE__); @@ -165,7 +165,6 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, VARDECL(int, iy); VARDECL(opus_val16, signx); int i, j; - opus_val16 s; int pulsesLeft; opus_val32 sum; opus_val32 xy; @@ -251,12 +250,12 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, pulsesLeft=0; } - s = 1; for (i=0;i<pulsesLeft;i++) { + opus_val16 Rxy, Ryy; int best_id; - opus_val32 best_num = -VERY_LARGE16; - opus_val16 best_den = 0; + opus_val32 best_num; + opus_val16 best_den; #ifdef FIXED_POINT int rshift; #endif @@ -267,9 +266,22 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, /* The squared magnitude term gets added anyway, so we might as well add it outside the loop */ yy = ADD16(yy, 1); - j=0; + + /* Calculations for position 0 are out of the loop, in part to reduce + mispredicted branches (since the if condition is usually false) + in the loop. */ + /* Temporary sums of the new pulse(s) */ + Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[0])),rshift)); + /* We're multiplying y[j] by two so we don't have to do it here */ + Ryy = ADD16(yy, y[0]); + + /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that + Rxy is positive because the sign is pre-computed) */ + Rxy = MULT16_16_Q15(Rxy,Rxy); + best_den = Ryy; + best_num = Rxy; + j=1; do { - opus_val16 Rxy, Ryy; /* Temporary sums of the new pulse(s) */ Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[j])),rshift)); /* We're multiplying y[j] by two so we don't have to do it here */ @@ -280,8 +292,11 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, Rxy = MULT16_16_Q15(Rxy,Rxy); /* The idea is to check for num/den >= best_num/best_den, but that way we can do it without any division */ - /* OPT: Make sure to use conditional moves here */ - if (MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num)) + /* OPT: It's not clear whether a cmov is faster than a branch here + since the condition is more often false than true and using + a cmov introduces data dependencies across iterations. The optimal + choice may be architecture-dependent. */ + if (opus_unlikely(MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num))) { best_den = Ryy; best_num = Rxy; @@ -296,7 +311,7 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, /* Only now that we've made the final choice, update y/iy */ /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ - y[best_id] += 2*s; + y[best_id] += 2; iy[best_id]++; } diff --git a/silk/macros.h b/silk/macros.h index d3ca3475..3c67b6e5 100644 --- a/silk/macros.h +++ b/silk/macros.h @@ -36,14 +36,6 @@ POSSIBILITY OF SUCH DAMAGE. #include "opus_defines.h" #include "arch.h" -#if OPUS_GNUC_PREREQ(3, 0) -#define opus_likely(x) (__builtin_expect(!!(x), 1)) -#define opus_unlikely(x) (__builtin_expect(!!(x), 0)) -#else -#define opus_likely(x) (!!(x)) -#define opus_unlikely(x) (!!(x)) -#endif - /* This is an OPUS_INLINE header file for general platform. */ /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ |