From 76674feae22db03848a40446beb2fcec70d2180d Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Tue, 9 Aug 2016 23:22:27 -0400 Subject: SSE2 implementation of the PVQ search We used the SSE reciprocal square root instruction to vectorize the serch rather than compare one at a time with multiplies. Speeds up the entire encoder by 8-10%. --- celt/bands.c | 2 +- celt/tests/test_unit_mathops.c | 1 + celt/tests/test_unit_rotation.c | 1 + celt/vq.c | 34 +++++-- celt/vq.h | 12 ++- celt/x86/vq_sse.h | 50 +++++++++ celt/x86/vq_sse2.c | 217 ++++++++++++++++++++++++++++++++++++++++ celt/x86/x86_celt_map.c | 13 +++ celt_headers.mk | 1 + celt_sources.mk | 2 +- 10 files changed, 320 insertions(+), 13 deletions(-) create mode 100644 celt/x86/vq_sse.h create mode 100644 celt/x86/vq_sse2.c diff --git a/celt/bands.c b/celt/bands.c index 87791b4a..868a98dc 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -1036,7 +1036,7 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X, /* Finally do the actual quantization */ if (encode) { - cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth); + cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth, ctx->arch); } else { cm = alg_unquant(X, N, K, spread, B, ec, gain); } diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index fd3319da..d49a2bd2 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -57,6 +57,7 @@ # endif # if defined(OPUS_X86_MAY_HAVE_SSE2) # include "x86/pitch_sse2.c" +# include "x86/vq_sse2.c" # endif # if defined(OPUS_X86_MAY_HAVE_SSE4_1) # include "x86/pitch_sse4_1.c" diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 1080c208..571ed12d 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -55,6 +55,7 @@ # endif # if defined(OPUS_X86_MAY_HAVE_SSE2) # include "x86/pitch_sse2.c" +# include "x86/vq_sse2.c" # endif # if defined(OPUS_X86_MAY_HAVE_SSE4_1) # include "x86/pitch_sse4_1.c" diff --git a/celt/vq.c b/celt/vq.c index e6895952..8d49d804 100644 --- a/celt/vq.c +++ b/celt/vq.c @@ -158,29 +158,21 @@ static unsigned extract_collapse_mask(int *iy, int N, int B) return collapse_mask; } -unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, - opus_val16 gain, int resynth) +opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch) { VARDECL(celt_norm, y); - VARDECL(int, iy); VARDECL(int, signx); int i, j; int pulsesLeft; opus_val32 sum; opus_val32 xy; opus_val16 yy; - unsigned collapse_mask; SAVE_STACK; - celt_assert2(K>0, "alg_quant() needs at least one pulse"); - celt_assert2(N>1, "alg_quant() needs at least two dimensions"); - + (void)arch; ALLOC(y, N, celt_norm); - ALLOC(iy, N, int); ALLOC(signx, N, int); - exp_rotation(X, N, 1, B, K, spread); - /* Get rid of the sign */ sum = 0; j=0; do { @@ -322,6 +314,28 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, but has the same performance otherwise. */ iy[j] = (iy[j]^-signx[j]) + signx[j]; } while (++j0, "alg_quant() needs at least one pulse"); + celt_assert2(N>1, "alg_quant() needs at least two dimensions"); + + /* Covers vectorization by up to 4. */ + ALLOC(iy, N+3, int); + + exp_rotation(X, N, 1, B, K, spread); + + yy = op_pvq_search(X, iy, K, N, arch); + encode_pulses(iy, N, K, enc); if (resynth) diff --git a/celt/vq.h b/celt/vq.h index c2fa0245..5dbf9ce0 100644 --- a/celt/vq.h +++ b/celt/vq.h @@ -37,10 +37,20 @@ #include "entdec.h" #include "modes.h" +#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)) +#include "x86/vq_sse.h" +#endif + #if defined(MIPSr1_ASM) #include "mips/vq_mipsr1.h" #endif +opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch); + +#if !defined(OVERRIDE_OP_PVQ_SEARCH) +#define op_pvq_search(x, iy, K, N, arch) \ + (op_pvq_search_c(x, iy, K, N, arch)) +#endif /** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of * the pitch and a combination of pulses such that its norm is still equal @@ -52,7 +62,7 @@ * @ret A mask indicating which blocks in the band received pulses */ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, - opus_val16 gain, int resynth); + opus_val16 gain, int resynth, int arch); /** Algebraic pulse decoder * @param X Decoded normalised spectrum (returned) diff --git a/celt/x86/vq_sse.h b/celt/x86/vq_sse.h new file mode 100644 index 00000000..b4efe8f2 --- /dev/null +++ b/celt/x86/vq_sse.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef VQ_SSE_H +#define VQ_SSE_H + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) +#define OVERRIDE_OP_PVQ_SEARCH + +opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch); + +#if defined(OPUS_X86_PRESUME_SSE2) +#define op_pvq_search(x, iy, K, N, arch) \ + (op_pvq_search_sse2(x, iy, K, N, arch)) + +#else + +extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( + celt_norm *_X, int *iy, int K, int N, int arch); + +# define op_pvq_search(X, iy, K, N, arch) \ + ((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch)) + +#endif +#endif + +#endif diff --git a/celt/x86/vq_sse2.c b/celt/x86/vq_sse2.c new file mode 100644 index 00000000..0891a5ba --- /dev/null +++ b/celt/x86/vq_sse2.c @@ -0,0 +1,217 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2007-2016 Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "vq.h" +#include "x86cpu.h" + + +#ifndef FIXED_POINT + +opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) +{ + int i, j; + int pulsesLeft; + float xy, yy; + VARDECL(celt_norm, y); + VARDECL(celt_norm, X); + VARDECL(float, signy); + __m128 signmask; + __m128 sums; + __m128i fours; + SAVE_STACK; + + (void)arch; + /* All bits set to zero, except for the sign bit. */ + signmask = _mm_set_ps1(-0.f); + fours = _mm_set_epi32(4, 4, 4, 4); + ALLOC(y, N+3, celt_norm); + ALLOC(X, N+3, celt_norm); + ALLOC(signy, N+3, float); + + OPUS_COPY(X, _X, N); + X[N] = X[N+1] = X[N+2] = 0; + sums = _mm_setzero_ps(); + for (j=0;j (N>>1)) + { + __m128i pulses_sum; + __m128 yy4, xy4; + __m128 rcp4; + opus_val32 sum = _mm_cvtss_f32(sums); + /* If X is too small, just replace it with a pulse at 0 */ + /* Prevents infinities and NaNs from causing too many pulses + to be allocated. 64 is an approximation of infinity here. */ + if (!(sum > EPSILON && sum < 64)) + { + X[0] = QCONST16(1.f,14); + j=1; do + X[j]=0; + while (++j=1, "Allocated too many pulses in the quick pass"); + + /* This should never happen, but just in case it does (e.g. on silence) + we fill the first bin with pulses. */ + if (pulsesLeft > N+3) + { + opus_val16 tmp = (opus_val16)pulsesLeft; + yy = MAC16_16(yy, tmp, tmp); + yy = MAC16_16(yy, tmp, y[0]); + iy[0] += pulsesLeft; + pulsesLeft=0; + } + + for (i=0;i