From 8e19536baa5064a89abe93570fcc99412b6c68a1 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Wed, 13 Jul 2016 16:40:48 -0700 Subject: Optimize silk_warped_autocorrelation_FIX() for ARM NEON The optimization is bit exact with C function. This optimization speeds up fixed-point SILK encoder on NEON about 5% to 8%. (Tested on Acer Chromebook, ARMv7 Processor rev 3 (v7l).) Change-Id: I582f6f3585b7946149e16a2ad3084ebc0ae79a4f Signed-off-by: Jean-Marc Valin --- Makefile.am | 7 +- silk/arm/arm_silk_map.c | 19 ++ silk/fixed/arm/warped_autocorrelation_FIX_arm.h | 68 ++++++ .../arm/warped_autocorrelation_FIX_neon_intr.c | 260 +++++++++++++++++++++ silk/fixed/main_FIX.h | 15 +- silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h | 2 +- .../fixed/mips/warped_autocorrelation_FIX_mipsr1.h | 3 +- silk/fixed/noise_shape_analysis_FIX.c | 2 +- silk/fixed/warped_autocorrelation_FIX.c | 7 +- silk_headers.mk | 1 + silk_sources.mk | 3 + 11 files changed, 376 insertions(+), 11 deletions(-) create mode 100644 silk/fixed/arm/warped_autocorrelation_FIX_arm.h create mode 100644 silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c diff --git a/Makefile.am b/Makefile.am index 018a7c1a..67453e4a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,6 +21,9 @@ SILK_SOURCES += $(SILK_SOURCES_FIXED) if HAVE_SSE4_1 SILK_SOURCES += $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1) endif +if HAVE_ARM_NEON_INTR +SILK_SOURCES += $(SILK_SOURCES_FIXED_ARM_NEON_INTR) +endif else SILK_SOURCES += $(SILK_SOURCES_FLOAT) if HAVE_SSE4_1 @@ -321,7 +324,9 @@ $(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS) endif if HAVE_ARM_NEON_INTR -ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) $(SILK_SOURCES_ARM_NEON_INTR:.c=.lo) +ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \ + $(SILK_SOURCES_ARM_NEON_INTR:.c=.lo) \ + $(SILK_SOURCES_FIXED_ARM_NEON_INTR:.c=.lo) $(ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += \ $(OPUS_ARM_NEON_INTR_CFLAGS) $(NE10_CFLAGS) endif diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c index 0966cfde..7ac9ad38 100644 --- a/silk/arm/arm_silk_map.c +++ b/silk/arm/arm_silk_map.c @@ -87,4 +87,23 @@ opus_int32 # endif +# if defined(FIXED_POINT) && \ + defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) + +void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +) = { + silk_warped_autocorrelation_FIX_c, /* ARMv4 */ + silk_warped_autocorrelation_FIX_c, /* EDSP */ + silk_warped_autocorrelation_FIX_c, /* Media */ + MAY_HAVE_NEON(silk_warped_autocorrelation_FIX), /* Neon */ +}; + +# endif + #endif /* OPUS_HAVE_RTCD */ diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_arm.h b/silk/fixed/arm/warped_autocorrelation_FIX_arm.h new file mode 100644 index 00000000..1992e432 --- /dev/null +++ b/silk/fixed/arm/warped_autocorrelation_FIX_arm.h @@ -0,0 +1,68 @@ +/*********************************************************************** +Copyright (c) 2017 Google Inc. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_WARPED_AUTOCORRELATION_FIX_ARM_H +# define SILK_WARPED_AUTOCORRELATION_FIX_ARM_H + +# include "celt/arm/armcpu.h" + +# if defined(FIXED_POINT) + +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +void silk_warped_autocorrelation_FIX_neon( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +); + +# if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON) +# define OVERRIDE_silk_warped_autocorrelation_FIX (1) +# define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order, arch) \ + ((void)(arch), PRESUME_NEON(silk_warped_autocorrelation_FIX)(corr, scale, input, warping_Q16, length, order)) +# endif +# endif + +# if !defined(OVERRIDE_silk_warped_autocorrelation_FIX) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK+1])(opus_int32*, opus_int*, const opus_int16*, const opus_int, const opus_int, const opus_int); +# define OVERRIDE_silk_warped_autocorrelation_FIX (1) +# define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order, arch) \ + ((*SILK_WARPED_AUTOCORRELATION_FIX_IMPL[(arch)&OPUS_ARCHMASK])(corr, scale, input, warping_Q16, length, order)) +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_silk_warped_autocorrelation_FIX (1) +# define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order, arch) \ + ((void)(arch), silk_warped_autocorrelation_FIX_neon(corr, scale, input, warping_Q16, length, order)) +# endif +# endif + +# endif /* end FIXED_POINT */ + +#endif /* end SILK_WARPED_AUTOCORRELATION_FIX_ARM_H */ diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c new file mode 100644 index 00000000..00a70cb5 --- /dev/null +++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c @@ -0,0 +1,260 @@ +/*********************************************************************** +Copyright (c) 2017 Google Inc., Jean-Marc Valin +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#ifdef OPUS_CHECK_ASM +# include +#endif +#include "stack_alloc.h" +#include "main_FIX.h" + +static OPUS_INLINE void calc_corr( const opus_int32 *const input_QS, opus_int64 *const corr_QC, const opus_int offset, const int32x4_t state_QS_s32x4 ) +{ + int64x2_t corr_QC_s64x2[ 2 ], t_s64x2[ 2 ]; + const int32x4_t input_QS_s32x4 = vld1q_s32( input_QS + offset ); + corr_QC_s64x2[ 0 ] = vld1q_s64( corr_QC + offset + 0 ); + corr_QC_s64x2[ 1 ] = vld1q_s64( corr_QC + offset + 2 ); + t_s64x2[ 0 ] = vmull_s32( vget_low_s32( state_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) ); + t_s64x2[ 1 ] = vmull_s32( vget_high_s32( state_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) ); + corr_QC_s64x2[ 0 ] = vsraq_n_s64( corr_QC_s64x2[ 0 ], t_s64x2[ 0 ], 2 * QS - QC ); + corr_QC_s64x2[ 1 ] = vsraq_n_s64( corr_QC_s64x2[ 1 ], t_s64x2[ 1 ], 2 * QS - QC ); + vst1q_s64( corr_QC + offset + 0, corr_QC_s64x2[ 0 ] ); + vst1q_s64( corr_QC + offset + 2, corr_QC_s64x2[ 1 ] ); +} + +static OPUS_INLINE int32x4_t calc_state( const int32x4_t state_QS0_s32x4, const int32x4_t state_QS0_1_s32x4, const int32x4_t state_QS1_1_s32x4, const int32x4_t warping_Q16_s32x4 ) +{ + int32x4_t t_s32x4 = vsubq_s32( state_QS0_s32x4, state_QS0_1_s32x4 ); + t_s32x4 = vqdmulhq_s32( t_s32x4, warping_Q16_s32x4 ); + return vaddq_s32( state_QS1_1_s32x4, t_s32x4 ); +} + +void silk_warped_autocorrelation_FIX_neon( + opus_int32 *corr, /* O Result [order + 1] */ + opus_int *scale, /* O Scaling of the correlation vector */ + const opus_int16 *input, /* I Input data to correlate */ + const opus_int warping_Q16, /* I Warping coefficient */ + const opus_int length, /* I Length of input */ + const opus_int order /* I Correlation order (even) */ +) +{ + if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) { + silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order ); + } else { + opus_int n, i, lsh; + opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */ + opus_int64 corr_QC_orderT; + int64x2_t lsh_s64x2; + const opus_int orderT = ( order + 3 ) & ~3; + opus_int64 *corr_QCT; + opus_int32 *input_QS; + VARDECL( opus_int32, input_QST ); + VARDECL( opus_int32, state ); + SAVE_STACK; + + /* Order must be even */ + silk_assert( ( order & 1 ) == 0 ); + silk_assert( 2 * QS - QC >= 0 ); + + ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); + + input_QS = input_QST; + /* input_QS has zero paddings in the beginning and end. */ + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + + /* Loop over samples */ + for( n = 0; n < length - 7; n += 8, input_QS += 8 ) { + const int16x8_t t0_s16x4 = vld1q_s16( input + n ); + vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) ); + vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) ); + } + for( ; n < length; n++, input_QS++ ) { + input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); + } + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; + + /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ + /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues. */ + /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */ + + /* Keep the C code here to help understand the intrinsics optimization. */ + /* + { + opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; + opus_int32 *state_QST[ 3 ]; + state_QST[ 0 ] = state_QS[ 0 ]; + state_QST[ 1 ] = state_QS[ 1 ]; + for( n = 0; n < length + order; n++, input_QS++ ) { + state_QST[ 0 ][ orderT ] = input_QS[ orderT ]; + for( i = 0; i < orderT; i++ ) { + corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC ); + state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 ); + } + state_QST[ 2 ] = state_QST[ 0 ]; + state_QST[ 0 ] = state_QST[ 1 ]; + state_QST[ 1 ] = state_QST[ 2 ]; + } + } + */ + + { + const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 ); + const opus_int32 *in = input_QS + orderT; + opus_int o = orderT; + int32x4_t state_QS_s32x4[ 3 ][ 2 ]; + + ALLOC( state, length + orderT, opus_int32 ); + state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); + + /* Calculate 8 taps of all inputs in each loop. */ + do { + state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] = + state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 ); + n = 0; + do { + calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] ); + calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] ); + state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n ); + vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 ); + state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 ); + state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 ); + state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); + state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); + state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; + state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; + } while( ++n < ( length + order ) ); + in = state; + o -= 8; + } while( o > 4 ); + + if( o ) { + /* Calculate the last 4 taps of all inputs. */ + opus_int32 *stateT = state; + silk_assert( o == 4 ); + state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 ); + n = length + order; + do { + calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] ); + state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT ); + vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 ); + state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 ); + state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); + state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; + input_QS++; + stateT++; + } while( --n ); + } + } + + { + const opus_int16 *inputT = input; + int32x4_t t_s32x4; + int64x1_t t_s64x1; + int64x2_t t_s64x2 = vdupq_n_s64( 0 ); + for( n = 0; n <= length - 8; n += 8 ) { + int16x8_t input_s16x8 = vld1q_s16( inputT ); + t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); + t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); + t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); + t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); + inputT += 8; + } + t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); + corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); + for( ; n < length; n++ ) { + corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); + } + corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); + corr_QC[ orderT ] = corr_QC_orderT; + } + + corr_QCT = corr_QC + orderT - order; + lsh = silk_CLZ64( corr_QC_orderT ) - 35; + lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); + *scale = -( QC + lsh ); + silk_assert( *scale >= -30 && *scale <= 12 ); + lsh_s64x2 = vdupq_n_s64( lsh ); + for( i = 0; i <= order - 3; i += 4 ) { + int32x4_t corr_s32x4; + int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; + corr_QC0_s64x2 = vld1q_s64( corr_QCT + i ); + corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 ); + corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 ); + corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 ); + corr_s32x4 = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) ); + corr_s32x4 = vrev64q_s32( corr_s32x4 ); + vst1q_s32( corr + order - i - 3, corr_s32x4 ); + } + if( lsh >= 0 ) { + for( ; i < order + 1; i++ ) { + corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) ); + } + } else { + for( ; i < order + 1; i++ ) { + corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) ); + } + } + silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/ + RESTORE_STACK; + } + +#ifdef OPUS_CHECK_ASM + { + opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ]; + opus_int scale_c; + silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order ); + silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) ); + silk_assert( scale_c == *scale ); + } +#endif +} diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h index ddbf3772..780afa39 100644 --- a/silk/fixed/main_FIX.h +++ b/silk/fixed/main_FIX.h @@ -36,6 +36,11 @@ POSSIBILITY OF SUCH DAMAGE. #include "debug.h" #include "entenc.h" +#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \ + || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#include "fixed/arm/warped_autocorrelation_FIX_arm.h" +#endif + #ifndef FORCE_CPP_BUILD #ifdef __cplusplus extern "C" @@ -47,6 +52,9 @@ extern "C" #define silk_encode_do_VAD_Fxx silk_encode_do_VAD_FIX #define silk_encode_frame_Fxx silk_encode_frame_FIX +#define QC 10 +#define QS 13 + /*********************/ /* Encoder Functions */ /*********************/ @@ -99,7 +107,7 @@ void silk_noise_shape_analysis_FIX( ); /* Autocorrelations for a warped frequency axis */ -void silk_warped_autocorrelation_FIX( +void silk_warped_autocorrelation_FIX_c( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ @@ -108,6 +116,11 @@ void silk_warped_autocorrelation_FIX( const opus_int order /* I Correlation order (even) */ ); +#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX) +#define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order, arch) \ + ((void)(arch), silk_warped_autocorrelation_FIX_c(corr, scale, input, warping_Q16, length, order)) +#endif + /* Calculation of LTP state scaling */ void silk_LTP_scale_ctrl_FIX( silk_encoder_state_FIX *psEnc, /* I/O encoder state */ diff --git a/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h b/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h index f9b5473f..3999b5bd 100644 --- a/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h +++ b/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h @@ -169,7 +169,7 @@ void silk_noise_shape_analysis_FIX( if( psEnc->sCmn.warping_Q16 > 0 ) { /* Calculate warped auto correlation */ - silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); + silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder, arch ); } else { /* Calculate regular auto correlation */ silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, arch ); diff --git a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h index e803ef0f..fcbd96c8 100644 --- a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h +++ b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h @@ -48,7 +48,8 @@ void silk_warped_autocorrelation_FIX( const opus_int16 *input, /* I Input data to correlate */ const opus_int warping_Q16, /* I Warping coefficient */ const opus_int length, /* I Length of input */ - const opus_int order /* I Correlation order (even) */ + const opus_int order, /* I Correlation order (even) */ + int arch /* I Run-time architecture */ ) { opus_int n, i, lsh; diff --git a/silk/fixed/noise_shape_analysis_FIX.c b/silk/fixed/noise_shape_analysis_FIX.c index 8fe23777..85fea0bf 100644 --- a/silk/fixed/noise_shape_analysis_FIX.c +++ b/silk/fixed/noise_shape_analysis_FIX.c @@ -262,7 +262,7 @@ void silk_noise_shape_analysis_FIX( if( psEnc->sCmn.warping_Q16 > 0 ) { /* Calculate warped auto correlation */ - silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder ); + silk_warped_autocorrelation_FIX( auto_corr, &scale, x_windowed, warping_Q16, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder, arch ); } else { /* Calculate regular auto correlation */ silk_autocorr( auto_corr, &scale, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, arch ); diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c index af164029..994c299a 100644 --- a/silk/fixed/warped_autocorrelation_FIX.c +++ b/silk/fixed/warped_autocorrelation_FIX.c @@ -31,17 +31,13 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FIX.h" -#define QC 10 -#define QS 13 - #if defined(MIPSr1_ASM) #include "mips/warped_autocorrelation_FIX_mipsr1.h" #endif -#ifndef OVERRIDE_silk_warped_autocorrelation_FIX /* Autocorrelations for a warped frequency axis */ -void silk_warped_autocorrelation_FIX( +void silk_warped_autocorrelation_FIX_c( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ @@ -92,4 +88,3 @@ void silk_warped_autocorrelation_FIX( } silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/ } -#endif /* OVERRIDE_silk_warped_autocorrelation_FIX */ diff --git a/silk_headers.mk b/silk_headers.mk index cb104919..278500b5 100644 --- a/silk_headers.mk +++ b/silk_headers.mk @@ -32,6 +32,7 @@ silk/arm/NSQ_del_dec_arm.h \ silk/arm/NSQ_neon.h \ silk/fixed/main_FIX.h \ silk/fixed/structs_FIX.h \ +silk/fixed/arm/warped_autocorrelation_FIX_arm.h \ silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \ silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h \ silk/float/main_FLP.h \ diff --git a/silk_sources.mk b/silk_sources.mk index 0dcf671a..c0312145 100644 --- a/silk_sources.mk +++ b/silk_sources.mk @@ -117,6 +117,9 @@ silk/fixed/schur_FIX.c SILK_SOURCES_FIXED_SSE4_1 = silk/fixed/x86/vector_ops_FIX_sse.c \ silk/fixed/x86/burg_modified_FIX_sse.c +SILK_SOURCES_FIXED_ARM_NEON_INTR = \ +silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c + SILK_SOURCES_FLOAT = \ silk/float/apply_sine_window_FLP.c \ silk/float/corrMatrix_FLP.c \ -- cgit v1.2.1