diff options
author | Jonathan Lennox <jonathan@vidyo.com> | 2015-08-03 17:04:27 -0400 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2015-09-01 17:21:31 -0400 |
commit | 43120f00758219a784f952754f33b9788a8d731b (patch) | |
tree | b1dd751c49c18c6d7c3a65f30be04476cb2903b6 /silk | |
parent | cb0875e07d7cac701b465863f532dc5bb8b0eb59 (diff) | |
download | opus-43120f00758219a784f952754f33b9788a8d731b.tar.gz |
Reorganize x86 SSE intrinsics code.
Enable x86 intrinsics when building in floating-point mode.
Support SSE as an arch value.
Use RTCD to conditionally enable existing floating-point Celt SSE code.
Call functions directly (without RTCD) when their architecture can be presumed.
Use SSE4.1 intrinsics optimized code for Silk even in floating-point mode.
Diffstat (limited to 'silk')
-rw-r--r-- | silk/x86/SigProc_FIX_sse.h | 17 | ||||
-rw-r--r-- | silk/x86/main_sse.h | 48 | ||||
-rw-r--r-- | silk/x86/x86_silk_map.c | 25 |
3 files changed, 83 insertions, 7 deletions
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h index 9a0e0964..61efa8da 100644 --- a/silk/x86/SigProc_FIX_sse.h +++ b/silk/x86/SigProc_FIX_sse.h @@ -45,6 +45,12 @@ void silk_burg_modified_sse4_1( int arch /* I Run-time architecture */ ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ + ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) + +#else + extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ @@ -59,12 +65,22 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( # define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +#endif + opus_int64 silk_inner_prod16_aligned_64_sse4_1( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len ); + +#if defined(OPUS_X86_PRESUME_SSE4_1) + +#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ + ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len)) + +#else + extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( const opus_int16 *inVec1, const opus_int16 *inVec2, @@ -75,3 +91,4 @@ extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( #endif #endif +#endif diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h index f970632c..afd5ec26 100644 --- a/silk/x86/main_sse.h +++ b/silk/x86/main_sse.h @@ -50,6 +50,15 @@ void silk_VQ_WMat_EC_sse4_1( opus_int L /* I number of vectors in codebook */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ + mu_Q9, max_gain_Q7, L, arch) \ + ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ + mu_Q9, max_gain_Q7, L)) + +#else + extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( opus_int8 *ind, /* O index of best codebook vector */ opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ @@ -69,6 +78,8 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ mu_Q9, max_gain_Q7, L)) +#endif + # define OVERRIDE_silk_NSQ void silk_NSQ_sse4_1( @@ -89,6 +100,15 @@ void silk_NSQ_sse4_1( const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) + +#else + extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, /* I/O Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ @@ -112,6 +132,8 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +#endif + # define OVERRIDE_silk_NSQ_del_dec void silk_NSQ_del_dec_sse4_1( @@ -132,6 +154,15 @@ void silk_NSQ_del_dec_sse4_1( const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) + +#else + extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, /* I/O Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ @@ -155,6 +186,8 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +#endif + void silk_noise_shape_quantizer( silk_nsq_state *NSQ, /* I/O NSQ state */ opus_int signalType, /* I Signal type */ @@ -192,6 +225,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( const opus_int16 pIn[] ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn)) + +#else + # define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) @@ -201,6 +239,8 @@ extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( # define OVERRIDE_silk_warped_LPC_analysis_filter_FIX +#endif + void silk_warped_LPC_analysis_filter_FIX_sse4_1( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -211,6 +251,12 @@ void silk_warped_LPC_analysis_filter_FIX_sse4_1( const opus_int order /* I Filter order (even) */ ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ + ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) + +#else + extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -224,5 +270,7 @@ extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1]) # define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ ((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) & OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) +#endif + # endif #endif diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c index 6747d101..ad9fef2a 100644 --- a/silk/x86/x86_silk_map.c +++ b/silk/x86/x86_silk_map.c @@ -35,6 +35,10 @@ #include "pitch.h" #include "main.h" +#if !defined(OPUS_X86_PRESUME_SSE4_1) + +#if defined(FIXED_POINT) + opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( const opus_int16 *inVec1, const opus_int16 *inVec2, @@ -42,18 +46,20 @@ opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_inner_prod16_aligned_64_c, /* non-sse */ silk_inner_prod16_aligned_64_c, + silk_inner_prod16_aligned_64_c, MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */ - NULL }; +#endif + opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )( silk_encoder_state *psEncC, const opus_int16 pIn[] ) = { silk_VAD_GetSA_Q8_c, /* non-sse */ silk_VAD_GetSA_Q8_c, + silk_VAD_GetSA_Q8_c, MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */ - NULL }; void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -75,8 +81,8 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_NSQ_c, /* non-sse */ silk_NSQ_c, + silk_NSQ_c, MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */ - NULL }; void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -94,8 +100,8 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_VQ_WMat_EC_c, /* non-sse */ silk_VQ_WMat_EC_c, + silk_VQ_WMat_EC_c, MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */ - NULL }; void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -117,10 +123,12 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_NSQ_del_dec_c, /* non-sse */ silk_NSQ_del_dec_c, + silk_NSQ_del_dec_c, MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */ - NULL }; +#if defined(FIXED_POINT) + void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -132,8 +140,8 @@ void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_warped_LPC_analysis_filter_FIX_c, /* non-sse */ silk_warped_LPC_analysis_filter_FIX_c, + silk_warped_LPC_analysis_filter_FIX_c, MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */ - NULL }; void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -149,6 +157,9 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_burg_modified_c, /* non-sse */ silk_burg_modified_c, + silk_burg_modified_c, MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */ - NULL }; + +#endif +#endif |