From 843121b356685ff5a8c40211951f392f77f689cc Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Tue, 12 Feb 2019 19:33:26 -0500 Subject: Fixes analysis buffering for silence and complexity changes The previous code would go out of sync in those cases. --- src/analysis.c | 48 +++++++++++++++++++++++++++++++++++++++--------- src/analysis.h | 1 + src/opus_encoder.c | 20 +++++++++----------- src/opus_private.h | 1 + 4 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/analysis.c b/src/analysis.c index 23f6fa56..6907a631 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -249,6 +249,15 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int if (curr_lookahead<0) curr_lookahead += DETECT_SIZE; + tonal->read_subframe += len/(tonal->Fs/400); + while (tonal->read_subframe>=8) + { + tonal->read_subframe -= 8; + tonal->read_pos++; + } + if (tonal->read_pos>=DETECT_SIZE) + tonal->read_pos-=DETECT_SIZE; + /* On long frames, look at the second analysis window rather than the first. */ if (len > tonal->Fs/50 && pos != tonal->write_pos) { @@ -262,6 +271,8 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int pos = DETECT_SIZE-1; pos0 = pos; OPUS_COPY(info_out, &tonal->info[pos], 1); + if (!info_out->valid) + return; tonality_max = tonality_avg = info_out->tonality; tonality_count = 1; /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */ @@ -393,14 +404,6 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int info_out->music_prob_max = prob_max; /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */ - tonal->read_subframe += len/(tonal->Fs/400); - while (tonal->read_subframe>=8) - { - tonal->read_subframe -= 8; - tonal->read_pos++; - } - if (tonal->read_pos>=DETECT_SIZE) - tonal->read_pos-=DETECT_SIZE; } static const float std_feature_bias[9] = { @@ -420,6 +423,24 @@ static const float std_feature_bias[9] = { #define SCALE_ENER(e) (e) #endif +#ifdef FIXED_POINT +static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth) +{ + int silence = 0; + opus_val32 sample_max = 0; +#ifdef MLP_TRAINING + return 0; +#endif + sample_max = celt_maxabs32(pcm, frame_size*channels); + + silence = (sample_max == 0); + (void)lsb_depth; + return silence; +} +#else +#define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth) +#endif + static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix) { int i, b; @@ -464,8 +485,10 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt float layer_out[MAX_NEURONS]; float below_max_pitch; float above_max_pitch; + int is_silence; SAVE_STACK; + tonal->initialized = 1; alpha = 1.f/IMIN(10, 1+tonal->count); alphaE = 1.f/IMIN(25, 1+tonal->count); /* Noise floor related decay for bandwidth detection: -2.2 dB/second */ @@ -500,6 +523,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt if (tonal->write_pos>=DETECT_SIZE) tonal->write_pos-=DETECT_SIZE; + is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth); + ALLOC(in, 480, kiss_fft_cpx); ALLOC(out, 480, kiss_fft_cpx); ALLOC(tonality, 240, float); @@ -518,6 +543,12 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt &tonal->inmem[240], tonal->downmix_state, remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs); tonal->mem_fill = 240 + remaining; + if (is_silence) + { + info->valid = 0; + RESTORE_STACK; + return; + } opus_fft(kfft, in, out, tonal->arch); #ifndef FIXED_POINT /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */ @@ -938,7 +969,6 @@ void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co analysis->analysis_offset -= frame_size; } - analysis_info->valid = 0; tonality_get_info(analysis, analysis_info, frame_size); } diff --git a/src/analysis.h b/src/analysis.h index 289c845e..0b66555f 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -74,6 +74,7 @@ typedef struct { int read_pos; int read_subframe; float hp_ener_accum; + int initialized; float rnn_state[MAX_NEURONS]; opus_val32 downmix_state[3]; AnalysisInfo info[DETECT_SIZE]; diff --git a/src/opus_encoder.c b/src/opus_encoder.c index cbeb40ae..ab08bff8 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -837,7 +837,7 @@ static opus_int32 compute_equiv_rate(opus_int32 bitrate, int channels, #ifndef DISABLE_FLOAT_API -static int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth) +int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth) { int silence = 0; opus_val32 sample_max = 0; @@ -1140,21 +1140,19 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->silk_mode.complexity >= 7 && st->Fs>=16000) #endif { - if (is_digital_silence(pcm, frame_size, st->channels, lsb_depth)) - { - is_silence = 1; - } else { - analysis_read_pos_bak = st->analysis.read_pos; - analysis_read_subframe_bak = st->analysis.read_subframe; - run_analysis(&st->analysis, celt_mode, analysis_pcm, analysis_size, frame_size, - c1, c2, analysis_channels, st->Fs, - lsb_depth, downmix, &analysis_info); - } + is_silence = is_digital_silence(pcm, frame_size, st->channels, lsb_depth); + analysis_read_pos_bak = st->analysis.read_pos; + analysis_read_subframe_bak = st->analysis.read_subframe; + run_analysis(&st->analysis, celt_mode, analysis_pcm, analysis_size, frame_size, + c1, c2, analysis_channels, st->Fs, + lsb_depth, downmix, &analysis_info); /* Track the peak signal energy */ if (!is_silence && analysis_info.activity_probability > DTX_ACTIVITY_THRESHOLD) st->peak_signal_energy = MAX32(MULT16_32_Q15(QCONST16(0.999f, 15), st->peak_signal_energy), compute_frame_energy(pcm, frame_size, st->channels, st->arch)); + } else if (st->analysis.initialized) { + tonality_analysis_reset(&st->analysis); } #else (void)analysis_pcm; diff --git a/src/opus_private.h b/src/opus_private.h index 09783cee..5e2463f5 100644 --- a/src/opus_private.h +++ b/src/opus_private.h @@ -135,6 +135,7 @@ typedef void (*opus_copy_channel_out_func)( typedef void (*downmix_func)(const void *, opus_val32 *, int, int, int, int, int); void downmix_float(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C); void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C); +int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth); int encode_size(int size, unsigned char *data); -- cgit v1.2.1