From 843121b356685ff5a8c40211951f392f77f689cc Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Tue, 12 Feb 2019 19:33:26 -0500
Subject: Fixes analysis buffering for silence and complexity changes

The previous code would go out of sync in those cases.
---
 src/analysis.c     | 48 +++++++++++++++++++++++++++++++++++++++---------
 src/analysis.h     |  1 +
 src/opus_encoder.c | 20 +++++++++-----------
 src/opus_private.h |  1 +
 4 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/src/analysis.c b/src/analysis.c
index 23f6fa56..6907a631 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -249,6 +249,15 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
    if (curr_lookahead<0)
       curr_lookahead += DETECT_SIZE;
 
+   tonal->read_subframe += len/(tonal->Fs/400);
+   while (tonal->read_subframe>=8)
+   {
+      tonal->read_subframe -= 8;
+      tonal->read_pos++;
+   }
+   if (tonal->read_pos>=DETECT_SIZE)
+      tonal->read_pos-=DETECT_SIZE;
+
    /* On long frames, look at the second analysis window rather than the first. */
    if (len > tonal->Fs/50 && pos != tonal->write_pos)
    {
@@ -262,6 +271,8 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
       pos = DETECT_SIZE-1;
    pos0 = pos;
    OPUS_COPY(info_out, &tonal->info[pos], 1);
+   if (!info_out->valid)
+      return;
    tonality_max = tonality_avg = info_out->tonality;
    tonality_count = 1;
    /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
@@ -393,14 +404,6 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
    info_out->music_prob_max = prob_max;
 
    /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
-   tonal->read_subframe += len/(tonal->Fs/400);
-   while (tonal->read_subframe>=8)
-   {
-      tonal->read_subframe -= 8;
-      tonal->read_pos++;
-   }
-   if (tonal->read_pos>=DETECT_SIZE)
-      tonal->read_pos-=DETECT_SIZE;
 }
 
 static const float std_feature_bias[9] = {
@@ -420,6 +423,24 @@ static const float std_feature_bias[9] = {
 #define SCALE_ENER(e) (e)
 #endif
 
+#ifdef FIXED_POINT
+static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth)
+{
+   int silence = 0;
+   opus_val32 sample_max = 0;
+#ifdef MLP_TRAINING
+   return 0;
+#endif
+   sample_max = celt_maxabs32(pcm, frame_size*channels);
+
+   silence = (sample_max == 0);
+   (void)lsb_depth;
+   return silence;
+}
+#else
+#define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth)
+#endif
+
 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
 {
     int i, b;
@@ -464,8 +485,10 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
     float layer_out[MAX_NEURONS];
     float below_max_pitch;
     float above_max_pitch;
+    int is_silence;
     SAVE_STACK;
 
+    tonal->initialized = 1;
     alpha = 1.f/IMIN(10, 1+tonal->count);
     alphaE = 1.f/IMIN(25, 1+tonal->count);
     /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
@@ -500,6 +523,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
     if (tonal->write_pos>=DETECT_SIZE)
        tonal->write_pos-=DETECT_SIZE;
 
+    is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth);
+
     ALLOC(in, 480, kiss_fft_cpx);
     ALLOC(out, 480, kiss_fft_cpx);
     ALLOC(tonality, 240, float);
@@ -518,6 +543,12 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
           &tonal->inmem[240], tonal->downmix_state, remaining,
           offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
     tonal->mem_fill = 240 + remaining;
+    if (is_silence)
+    {
+       info->valid = 0;
+       RESTORE_STACK;
+       return;
+    }
     opus_fft(kfft, in, out, tonal->arch);
 #ifndef FIXED_POINT
     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
@@ -938,7 +969,6 @@ void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co
       analysis->analysis_offset -= frame_size;
    }
 
-   analysis_info->valid = 0;
    tonality_get_info(analysis, analysis_info, frame_size);
 }
 
diff --git a/src/analysis.h b/src/analysis.h
index 289c845e..0b66555f 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -74,6 +74,7 @@ typedef struct {
    int read_pos;
    int read_subframe;
    float hp_ener_accum;
+   int initialized;
    float rnn_state[MAX_NEURONS];
    opus_val32 downmix_state[3];
    AnalysisInfo info[DETECT_SIZE];
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index cbeb40ae..ab08bff8 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -837,7 +837,7 @@ static opus_int32 compute_equiv_rate(opus_int32 bitrate, int channels,
 
 #ifndef DISABLE_FLOAT_API
 
-static int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth)
+int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth)
 {
    int silence = 0;
    opus_val32 sample_max = 0;
@@ -1140,21 +1140,19 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
     if (st->silk_mode.complexity >= 7 && st->Fs>=16000)
 #endif
     {
-       if (is_digital_silence(pcm, frame_size, st->channels, lsb_depth))
-       {
-          is_silence = 1;
-       } else {
-          analysis_read_pos_bak = st->analysis.read_pos;
-          analysis_read_subframe_bak = st->analysis.read_subframe;
-          run_analysis(&st->analysis, celt_mode, analysis_pcm, analysis_size, frame_size,
-                c1, c2, analysis_channels, st->Fs,
-                lsb_depth, downmix, &analysis_info);
-       }
+       is_silence = is_digital_silence(pcm, frame_size, st->channels, lsb_depth);
+       analysis_read_pos_bak = st->analysis.read_pos;
+       analysis_read_subframe_bak = st->analysis.read_subframe;
+       run_analysis(&st->analysis, celt_mode, analysis_pcm, analysis_size, frame_size,
+             c1, c2, analysis_channels, st->Fs,
+             lsb_depth, downmix, &analysis_info);
 
        /* Track the peak signal energy */
        if (!is_silence && analysis_info.activity_probability > DTX_ACTIVITY_THRESHOLD)
           st->peak_signal_energy = MAX32(MULT16_32_Q15(QCONST16(0.999f, 15), st->peak_signal_energy),
                 compute_frame_energy(pcm, frame_size, st->channels, st->arch));
+    } else if (st->analysis.initialized) {
+       tonality_analysis_reset(&st->analysis);
     }
 #else
     (void)analysis_pcm;
diff --git a/src/opus_private.h b/src/opus_private.h
index 09783cee..5e2463f5 100644
--- a/src/opus_private.h
+++ b/src/opus_private.h
@@ -135,6 +135,7 @@ typedef void (*opus_copy_channel_out_func)(
 typedef void (*downmix_func)(const void *, opus_val32 *, int, int, int, int, int);
 void downmix_float(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
 void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
+int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth);
 
 int encode_size(int size, unsigned char *data);
 
-- 
cgit v1.2.1