diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2017-07-12 16:55:28 -0400 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2017-07-17 14:02:59 -0400 |
commit | bcd006b57f54a183bc91e0d0d37ea3d968a6be33 (patch) | |
tree | 56d030f18ebb9850f55f2a913e8bd6db000b0178 | |
parent | 2a4f49448f66f664f90edd220b8467d9b06938ab (diff) | |
download | opus-exp_rnn3.tar.gz |
Add RNN for VAD and speech/music classificationexp_rnn3
Based on two dense layers with a GRU layer in the middle
-rw-r--r-- | celt/celt.h | 3 | ||||
-rw-r--r-- | src/analysis.c | 271 | ||||
-rw-r--r-- | src/analysis.h | 15 | ||||
-rw-r--r-- | src/mlp.c | 168 | ||||
-rw-r--r-- | src/mlp.h | 35 | ||||
-rw-r--r-- | src/mlp_data.c | 325 | ||||
-rw-r--r-- | src/opus_encoder.c | 11 |
7 files changed, 473 insertions, 355 deletions
diff --git a/celt/celt.h b/celt/celt.h index 70175301..f73f29dd 100644 --- a/celt/celt.h +++ b/celt/celt.h @@ -59,7 +59,8 @@ typedef struct { float noisiness; float activity; float music_prob; - float vad_prob; + float music_prob_min; + float music_prob_max; int bandwidth; float activity_probability; /* Store as Q6 char to save space. */ diff --git a/src/analysis.c b/src/analysis.c index f4160e4b..1d6dd829 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -50,6 +50,8 @@ #ifndef DISABLE_FLOAT_API +#define TRANSITION_PENALTY 10 + static const float dct_table[128] = { 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, @@ -224,19 +226,22 @@ void tonality_analysis_reset(TonalityAnalysisState *tonal) /* Clear non-reusable fields. */ char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); - tonal->music_confidence = .9f; - tonal->speech_confidence = .1f; } void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) { int pos; int curr_lookahead; - float psum; float tonality_max; float tonality_avg; int tonality_count; int i; + int pos0; + float prob_avg; + float prob_count; + float prob_min, prob_max; + float vad_prob; + int mpos, vpos; pos = tonal->read_pos; curr_lookahead = tonal->write_pos-tonal->read_pos; @@ -254,6 +259,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int pos--; if (pos<0) pos = DETECT_SIZE-1; + pos0 = pos; OPUS_COPY(info_out, &tonal->info[pos], 1); tonality_max = tonality_avg = info_out->tonality; tonality_count = 1; @@ -270,6 +276,107 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int tonality_count++; } info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f); + + mpos = vpos = pos0; + /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and + ~1 frame delay in the VAD prob. */ + if (curr_lookahead > 15) + { + mpos += 5; + if (mpos>=DETECT_SIZE) + mpos -= DETECT_SIZE; + vpos += 1; + if (vpos>=DETECT_SIZE) + vpos -= DETECT_SIZE; + } + + /* The following calculations attempt to minimize a "badness function" + for the transition. When switching from speech to music, the badness + of switching at frame k is + b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T) + where + v_i is the activity probability (VAD) at frame i, + p_i is the music probability at frame i + T is the probability threshold for switching + S is the penalty for switching during active audio rather than silence + the current frame has index i=0 + + Rather than apply badness to directly decide when to switch, what we compute + instead is the threshold for which the optimal switching point is now. When + considering whether to switch now (frame 0) or at frame k, we have: + S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T) + which gives us: + T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i ) + We take the min threshold across all positive values of k (up to the maximum + amount of lookahead we have) to give us the threshold for which the current + frame is the optimal switch point. + + The last step is that we need to consider whether we want to switch at all. + For that we use the average of the music probability over the entire window. + If the threshold is higher than that average we're not going to + switch, so we compute a min with the average as well. The result of all these + min operations is music_prob_min, which gives the threshold for switching to music + if we're currently encoding for speech. + + We do the exact opposite to compute music_prob_max which is used for switching + from music to speech. + */ + prob_min = 1.f; + prob_max = 0.f; + vad_prob = tonal->info[vpos].activity_probability; + prob_count = MAX16(.1f, vad_prob); + prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob; + while (1) + { + float pos_vad; + mpos++; + if (mpos==DETECT_SIZE) + mpos = 0; + if (mpos == tonal->write_pos) + break; + vpos++; + if (vpos==DETECT_SIZE) + vpos = 0; + if (vpos == tonal->write_pos) + break; + pos_vad = tonal->info[vpos].activity_probability; + prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min); + prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max); + prob_count += MAX16(.1f, pos_vad); + prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob; + } + info_out->music_prob = prob_avg/prob_count; + prob_min = MIN16(prob_avg/prob_count, prob_min); + prob_max = MAX16(prob_avg/prob_count, prob_max); + prob_min = MAX16(prob_min, 0.f); + prob_max = MIN16(prob_max, 1.f); + + /* If we don't have enough look-ahead, do our best to make a decent decision. */ + if (curr_lookahead < 10) + { + float pmin, pmax; + pmin = prob_min; + pmax = prob_max; + pos = pos0; + /* Look for min/max in the past. */ + for (i=0;i<IMIN(tonal->count-1, 15);i++) + { + pos--; + if (pos < 0) + pos = DETECT_SIZE-1; + pmin = MIN16(pmin, tonal->info[pos].music_prob); + pmax = MAX16(pmax, tonal->info[pos].music_prob); + } + /* Bias against switching on active audio. */ + pmin = MAX16(0.f, pmin - .1f*vad_prob); + pmax = MIN16(1.f, pmax + .1f*vad_prob); + prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min); + prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max); + } + info_out->music_prob_min = prob_min; + info_out->music_prob_max = prob_max; + + /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */ tonal->read_subframe += len/(tonal->Fs/400); while (tonal->read_subframe>=8) { @@ -278,21 +385,6 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int } if (tonal->read_pos>=DETECT_SIZE) tonal->read_pos-=DETECT_SIZE; - - /* The -1 is to compensate for the delay in the features themselves. */ - curr_lookahead = IMAX(curr_lookahead-1, 0); - - psum=0; - /* Summing the probability of transition patterns that involve music at - time (DETECT_SIZE-curr_lookahead-1) */ - for (i=0;i<DETECT_SIZE-curr_lookahead;i++) - psum += tonal->pmusic[i]; - for (;i<DETECT_SIZE;i++) - psum += tonal->pspeech[i]; - psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; - /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/ - - info_out->music_prob = psum; } static const float std_feature_bias[9] = { @@ -352,6 +444,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt float band_log2[NB_TBANDS+1]; float leakage_from[NB_TBANDS+1]; float leakage_to[NB_TBANDS+1]; + float layer_out[MAX_NEURONS]; SAVE_STACK; alpha = 1.f/IMIN(10, 1+tonal->count); @@ -368,12 +461,6 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt offset = 3*offset/2; } - if (tonal->count<4) { - if (tonal->application == OPUS_APPLICATION_VOIP) - tonal->music_prob = .1f; - else - tonal->music_prob = .625f; - } kfft = celt_mode->mdct.kfft[0]; if (tonal->count==0) tonal->mem_fill = 240; @@ -761,139 +848,17 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt features[23] = info->tonality_slope + 0.069216f; features[24] = tonal->lowECount - 0.067930f; - mlp_process(&net, features, frame_probs); - frame_probs[0] = .5f*(frame_probs[0]+1); - /* Curve fitting between the MLP probability and the actual probability */ - /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/ - /* Probability of active audio (as opposed to silence) */ - frame_probs[1] = .5f*frame_probs[1]+.5f; - frame_probs[1] *= frame_probs[1]; + compute_dense(&layer0, layer_out, features); + compute_gru(&layer1, tonal->rnn_state, layer_out); + compute_dense(&layer2, frame_probs, tonal->rnn_state); /* Probability of speech or music vs noise */ info->activity_probability = frame_probs[1]; + /* It seems like the RNN tends to have a bias towards speech and this + warping of the probabilities compensates for it. */ + info->music_prob = frame_probs[0] * (2 - frame_probs[0]); - /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/ - { - /* Probability of state transition */ - float tau; - /* Represents independence of the MLP probabilities, where - beta=1 means fully independent. */ - float beta; - /* Denormalized probability of speech (p0) and music (p1) after update */ - float p0, p1; - /* Probabilities for "all speech" and "all music" */ - float s0, m0; - /* Probability sum for renormalisation */ - float psum; - /* Instantaneous probability of speech and music, with beta pre-applied. */ - float speech0; - float music0; - float p, q; - - /* More silence transitions for speech than for music. */ - tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob); - p = MAX16(.05f,MIN16(.95f,frame_probs[1])); - q = MAX16(.05f,MIN16(.95f,tonal->vad_prob)); - beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); - /* p0 and p1 are the probabilities of speech and music at this frame - using only information from previous frame and applying the - state transition model */ - p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau; - p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau; - /* We apply the current probability with exponent beta to work around - the fact that the probability estimates aren't independent. */ - p0 *= (float)pow(1-frame_probs[1], beta); - p1 *= (float)pow(frame_probs[1], beta); - /* Normalise the probabilities to get the Marokv probability of music. */ - tonal->vad_prob = p1/(p0+p1); - info->vad_prob = tonal->vad_prob; - /* Consider that silence has a 50-50 probability of being speech or music. */ - frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f; - - /* One transition every 3 minutes of active audio */ - tau = .0001f; - /* Adapt beta based on how "unexpected" the new prob is */ - p = MAX16(.05f,MIN16(.95f,frame_probs[0])); - q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); - beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); - /* p0 and p1 are the probabilities of speech and music at this frame - using only information from previous frame and applying the - state transition model */ - p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; - p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; - /* We apply the current probability with exponent beta to work around - the fact that the probability estimates aren't independent. */ - p0 *= (float)pow(1-frame_probs[0], beta); - p1 *= (float)pow(frame_probs[0], beta); - /* Normalise the probabilities to get the Marokv probability of music. */ - tonal->music_prob = p1/(p0+p1); - info->music_prob = tonal->music_prob; - - /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/ - /* This chunk of code deals with delayed decision. */ - psum=1e-20f; - /* Instantaneous probability of speech and music, with beta pre-applied. */ - speech0 = (float)pow(1-frame_probs[0], beta); - music0 = (float)pow(frame_probs[0], beta); - if (tonal->count==1) - { - if (tonal->application == OPUS_APPLICATION_VOIP) - tonal->pmusic[0] = .1f; - else - tonal->pmusic[0] = .625f; - tonal->pspeech[0] = 1-tonal->pmusic[0]; - } - /* Updated probability of having only speech (s0) or only music (m0), - before considering the new observation. */ - s0 = tonal->pspeech[0] + tonal->pspeech[1]; - m0 = tonal->pmusic [0] + tonal->pmusic [1]; - /* Updates s0 and m0 with instantaneous probability. */ - tonal->pspeech[0] = s0*(1-tau)*speech0; - tonal->pmusic [0] = m0*(1-tau)*music0; - /* Propagate the transition probabilities */ - for (i=1;i<DETECT_SIZE-1;i++) - { - tonal->pspeech[i] = tonal->pspeech[i+1]*speech0; - tonal->pmusic [i] = tonal->pmusic [i+1]*music0; - } - /* Probability that the latest frame is speech, when all the previous ones were music. */ - tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; - /* Probability that the latest frame is music, when all the previous ones were speech. */ - tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; - - /* Renormalise probabilities to 1 */ - for (i=0;i<DETECT_SIZE;i++) - psum += tonal->pspeech[i] + tonal->pmusic[i]; - psum = 1.f/psum; - for (i=0;i<DETECT_SIZE;i++) - { - tonal->pspeech[i] *= psum; - tonal->pmusic [i] *= psum; - } - psum = tonal->pmusic[0]; - for (i=1;i<DETECT_SIZE;i++) - psum += tonal->pspeech[i]; - - /* Estimate our confidence in the speech/music decisions */ - if (frame_probs[1]>.75) - { - if (tonal->music_prob>.9) - { - float adapt; - adapt = 1.f/(++tonal->music_confidence_count); - tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); - tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence); - } - if (tonal->music_prob<.1) - { - float adapt; - adapt = 1.f/(++tonal->speech_confidence_count); - tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); - tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); - } - } - } - tonal->last_music = tonal->music_prob>.5f; + /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/ #ifdef MLP_TRAINING for (i=0;i<25;i++) printf("%f ", features[i]); diff --git a/src/analysis.h b/src/analysis.h index cac51dfa..289c845e 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -30,6 +30,7 @@ #include "celt.h" #include "opus_private.h" +#include "mlp.h" #define NB_FRAMES 8 #define NB_TBANDS 18 @@ -64,28 +65,16 @@ typedef struct { float mem[32]; float cmean[8]; float std[9]; - float music_prob; - float vad_prob; float Etracker; float lowECount; int E_count; - int last_music; int count; int analysis_offset; - /** Probability of having speech for time i to DETECT_SIZE-1 (and music before). - pspeech[0] is the probability that all frames in the window are speech. */ - float pspeech[DETECT_SIZE]; - /** Probability of having music for time i to DETECT_SIZE-1 (and speech before). - pmusic[0] is the probability that all frames in the window are music. */ - float pmusic[DETECT_SIZE]; - float speech_confidence; - float music_confidence; - int speech_confidence_count; - int music_confidence_count; int write_pos; int read_pos; int read_subframe; float hp_ener_accum; + float rnn_state[MAX_NEURONS]; opus_val32 downmix_state[3]; AnalysisInfo info[DETECT_SIZE]; } TonalityAnalysisState; @@ -1,5 +1,5 @@ /* Copyright (c) 2008-2011 Octasic Inc. - Written by Jean-Marc Valin */ + 2012-2017 Jean-Marc Valin */ /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -29,42 +29,13 @@ #include "config.h" #endif +#include <math.h> #include "opus_types.h" #include "opus_defines.h" - -#include <math.h> -#include "mlp.h" #include "arch.h" #include "tansig_table.h" -#define MAX_NEURONS 100 +#include "mlp.h" -#if 0 -static OPUS_INLINE opus_val16 tansig_approx(opus_val32 _x) /* Q19 */ -{ - int i; - opus_val16 xx; /* Q11 */ - /*double x, y;*/ - opus_val16 dy, yy; /* Q14 */ - /*x = 1.9073e-06*_x;*/ - if (_x>=QCONST32(8,19)) - return QCONST32(1.,14); - if (_x<=-QCONST32(8,19)) - return -QCONST32(1.,14); - xx = EXTRACT16(SHR32(_x, 8)); - /*i = lrint(25*x);*/ - i = SHR32(ADD32(1024,MULT16_16(25, xx)),11); - /*x -= .04*i;*/ - xx -= EXTRACT16(SHR32(MULT16_16(20972,i),8)); - /*x = xx*(1./2048);*/ - /*y = tansig_table[250+i];*/ - yy = tansig_table[250+i]; - /*y = yy*(1./16384);*/ - dy = 16384-MULT16_16_Q14(yy,yy); - yy = yy + MULT16_16_Q14(MULT16_16_Q11(xx,dy),(16384 - MULT16_16_Q11(yy,xx))); - return yy; -} -#else -/*extern const float tansig_table[501];*/ static OPUS_INLINE float tansig_approx(float x) { int i; @@ -92,54 +63,97 @@ static OPUS_INLINE float tansig_approx(float x) y = y + x*dy*(1 - y*x); return sign*y; } -#endif -#if 0 -void mlp_process(const MLP *m, const opus_val16 *in, opus_val16 *out) +static OPUS_INLINE float sigmoid_approx(float x) { - int j; - opus_val16 hidden[MAX_NEURONS]; - const opus_val16 *W = m->weights; - /* Copy to tmp_in */ - for (j=0;j<m->topo[1];j++) - { - int k; - opus_val32 sum = SHL32(EXTEND32(*W++),8); - for (k=0;k<m->topo[0];k++) - sum = MAC16_16(sum, in[k],*W++); - hidden[j] = tansig_approx(sum); - } - for (j=0;j<m->topo[2];j++) - { - int k; - opus_val32 sum = SHL32(EXTEND32(*W++),14); - for (k=0;k<m->topo[1];k++) - sum = MAC16_16(sum, hidden[k], *W++); - out[j] = tansig_approx(EXTRACT16(PSHR32(sum,17))); - } + return .5 + .5*tansig_approx(.5*x); } -#else -void mlp_process(const MLP *m, const float *in, float *out) + +void compute_dense(const DenseLayer *layer, float *output, const float *input) { - int j; - float hidden[MAX_NEURONS]; - const float *W = m->weights; - /* Copy to tmp_in */ - for (j=0;j<m->topo[1];j++) - { - int k; - float sum = *W++; - for (k=0;k<m->topo[0];k++) - sum = sum + in[k]**W++; - hidden[j] = tansig_approx(sum); - } - for (j=0;j<m->topo[2];j++) - { - int k; - float sum = *W++; - for (k=0;k<m->topo[1];k++) - sum = sum + hidden[k]**W++; - out[j] = tansig_approx(sum); - } + int i, j; + int N, M; + int stride; + M = layer->nb_inputs; + N = layer->nb_neurons; + stride = N; + for (i=0;i<N;i++) + { + /* Compute update gate. */ + float sum = layer->bias[i]; + for (j=0;j<M;j++) + sum += layer->input_weights[j*stride + i]*input[j]; + output[i] = WEIGHTS_SCALE*sum; + } + if (layer->sigmoid) { + for (i=0;i<N;i++) + output[i] = sigmoid_approx(output[i]); + } else { + for (i=0;i<N;i++) + output[i] = tansig_approx(output[i]); + } +} + +void compute_gru(const GRULayer *gru, float *state, const float *input) +{ + int i, j; + int N, M; + int stride; + float z[MAX_NEURONS]; + float r[MAX_NEURONS]; + float h[MAX_NEURONS]; + M = gru->nb_inputs; + N = gru->nb_neurons; + stride = 3*N; + for (i=0;i<N;i++) + { + /* Compute update gate. */ + float sum = gru->bias[i]; + for (j=0;j<M;j++) + sum += gru->input_weights[j*stride + i]*input[j]; + for (j=0;j<N;j++) + sum += gru->recurrent_weights[j*stride + i]*state[j]; + z[i] = sigmoid_approx(WEIGHTS_SCALE*sum); + } + for (i=0;i<N;i++) + { + /* Compute reset gate. */ + float sum = gru->bias[N + i]; + for (j=0;j<M;j++) + sum += gru->input_weights[N + j*stride + i]*input[j]; + for (j=0;j<N;j++) + sum += gru->recurrent_weights[N + j*stride + i]*state[j]; + r[i] = sigmoid_approx(WEIGHTS_SCALE*sum); + } + for (i=0;i<N;i++) + { + /* Compute output. */ + float sum = gru->bias[2*N + i]; + for (j=0;j<M;j++) + sum += gru->input_weights[2*N + j*stride + i]*input[j]; + for (j=0;j<N;j++) + sum += gru->recurrent_weights[2*N + j*stride + i]*state[j]*r[j]; + h[i] = z[i]*state[i] + (1-z[i])*tansig_approx(WEIGHTS_SCALE*sum); + } + for (i=0;i<N;i++) + state[i] = h[i]; +} + +#if 0 +int main() { + float state[12] = {0}; + float input[25]; + float out0[16]; + float out[2]; + while (1) + { + int i; + for (i=0;i<25;i++) scanf("%f", &input[i]); + if (feof(stdin)) break; + compute_dense(&layer0, out0, input); + compute_gru(&layer1, state, out0); + compute_dense(&layer2, out, state); + printf("%f %f\n", out[0], out[1]); + } } #endif @@ -1,5 +1,4 @@ -/* Copyright (c) 2008-2011 Octasic Inc. - Written by Jean-Marc Valin */ +/* Copyright (c) 2017 Jean-Marc Valin */ /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -28,16 +27,34 @@ #ifndef _MLP_H_ #define _MLP_H_ -#include "arch.h" +#include "opus_types.h" + +#define WEIGHTS_SCALE (1.f/8192) + +#define MAX_NEURONS 20 typedef struct { - int layers; - const int *topo; - const float *weights; -} MLP; + const opus_int16 *bias; + const opus_int16 *input_weights; + int nb_inputs; + int nb_neurons; + int sigmoid; +} DenseLayer; + +typedef struct { + const opus_int16 *bias; + const opus_int16 *input_weights; + const opus_int16 *recurrent_weights; + int nb_inputs; + int nb_neurons; +} GRULayer; + +extern const DenseLayer layer0; +extern const GRULayer layer1; +extern const DenseLayer layer2; -extern const MLP net; +void compute_dense(const DenseLayer *layer, float *output, const float *input); -void mlp_process(const MLP *m, const float *in, float *out); +void compute_gru(const GRULayer *gru, float *state, const float *input); #endif /* _MLP_H_ */ diff --git a/src/mlp_data.c b/src/mlp_data.c index a819880b..5ddc94d5 100644 --- a/src/mlp_data.c +++ b/src/mlp_data.c @@ -1,112 +1,235 @@ +/*This file is automatically generated from a Keras model*/ + #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "mlp.h" -/* RMS error was 0.280492, seed was 1480478173 */ -/* 0.005976 0.031821 (0.280494 0.280492) done */ +static const opus_int16 layer0_weights[400] = { + 622, 853, -153, 75, -68, -498, -1936, -291, + -60, -293, 880, 977, -492, 245, -1111, -1622, + -6366, -362, 91, -1764, 1064, -1579, -2406, 696, + 216, -850, 316, -4033, -498, -2667, 509, 61, + 5334, -561, 1022, -3855, -228, -1117, -266, 326, + -1669, 262, 2970, 1810, -2451, -3331, -4970, -617, + 2669, 743, 717, 1942, 2858, 253, -2397, 1525, + -1665, -919, -945, -3356, 1598, 469, -5746, 1111, + -1328, 1331, -140, -1067, -4318, 461, 2235, 702, + 905, -45, -734, 779, -2457, -4860, -16, 979, + -1769, -1167, -1998, 1009, -6205, -2645, -2309, 2178, + 1951, 1433, -1456, 1238, -1195, 4550, -587, -1215, + -2388, 4203, 1051, 1118, -1861, 3513, -355, 1787, + 3133, -466, 4455, 1794, -167, -3224, 3442, 1458, + -9313, 414, -4165, -872, 2574, -3401, -5647, -861, + 2817, 1313, 192, 2431, 293, -1737, 354, -3257, + 1475, 2711, -991, -2767, 2806, 210, 964, 1269, + 2238, -385, 901, -1201, 1182, -4113, 861, -1525, + -6256, -12, -62, 1465, 1034, 595, -827, -849, + 1012, -1290, -2396, -2684, -503, 2473, -1457, 1528, + -2172, 2742, -972, -1949, -4060, -3066, -410, -779, + -594, 373, 1823, 197, -621, -191, -3124, -4822, + -2073, 351, -1115, 2442, -44, 172, -131, -1216, + 875, 94, 4502, 1186, 1008, 698, 351, 160, + -506, -1202, 1255, -1411, 1864, -2380, -332, -42, + 19, 1521, -2319, 634, 3691, 150, -1300, 2018, + 2745, 1845, 138, 1121, -430, 3005, 474, 1349, + -1484, -3281, 2309, 1758, 2206, 1506, -267, -187, + 2478, 6407, -1708, -1994, 741, 2246, -3388, -552, + 239, -559, 130, 854, 2832, -463, 304, 5351, + -1417, -1113, -5, -1782, 154, 1314, 1410, 284, + 1825, -383, 679, -2209, -946, -1933, -1300, 830, + 876, 1313, 1328, 1508, -301, 3985, -2731, 697, + -2527, -2002, -834, -236, 2619, 2201, -1857, -610, + -951, 1685, -1413, -4944, 1479, 2184, -4672, 172, + 39, 2138, 207, -509, 2, -364, -3368, 6137, + 483, 4936, -7439, -4670, -1214, -3259, 2538, -5904, + -166, -3714, -788, 1445, 6256, 908, 941, 6981, + -593, 1114, 2186, -2218, -348, -2502, 1961, 1182, + -742, 238, 926, 920, -2111, 517, 2210, 191, + -3382, -9810, -13597, -7181, 24299, -6002, 8258, 21229, + 15072, -19057, -3613, 14832, -15021, 12016, -3219, -11380, + -1944, 4180, -6248, -3509, 9254, -619, 6140, 1451, + 5216, -7914, -1158, 5757, -2007, 8602, -3177, -452, + 3664, -2979, -12114, 1019, -2574, -2650, 2774, -6778, + -821, 136, -1717, 665, 7233, -1536, -851, 434, + 2075, -14258, 23564, 14664, -15677, 12462, -2884, -8410, + -11996, 15482, 7153, -282, 5304, -9404, 12404, -16057, + 1198, -127, -7232, 2624, 1463, 6303, 9577, 2998, + -12324, 4652, -4785, -3861, -630, -6777, 5040, 3212 +}; + +static const opus_int16 layer0_bias[16] = { + -1246, -4948, 74, -182, 1314, 1022, -968, 2021, + -428, 48, -76, -3614, -4573, 264, -335, -3753 +}; + +static const opus_int16 layer1_weights[576] = { + -1111, -1254, 1798, -2602, 3651, 9301, 5408, 1956, + 1478, 2304, -963, 320, 2738, 2543, -2005, -1085, + -114, -1571, 4910, 1068, 4171, 2313, 1606, 733, + -2610, -2959, 2290, -527, -1842, -646, -16, 2005, + 283, -1077, -1891, -131, 3992, -3736, -11009, 946, + -879, -2992, -728, 1714, -1299, -6849, -7889, 51, + 3311, -4404, 3362, -1589, -1069, -414, 2833, 51, + -3667, -1329, -444, -3046, 702, -1800, -1539, 2547, + 3632, 1717, -1586, -1469, -687, 2218, -236, 49, + -311, 1327, -971, -2230, 3053, 2176, 2819, 113, + 3450, -8814, -5903, 620, 3764, -2008, -889, 1287, + 702, 1576, 8289, 876, -187, -901, -602, 6363, + 141, -1538, 1008, -1399, 2652, 2342, -792, -229, + 4015, -339, 2396, 2358, -5957, -3011, -9989, -300, + -1311, 771, -346, -6502, 747, 1681, -15794, 6796, + -1067, 3718, -2932, -3243, -2861, -1526, 3501, 2016, + 3428, 1293, 26, -3254, -868, -820, 2181, -1091, + -489, -1773, 1598, -2704, 2712, 99, 1321, 72, + -2340, 5255, -6217, 2964, 3356, -1230, -3548, -2045, + -1352, 795, 3486, -5695, -2230, -1462, -2318, -3059, + -2158, 6277, 491, -543, 5419, -4878, -2874, -2366, + 974, 1686, -1541, -1632, -2494, 2066, 2744, 1565, + -4715, -2288, 653, 78, -1683, 5352, -102, 1683, + 4716, -6395, -3046, -629, 1665, 6384, -8447, 2067, + -1616, 6815, 2266, -1036, -5038, 2433, -1651, 1100, + -3259, 2064, 2361, -2265, 1324, 2891, -314, -2138, + -2988, 510, -2769, 2064, 1017, 393, 1768, 1454, + -8112, -5234, 5309, 1943, -5209, 7297, 3919, -6962, + -2801, 3106, 789, 6443, 1361, -1278, 1161, -4952, + 457, -601, -5225, -1984, -1369, 1295, 191, 882, + -651, 2795, 1339, 1014, 726, -1006, 3483, 290, + -1399, -1251, -2881, -1338, 3136, -5323, 633, -5421, + -6290, 3967, 3783, 4605, -2662, -295, -3887, -457, + 5213, 3721, 924, -1770, -2616, 3186, -3607, 1911, + 130, -3046, -7271, 1173, 5783, 1843, 1085, 3245, + -1263, 78, -1060, -1691, -3620, -2132, -209, -580, + 1209, -2759, -3882, -5831, -1829, -921, -5332, 1283, + -3190, 2349, 1728, -5752, -7430, -6203, 1696, -55, + 2174, -2204, 318, 690, -2819, -4307, 1395, 6894, + 1441, -1780, 3808, 569, 3798, 928, 1422, -339, + -1251, -1287, 2070, 2876, -961, 1005, 7303, 17, + -1773, 1397, 319, 3843, 1678, 6099, 6560, 3289, + 1865, -638, 732, -2911, 3968, 361, 422, -1089, + -1486, 6998, -1845, 2680, 293, 4466, 249, 637, + -1471, -1170, -4907, -106, 4637, 542, -2278, 1263, + -3205, -3427, -12921, -3277, -1577, -3644, -3593, 2914, + 3684, -482, -3260, -3842, -2185, 3918, -3654, -168, + -1301, -1121, -303, 1102, -6530, -163, 1887, 2298, + -33, -305, -407, -571, -904, 2380, -1370, -589, + 636, 851, -22, 1512, -9024, -5379, -653, -4918, + -3000, -3675, 3973, -5136, 6238, -3456, -1061, -969, + -449, -1220, -3767, -2634, -3361, -757, -3308, 6517, + 5625, -1183, -752, -3137, -401, 1344, 3681, -277, + 2478, 2315, 788, 3012, -240, 1288, 1235, 1606, + 847, 73, 1037, -491, 410, -3203, -1322, 2917, + 2233, 5982, -4473, -6050, -4147, 122, -30, -44, + -71, -144, -560, 1808, -3543, 1175, 2110, -2488, + -1972, -1154, -1688, 2224, -1458, 2123, -937, 2071, + 3042, -181, -3693, 1762, -4058, 389, 3015, 3460, + -371, -4471, -801, 6941, -1142, 914, -1497, -5451, + 1427, 363, -2305, 717, -101, -2243, 787, 2063, + 2094, 1753, -4824, -392, 642, -1595, 2284, -355, + 723, 704, 4422, 238, -1603, 4658, -261, -1049, + -5058, 1302, 8334, 300, 184, 2387, -4650, 920, + -1044, 4126, 2278, -1618, -1595, -3917, 3040, -1588, + 2545, -554, 4401, 1209, -1611, -4681, 1402, 157, + -2734, 1322, 2633, -89, -2124, -3775, -1074, 2343, + 653, -2387, -1463, 1026, 1146, 2433, -992, -89, + 390, -604, -4066, -3364, 2779, 1317, -3104, -2945, + 4261, 8309, 3272, 3126, 897, 1713, -135, 194, + -2696, 1554, -1179, -1107, -625, 233, -2899, 1175, + 729, 4034, 1992, -1057, -724, 1125, -3964, -1280 +}; + +static const opus_int16 layer1_recur_weights[432] = { + -438, -838, -6192, 5411, -418, 2893, 284, 1692, + 724, -6694, 372, 2294, -2420, -986, -181, 3070, + -3303, 1708, 2409, 4537, 1035, -2341, 1559, 3677, + 6927, 19, 7018, -1246, -6, 764, 1216, 3250, + -1130, -4239, 4176, -1841, -364, -11096, 1627, -5613, + -5810, -2252, -3298, -4786, -1273, 1114, 4722, 4239, + -1604, -848, 534, -472, -3669, -2118, -2768, -1475, + 731, 3618, 1301, 262, -1884, 3715, 2816, -397, + -2884, -2069, -382, -778, -3494, -5716, 4715, 3827, + -5099, 259, -9518, -3708, -768, 600, 6425, -3923, + 820, 4019, 2664, 5603, -4372, 1172, -1589, 1831, + -874, -2241, 2583, 1217, -5199, -552, 2599, 5865, + 4130, 2308, 6881, -3955, 3300, -438, 2953, 2086, + -36, -5881, 4261, -737, -1528, -2968, 357, -808, + -4266, -5794, -2556, 4370, -3368, -6190, -7920, -3524, + -3430, 2304, -394, 3321, 3607, -885, -4667, -4856, + -7151, 1654, -1356, -2450, -3054, -2729, -6057, 3589, + 2660, 5931, 1632, -1200, -2062, 5428, -5080, -1625, + 4027, 258, -871, 2653, 6457, -3976, -1827, 3303, + 215, -9023, -6973, 688, 1128, -324, 13, 2964, + 1124, 2324, 1648, 1985, -2165, -859, -4202, 2908, + -2207, 2688, 314, 5358, 5148, 2579, -73, 248, + -1238, 2539, 520, -1776, 3805, 300, -3066, 1107, + -2935, 850, 1637, 3337, -406, -8662, -11909, -1224, + 5174, 2046, 955, -3673, -140, -1652, -1644, 2844, + 2741, 525, -4580, -2051, 2389, 167, -3123, -4217, + -3441, 4071, 1916, 6908, -1404, -938, -1956, -3821, + -3583, -1661, -9650, -4695, -2647, 3529, -1050, -1390, + -941, -8952, -8547, -5131, 1574, 3018, -1347, -3441, + 2818, 1877, 922, 203, 1547, -2540, -2669, -3568, + 3712, -1858, 1608, 4022, 1949, 1270, 5690, 4952, + -2924, -1852, -960, -6592, 4112, -4835, -6366, 947, + 1653, 3866, -3543, 424, -1011, -4746, 482, -5315, + -1291, -2193, 1034, -2216, -1676, 2701, 854, 2519, + 1207, -4291, -2353, -717, 3103, -546, 1223, -4721, + -235, -719, 2882, 2164, 866, -1741, -1255, -2969, + 4765, -2875, -4220, -3430, -4870, -4859, -2382, -3808, + -1145, 1523, -6688, 1423, 331, 824, -3213, 2206, + 1176, -6635, 1452, -3581, -4968, 3371, 6670, 478, + -896, -1936, -3446, 3845, -2542, -906, -3529, -4821, + 6980, 4467, -2353, 3978, 886, -1195, -3932, 3882, + 2825, -2174, -3966, 8341, 4275, 8445, -3631, -2451, + 4168, -122, -1558, -1961, 1739, -2608, -1198, -1021, + -3015, 2149, -3997, -1421, -5459, -33, -4203, 3328, + 12, 3219, 3345, 1329, 3197, 4859, -2998, 1177, + -2311, 4629, -5004, 513, 4744, 5323, 8186, -269, + 5114, -8890, -1964, 7982, -399, -1038, -1705, 777, + -326, -1578, 3215, 2023, -1201, 1188, -1852, 3234, + 1091, 1777, 3782, -1820, -2942, -954, -910, -1606, + 2469, -3312, 3235, 2541, -2422, -2059, 707, -1015, + -7480, -2569, -4303, -6153, -3864, 8265, 1891, 2087, + -1127, 1155, -2118, -3621, -3438, 1199, 1071, -1461, + -2744, 2638, 3131, 518, -434, 7176, -2115, -527, + -1903, -1662, -2805, -5871, 2314, -2244, 2819, 7768 +}; + +static const opus_int16 layer1_bias[36] = { + 3484, 1686, 8617, 3821, 2768, 4548, 5706, 5368, + 1998, 8007, 4605, 8417, 3054, 1436, 4327, 2667, + 913, 4302, 1496, 1808, 883, 922, -415, 4419, + 1156, -2037, 1373, -1083, 323, 1726, -668, -59, + -866, -3, -662, -2456 +}; + +static const opus_int16 layer2_weights[24] = { + 10570, 495, -6157, -20216, 8597, -3977, -23140, 5295, + -2893, 18700, 997, 8626, 2902, 434, -1866, 9536, + -830, -15077, -11656, 3090, 18331, 4166, -4320, -9123 +}; -static const float weights[450] = { +static const opus_int16 layer2_bias[2] = { + -1526, 7868 +}; -/* hidden layer */ --0.514624f, 0.0234227f, -0.14329f, -0.0878216f, -0.00187827f, --0.0257443f, 0.108524f, 0.00333881f, 0.00585017f, -0.0246132f, -0.142723f, -0.00436494f, 0.0101354f, -0.11124f, -0.0809367f, --0.0750772f, 0.0295524f, 0.00823944f, 0.150392f, 0.0320876f, --0.0710564f, -1.43818f, 0.652076f, 0.0650744f, -1.54821f, -0.168949f, -1.92724f, 0.0517976f, -0.0670737f, -0.0690121f, -0.00247528f, -0.0522024f, 0.0631368f, 0.0532776f, 0.047751f, --0.011715f, 0.142374f, -0.0290885f, -0.279263f, -0.433499f, --0.0795174f, -0.380458f, -0.051263f, 0.218537f, -0.322478f, -1.06667f, -0.104607f, -4.70108f, 0.312037f, 0.277397f, --2.71859f, 1.70037f, -0.141845f, 0.0115618f, 0.0629883f, -0.0403871f, 0.0139428f, -0.00430733f, -0.0429038f, -0.0590318f, --0.0501526f, -0.0284802f, -0.0415686f, -0.0438999f, 0.0822666f, -0.197194f, 0.0363275f, -0.0584307f, 0.0752364f, -0.0799796f, --0.146275f, 0.161661f, -0.184585f, 0.145568f, 0.442823f, -1.61221f, 1.11162f, 2.62177f, -2.482f, -0.112599f, --0.110366f, -0.140794f, -0.181694f, 0.0648674f, 0.0842248f, -0.0933993f, 0.150122f, 0.129171f, 0.176848f, 0.141758f, --0.271822f, 0.235113f, 0.0668579f, -0.433957f, 0.113633f, --0.169348f, -1.40091f, 0.62861f, -0.134236f, 0.402173f, -1.86373f, 1.53998f, -4.32084f, 0.735343f, 0.800214f, --0.00968415f, 0.0425904f, 0.0196811f, -0.018426f, -0.000343953f, --0.00416389f, 0.00111558f, 0.0173069f, -0.00998596f, -0.025898f, -0.00123764f, -0.00520373f, -0.0565033f, 0.0637394f, 0.0051213f, -0.0221361f, 0.00819962f, -0.0467061f, -0.0548258f, -0.00314063f, --1.18332f, 1.88091f, -0.41148f, -2.95727f, -0.521449f, --0.271641f, 0.124946f, -0.0532936f, 0.101515f, 0.000208564f, --0.0488748f, 0.0642388f, -0.0383848f, 0.0135046f, -0.0413592f, --0.0326402f, -0.0137421f, -0.0225219f, -0.0917294f, -0.277759f, --0.185418f, 0.0471128f, -0.125879f, 0.262467f, -0.212794f, --0.112931f, -1.99885f, -0.404787f, 0.224402f, 0.637962f, --0.27808f, -0.0723953f, -0.0537655f, -0.0336359f, -0.0906601f, --0.0641309f, -0.0713542f, 0.0524317f, 0.00608819f, 0.0754101f, --0.0488401f, -0.00671865f, 0.0418239f, 0.0536284f, -0.132639f, -0.0267648f, -0.248432f, -0.0104153f, 0.035544f, -0.212753f, --0.302895f, -0.0357854f, 0.376838f, 0.597025f, -0.664647f, -0.268422f, -0.376772f, -1.05472f, 0.0144178f, 0.179122f, -0.0360155f, 0.220262f, -0.0056381f, 0.0317197f, 0.0621066f, --0.00779298f, 0.00789378f, 0.00350605f, 0.0104809f, 0.0362871f, --0.157708f, -0.0659779f, -0.0926278f, 0.00770791f, 0.0631621f, -0.0817343f, -0.424295f, -0.0437727f, -0.24251f, 0.711217f, --0.736455f, -2.194f, -0.107612f, -0.175156f, -0.0366573f, --0.0123156f, -0.0628516f, -0.0218977f, -0.00693699f, 0.00695185f, -0.00507362f, 0.00359334f, 0.0052661f, 0.035561f, 0.0382701f, -0.0342179f, -0.00790271f, -0.0170925f, 0.047029f, 0.0197362f, --0.0153435f, 0.0644152f, -0.36862f, -0.0674876f, -2.82672f, -1.34122f, -0.0788029f, -3.47792f, 0.507246f, -0.816378f, --0.0142383f, -0.127349f, -0.106926f, -0.0359524f, 0.105045f, -0.291554f, 0.195413f, 0.0866214f, -0.066577f, -0.102188f, -0.0979466f, -0.12982f, 0.400181f, -0.409336f, -0.0593326f, --0.0656203f, -0.204474f, 0.179802f, 0.000509084f, 0.0995954f, --2.377f, -0.686359f, 0.934861f, 1.10261f, 1.3901f, --4.33616f, -0.00264017f, 0.00713045f, 0.106264f, 0.143726f, --0.0685305f, -0.054656f, -0.0176725f, -0.0772669f, -0.0264526f, --0.0103824f, -0.0269872f, -0.00687f, 0.225804f, 0.407751f, --0.0612611f, -0.0576863f, -0.180131f, -0.222772f, -0.461742f, -0.335236f, 1.03399f, 4.24112f, -0.345796f, -0.594549f, --76.1407f, -0.265276f, 0.0507719f, 0.0643044f, 0.0384832f, -0.0424459f, -0.0387817f, -0.0235996f, -0.0740556f, -0.0270029f, -0.00882177f, -0.0552371f, -0.00485851f, 0.314295f, 0.360431f, --0.0787085f, 0.110355f, -0.415958f, -0.385088f, -0.272224f, --1.55108f, -0.141848f, 0.448877f, -0.563447f, -2.31403f, --0.120077f, -1.49918f, -0.817726f, -0.0495854f, -0.0230782f, --0.0224014f, 0.117076f, 0.0393216f, 0.051997f, 0.0330763f, --0.110796f, 0.0211117f, -0.0197258f, 0.0187461f, 0.0125183f, -0.14876f, 0.0920565f, -0.342475f, 0.135272f, -0.168155f, --0.033423f, -0.0604611f, -0.128835f, 0.664947f, -0.144997f, -2.27649f, 1.28663f, 0.841217f, -2.42807f, 0.0230471f, -0.226709f, -0.0374803f, 0.155436f, 0.0400342f, -0.184686f, -0.128488f, -0.0939518f, -0.0578559f, 0.0265967f, -0.0999322f, --0.0322768f, -0.322994f, -0.189371f, -0.738069f, -0.0754914f, -0.214717f, -0.093728f, -0.695741f, 0.0899298f, -2.06188f, --0.273719f, -0.896977f, 0.130553f, 0.134638f, 1.29355f, -0.00520749f, -0.0324224f, 0.00530451f, 0.0192385f, 0.00328708f, -0.0250838f, 0.0053365f, -0.0177321f, 0.00618789f, 0.00525364f, -0.00104596f, -0.0360459f, 0.0402403f, -0.0406351f, 0.0136883f, -0.0880722f, -0.0197449f, 0.089938f, 0.0100456f, -0.0475638f, --0.73267f, 0.037433f, -0.146551f, -0.230221f, -3.06489f, --1.40194f, 0.0198483f, 0.0397953f, -0.0190239f, 0.0470715f, --0.131363f, -0.191721f, -0.0176224f, -0.0480352f, -0.221799f, --0.26794f, -0.0292615f, 0.0612127f, -0.129877f, 0.00628332f, --0.085918f, 0.0175379f, 0.0541011f, -0.0810874f, -0.380809f, --0.222056f, -0.508859f, -0.473369f, 0.484958f, -2.28411f, -0.0139516f, -/* output layer */ -3.90017f, 1.71789f, -1.43372f, -2.70839f, 1.77107f, -5.48006f, 1.44661f, 2.01134f, -1.88383f, -3.64958f, --1.26351f, 0.779421f, 2.11357f, 3.10409f, 1.68846f, --4.46197f, -1.61455f, 3.59832f, 2.43531f, -1.26458f, -0.417941f, 1.47437f, 2.16635f, -1.909f, -0.828869f, -1.38805f, -2.67975f, -0.110044f, 1.95596f, 0.697931f, --0.313226f, -0.889315f, 0.283236f, 0.946102f, }; +const DenseLayer layer0 = { + layer0_bias, + layer0_weights, + 25, 16, 0 +}; -static const int topo[3] = {25, 16, 2}; +const GRULayer layer1 = { + layer1_bias, + layer1_weights, + layer1_recur_weights, + 16, 12 +}; -const MLP net = { - 3, - topo, - weights +const DenseLayer layer2 = { + layer2_bias, + layer2_weights, + 12, 2, 1 }; + diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 3770fc64..0494170f 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -1189,7 +1189,16 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ { int analysis_bandwidth; if (st->signal_type == OPUS_AUTO) - st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); + { + float prob; + if (st->prev_mode == 0) + prob = analysis_info.music_prob; + else if (st->prev_mode == MODE_CELT_ONLY) + prob = analysis_info.music_prob_max; + else + prob = analysis_info.music_prob_min; + st->voice_ratio = (int)floor(.5+100*(1-prob)); + } analysis_bandwidth = analysis_info.bandwidth; if (analysis_bandwidth<=12) |