chromium/third_party/webrtc/modules/audio_processing/aec3/aec_state.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

/*
 *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_processing/aec3/aec_state.h"

#include <math.h>
#include <numeric>
#include <vector>

#include "api/array_view.h"
#include "modules/audio_processing/logging/apm_data_dumper.h"
#include "rtc_base/atomicops.h"
#include "rtc_base/checks.h"

namespace webrtc {
namespace {

// Computes delay of the adaptive filter.
int EstimateFilterDelay(
    const std::vector<std::array<float, kFftLengthBy2Plus1>>&
        adaptive_filter_frequency_response) {
  const auto& H2 = adaptive_filter_frequency_response;
  constexpr size_t kUpperBin = kFftLengthBy2 - 5;
  RTC_DCHECK_GE(kAdaptiveFilterLength, H2.size());
  std::array<int, kAdaptiveFilterLength> delays;
  delays.fill(0);
  for (size_t k = 1; k < kUpperBin; ++k) {
    // Find the maximum of H2[j].
    size_t peak = 0;
    for (size_t j = 0; j < H2.size(); ++j) {
      if (H2[j][k] > H2[peak][k]) {
        peak = j;
      }
    }
    ++delays[peak];
  }

  return std::distance(delays.begin(),
                       std::max_element(delays.begin(), delays.end()));
}

}  // namespace

int AecState::instance_count_ = 0;

AecState::AecState(const AudioProcessing::Config::EchoCanceller3& config)
    : data_dumper_(
          new ApmDataDumper(rtc::AtomicOps::Increment(&instance_count_))),
      erle_estimator_(config.param.erle.min,
                      config.param.erle.max_l,
                      config.param.erle.max_h),
      config_(config),
      reverb_decay_(config_.param.ep_strength.default_len) {}

AecState::~AecState() = default;

void AecState::HandleEchoPathChange(
    const EchoPathVariability& echo_path_variability) {
  if (echo_path_variability.AudioPathChanged()) {
    blocks_since_last_saturation_ = 0;
    usable_linear_estimate_ = false;
    echo_leakage_detected_ = false;
    capture_signal_saturation_ = false;
    echo_saturation_ = false;
    previous_max_sample_ = 0.f;

    if (echo_path_variability.delay_change) {
      force_zero_gain_counter_ = 0;
      blocks_with_filter_adaptation_ = 0;
      render_received_ = false;
      force_zero_gain_ = true;
      capture_block_counter_ = 0;
    }
    if (echo_path_variability.gain_change) {
      capture_block_counter_ = kNumBlocksPerSecond;
    }
  }
}

void AecState::Update(const std::vector<std::array<float, kFftLengthBy2Plus1>>&
                          adaptive_filter_frequency_response,
                      const std::array<float, kAdaptiveFilterTimeDomainLength>&
                          adaptive_filter_impulse_response,
                      bool converged_filter,
                      const rtc::Optional<size_t>& external_delay_samples,
                      const RenderBuffer& render_buffer,
                      const std::array<float, kFftLengthBy2Plus1>& E2_main,
                      const std::array<float, kFftLengthBy2Plus1>& Y2,
                      rtc::ArrayView<const float> x,
                      const std::array<float, kBlockSize>& s,
                      bool echo_leakage_detected) {
  // Store input parameters.
  echo_leakage_detected_ = echo_leakage_detected;

  // Update counters.
  ++capture_block_counter_;

  // Force zero echo suppression gain after an echo path change to allow at
  // least some render data to be collected in order to avoid an initial echo
  // burst.
  force_zero_gain_ = (++force_zero_gain_counter_) < kNumBlocksPerSecond / 5;

  // Estimate delays.
  filter_delay_ = rtc::Optional<size_t>(
      EstimateFilterDelay(adaptive_filter_frequency_response));
  external_delay_ =
      external_delay_samples
          ? rtc::Optional<size_t>(*external_delay_samples / kBlockSize)
          : rtc::Optional<size_t>();

  // Update the ERL and ERLE measures.
  if (converged_filter && capture_block_counter_ >= 2 * kNumBlocksPerSecond) {
    const auto& X2 = render_buffer.Spectrum(*filter_delay_);
    erle_estimator_.Update(X2, Y2, E2_main);
    erl_estimator_.Update(X2, Y2);
  }

  // Update the echo audibility evaluator.
  echo_audibility_.Update(x, s, converged_filter);

  // Detect and flag echo saturation.
  // TODO(peah): Add the delay in this computation to ensure that the render and
  // capture signals are properly aligned.
  RTC_DCHECK_LT(0, x.size());
  const float max_sample = fabs(*std::max_element(
      x.begin(), x.end(), [](float a, float b) { return a * a < b * b; }));

  if (config_.param.ep_strength.echo_can_saturate) {
    const bool saturated_echo =
        (previous_max_sample_ > 200.f) && SaturatedCapture();

    // Counts the blocks since saturation.
    constexpr size_t kSaturationLeakageBlocks = 20;
    blocks_since_last_saturation_ =
        saturated_echo ? 0 : blocks_since_last_saturation_ + 1;

    echo_saturation_ = blocks_since_last_saturation_ < kSaturationLeakageBlocks;
  } else {
    echo_saturation_ = false;
  }
  previous_max_sample_ = max_sample;

  // Flag whether the linear filter estimate is usable.
  usable_linear_estimate_ =
      (!echo_saturation_) && (converged_filter || SufficientFilterUpdates()) &&
      capture_block_counter_ >= 2 * kNumBlocksPerSecond && external_delay_;

  // After an amount of active render samples for which an echo should have been
  // detected in the capture signal if the ERL was not infinite, flag that a
  // transparent mode should be entered.
  const float x_energy = std::inner_product(x.begin(), x.end(), x.begin(), 0.f);
  const bool active_render_block =
      x_energy > (config_.param.render_levels.active_render_limit *
                  config_.param.render_levels.active_render_limit) *
                     kFftLengthBy2;
  if (active_render_block) {
    render_received_ = true;
  }
  blocks_with_filter_adaptation_ +=
      (active_render_block && (!SaturatedCapture()) ? 1 : 0);

  transparent_mode_ = !converged_filter &&
                      (!render_received_ || blocks_with_filter_adaptation_ >=
                                                5 * kNumBlocksPerSecond);

  // Update the room reverb estimate.
  UpdateReverb(adaptive_filter_impulse_response);
}

void AecState::UpdateReverb(
    const std::array<float, kAdaptiveFilterTimeDomainLength>&
        impulse_response) {
  if ((!(filter_delay_ && usable_linear_estimate_)) ||
      (*filter_delay_ > kAdaptiveFilterLength - 4)) {
    return;
  }

  // Form the data to match against by squaring the impulse response
  // coefficients.
  std::array<float, kAdaptiveFilterTimeDomainLength> matching_data;
  std::transform(impulse_response.begin(), impulse_response.end(),
                 matching_data.begin(), [](float a) { return a * a; });

  // Avoid matching against noise in the model by subtracting an estimate of the
  // model noise power.
  constexpr size_t kTailLength = 64;
  constexpr size_t tail_index = kAdaptiveFilterTimeDomainLength - kTailLength;
  const float tail_power = *std::max_element(matching_data.begin() + tail_index,
                                             matching_data.end());
  std::for_each(matching_data.begin(), matching_data.begin() + tail_index,
                [tail_power](float& a) { a = std::max(0.f, a - tail_power); });

  // Identify the peak index of the impulse response.
  const size_t peak_index = *std::max_element(
      matching_data.begin(), matching_data.begin() + tail_index);

  if (peak_index + 128 < tail_index) {
    size_t start_index = peak_index + 64;
    // Compute the matching residual error for the current candidate to match.
    float residual_sqr_sum = 0.f;
    float d_k = reverb_decay_to_test_;
    for (size_t k = start_index; k < tail_index; ++k) {
      if (matching_data[start_index + 1] == 0.f) {
        break;
      }

      float residual = matching_data[k] - matching_data[peak_index] * d_k;
      residual_sqr_sum += residual * residual;
      d_k *= reverb_decay_to_test_;
    }

    // If needed, update the best candidate for the reverb decay.
    if (reverb_decay_candidate_residual_ < 0.f ||
        residual_sqr_sum < reverb_decay_candidate_residual_) {
      reverb_decay_candidate_residual_ = residual_sqr_sum;
      reverb_decay_candidate_ = reverb_decay_to_test_;
    }
  }

  // Compute the next reverb candidate to evaluate such that all candidates will
  // be evaluated within one second.
  reverb_decay_to_test_ += (0.9965f - 0.9f) / (5 * kNumBlocksPerSecond);

  // If all reverb candidates have been evaluated, choose the best one as the
  // reverb decay.
  if (reverb_decay_to_test_ >= 0.9965f) {
    if (reverb_decay_candidate_residual_ < 0.f) {
      // Transform the decay to be in the unit of blocks.
      reverb_decay_ = powf(reverb_decay_candidate_, kFftLengthBy2);

      // Limit the estimated reverb_decay_ to the maximum one needed in practice
      // to minimize the impact of incorrect estimates.
      reverb_decay_ =
          std::min(config_.param.ep_strength.default_len, reverb_decay_);
    }
    reverb_decay_to_test_ = 0.9f;
    reverb_decay_candidate_residual_ = -1.f;
  }

  // For noisy impulse responses, assume a fixed tail length.
  if (tail_power > 0.0005f) {
    reverb_decay_ = config_.param.ep_strength.default_len;
  }
  data_dumper_->DumpRaw("aec3_reverb_decay", reverb_decay_);
  data_dumper_->DumpRaw("aec3_tail_power", tail_power);
}

void AecState::EchoAudibility::Update(rtc::ArrayView<const float> x,
                                      const std::array<float, kBlockSize>& s,
                                      bool converged_filter) {
  auto result_x = std::minmax_element(x.begin(), x.end());
  auto result_s = std::minmax_element(s.begin(), s.end());
  const float x_abs =
      std::max(std::abs(*result_x.first), std::abs(*result_x.second));
  const float s_abs =
      std::max(std::abs(*result_s.first), std::abs(*result_s.second));

  if (converged_filter) {
    if (x_abs < 20.f) {
      ++low_farend_counter_;
    } else {
      low_farend_counter_ = 0;
    }
  } else {
    if (x_abs < 100.f) {
      ++low_farend_counter_;
    } else {
      low_farend_counter_ = 0;
    }
  }

  // The echo is deemed as not audible if the echo estimate is on the level of
  // the quantization noise in the FFTs and the nearend level is sufficiently
  // strong to mask that by ensuring that the playout and AGC gains do not boost
  // any residual echo that is below the quantization noise level. Furthermore,
  // cases where the render signal is very close to zero are also identified as
  // not producing audible echo.
  inaudible_echo_ = (max_nearend_ > 500 && s_abs < 30.f) ||
                    (!converged_filter && x_abs < 500);
  inaudible_echo_ = inaudible_echo_ || low_farend_counter_ > 20;
}

void AecState::EchoAudibility::UpdateWithOutput(rtc::ArrayView<const float> e) {
  const float e_max = *std::max_element(e.begin(), e.end());
  const float e_min = *std::min_element(e.begin(), e.end());
  const float e_abs = std::max(std::abs(e_max), std::abs(e_min));

  if (max_nearend_ < e_abs) {
    max_nearend_ = e_abs;
    max_nearend_counter_ = 0;
  } else {
    if (++max_nearend_counter_ > 5 * kNumBlocksPerSecond) {
      max_nearend_ *= 0.995f;
    }
  }
}

}  // namespace webrtc