vad_audio_proc.cc revision 34be126c1b3ee60ecdb86b1de41a0648347450b2
1/*
2 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
12
13#include <math.h>
14#include <stdio.h>
15
16#include "webrtc/common_audio/fft4g.h"
17#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
18#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
19#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
20extern "C" {
21#include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
22#include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
23#include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
24#include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
25}
26#include "webrtc/modules/interface/module_common_types.h"
27
28namespace webrtc {
29
30// The following structures are declared anonymous in iSAC's structs.h. To
31// forward declare them, we use this derived class trick.
32struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
33struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
34
35static const float kFrequencyResolution =
36    kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
37static const int kSilenceRms = 5;
38
39// TODO(turajs): Make a Create or Init for VadAudioProc.
40VadAudioProc::VadAudioProc()
41    : audio_buffer_(),
42      num_buffer_samples_(kNumPastSignalSamples),
43      log_old_gain_(-2),
44      old_lag_(50),  // Arbitrary but valid as pitch-lag (in samples).
45      pitch_analysis_handle_(new PitchAnalysisStruct),
46      pre_filter_handle_(new PreFiltBankstr),
47      high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
48                                               kFilterOrder,
49                                               kCoeffDenominator,
50                                               kFilterOrder)) {
51  static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
52                    sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
53                "lpc analysis window incorrect size");
54  static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
55                "correlation weight incorrect size");
56
57  // TODO(turajs): Are we doing too much in the constructor?
58  float data[kDftSize];
59  // Make FFT to initialize.
60  ip_[0] = 0;
61  WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
62  // TODO(turajs): Need to initialize high-pass filter.
63
64  // Initialize iSAC components.
65  WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
66  WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
67}
68
69VadAudioProc::~VadAudioProc() {
70}
71
72void VadAudioProc::ResetBuffer() {
73  memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
74         sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
75  num_buffer_samples_ = kNumPastSignalSamples;
76}
77
78int VadAudioProc::ExtractFeatures(const int16_t* frame,
79                                  int length,
80                                  AudioFeatures* features) {
81  features->num_frames = 0;
82  if (length != kNumSubframeSamples) {
83    return -1;
84  }
85
86  // High-pass filter to remove the DC component and very low frequency content.
87  // We have experienced that this high-pass filtering improves voice/non-voiced
88  // classification.
89  if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
90                                &audio_buffer_[num_buffer_samples_]) != 0) {
91    return -1;
92  }
93
94  num_buffer_samples_ += kNumSubframeSamples;
95  if (num_buffer_samples_ < kBufferLength) {
96    return 0;
97  }
98  assert(num_buffer_samples_ == kBufferLength);
99  features->num_frames = kNum10msSubframes;
100  features->silence = false;
101
102  Rms(features->rms, kMaxNumFrames);
103  for (int i = 0; i < kNum10msSubframes; ++i) {
104    if (features->rms[i] < kSilenceRms) {
105      // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
106      // Bail out here instead.
107      features->silence = true;
108      ResetBuffer();
109      return 0;
110    }
111  }
112
113  PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
114                kMaxNumFrames);
115  FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
116  ResetBuffer();
117  return 0;
118}
119
120// Computes |kLpcOrder + 1| correlation coefficients.
121void VadAudioProc::SubframeCorrelation(double* corr,
122                                       int length_corr,
123                                       int subframe_index) {
124  assert(length_corr >= kLpcOrder + 1);
125  double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
126  int buffer_index = subframe_index * kNumSubframeSamples;
127
128  for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
129    windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
130
131  WebRtcIsac_AutoCorr(corr, windowed_audio,
132                      kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
133}
134
135// Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
136// The analysis window is 15 ms long and it is centered on the first half of
137// each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
138// first half of each 10 ms subframe.
139void VadAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {
140  assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));
141  double corr[kLpcOrder + 1];
142  double reflec_coeff[kLpcOrder];
143  for (int i = 0, offset_lpc = 0; i < kNum10msSubframes;
144       i++, offset_lpc += kLpcOrder + 1) {
145    SubframeCorrelation(corr, kLpcOrder + 1, i);
146    corr[0] *= 1.0001;
147    // This makes Lev-Durb a bit more stable.
148    for (int k = 0; k < kLpcOrder + 1; k++) {
149      corr[k] *= kCorrWeight[k];
150    }
151    WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
152  }
153}
154
155// Fit a second order curve to these 3 points and find the location of the
156// extremum. The points are inverted before curve fitting.
157static float QuadraticInterpolation(float prev_val,
158                                    float curr_val,
159                                    float next_val) {
160  // Doing the interpolation in |1 / A(z)|^2.
161  float fractional_index = 0;
162  next_val = 1.0f / next_val;
163  prev_val = 1.0f / prev_val;
164  curr_val = 1.0f / curr_val;
165
166  fractional_index =
167      -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
168  assert(fabs(fractional_index) < 1);
169  return fractional_index;
170}
171
172// 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
173// of the input signal. The local maximum of the spectral envelope corresponds
174// with the local minimum of A(z). It saves complexity, as we save one
175// inversion. Furthermore, we find the first local maximum of magnitude squared,
176// to save on one square root.
177void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {
178  assert(length_f_peak >= kNum10msSubframes);
179  double lpc[kNum10msSubframes * (kLpcOrder + 1)];
180  // For all sub-frames.
181  GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
182
183  const int kNumDftCoefficients = kDftSize / 2 + 1;
184  float data[kDftSize];
185
186  for (int i = 0; i < kNum10msSubframes; i++) {
187    // Convert to float with zero pad.
188    memset(data, 0, sizeof(data));
189    for (int n = 0; n < kLpcOrder + 1; n++) {
190      data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
191    }
192    // Transform to frequency domain.
193    WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
194
195    int index_peak = 0;
196    float prev_magn_sqr = data[0] * data[0];
197    float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
198    float next_magn_sqr;
199    bool found_peak = false;
200    for (int n = 2; n < kNumDftCoefficients - 1; n++) {
201      next_magn_sqr =
202          data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
203      if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
204        found_peak = true;
205        index_peak = n - 1;
206        break;
207      }
208      prev_magn_sqr = curr_magn_sqr;
209      curr_magn_sqr = next_magn_sqr;
210    }
211    float fractional_index = 0;
212    if (!found_peak) {
213      // Checking if |kNumDftCoefficients - 1| is the local minimum.
214      next_magn_sqr = data[1] * data[1];
215      if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
216        index_peak = kNumDftCoefficients - 1;
217      }
218    } else {
219      // A peak is found, do a simple quadratic interpolation to get a more
220      // accurate estimate of the peak location.
221      fractional_index =
222          QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
223    }
224    f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
225  }
226}
227
228// Using iSAC functions to estimate pitch gains & lags.
229void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
230                                 double* pitch_lags_hz,
231                                 int length) {
232  // TODO(turajs): This can be "imported" from iSAC & and the next two
233  // constants.
234  assert(length >= kNum10msSubframes);
235  const int kNumPitchSubframes = 4;
236  double gains[kNumPitchSubframes];
237  double lags[kNumPitchSubframes];
238
239  const int kNumSubbandFrameSamples = 240;
240  const int kNumLookaheadSamples = 24;
241
242  float lower[kNumSubbandFrameSamples];
243  float upper[kNumSubbandFrameSamples];
244  double lower_lookahead[kNumSubbandFrameSamples];
245  double upper_lookahead[kNumSubbandFrameSamples];
246  double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
247                                    kNumLookaheadSamples];
248
249  // Split signal to lower and upper bands
250  WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
251                                 upper, lower_lookahead, upper_lookahead,
252                                 pre_filter_handle_.get());
253  WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
254                           pitch_analysis_handle_.get(), lags, gains);
255
256  // Lags are computed on lower-band signal with sampling rate half of the
257  // input signal.
258  GetSubframesPitchParameters(
259      kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
260      &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
261}
262
263void VadAudioProc::Rms(double* rms, int length_rms) {
264  assert(length_rms >= kNum10msSubframes);
265  int offset = kNumPastSignalSamples;
266  for (int i = 0; i < kNum10msSubframes; i++) {
267    rms[i] = 0;
268    for (int n = 0; n < kNumSubframeSamples; n++, offset++)
269      rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
270    rms[i] = sqrt(rms[i] / kNumSubframeSamples);
271  }
272}
273
274}  // namespace webrtc
275