1ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs/*
2ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs *  Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
3ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs *
4ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs *  Use of this source code is governed by a BSD-style license
5ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs *  that can be found in the LICENSE file in the root of the source
6ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs *  tree. An additional intellectual property rights grant can be found
7ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs *  in the file PATENTS.  All contributing project authors may
8ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs *  be found in the AUTHORS file in the root of the source tree.
9ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs */
10ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
11ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
12ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
13ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs#include <algorithm>
14ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
15ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs#include "webrtc/base/checks.h"
16ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
17ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsnamespace webrtc {
18ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsnamespace {
19ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
20dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kastingconst size_t kMaxLength = 320;
216955870806624479723addfae6dcf5d13968796cPeter Kastingconst size_t kNumChannels = 1;
22ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
23ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsconst double kDefaultVoiceValue = 1.0;
24ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsconst double kNeutralProbability = 0.5;
25ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsconst double kLowProbability = 0.01;
26ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
27ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs}  // namespace
28ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
29ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsVoiceActivityDetector::VoiceActivityDetector()
30ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs    : last_voice_probability_(kDefaultVoiceValue),
31ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs      standalone_vad_(StandaloneVad::Create()) {
32ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs}
33ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
34ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs// Because ISAC has a different chunk length, it updates
35ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs// |chunkwise_voice_probabilities_| and |chunkwise_rms_| when there is new data.
36ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs// Otherwise it clears them.
37ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsvoid VoiceActivityDetector::ProcessChunk(const int16_t* audio,
38dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting                                         size_t length,
39ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs                                         int sample_rate_hz) {
4091d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg  RTC_DCHECK_EQ(static_cast<int>(length), sample_rate_hz / 100);
4191d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg  RTC_DCHECK_LE(length, kMaxLength);
42ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  // Resample to the required rate.
43ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  const int16_t* resampled_ptr = audio;
44ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  if (sample_rate_hz != kSampleRateHz) {
4591d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg    RTC_CHECK_EQ(
46ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs        resampler_.ResetIfNeeded(sample_rate_hz, kSampleRateHz, kNumChannels),
47ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs        0);
48ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs    resampler_.Push(audio, length, resampled_, kLength10Ms, length);
49ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs    resampled_ptr = resampled_;
50ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  }
5191d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg  RTC_DCHECK_EQ(length, kLength10Ms);
52ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
53ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  // Each chunk needs to be passed into |standalone_vad_|, because internally it
54ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  // buffers the audio and processes it all at once when GetActivity() is
55ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  // called.
5691d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg  RTC_CHECK_EQ(standalone_vad_->AddAudio(resampled_ptr, length), 0);
57ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
58ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  audio_processing_.ExtractFeatures(resampled_ptr, length, &features_);
59ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
60ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  chunkwise_voice_probabilities_.resize(features_.num_frames);
61ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  chunkwise_rms_.resize(features_.num_frames);
62ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  std::copy(features_.rms, features_.rms + chunkwise_rms_.size(),
63ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs            chunkwise_rms_.begin());
64ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  if (features_.num_frames > 0) {
65ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs    if (features_.silence) {
66ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs      // The other features are invalid, so set the voice probabilities to an
67ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs      // arbitrary low value.
68ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs      std::fill(chunkwise_voice_probabilities_.begin(),
69ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs                chunkwise_voice_probabilities_.end(), kLowProbability);
70ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs    } else {
71ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs      std::fill(chunkwise_voice_probabilities_.begin(),
72ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs                chunkwise_voice_probabilities_.end(), kNeutralProbability);
7391d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg      RTC_CHECK_GE(
74ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs          standalone_vad_->GetActivity(&chunkwise_voice_probabilities_[0],
75ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs                                       chunkwise_voice_probabilities_.size()),
76ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs          0);
7791d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg      RTC_CHECK_GE(pitch_based_vad_.VoicingProbability(
7891d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg                       features_, &chunkwise_voice_probabilities_[0]),
7991d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg                   0);
80ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs    }
81ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs    last_voice_probability_ = chunkwise_voice_probabilities_.back();
82ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs  }
83ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs}
84ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs
85ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs}  // namespace webrtc
86