1ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs/* 2ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved. 3ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs * 4ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs * Use of this source code is governed by a BSD-style license 5ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs * that can be found in the LICENSE file in the root of the source 6ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs * tree. An additional intellectual property rights grant can be found 7ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs * in the file PATENTS. All contributing project authors may 8ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs * be found in the AUTHORS file in the root of the source tree. 9ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs */ 10ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 11ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h" 12ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 13ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs#include <algorithm> 14ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 15ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs#include "webrtc/base/checks.h" 16ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 17ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsnamespace webrtc { 18ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsnamespace { 19ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 20dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kastingconst size_t kMaxLength = 320; 216955870806624479723addfae6dcf5d13968796cPeter Kastingconst size_t kNumChannels = 1; 22ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 23ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsconst double kDefaultVoiceValue = 1.0; 24ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsconst double kNeutralProbability = 0.5; 25ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsconst double kLowProbability = 0.01; 26ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 27ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs} // namespace 28ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 29ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsVoiceActivityDetector::VoiceActivityDetector() 30ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs : last_voice_probability_(kDefaultVoiceValue), 31ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs standalone_vad_(StandaloneVad::Create()) { 32ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs} 33ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 34ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs// Because ISAC has a different chunk length, it updates 35ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs// |chunkwise_voice_probabilities_| and |chunkwise_rms_| when there is new data. 36ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs// Otherwise it clears them. 37ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebsvoid VoiceActivityDetector::ProcessChunk(const int16_t* audio, 38dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting size_t length, 39ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs int sample_rate_hz) { 4091d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg RTC_DCHECK_EQ(static_cast<int>(length), sample_rate_hz / 100); 4191d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg RTC_DCHECK_LE(length, kMaxLength); 42ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs // Resample to the required rate. 43ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs const int16_t* resampled_ptr = audio; 44ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs if (sample_rate_hz != kSampleRateHz) { 4591d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg RTC_CHECK_EQ( 46ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs resampler_.ResetIfNeeded(sample_rate_hz, kSampleRateHz, kNumChannels), 47ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 0); 48ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs resampler_.Push(audio, length, resampled_, kLength10Ms, length); 49ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs resampled_ptr = resampled_; 50ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs } 5191d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg RTC_DCHECK_EQ(length, kLength10Ms); 52ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 53ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs // Each chunk needs to be passed into |standalone_vad_|, because internally it 54ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs // buffers the audio and processes it all at once when GetActivity() is 55ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs // called. 5691d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg RTC_CHECK_EQ(standalone_vad_->AddAudio(resampled_ptr, length), 0); 57ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 58ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs audio_processing_.ExtractFeatures(resampled_ptr, length, &features_); 59ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 60ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs chunkwise_voice_probabilities_.resize(features_.num_frames); 61ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs chunkwise_rms_.resize(features_.num_frames); 62ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs std::copy(features_.rms, features_.rms + chunkwise_rms_.size(), 63ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs chunkwise_rms_.begin()); 64ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs if (features_.num_frames > 0) { 65ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs if (features_.silence) { 66ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs // The other features are invalid, so set the voice probabilities to an 67ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs // arbitrary low value. 68ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs std::fill(chunkwise_voice_probabilities_.begin(), 69ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs chunkwise_voice_probabilities_.end(), kLowProbability); 70ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs } else { 71ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs std::fill(chunkwise_voice_probabilities_.begin(), 72ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs chunkwise_voice_probabilities_.end(), kNeutralProbability); 7391d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg RTC_CHECK_GE( 74ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs standalone_vad_->GetActivity(&chunkwise_voice_probabilities_[0], 75ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs chunkwise_voice_probabilities_.size()), 76ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 0); 7791d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg RTC_CHECK_GE(pitch_based_vad_.VoicingProbability( 7891d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg features_, &chunkwise_voice_probabilities_[0]), 7991d6edef35e7275879c30ce16ecb8b6dc73c6e4ahenrikg 0); 80ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs } 81ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs last_voice_probability_ = chunkwise_voice_probabilities_.back(); 82ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs } 83ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs} 84ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs 85ecf6b81644af9823dbff5c24a3d5b9bb596c0d5baluebs} // namespace webrtc 86