1b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org/* 2b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 4b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Use of this source code is governed by a BSD-style license 5b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * that can be found in the LICENSE file in the root of the source 6b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * tree. An additional intellectual property rights grant can be found 7b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * in the file PATENTS. All contributing project authors may 8b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * be found in the AUTHORS file in the root of the source tree. 9b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org */ 10b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 11b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 12b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org/* 13b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * This header file includes the descriptions of the core VAD calls. 14b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org */ 15b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 16b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org#ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_ 17b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_ 18b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 19f24ac5923cbe5e806fac59a0d15e32567553ce8epbos@webrtc.org#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h" 20f24ac5923cbe5e806fac59a0d15e32567553ce8epbos@webrtc.org#include "webrtc/typedefs.h" 21b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 22b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgenum { kNumChannels = 6 }; // Number of frequency bands (named channels). 23b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgenum { kNumGaussians = 2 }; // Number of Gaussians per channel in the GMM. 24b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgenum { kTableSize = kNumChannels * kNumGaussians }; 25b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgenum { kMinEnergy = 10 }; // Minimum energy required to trigger audio signal. 26b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 27b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgtypedef struct VadInstT_ 28b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org{ 29b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 30b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int vad; 31b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int32_t downsampling_filter_states[4]; 32b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org WebRtcSpl_State48khzTo8khz state_48_to_8; 33b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t noise_means[kTableSize]; 34b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t speech_means[kTableSize]; 35b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t noise_stds[kTableSize]; 36b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t speech_stds[kTableSize]; 37b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org // TODO(bjornv): Change to |frame_count|. 38b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int32_t frame_counter; 39b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t over_hang; // Over Hang 40b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t num_of_speech; 41b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org // TODO(bjornv): Change to |age_vector|. 42b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t index_vector[16 * kNumChannels]; 43b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t low_value_vector[16 * kNumChannels]; 44b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org // TODO(bjornv): Change to |median|. 45b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t mean_value[kNumChannels]; 46b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t upper_state[5]; 47b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t lower_state[5]; 48b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t hp_filter_state[4]; 49b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t over_hang_max_1[3]; 50b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t over_hang_max_2[3]; 51b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t individual[3]; 52b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int16_t total[3]; 53b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 54b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int init_flag; 55b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 56b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org} VadInstT; 57b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 58b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org// Initializes the core VAD component. The default aggressiveness mode is 59b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org// controlled by |kDefaultMode| in vad_core.c. 60b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org// 61b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org// - self [i/o] : Instance that should be initialized 62b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org// 63b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org// returns : 0 (OK), -1 (NULL pointer in or if the default mode can't be 64b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org// set) 65b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgint WebRtcVad_InitCore(VadInstT* self); 66b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 67b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org/**************************************************************************** 68b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * WebRtcVad_set_mode_core(...) 69b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 70b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * This function changes the VAD settings 71b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 72b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Input: 73b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * - inst : VAD instance 74b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * - mode : Aggressiveness degree 75b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 0 (High quality) - 3 (Highly aggressive) 76b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 77b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Output: 78b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * - inst : Changed instance 79b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 80b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Return value : 0 - Ok 81b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * -1 - Error 82b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org */ 83b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 84b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgint WebRtcVad_set_mode_core(VadInstT* self, int mode); 85b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 86b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org/**************************************************************************** 87b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * WebRtcVad_CalcVad48khz(...) 88785c2fdd22c9ad2d172c0976d1224bbe44073bc7andrew@webrtc.org * WebRtcVad_CalcVad32khz(...) 89785c2fdd22c9ad2d172c0976d1224bbe44073bc7andrew@webrtc.org * WebRtcVad_CalcVad16khz(...) 90785c2fdd22c9ad2d172c0976d1224bbe44073bc7andrew@webrtc.org * WebRtcVad_CalcVad8khz(...) 91b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 92b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Calculate probability for active speech and make VAD decision. 93b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 94b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Input: 95b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * - inst : Instance that should be initialized 96b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * - speech_frame : Input speech frame 97b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * - frame_length : Number of input samples 98b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 99b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Output: 100b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * - inst : Updated filter states etc. 101b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 102b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * Return value : VAD decision 103b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 0 - No active speech 104b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org * 1-6 - Active speech 105b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org */ 106785c2fdd22c9ad2d172c0976d1224bbe44073bc7andrew@webrtc.orgint WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame, 107b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int frame_length); 108785c2fdd22c9ad2d172c0976d1224bbe44073bc7andrew@webrtc.orgint WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame, 109b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int frame_length); 110785c2fdd22c9ad2d172c0976d1224bbe44073bc7andrew@webrtc.orgint WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame, 111b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int frame_length); 112785c2fdd22c9ad2d172c0976d1224bbe44073bc7andrew@webrtc.orgint WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame, 113b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org int frame_length); 114b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 115b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org#endif // WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_ 116