intelligibility_enhancer.h revision b7553dfdbb1ca7779eb0d80b5f509523c9b00086
1030249dd247444687663c4969ff078dc0a4b24acekm/* 2030249dd247444687663c4969ff078dc0a4b24acekm * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3030249dd247444687663c4969ff078dc0a4b24acekm * 4030249dd247444687663c4969ff078dc0a4b24acekm * Use of this source code is governed by a BSD-style license 5030249dd247444687663c4969ff078dc0a4b24acekm * that can be found in the LICENSE file in the root of the source 6030249dd247444687663c4969ff078dc0a4b24acekm * tree. An additional intellectual property rights grant can be found 7030249dd247444687663c4969ff078dc0a4b24acekm * in the file PATENTS. All contributing project authors may 8030249dd247444687663c4969ff078dc0a4b24acekm * be found in the AUTHORS file in the root of the source tree. 9030249dd247444687663c4969ff078dc0a4b24acekm */ 10030249dd247444687663c4969ff078dc0a4b24acekm 11b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm// 12b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm// Specifies core class for intelligbility enhancement. 13b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm// 14b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 15030249dd247444687663c4969ff078dc0a4b24acekm#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 16030249dd247444687663c4969ff078dc0a4b24acekm#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 17030249dd247444687663c4969ff078dc0a4b24acekm 18030249dd247444687663c4969ff078dc0a4b24acekm#include <complex> 19030249dd247444687663c4969ff078dc0a4b24acekm 20b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm#include "webrtc/base/scoped_ptr.h" 21030249dd247444687663c4969ff078dc0a4b24acekm#include "webrtc/common_audio/lapped_transform.h" 22030249dd247444687663c4969ff078dc0a4b24acekm#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" 23030249dd247444687663c4969ff078dc0a4b24acekm 24030249dd247444687663c4969ff078dc0a4b24acekmstruct WebRtcVadInst; 25030249dd247444687663c4969ff078dc0a4b24acekmtypedef struct WebRtcVadInst VadInst; 26030249dd247444687663c4969ff078dc0a4b24acekm 27030249dd247444687663c4969ff078dc0a4b24acekmnamespace webrtc { 28030249dd247444687663c4969ff078dc0a4b24acekm 29030249dd247444687663c4969ff078dc0a4b24acekm// Speech intelligibility enhancement module. Reads render and capture 30030249dd247444687663c4969ff078dc0a4b24acekm// audio streams and modifies the render stream with a set of gains per 31030249dd247444687663c4969ff078dc0a4b24acekm// frequency bin to enhance speech against the noise background. 32b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm// Note: assumes speech and noise streams are already separated. 33030249dd247444687663c4969ff078dc0a4b24acekmclass IntelligibilityEnhancer { 34030249dd247444687663c4969ff078dc0a4b24acekm public: 35030249dd247444687663c4969ff078dc0a4b24acekm // Construct a new instance with the given filter bank resolution, 36030249dd247444687663c4969ff078dc0a4b24acekm // sampling rate, number of channels and analysis rates. 37030249dd247444687663c4969ff078dc0a4b24acekm // |analysis_rate| sets the number of input blocks (containing speech!) 38030249dd247444687663c4969ff078dc0a4b24acekm // to elapse before a new gain computation is made. |variance_rate| specifies 39030249dd247444687663c4969ff078dc0a4b24acekm // the number of gain recomputations after which the variances are reset. 40030249dd247444687663c4969ff078dc0a4b24acekm // |cv_*| are parameters for the VarianceArray constructor for the 41b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // clear speech stream. 42030249dd247444687663c4969ff078dc0a4b24acekm // TODO(bercic): the |cv_*|, |*_rate| and |gain_limit| parameters should 43030249dd247444687663c4969ff078dc0a4b24acekm // probably go away once fine tuning is done. They override the internal 44030249dd247444687663c4969ff078dc0a4b24acekm // constants in the class (kGainChangeLimit, kAnalyzeRate, kVarianceRate). 45b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm IntelligibilityEnhancer(int erb_resolution, 46b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm int sample_rate_hz, 47b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm int channels, 48b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm int cv_type, 49b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm float cv_alpha, 50b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm int cv_win, 51b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm int analysis_rate, 52b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm int variance_rate, 53030249dd247444687663c4969ff078dc0a4b24acekm float gain_limit); 54030249dd247444687663c4969ff078dc0a4b24acekm ~IntelligibilityEnhancer(); 55030249dd247444687663c4969ff078dc0a4b24acekm 56b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Reads and processes chunk of noise stream in time domain. 57030249dd247444687663c4969ff078dc0a4b24acekm void ProcessCaptureAudio(float* const* audio); 58030249dd247444687663c4969ff078dc0a4b24acekm 59b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Reads chunk of speech in time domain and updates with modified signal. 60b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm void ProcessRenderAudio(float* const* audio); 61b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 62030249dd247444687663c4969ff078dc0a4b24acekm private: 63030249dd247444687663c4969ff078dc0a4b24acekm enum AudioSource { 64b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm kRenderStream = 0, // Clear speech stream. 65b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm kCaptureStream, // Noise stream. 66030249dd247444687663c4969ff078dc0a4b24acekm }; 67030249dd247444687663c4969ff078dc0a4b24acekm 68b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Provides access point to the frequency domain. 69030249dd247444687663c4969ff078dc0a4b24acekm class TransformCallback : public LappedTransform::Callback { 70030249dd247444687663c4969ff078dc0a4b24acekm public: 71030249dd247444687663c4969ff078dc0a4b24acekm TransformCallback(IntelligibilityEnhancer* parent, AudioSource source); 72b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 73b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // All in frequency domain, receives input |in_block|, applies 74b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // intelligibility enhancement, and writes result to |out_block|. 75030249dd247444687663c4969ff078dc0a4b24acekm virtual void ProcessAudioBlock(const std::complex<float>* const* in_block, 76b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm int in_channels, 77b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm int frames, 78030249dd247444687663c4969ff078dc0a4b24acekm int out_channels, 79030249dd247444687663c4969ff078dc0a4b24acekm std::complex<float>* const* out_block); 80030249dd247444687663c4969ff078dc0a4b24acekm 81030249dd247444687663c4969ff078dc0a4b24acekm private: 82030249dd247444687663c4969ff078dc0a4b24acekm IntelligibilityEnhancer* parent_; 83030249dd247444687663c4969ff078dc0a4b24acekm AudioSource source_; 84030249dd247444687663c4969ff078dc0a4b24acekm }; 85030249dd247444687663c4969ff078dc0a4b24acekm friend class TransformCallback; 86030249dd247444687663c4969ff078dc0a4b24acekm 87b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source. 88b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm void DispatchAudio(AudioSource source, 89b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm const std::complex<float>* in_block, 90030249dd247444687663c4969ff078dc0a4b24acekm std::complex<float>* out_block); 91b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 92b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Updates variance computation and analysis with |in_block_|, 93b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // and writes modified speech to |out_block|. 94030249dd247444687663c4969ff078dc0a4b24acekm void ProcessClearBlock(const std::complex<float>* in_block, 95030249dd247444687663c4969ff078dc0a4b24acekm std::complex<float>* out_block); 96b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 97b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Computes and sets modified gains. 98030249dd247444687663c4969ff078dc0a4b24acekm void AnalyzeClearBlock(float power_target); 99b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 100b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Updates variance calculation for noise input with |in_block|. 101030249dd247444687663c4969ff078dc0a4b24acekm void ProcessNoiseBlock(const std::complex<float>* in_block, 102030249dd247444687663c4969ff078dc0a4b24acekm std::complex<float>* out_block); 103030249dd247444687663c4969ff078dc0a4b24acekm 104b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Returns number of ERB filters. 105030249dd247444687663c4969ff078dc0a4b24acekm static int GetBankSize(int sample_rate, int erb_resolution); 106b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 107b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Initializes ERB filterbank. 108030249dd247444687663c4969ff078dc0a4b24acekm void CreateErbBank(); 109b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 110b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Analytically solves quadratic for optimal gains given |lambda|. 111b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Negative gains are set to 0. Stores the results in |sols|. 112b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm void SolveForGainsGivenLambda(float lambda, int start_freq, float* sols); 113b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 114b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Computes variance across ERB filters from freq variance |var|. 115b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Stores in |result|. 116030249dd247444687663c4969ff078dc0a4b24acekm void FilterVariance(const float* var, float* result); 117b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 118b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Returns dot product of vectors specified by size |length| arrays |a|,|b|. 119030249dd247444687663c4969ff078dc0a4b24acekm static float DotProduct(const float* a, const float* b, int length); 120030249dd247444687663c4969ff078dc0a4b24acekm 121030249dd247444687663c4969ff078dc0a4b24acekm static const int kErbResolution; 122030249dd247444687663c4969ff078dc0a4b24acekm static const int kWindowSizeMs; 123030249dd247444687663c4969ff078dc0a4b24acekm static const int kChunkSizeMs; 124b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm static const int kAnalyzeRate; // Default for |analysis_rate_|. 125b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm static const int kVarianceRate; // Default for |variance_rate_|. 126030249dd247444687663c4969ff078dc0a4b24acekm static const float kClipFreq; 127b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm static const float kConfigRho; // Default production and interpretation SNR. 128030249dd247444687663c4969ff078dc0a4b24acekm static const float kKbdAlpha; 129030249dd247444687663c4969ff078dc0a4b24acekm static const float kGainChangeLimit; 130030249dd247444687663c4969ff078dc0a4b24acekm 131b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm const int freqs_; // Num frequencies in frequency domain. 132b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm const int window_size_; // Window size in samples; also the block size. 133b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm const int chunk_length_; // Chunk size in samples. 134b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm const int bank_size_; // Num ERB filters. 135030249dd247444687663c4969ff078dc0a4b24acekm const int sample_rate_hz_; 136030249dd247444687663c4969ff078dc0a4b24acekm const int erb_resolution_; 137b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm const int channels_; // Num channels. 138b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm const int analysis_rate_; // Num blocks before gains recalculated. 139b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm const int variance_rate_; // Num recalculations before history is cleared. 140030249dd247444687663c4969ff078dc0a4b24acekm 141030249dd247444687663c4969ff078dc0a4b24acekm intelligibility::VarianceArray clear_variance_; 142030249dd247444687663c4969ff078dc0a4b24acekm intelligibility::VarianceArray noise_variance_; 143b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<float[]> filtered_clear_var_; 144b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<float[]> filtered_noise_var_; 145b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm float** filter_bank_; // TODO(ekmeyerson): Switch to using ChannelBuffer. 146b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<float[]> center_freqs_; 147030249dd247444687663c4969ff078dc0a4b24acekm int start_freq_; 148b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR. 149b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // for each ERB band. 150b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains. 151030249dd247444687663c4969ff078dc0a4b24acekm intelligibility::GainApplier gain_applier_; 152030249dd247444687663c4969ff078dc0a4b24acekm 153030249dd247444687663c4969ff078dc0a4b24acekm // Destination buffer used to reassemble blocked chunks before overwriting 154030249dd247444687663c4969ff078dc0a4b24acekm // the original input array with modifications. 155b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // TODO(ekmeyerson): Switch to using ChannelBuffer. 156030249dd247444687663c4969ff078dc0a4b24acekm float** temp_out_buffer_; 157b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 158b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<float* []> input_audio_; 159b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<float[]> kbd_window_; 160030249dd247444687663c4969ff078dc0a4b24acekm TransformCallback render_callback_; 161030249dd247444687663c4969ff078dc0a4b24acekm TransformCallback capture_callback_; 162b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<LappedTransform> render_mangler_; 163b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<LappedTransform> capture_mangler_; 164030249dd247444687663c4969ff078dc0a4b24acekm int block_count_; 165030249dd247444687663c4969ff078dc0a4b24acekm int analysis_step_; 166030249dd247444687663c4969ff078dc0a4b24acekm 167030249dd247444687663c4969ff078dc0a4b24acekm // TODO(bercic): Quick stopgap measure for voice detection in the clear 168030249dd247444687663c4969ff078dc0a4b24acekm // and noise streams. 169b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm // Note: VAD currently does not affect anything in IntelligibilityEnhancer. 170030249dd247444687663c4969ff078dc0a4b24acekm VadInst* vad_high_; 171030249dd247444687663c4969ff078dc0a4b24acekm VadInst* vad_low_; 172b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm rtc::scoped_ptr<int16_t[]> vad_tmp_buffer_; 173b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm bool has_voice_low_; // Whether voice detected in speech stream. 174030249dd247444687663c4969ff078dc0a4b24acekm}; 175030249dd247444687663c4969ff078dc0a4b24acekm 176030249dd247444687663c4969ff078dc0a4b24acekm} // namespace webrtc 177030249dd247444687663c4969ff078dc0a4b24acekm 178030249dd247444687663c4969ff078dc0a4b24acekm#endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 179