1030249dd247444687663c4969ff078dc0a4b24acekm/* 2030249dd247444687663c4969ff078dc0a4b24acekm * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3030249dd247444687663c4969ff078dc0a4b24acekm * 4030249dd247444687663c4969ff078dc0a4b24acekm * Use of this source code is governed by a BSD-style license 5030249dd247444687663c4969ff078dc0a4b24acekm * that can be found in the LICENSE file in the root of the source 6030249dd247444687663c4969ff078dc0a4b24acekm * tree. An additional intellectual property rights grant can be found 7030249dd247444687663c4969ff078dc0a4b24acekm * in the file PATENTS. All contributing project authors may 8030249dd247444687663c4969ff078dc0a4b24acekm * be found in the AUTHORS file in the root of the source tree. 9030249dd247444687663c4969ff078dc0a4b24acekm */ 10030249dd247444687663c4969ff078dc0a4b24acekm 11db4fecfb01ac51e936e4b7496a4929e713080f07ekm// 12db4fecfb01ac51e936e4b7496a4929e713080f07ekm// Specifies core class for intelligbility enhancement. 13db4fecfb01ac51e936e4b7496a4929e713080f07ekm// 14db4fecfb01ac51e936e4b7496a4929e713080f07ekm 15030249dd247444687663c4969ff078dc0a4b24acekm#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 16030249dd247444687663c4969ff078dc0a4b24acekm#define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 17030249dd247444687663c4969ff078dc0a4b24acekm 18030249dd247444687663c4969ff078dc0a4b24acekm#include <complex> 1935b72fbceb09031cbd6039e0dbbd44ed24296509ekm#include <vector> 20030249dd247444687663c4969ff078dc0a4b24acekm 21db4fecfb01ac51e936e4b7496a4929e713080f07ekm#include "webrtc/base/scoped_ptr.h" 22030249dd247444687663c4969ff078dc0a4b24acekm#include "webrtc/common_audio/lapped_transform.h" 2360d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson#include "webrtc/common_audio/channel_buffer.h" 24030249dd247444687663c4969ff078dc0a4b24acekm#include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" 25030249dd247444687663c4969ff078dc0a4b24acekm 26030249dd247444687663c4969ff078dc0a4b24acekmnamespace webrtc { 27030249dd247444687663c4969ff078dc0a4b24acekm 28030249dd247444687663c4969ff078dc0a4b24acekm// Speech intelligibility enhancement module. Reads render and capture 29030249dd247444687663c4969ff078dc0a4b24acekm// audio streams and modifies the render stream with a set of gains per 30030249dd247444687663c4969ff078dc0a4b24acekm// frequency bin to enhance speech against the noise background. 31db4fecfb01ac51e936e4b7496a4929e713080f07ekm// Note: assumes speech and noise streams are already separated. 32030249dd247444687663c4969ff078dc0a4b24acekmclass IntelligibilityEnhancer { 33030249dd247444687663c4969ff078dc0a4b24acekm public: 3460d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson struct Config { 3560d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson // |var_*| are parameters for the VarianceArray constructor for the 3660d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson // clear speech stream. 3760d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should 3860d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson // probably go away once fine tuning is done. 3960d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson Config() 4060d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson : sample_rate_hz(16000), 4160d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson num_capture_channels(1), 4260d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson num_render_channels(1), 4360d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson var_type(intelligibility::VarianceArray::kStepDecaying), 4460d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson var_decay_rate(0.9f), 4560d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson var_window_size(10), 4660d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson analysis_rate(800), 4760d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson gain_change_limit(0.1f), 4860d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson rho(0.02f) {} 4960d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson int sample_rate_hz; 506955870806624479723addfae6dcf5d13968796cPeter Kasting size_t num_capture_channels; 516955870806624479723addfae6dcf5d13968796cPeter Kasting size_t num_render_channels; 5260d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson intelligibility::VarianceArray::StepType var_type; 5360d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson float var_decay_rate; 54dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting size_t var_window_size; 5560d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson int analysis_rate; 5660d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson float gain_change_limit; 5760d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson float rho; 5860d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson }; 5960d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson 6060d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson explicit IntelligibilityEnhancer(const Config& config); 6160d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson IntelligibilityEnhancer(); // Initialize with default config. 62030249dd247444687663c4969ff078dc0a4b24acekm 63db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Reads and processes chunk of noise stream in time domain. 6460d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson void AnalyzeCaptureAudio(float* const* audio, 6560d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson int sample_rate_hz, 666955870806624479723addfae6dcf5d13968796cPeter Kasting size_t num_channels); 67b7553dfdbb1ca7779eb0d80b5f509523c9b00086ekm 68db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Reads chunk of speech in time domain and updates with modified signal. 6960d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson void ProcessRenderAudio(float* const* audio, 7060d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson int sample_rate_hz, 716955870806624479723addfae6dcf5d13968796cPeter Kasting size_t num_channels); 7260d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson bool active() const; 73db4fecfb01ac51e936e4b7496a4929e713080f07ekm 74030249dd247444687663c4969ff078dc0a4b24acekm private: 75030249dd247444687663c4969ff078dc0a4b24acekm enum AudioSource { 76db4fecfb01ac51e936e4b7496a4929e713080f07ekm kRenderStream = 0, // Clear speech stream. 77db4fecfb01ac51e936e4b7496a4929e713080f07ekm kCaptureStream, // Noise stream. 78030249dd247444687663c4969ff078dc0a4b24acekm }; 79030249dd247444687663c4969ff078dc0a4b24acekm 80db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Provides access point to the frequency domain. 81030249dd247444687663c4969ff078dc0a4b24acekm class TransformCallback : public LappedTransform::Callback { 82030249dd247444687663c4969ff078dc0a4b24acekm public: 83030249dd247444687663c4969ff078dc0a4b24acekm TransformCallback(IntelligibilityEnhancer* parent, AudioSource source); 84db4fecfb01ac51e936e4b7496a4929e713080f07ekm 85db4fecfb01ac51e936e4b7496a4929e713080f07ekm // All in frequency domain, receives input |in_block|, applies 86db4fecfb01ac51e936e4b7496a4929e713080f07ekm // intelligibility enhancement, and writes result to |out_block|. 87b297c5a01f88219da26cffe433804963d1b70f0fpkasting void ProcessAudioBlock(const std::complex<float>* const* in_block, 886955870806624479723addfae6dcf5d13968796cPeter Kasting size_t in_channels, 89dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting size_t frames, 906955870806624479723addfae6dcf5d13968796cPeter Kasting size_t out_channels, 91b297c5a01f88219da26cffe433804963d1b70f0fpkasting std::complex<float>* const* out_block) override; 92030249dd247444687663c4969ff078dc0a4b24acekm 93030249dd247444687663c4969ff078dc0a4b24acekm private: 94030249dd247444687663c4969ff078dc0a4b24acekm IntelligibilityEnhancer* parent_; 95030249dd247444687663c4969ff078dc0a4b24acekm AudioSource source_; 96030249dd247444687663c4969ff078dc0a4b24acekm }; 97030249dd247444687663c4969ff078dc0a4b24acekm friend class TransformCallback; 9835b72fbceb09031cbd6039e0dbbd44ed24296509ekm FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation); 9935b72fbceb09031cbd6039e0dbbd44ed24296509ekm FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains); 100030249dd247444687663c4969ff078dc0a4b24acekm 101db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source. 102db4fecfb01ac51e936e4b7496a4929e713080f07ekm void DispatchAudio(AudioSource source, 103db4fecfb01ac51e936e4b7496a4929e713080f07ekm const std::complex<float>* in_block, 104030249dd247444687663c4969ff078dc0a4b24acekm std::complex<float>* out_block); 105db4fecfb01ac51e936e4b7496a4929e713080f07ekm 106db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Updates variance computation and analysis with |in_block_|, 107db4fecfb01ac51e936e4b7496a4929e713080f07ekm // and writes modified speech to |out_block|. 108030249dd247444687663c4969ff078dc0a4b24acekm void ProcessClearBlock(const std::complex<float>* in_block, 109030249dd247444687663c4969ff078dc0a4b24acekm std::complex<float>* out_block); 110db4fecfb01ac51e936e4b7496a4929e713080f07ekm 111db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Computes and sets modified gains. 112030249dd247444687663c4969ff078dc0a4b24acekm void AnalyzeClearBlock(float power_target); 113db4fecfb01ac51e936e4b7496a4929e713080f07ekm 11435b72fbceb09031cbd6039e0dbbd44ed24296509ekm // Bisection search for optimal |lambda|. 11535b72fbceb09031cbd6039e0dbbd44ed24296509ekm void SolveForLambda(float power_target, float power_bot, float power_top); 11635b72fbceb09031cbd6039e0dbbd44ed24296509ekm 11735b72fbceb09031cbd6039e0dbbd44ed24296509ekm // Transforms freq gains to ERB gains. 11835b72fbceb09031cbd6039e0dbbd44ed24296509ekm void UpdateErbGains(); 11935b72fbceb09031cbd6039e0dbbd44ed24296509ekm 120db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Updates variance calculation for noise input with |in_block|. 121030249dd247444687663c4969ff078dc0a4b24acekm void ProcessNoiseBlock(const std::complex<float>* in_block, 122030249dd247444687663c4969ff078dc0a4b24acekm std::complex<float>* out_block); 123030249dd247444687663c4969ff078dc0a4b24acekm 124db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Returns number of ERB filters. 125dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting static size_t GetBankSize(int sample_rate, size_t erb_resolution); 126db4fecfb01ac51e936e4b7496a4929e713080f07ekm 127db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Initializes ERB filterbank. 128030249dd247444687663c4969ff078dc0a4b24acekm void CreateErbBank(); 129db4fecfb01ac51e936e4b7496a4929e713080f07ekm 130db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Analytically solves quadratic for optimal gains given |lambda|. 131db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Negative gains are set to 0. Stores the results in |sols|. 132dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols); 133db4fecfb01ac51e936e4b7496a4929e713080f07ekm 134db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Computes variance across ERB filters from freq variance |var|. 135db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Stores in |result|. 136030249dd247444687663c4969ff078dc0a4b24acekm void FilterVariance(const float* var, float* result); 137db4fecfb01ac51e936e4b7496a4929e713080f07ekm 138db4fecfb01ac51e936e4b7496a4929e713080f07ekm // Returns dot product of vectors specified by size |length| arrays |a|,|b|. 139dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting static float DotProduct(const float* a, const float* b, size_t length); 140030249dd247444687663c4969ff078dc0a4b24acekm 141dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting const size_t freqs_; // Num frequencies in frequency domain. 142dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting const size_t window_size_; // Window size in samples; also the block size. 143dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting const size_t chunk_length_; // Chunk size in samples. 144dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting const size_t bank_size_; // Num ERB filters. 145030249dd247444687663c4969ff078dc0a4b24acekm const int sample_rate_hz_; 146030249dd247444687663c4969ff078dc0a4b24acekm const int erb_resolution_; 1476955870806624479723addfae6dcf5d13968796cPeter Kasting const size_t num_capture_channels_; 1486955870806624479723addfae6dcf5d13968796cPeter Kasting const size_t num_render_channels_; 149dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting const int analysis_rate_; // Num blocks before gains recalculated. 15060d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson 151dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting const bool active_; // Whether render gains are being updated. 152dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting // TODO(ekm): Add logic for updating |active_|. 153030249dd247444687663c4969ff078dc0a4b24acekm 154030249dd247444687663c4969ff078dc0a4b24acekm intelligibility::VarianceArray clear_variance_; 155030249dd247444687663c4969ff078dc0a4b24acekm intelligibility::VarianceArray noise_variance_; 156db4fecfb01ac51e936e4b7496a4929e713080f07ekm rtc::scoped_ptr<float[]> filtered_clear_var_; 157db4fecfb01ac51e936e4b7496a4929e713080f07ekm rtc::scoped_ptr<float[]> filtered_noise_var_; 15835b72fbceb09031cbd6039e0dbbd44ed24296509ekm std::vector<std::vector<float>> filter_bank_; 159db4fecfb01ac51e936e4b7496a4929e713080f07ekm rtc::scoped_ptr<float[]> center_freqs_; 160dce40cf804019a9898b6ab8d8262466b697c56e0Peter Kasting size_t start_freq_; 161db4fecfb01ac51e936e4b7496a4929e713080f07ekm rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR. 162db4fecfb01ac51e936e4b7496a4929e713080f07ekm // for each ERB band. 163db4fecfb01ac51e936e4b7496a4929e713080f07ekm rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains. 164030249dd247444687663c4969ff078dc0a4b24acekm intelligibility::GainApplier gain_applier_; 165030249dd247444687663c4969ff078dc0a4b24acekm 16660d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson // Destination buffers used to reassemble blocked chunks before overwriting 167030249dd247444687663c4969ff078dc0a4b24acekm // the original input array with modifications. 16860d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson ChannelBuffer<float> temp_render_out_buffer_; 16960d9b332a5391045439bfb6a3a5447973e3d5603ekmeyerson ChannelBuffer<float> temp_capture_out_buffer_; 170db4fecfb01ac51e936e4b7496a4929e713080f07ekm 171db4fecfb01ac51e936e4b7496a4929e713080f07ekm rtc::scoped_ptr<float[]> kbd_window_; 172030249dd247444687663c4969ff078dc0a4b24acekm TransformCallback render_callback_; 173030249dd247444687663c4969ff078dc0a4b24acekm TransformCallback capture_callback_; 174db4fecfb01ac51e936e4b7496a4929e713080f07ekm rtc::scoped_ptr<LappedTransform> render_mangler_; 175db4fecfb01ac51e936e4b7496a4929e713080f07ekm rtc::scoped_ptr<LappedTransform> capture_mangler_; 176030249dd247444687663c4969ff078dc0a4b24acekm int block_count_; 177030249dd247444687663c4969ff078dc0a4b24acekm int analysis_step_; 178030249dd247444687663c4969ff078dc0a4b24acekm}; 179030249dd247444687663c4969ff078dc0a4b24acekm 180030249dd247444687663c4969ff078dc0a4b24acekm} // namespace webrtc 181030249dd247444687663c4969ff078dc0a4b24acekm 182030249dd247444687663c4969ff078dc0a4b24acekm#endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 183