1/* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12#include <math.h> 13#include <stdio.h> 14#include <stdlib.h> 15 16#include <algorithm> 17 18#include "gflags/gflags.h" 19#include "testing/gtest/include/gtest/gtest.h" 20#include "webrtc/modules/audio_processing/agc/agc.h" 21#include "webrtc/modules/audio_processing/agc/histogram.h" 22#include "webrtc/modules/audio_processing/agc/utility.h" 23#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" 24#include "webrtc/modules/audio_processing/vad/common.h" 25#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" 26#include "webrtc/modules/audio_processing/vad/standalone_vad.h" 27#include "webrtc/modules/include/module_common_types.h" 28 29static const int kAgcAnalWindowSamples = 100; 30static const double kDefaultActivityThreshold = 0.3; 31 32DEFINE_bool(standalone_vad, true, "enable stand-alone VAD"); 33DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'" 34 " format"); 35DEFINE_string(video_vad, "", "name of a file containing video VAD (activity" 36 " probabilities) in double format. One activity per 10ms is" 37 " required. If no file is given the video information is not" 38 " incorporated. Negative activity is interpreted as video is" 39 " not adapted and the statistics are not computed during" 40 " the learning phase. Note that the negative video activities" 41 " are ONLY allowed at the beginning."); 42DEFINE_string(result, "", "name of a file to write the results. The results" 43 " will be appended to the end of the file. This is optional."); 44DEFINE_string(audio_content, "", "name of a file where audio content is written" 45 " to, in double format."); 46DEFINE_double(activity_threshold, kDefaultActivityThreshold, 47 "Activity threshold"); 48 49namespace webrtc { 50 51// TODO(turajs) A new CL will be committed soon where ExtractFeatures will 52// notify the caller of "silence" input, instead of bailing out. We would not 53// need the following function when such a change is made. 54 55// Add some dither to quiet frames. This avoids the ExtractFeatures skip a 56// silence frame. Otherwise true VAD would drift with respect to the audio. 57// We only consider mono inputs. 58static void DitherSilence(AudioFrame* frame) { 59 ASSERT_EQ(1u, frame->num_channels_); 60 const double kRmsSilence = 5; 61 const double sum_squared_silence = kRmsSilence * kRmsSilence * 62 frame->samples_per_channel_; 63 double sum_squared = 0; 64 for (size_t n = 0; n < frame->samples_per_channel_; n++) 65 sum_squared += frame->data_[n] * frame->data_[n]; 66 if (sum_squared <= sum_squared_silence) { 67 for (size_t n = 0; n < frame->samples_per_channel_; n++) 68 frame->data_[n] = (rand() & 0xF) - 8; // NOLINT: ignore non-threadsafe. 69 } 70} 71 72class AgcStat { 73 public: 74 AgcStat() 75 : video_index_(0), 76 activity_threshold_(kDefaultActivityThreshold), 77 audio_content_(Histogram::Create(kAgcAnalWindowSamples)), 78 audio_processing_(new VadAudioProc()), 79 vad_(new PitchBasedVad()), 80 standalone_vad_(StandaloneVad::Create()), 81 audio_content_fid_(NULL) { 82 for (size_t n = 0; n < kMaxNumFrames; n++) 83 video_vad_[n] = 0.5; 84 } 85 86 ~AgcStat() { 87 if (audio_content_fid_ != NULL) { 88 fclose(audio_content_fid_); 89 } 90 } 91 92 void set_audio_content_file(FILE* audio_content_fid) { 93 audio_content_fid_ = audio_content_fid; 94 } 95 96 int AddAudio(const AudioFrame& frame, double p_video, 97 int* combined_vad) { 98 if (frame.num_channels_ != 1 || 99 frame.samples_per_channel_ != 100 kSampleRateHz / 100 || 101 frame.sample_rate_hz_ != kSampleRateHz) 102 return -1; 103 video_vad_[video_index_++] = p_video; 104 AudioFeatures features; 105 audio_processing_->ExtractFeatures( 106 frame.data_, frame.samples_per_channel_, &features); 107 if (FLAGS_standalone_vad) { 108 standalone_vad_->AddAudio(frame.data_, 109 frame.samples_per_channel_); 110 } 111 if (features.num_frames > 0) { 112 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5}; 113 if (FLAGS_standalone_vad) { 114 standalone_vad_->GetActivity(p, kMaxNumFrames); 115 } 116 // TODO(turajs) combining and limiting are used in the source files as 117 // well they can be moved to utility. 118 // Combine Video and stand-alone VAD. 119 for (size_t n = 0; n < features.num_frames; n++) { 120 double p_active = p[n] * video_vad_[n]; 121 double p_passive = (1 - p[n]) * (1 - video_vad_[n]); 122 p[n] = p_active / (p_active + p_passive); 123 // Limit probabilities. 124 p[n] = std::min(std::max(p[n], 0.01), 0.99); 125 } 126 if (vad_->VoicingProbability(features, p) < 0) 127 return -1; 128 for (size_t n = 0; n < features.num_frames; n++) { 129 audio_content_->Update(features.rms[n], p[n]); 130 double ac = audio_content_->AudioContent(); 131 if (audio_content_fid_ != NULL) { 132 fwrite(&ac, sizeof(ac), 1, audio_content_fid_); 133 } 134 if (ac > kAgcAnalWindowSamples * activity_threshold_) { 135 combined_vad[n] = 1; 136 } else { 137 combined_vad[n] = 0; 138 } 139 } 140 video_index_ = 0; 141 } 142 return static_cast<int>(features.num_frames); 143 } 144 145 void Reset() { 146 audio_content_->Reset(); 147 } 148 149 void SetActivityThreshold(double activity_threshold) { 150 activity_threshold_ = activity_threshold; 151 } 152 153 private: 154 int video_index_; 155 double activity_threshold_; 156 double video_vad_[kMaxNumFrames]; 157 rtc::scoped_ptr<Histogram> audio_content_; 158 rtc::scoped_ptr<VadAudioProc> audio_processing_; 159 rtc::scoped_ptr<PitchBasedVad> vad_; 160 rtc::scoped_ptr<StandaloneVad> standalone_vad_; 161 162 FILE* audio_content_fid_; 163}; 164 165 166void void_main(int argc, char* argv[]) { 167 webrtc::AgcStat agc_stat; 168 169 FILE* pcm_fid = fopen(argv[1], "rb"); 170 ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1]; 171 172 if (argc < 2) { 173 fprintf(stderr, "\nNot Enough arguments\n"); 174 } 175 176 FILE* true_vad_fid = NULL; 177 ASSERT_GT(FLAGS_true_vad.size(), 0u) << "Specify the file containing true " 178 "VADs using --true_vad flag."; 179 true_vad_fid = fopen(FLAGS_true_vad.c_str(), "rb"); 180 ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " << 181 FLAGS_true_vad; 182 183 FILE* results_fid = NULL; 184 if (FLAGS_result.size() > 0) { 185 // True if this is the first time writing to this function and we add a 186 // header to the beginning of the file. 187 bool write_header; 188 // Open in the read mode. If it fails, the file doesn't exist and has to 189 // write a header for it. Otherwise no need to write a header. 190 results_fid = fopen(FLAGS_result.c_str(), "r"); 191 if (results_fid == NULL) { 192 write_header = true; 193 } else { 194 fclose(results_fid); 195 write_header = false; 196 } 197 // Open in append mode. 198 results_fid = fopen(FLAGS_result.c_str(), "a"); 199 ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " << 200 FLAGS_result << ", to write the results."; 201 // Write the header if required. 202 if (write_header) { 203 fprintf(results_fid, "%% Total Active, Misdetection, " 204 "Total inactive, False Positive, On-sets, Missed segments, " 205 "Average response\n"); 206 } 207 } 208 209 FILE* video_vad_fid = NULL; 210 if (FLAGS_video_vad.size() > 0) { 211 video_vad_fid = fopen(FLAGS_video_vad.c_str(), "rb"); 212 ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " << 213 FLAGS_video_vad << " to read video-based VAD decisions.\n"; 214 } 215 216 // AgsStat will be the owner of this file and will close it at its 217 // destructor. 218 FILE* audio_content_fid = NULL; 219 if (FLAGS_audio_content.size() > 0) { 220 audio_content_fid = fopen(FLAGS_audio_content.c_str(), "wb"); 221 ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " << 222 FLAGS_audio_content << " to write audio-content.\n"; 223 agc_stat.set_audio_content_file(audio_content_fid); 224 } 225 226 webrtc::AudioFrame frame; 227 frame.num_channels_ = 1; 228 frame.sample_rate_hz_ = 16000; 229 frame.samples_per_channel_ = frame.sample_rate_hz_ / 100; 230 const size_t kSamplesToRead = frame.num_channels_ * 231 frame.samples_per_channel_; 232 233 agc_stat.SetActivityThreshold(FLAGS_activity_threshold); 234 235 int ret_val = 0; 236 int num_frames = 0; 237 int agc_vad[kMaxNumFrames]; 238 uint8_t true_vad[kMaxNumFrames]; 239 double p_video = 0.5; 240 int total_active = 0; 241 int total_passive = 0; 242 int total_false_positive = 0; 243 int total_missed_detection = 0; 244 int onset_adaptation = 0; 245 int num_onsets = 0; 246 bool onset = false; 247 uint8_t previous_true_vad = 0; 248 int num_not_adapted = 0; 249 size_t true_vad_index = 0; 250 bool in_false_positive_region = false; 251 int total_false_positive_duration = 0; 252 bool video_adapted = false; 253 while (kSamplesToRead == fread(frame.data_, sizeof(int16_t), 254 kSamplesToRead, pcm_fid)) { 255 assert(true_vad_index < kMaxNumFrames); 256 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1, 257 true_vad_fid)) 258 << "Size mismatch between True-VAD and the PCM file.\n"; 259 if (video_vad_fid != NULL) { 260 ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) << 261 "Not enough video-based VAD probabilities."; 262 } 263 264 // Negative video activity indicates that the video-based VAD is not yet 265 // adapted. Disregards the learning phase in statistics. 266 if (p_video < 0) { 267 if (video_adapted) { 268 fprintf(stderr, "Negative video probabilities ONLY allowed at the " 269 "beginning of the sequence, not in the middle.\n"); 270 exit(1); 271 } 272 continue; 273 } else { 274 video_adapted = true; 275 } 276 277 num_frames++; 278 uint8_t last_true_vad; 279 if (true_vad_index == 0) { 280 last_true_vad = previous_true_vad; 281 } else { 282 last_true_vad = true_vad[true_vad_index - 1]; 283 } 284 if (last_true_vad == 1 && true_vad[true_vad_index] == 0) { 285 agc_stat.Reset(); 286 } 287 true_vad_index++; 288 289 DitherSilence(&frame); 290 291 ret_val = agc_stat.AddAudio(frame, p_video, agc_vad); 292 ASSERT_GE(ret_val, 0); 293 294 if (ret_val > 0) { 295 ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val)); 296 for (int n = 0; n < ret_val; n++) { 297 if (true_vad[n] == 1) { 298 total_active++; 299 if (previous_true_vad == 0) { 300 num_onsets++; 301 onset = true; 302 } 303 if (agc_vad[n] == 0) { 304 total_missed_detection++; 305 if (onset) 306 onset_adaptation++; 307 } else { 308 in_false_positive_region = false; 309 onset = false; 310 } 311 } else if (true_vad[n] == 0) { 312 // Check if |on_set| flag is still up. If so it means that we totally 313 // missed an active region 314 if (onset) 315 num_not_adapted++; 316 onset = false; 317 318 total_passive++; 319 if (agc_vad[n] == 1) { 320 total_false_positive++; 321 in_false_positive_region = true; 322 } 323 if (in_false_positive_region) { 324 total_false_positive_duration++; 325 } 326 } else { 327 ASSERT_TRUE(false) << "Invalid value for true-VAD.\n"; 328 } 329 previous_true_vad = true_vad[n]; 330 } 331 true_vad_index = 0; 332 } 333 } 334 335 if (results_fid != NULL) { 336 fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", 337 total_active, 338 total_missed_detection, 339 total_passive, 340 total_false_positive, 341 num_onsets, 342 num_not_adapted, 343 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), 344 static_cast<float>(total_false_positive_duration) / 345 (total_passive + 1e-12)); 346 } 347 fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n", 348 total_active, 349 total_missed_detection, 350 total_passive, 351 total_false_positive, 352 num_onsets, 353 num_not_adapted, 354 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12), 355 static_cast<float>(total_false_positive_duration) / 356 (total_passive + 1e-12)); 357 358 fclose(true_vad_fid); 359 fclose(pcm_fid); 360 if (video_vad_fid != NULL) { 361 fclose(video_vad_fid); 362 } 363 if (results_fid != NULL) { 364 fclose(results_fid); 365 } 366} 367 368} // namespace webrtc 369 370int main(int argc, char* argv[]) { 371 char kUsage[] = 372 "\nCompute the number of misdetected and false-positive frames. Not\n" 373 " that for each frame of audio (10 ms) there should be one true\n" 374 " activity. If any video-based activity is given, there should also be\n" 375 " one probability per frame.\n" 376 "\nUsage:\n\n" 377 "activity_metric input_pcm [options]\n" 378 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits " 379 "format.\n\n"; 380 google::SetUsageMessage(kUsage); 381 google::ParseCommandLineFlags(&argc, &argv, true); 382 webrtc::void_main(argc, argv); 383 return 0; 384} 385