1/*
2 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include <math.h>
13#include <stdio.h>
14#include <stdlib.h>
15
16#include <algorithm>
17
18#include "gflags/gflags.h"
19#include "testing/gtest/include/gtest/gtest.h"
20#include "webrtc/modules/audio_processing/agc/agc.h"
21#include "webrtc/modules/audio_processing/agc/histogram.h"
22#include "webrtc/modules/audio_processing/agc/utility.h"
23#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
24#include "webrtc/modules/audio_processing/vad/common.h"
25#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
26#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
27#include "webrtc/modules/include/module_common_types.h"
28
29static const int kAgcAnalWindowSamples = 100;
30static const double kDefaultActivityThreshold = 0.3;
31
32DEFINE_bool(standalone_vad, true, "enable stand-alone VAD");
33DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'"
34              " format");
35DEFINE_string(video_vad, "", "name of a file containing video VAD (activity"
36              " probabilities) in double format. One activity per 10ms is"
37              " required. If no file is given the video information is not"
38              " incorporated. Negative activity is interpreted as video is"
39              " not adapted and the statistics are not computed during"
40              " the learning phase. Note that the negative video activities"
41              " are ONLY allowed at the beginning.");
42DEFINE_string(result, "", "name of a file to write the results. The results"
43              " will be appended to the end of the file. This is optional.");
44DEFINE_string(audio_content, "", "name of a file where audio content is written"
45              " to, in double format.");
46DEFINE_double(activity_threshold, kDefaultActivityThreshold,
47              "Activity threshold");
48
49namespace webrtc {
50
51// TODO(turajs) A new CL will be committed soon where ExtractFeatures will
52// notify the caller of "silence" input, instead of bailing out. We would not
53// need the following function when such a change is made.
54
55// Add some dither to quiet frames. This avoids the ExtractFeatures skip a
56// silence frame. Otherwise true VAD would drift with respect to the audio.
57// We only consider mono inputs.
58static void DitherSilence(AudioFrame* frame) {
59  ASSERT_EQ(1u, frame->num_channels_);
60  const double kRmsSilence = 5;
61  const double sum_squared_silence = kRmsSilence * kRmsSilence *
62      frame->samples_per_channel_;
63  double sum_squared = 0;
64  for (size_t n = 0; n < frame->samples_per_channel_; n++)
65    sum_squared += frame->data_[n] * frame->data_[n];
66  if (sum_squared <= sum_squared_silence) {
67    for (size_t n = 0; n < frame->samples_per_channel_; n++)
68      frame->data_[n] = (rand() & 0xF) - 8;  // NOLINT: ignore non-threadsafe.
69  }
70}
71
72class AgcStat {
73 public:
74  AgcStat()
75      : video_index_(0),
76        activity_threshold_(kDefaultActivityThreshold),
77        audio_content_(Histogram::Create(kAgcAnalWindowSamples)),
78        audio_processing_(new VadAudioProc()),
79        vad_(new PitchBasedVad()),
80        standalone_vad_(StandaloneVad::Create()),
81        audio_content_fid_(NULL) {
82    for (size_t n = 0; n < kMaxNumFrames; n++)
83      video_vad_[n] = 0.5;
84  }
85
86  ~AgcStat() {
87    if (audio_content_fid_ != NULL) {
88      fclose(audio_content_fid_);
89    }
90  }
91
92  void set_audio_content_file(FILE* audio_content_fid) {
93    audio_content_fid_ = audio_content_fid;
94  }
95
96  int AddAudio(const AudioFrame& frame, double p_video,
97               int* combined_vad) {
98    if (frame.num_channels_ != 1 ||
99        frame.samples_per_channel_ !=
100            kSampleRateHz / 100 ||
101            frame.sample_rate_hz_ != kSampleRateHz)
102      return -1;
103    video_vad_[video_index_++] = p_video;
104    AudioFeatures features;
105    audio_processing_->ExtractFeatures(
106        frame.data_, frame.samples_per_channel_, &features);
107    if (FLAGS_standalone_vad) {
108      standalone_vad_->AddAudio(frame.data_,
109                                frame.samples_per_channel_);
110    }
111    if (features.num_frames > 0) {
112      double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5};
113      if (FLAGS_standalone_vad) {
114        standalone_vad_->GetActivity(p, kMaxNumFrames);
115      }
116      // TODO(turajs) combining and limiting are used in the source files as
117      // well they can be moved to utility.
118      // Combine Video and stand-alone VAD.
119      for (size_t n = 0; n < features.num_frames; n++) {
120        double p_active = p[n] * video_vad_[n];
121        double p_passive = (1 - p[n]) * (1 - video_vad_[n]);
122        p[n]  = p_active / (p_active + p_passive);
123        // Limit probabilities.
124        p[n] = std::min(std::max(p[n], 0.01), 0.99);
125      }
126      if (vad_->VoicingProbability(features, p) < 0)
127        return -1;
128      for (size_t n = 0; n < features.num_frames; n++) {
129        audio_content_->Update(features.rms[n], p[n]);
130        double ac = audio_content_->AudioContent();
131        if (audio_content_fid_ != NULL) {
132          fwrite(&ac, sizeof(ac), 1, audio_content_fid_);
133        }
134        if (ac > kAgcAnalWindowSamples * activity_threshold_) {
135          combined_vad[n] = 1;
136        } else {
137          combined_vad[n] = 0;
138        }
139      }
140      video_index_ = 0;
141    }
142    return static_cast<int>(features.num_frames);
143  }
144
145  void Reset() {
146    audio_content_->Reset();
147  }
148
149  void SetActivityThreshold(double activity_threshold) {
150    activity_threshold_ = activity_threshold;
151  }
152
153 private:
154  int video_index_;
155  double activity_threshold_;
156  double video_vad_[kMaxNumFrames];
157  rtc::scoped_ptr<Histogram> audio_content_;
158  rtc::scoped_ptr<VadAudioProc> audio_processing_;
159  rtc::scoped_ptr<PitchBasedVad> vad_;
160  rtc::scoped_ptr<StandaloneVad> standalone_vad_;
161
162  FILE* audio_content_fid_;
163};
164
165
166void void_main(int argc, char* argv[]) {
167  webrtc::AgcStat agc_stat;
168
169  FILE* pcm_fid = fopen(argv[1], "rb");
170  ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1];
171
172  if (argc < 2) {
173    fprintf(stderr, "\nNot Enough arguments\n");
174  }
175
176  FILE* true_vad_fid = NULL;
177  ASSERT_GT(FLAGS_true_vad.size(), 0u) << "Specify the file containing true "
178      "VADs using --true_vad flag.";
179  true_vad_fid = fopen(FLAGS_true_vad.c_str(), "rb");
180  ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " <<
181      FLAGS_true_vad;
182
183  FILE* results_fid = NULL;
184  if (FLAGS_result.size() > 0) {
185    // True if this is the first time writing to this function and we add a
186    // header to the beginning of the file.
187    bool write_header;
188    // Open in the read mode. If it fails, the file doesn't exist and has to
189    // write a header for it. Otherwise no need to write a header.
190    results_fid = fopen(FLAGS_result.c_str(), "r");
191    if (results_fid == NULL) {
192      write_header = true;
193    } else {
194      fclose(results_fid);
195      write_header = false;
196    }
197    // Open in append mode.
198    results_fid = fopen(FLAGS_result.c_str(), "a");
199    ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " <<
200              FLAGS_result << ", to write the results.";
201    // Write the header if required.
202    if (write_header) {
203      fprintf(results_fid, "%% Total Active,  Misdetection,  "
204              "Total inactive,  False Positive,  On-sets,  Missed segments,  "
205              "Average response\n");
206    }
207  }
208
209  FILE* video_vad_fid = NULL;
210  if (FLAGS_video_vad.size() > 0) {
211    video_vad_fid = fopen(FLAGS_video_vad.c_str(), "rb");
212    ASSERT_TRUE(video_vad_fid != NULL) <<  "Cannot open the file, " <<
213              FLAGS_video_vad << " to read video-based VAD decisions.\n";
214  }
215
216  // AgsStat will be the owner of this file and will close it at its
217  // destructor.
218  FILE* audio_content_fid = NULL;
219  if (FLAGS_audio_content.size() > 0) {
220    audio_content_fid = fopen(FLAGS_audio_content.c_str(), "wb");
221    ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " <<
222              FLAGS_audio_content << " to write audio-content.\n";
223    agc_stat.set_audio_content_file(audio_content_fid);
224  }
225
226  webrtc::AudioFrame frame;
227  frame.num_channels_ = 1;
228  frame.sample_rate_hz_ = 16000;
229  frame.samples_per_channel_ = frame.sample_rate_hz_ / 100;
230  const size_t kSamplesToRead = frame.num_channels_ *
231      frame.samples_per_channel_;
232
233  agc_stat.SetActivityThreshold(FLAGS_activity_threshold);
234
235  int ret_val = 0;
236  int num_frames = 0;
237  int agc_vad[kMaxNumFrames];
238  uint8_t true_vad[kMaxNumFrames];
239  double p_video = 0.5;
240  int total_active = 0;
241  int total_passive = 0;
242  int total_false_positive = 0;
243  int total_missed_detection = 0;
244  int onset_adaptation = 0;
245  int num_onsets = 0;
246  bool onset = false;
247  uint8_t previous_true_vad = 0;
248  int num_not_adapted = 0;
249  size_t true_vad_index = 0;
250  bool in_false_positive_region = false;
251  int total_false_positive_duration = 0;
252  bool video_adapted = false;
253  while (kSamplesToRead == fread(frame.data_, sizeof(int16_t),
254                                 kSamplesToRead, pcm_fid)) {
255    assert(true_vad_index < kMaxNumFrames);
256    ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1,
257                        true_vad_fid))
258        << "Size mismatch between True-VAD and the PCM file.\n";
259    if (video_vad_fid != NULL) {
260      ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) <<
261          "Not enough video-based VAD probabilities.";
262    }
263
264    // Negative video activity indicates that the video-based VAD is not yet
265    // adapted. Disregards the learning phase in statistics.
266    if (p_video < 0) {
267      if (video_adapted) {
268        fprintf(stderr, "Negative video probabilities ONLY allowed at the "
269            "beginning of the sequence, not in the middle.\n");
270        exit(1);
271      }
272      continue;
273    } else {
274      video_adapted = true;
275    }
276
277    num_frames++;
278    uint8_t last_true_vad;
279    if (true_vad_index == 0) {
280      last_true_vad = previous_true_vad;
281    } else {
282      last_true_vad = true_vad[true_vad_index - 1];
283    }
284    if (last_true_vad == 1 && true_vad[true_vad_index] == 0) {
285      agc_stat.Reset();
286    }
287    true_vad_index++;
288
289    DitherSilence(&frame);
290
291    ret_val = agc_stat.AddAudio(frame, p_video, agc_vad);
292    ASSERT_GE(ret_val, 0);
293
294    if (ret_val > 0) {
295      ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val));
296      for (int n = 0; n < ret_val; n++) {
297        if (true_vad[n] == 1) {
298          total_active++;
299          if (previous_true_vad == 0) {
300            num_onsets++;
301            onset = true;
302          }
303          if (agc_vad[n] == 0) {
304            total_missed_detection++;
305            if (onset)
306              onset_adaptation++;
307          } else {
308            in_false_positive_region = false;
309            onset = false;
310          }
311        } else if (true_vad[n] == 0) {
312          // Check if |on_set| flag is still up. If so it means that we totally
313          // missed an active region
314          if (onset)
315            num_not_adapted++;
316          onset = false;
317
318          total_passive++;
319          if (agc_vad[n] == 1) {
320            total_false_positive++;
321            in_false_positive_region = true;
322          }
323          if (in_false_positive_region) {
324            total_false_positive_duration++;
325          }
326        } else {
327          ASSERT_TRUE(false) << "Invalid value for true-VAD.\n";
328        }
329        previous_true_vad = true_vad[n];
330      }
331      true_vad_index = 0;
332    }
333  }
334
335  if (results_fid != NULL) {
336    fprintf(results_fid, "%4d  %4d  %4d  %4d  %4d  %4d  %4.0f %4.0f\n",
337            total_active,
338            total_missed_detection,
339            total_passive,
340            total_false_positive,
341            num_onsets,
342            num_not_adapted,
343            static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
344            static_cast<float>(total_false_positive_duration) /
345            (total_passive + 1e-12));
346  }
347  fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
348          total_active,
349          total_missed_detection,
350          total_passive,
351          total_false_positive,
352          num_onsets,
353          num_not_adapted,
354          static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
355          static_cast<float>(total_false_positive_duration) /
356              (total_passive + 1e-12));
357
358  fclose(true_vad_fid);
359  fclose(pcm_fid);
360  if (video_vad_fid != NULL) {
361    fclose(video_vad_fid);
362  }
363  if (results_fid != NULL) {
364    fclose(results_fid);
365  }
366}
367
368}  // namespace webrtc
369
370int main(int argc, char* argv[]) {
371  char kUsage[] =
372      "\nCompute the number of misdetected and false-positive frames. Not\n"
373      " that for each frame of audio (10 ms) there should be one true\n"
374      " activity. If any video-based activity is given, there should also be\n"
375      " one probability per frame.\n"
376      "\nUsage:\n\n"
377      "activity_metric input_pcm [options]\n"
378      "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits "
379      "format.\n\n";
380  google::SetUsageMessage(kUsage);
381  google::ParseCommandLineFlags(&argc, &argv, true);
382  webrtc::void_main(argc, argv);
383  return 0;
384}
385