1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// To know more about the algorithm used and the original code which this is
6// based of, see
7// https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef
8
9#include "content/browser/speech/endpointer/energy_endpointer.h"
10
11#include <math.h>
12
13#include "base/logging.h"
14
15namespace {
16
17// Returns the RMS (quadratic mean) of the input signal.
18float RMS(const int16* samples, int num_samples) {
19  int64 ssq_int64 = 0;
20  int64 sum_int64 = 0;
21  for (int i = 0; i < num_samples; ++i) {
22    sum_int64 += samples[i];
23    ssq_int64 += samples[i] * samples[i];
24  }
25  // now convert to floats.
26  double sum = static_cast<double>(sum_int64);
27  sum /= num_samples;
28  double ssq = static_cast<double>(ssq_int64);
29  return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
30}
31
32int64 Secs2Usecs(float seconds) {
33  return static_cast<int64>(0.5 + (1.0e6 * seconds));
34}
35
36float GetDecibel(float value) {
37  if (value > 1.0e-100)
38    return 20 * log10(value);
39  return -2000.0;
40}
41
42}  // namespace
43
44namespace content {
45
46// Stores threshold-crossing histories for making decisions about the speech
47// state.
48class EnergyEndpointer::HistoryRing {
49 public:
50  HistoryRing() : insertion_index_(0) {}
51
52  // Resets the ring to |size| elements each with state |initial_state|
53  void SetRing(int size, bool initial_state);
54
55  // Inserts a new entry into the ring and drops the oldest entry.
56  void Insert(int64 time_us, bool decision);
57
58  // Returns the time in microseconds of the most recently added entry.
59  int64 EndTime() const;
60
61  // Returns the sum of all intervals during which 'decision' is true within
62  // the time in seconds specified by 'duration'. The returned interval is
63  // in seconds.
64  float RingSum(float duration_sec);
65
66 private:
67  struct DecisionPoint {
68    int64 time_us;
69    bool decision;
70  };
71
72  std::vector<DecisionPoint> decision_points_;
73  int insertion_index_;  // Index at which the next item gets added/inserted.
74
75  DISALLOW_COPY_AND_ASSIGN(HistoryRing);
76};
77
78void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
79  insertion_index_ = 0;
80  decision_points_.clear();
81  DecisionPoint init = { -1, initial_state };
82  decision_points_.resize(size, init);
83}
84
85void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) {
86  decision_points_[insertion_index_].time_us = time_us;
87  decision_points_[insertion_index_].decision = decision;
88  insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
89}
90
91int64 EnergyEndpointer::HistoryRing::EndTime() const {
92  int ind = insertion_index_ - 1;
93  if (ind < 0)
94    ind = decision_points_.size() - 1;
95  return decision_points_[ind].time_us;
96}
97
98float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
99  if (!decision_points_.size())
100    return 0.0;
101
102  int64 sum_us = 0;
103  int ind = insertion_index_ - 1;
104  if (ind < 0)
105    ind = decision_points_.size() - 1;
106  int64 end_us = decision_points_[ind].time_us;
107  bool is_on = decision_points_[ind].decision;
108  int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec));
109  if (start_us < 0)
110    start_us = 0;
111  size_t n_summed = 1;  // n points ==> (n-1) intervals
112  while ((decision_points_[ind].time_us > start_us) &&
113         (n_summed < decision_points_.size())) {
114    --ind;
115    if (ind < 0)
116      ind = decision_points_.size() - 1;
117    if (is_on)
118      sum_us += end_us - decision_points_[ind].time_us;
119    is_on = decision_points_[ind].decision;
120    end_us = decision_points_[ind].time_us;
121    n_summed++;
122  }
123
124  return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
125}
126
127EnergyEndpointer::EnergyEndpointer()
128    : status_(EP_PRE_SPEECH),
129      offset_confirm_dur_sec_(0),
130      endpointer_time_us_(0),
131      fast_update_frames_(0),
132      frame_counter_(0),
133      max_window_dur_(4.0),
134      sample_rate_(0),
135      history_(new HistoryRing()),
136      decision_threshold_(0),
137      estimating_environment_(false),
138      noise_level_(0),
139      rms_adapt_(0),
140      start_lag_(0),
141      end_lag_(0),
142      user_input_start_time_us_(0) {
143}
144
145EnergyEndpointer::~EnergyEndpointer() {
146}
147
148int EnergyEndpointer::TimeToFrame(float time) const {
149  return static_cast<int32>(0.5 + (time / params_.frame_period()));
150}
151
152void EnergyEndpointer::Restart(bool reset_threshold) {
153  status_ = EP_PRE_SPEECH;
154  user_input_start_time_us_ = 0;
155
156  if (reset_threshold) {
157    decision_threshold_ = params_.decision_threshold();
158    rms_adapt_ = decision_threshold_;
159    noise_level_ = params_.decision_threshold() / 2.0f;
160    frame_counter_ = 0;  // Used for rapid initial update of levels.
161  }
162
163  // Set up the memories to hold the history windows.
164  history_->SetRing(TimeToFrame(max_window_dur_), false);
165
166  // Flag that indicates that current input should be used for
167  // estimating the environment. The user has not yet started input
168  // by e.g. pressed the push-to-talk button. By default, this is
169  // false for backward compatibility.
170  estimating_environment_ = false;
171}
172
173void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
174  params_ = params;
175
176  // Find the longest history interval to be used, and make the ring
177  // large enough to accommodate that number of frames.  NOTE: This
178  // depends upon ep_frame_period being set correctly in the factory
179  // that did this instantiation.
180  max_window_dur_ = params_.onset_window();
181  if (params_.speech_on_window() > max_window_dur_)
182    max_window_dur_ = params_.speech_on_window();
183  if (params_.offset_window() > max_window_dur_)
184    max_window_dur_ = params_.offset_window();
185  Restart(true);
186
187  offset_confirm_dur_sec_ = params_.offset_window() -
188                            params_.offset_confirm_dur();
189  if (offset_confirm_dur_sec_ < 0.0)
190    offset_confirm_dur_sec_ = 0.0;
191
192  user_input_start_time_us_ = 0;
193
194  // Flag that indicates that  current input should be used for
195  // estimating the environment. The user has not yet started input
196  // by e.g. pressed the push-to-talk button. By default, this is
197  // false for backward compatibility.
198  estimating_environment_ = false;
199  // The initial value of the noise and speech levels is inconsequential.
200  // The level of the first frame will overwrite these values.
201  noise_level_ = params_.decision_threshold() / 2.0f;
202  fast_update_frames_ =
203      static_cast<int64>(params_.fast_update_dur() / params_.frame_period());
204
205  frame_counter_ = 0;  // Used for rapid initial update of levels.
206
207  sample_rate_ = params_.sample_rate();
208  start_lag_ = static_cast<int>(sample_rate_ /
209                                params_.max_fundamental_frequency());
210  end_lag_ = static_cast<int>(sample_rate_ /
211                              params_.min_fundamental_frequency());
212}
213
214void EnergyEndpointer::StartSession() {
215  Restart(true);
216}
217
218void EnergyEndpointer::EndSession() {
219  status_ = EP_POST_SPEECH;
220}
221
222void EnergyEndpointer::SetEnvironmentEstimationMode() {
223  Restart(true);
224  estimating_environment_ = true;
225}
226
227void EnergyEndpointer::SetUserInputMode() {
228  estimating_environment_ = false;
229  user_input_start_time_us_ = endpointer_time_us_;
230}
231
232void EnergyEndpointer::ProcessAudioFrame(int64 time_us,
233                                         const int16* samples,
234                                         int num_samples,
235                                         float* rms_out) {
236  endpointer_time_us_ = time_us;
237  float rms = RMS(samples, num_samples);
238
239  // Check that this is user input audio vs. pre-input adaptation audio.
240  // Input audio starts when the user indicates start of input, by e.g.
241  // pressing push-to-talk. Audio received prior to that is used to update
242  // noise and speech level estimates.
243  if (!estimating_environment_) {
244    bool decision = false;
245    if ((endpointer_time_us_ - user_input_start_time_us_) <
246        Secs2Usecs(params_.contamination_rejection_period())) {
247      decision = false;
248      DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_;
249    } else {
250      decision = (rms > decision_threshold_);
251    }
252
253    history_->Insert(endpointer_time_us_, decision);
254
255    switch (status_) {
256      case EP_PRE_SPEECH:
257        if (history_->RingSum(params_.onset_window()) >
258            params_.onset_detect_dur()) {
259          status_ = EP_POSSIBLE_ONSET;
260        }
261        break;
262
263      case EP_POSSIBLE_ONSET: {
264        float tsum = history_->RingSum(params_.onset_window());
265        if (tsum > params_.onset_confirm_dur()) {
266          status_ = EP_SPEECH_PRESENT;
267        } else {  // If signal is not maintained, drop back to pre-speech.
268          if (tsum <= params_.onset_detect_dur())
269            status_ = EP_PRE_SPEECH;
270        }
271        break;
272      }
273
274      case EP_SPEECH_PRESENT: {
275        // To induce hysteresis in the state residency, we allow a
276        // smaller residency time in the on_ring, than was required to
277        // enter the SPEECH_PERSENT state.
278        float on_time = history_->RingSum(params_.speech_on_window());
279        if (on_time < params_.on_maintain_dur())
280          status_ = EP_POSSIBLE_OFFSET;
281        break;
282      }
283
284      case EP_POSSIBLE_OFFSET:
285        if (history_->RingSum(params_.offset_window()) <=
286            offset_confirm_dur_sec_) {
287          // Note that this offset time may be beyond the end
288          // of the input buffer in a real-time system.  It will be up
289          // to the RecognizerSession to decide what to do.
290          status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
291        } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
292          if (history_->RingSum(params_.speech_on_window()) >=
293              params_.on_maintain_dur())
294            status_ = EP_SPEECH_PRESENT;
295        }
296        break;
297
298      default:
299        LOG(WARNING) << "Invalid case in switch: " << status_;
300        break;
301    }
302
303    // If this is a quiet, non-speech region, slowly adapt the detection
304    // threshold to be about 6dB above the average RMS.
305    if ((!decision) && (status_ == EP_PRE_SPEECH)) {
306      decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
307      rms_adapt_ = decision_threshold_;
308    } else {
309      // If this is in a speech region, adapt the decision threshold to
310      // be about 10dB below the average RMS. If the noise level is high,
311      // the threshold is pushed up.
312      // Adaptation up to a higher level is 5 times faster than decay to
313      // a lower level.
314      if ((status_ == EP_SPEECH_PRESENT) && decision) {
315        if (rms_adapt_ > rms) {
316          rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
317        } else {
318          rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
319        }
320        float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
321        decision_threshold_ = (.90f * decision_threshold_) +
322                              (0.10f * target_threshold);
323      }
324    }
325
326    // Set a floor
327    if (decision_threshold_ < params_.min_decision_threshold())
328      decision_threshold_ = params_.min_decision_threshold();
329  }
330
331  // Update speech and noise levels.
332  UpdateLevels(rms);
333  ++frame_counter_;
334
335  if (rms_out)
336    *rms_out = GetDecibel(rms);
337}
338
339float EnergyEndpointer::GetNoiseLevelDb() const {
340  return GetDecibel(noise_level_);
341}
342
343void EnergyEndpointer::UpdateLevels(float rms) {
344  // Update quickly initially. We assume this is noise and that
345  // speech is 6dB above the noise.
346  if (frame_counter_ < fast_update_frames_) {
347    // Alpha increases from 0 to (k-1)/k where k is the number of time
348    // steps in the initial adaptation period.
349    float alpha = static_cast<float>(frame_counter_) /
350        static_cast<float>(fast_update_frames_);
351    noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
352    DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_
353             << ", fast_update_frames_ " << fast_update_frames_;
354  } else {
355    // Update Noise level. The noise level adapts quickly downward, but
356    // slowly upward. The noise_level_ parameter is not currently used
357    // for threshold adaptation. It is used for UI feedback.
358    if (noise_level_ < rms)
359      noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
360    else
361      noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
362  }
363  if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
364    decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
365    // Set a floor
366    if (decision_threshold_ < params_.min_decision_threshold())
367      decision_threshold_ = params_.min_decision_threshold();
368  }
369}
370
371EpStatus EnergyEndpointer::Status(int64* status_time)  const {
372  *status_time = history_->EndTime();
373  return status_;
374}
375
376}  // namespace content
377