1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// The EnergyEndpointer class finds likely speech onset and offset points.
6//
7// The implementation described here is about the simplest possible.
8// It is based on timings of threshold crossings for overall signal
9// RMS. It is suitable for light weight applications.
10//
11// As written, the basic idea is that one specifies intervals that
12// must be occupied by super- and sub-threshold energy levels, and
13// defers decisions re onset and offset times until these
14// specifications have been met.  Three basic intervals are tested: an
15// onset window, a speech-on window, and an offset window.  We require
16// super-threshold to exceed some mimimum total durations in the onset
17// and speech-on windows before declaring the speech onset time, and
18// we specify a required sub-threshold residency in the offset window
19// before declaring speech offset. As the various residency requirements are
20// met, the EnergyEndpointer instance assumes various states, and can return the
21// ID of these states to the client (see EpStatus below).
22//
23// The levels of the speech and background noise are continuously updated. It is
24// important that the background noise level be estimated initially for
25// robustness in noisy conditions. The first frames are assumed to be background
26// noise and a fast update rate is used for the noise level. The duration for
27// fast update is controlled by the fast_update_dur_ paramter.
28//
29// If used in noisy conditions, the endpointer should be started and run in the
30// EnvironmentEstimation mode, for at least 200ms, before switching to
31// UserInputMode.
32// Audio feedback contamination can appear in the input audio, if not cut
33// out or handled by echo cancellation. Audio feedback can trigger a false
34// accept. The false accepts can be ignored by setting
35// ep_contamination_rejection_period.
36
37#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
38#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
39
40#include <vector>
41
42#include "base/basictypes.h"
43#include "base/memory/scoped_ptr.h"
44#include "content/browser/speech/endpointer/energy_endpointer_params.h"
45#include "content/common/content_export.h"
46
47namespace content {
48
49// Endpointer status codes
50enum EpStatus {
51  EP_PRE_SPEECH = 10,
52  EP_POSSIBLE_ONSET,
53  EP_SPEECH_PRESENT,
54  EP_POSSIBLE_OFFSET,
55  EP_POST_SPEECH,
56};
57
58class CONTENT_EXPORT EnergyEndpointer {
59 public:
60  // The default construction MUST be followed by Init(), before any
61  // other use can be made of the instance.
62  EnergyEndpointer();
63  virtual ~EnergyEndpointer();
64
65  void Init(const EnergyEndpointerParams& params);
66
67  // Start the endpointer. This should be called at the beginning of a session.
68  void StartSession();
69
70  // Stop the endpointer.
71  void EndSession();
72
73  // Start environment estimation. Audio will be used for environment estimation
74  // i.e. noise level estimation.
75  void SetEnvironmentEstimationMode();
76
77  // Start user input. This should be called when the user indicates start of
78  // input, e.g. by pressing a button.
79  void SetUserInputMode();
80
81  // Computes the next input frame and modifies EnergyEndpointer status as
82  // appropriate based on the computation.
83  void ProcessAudioFrame(int64 time_us,
84                         const int16* samples, int num_samples,
85                         float* rms_out);
86
87  // Returns the current state of the EnergyEndpointer and the time
88  // corresponding to the most recently computed frame.
89  EpStatus Status(int64* status_time_us) const;
90
91  bool estimating_environment() const {
92    return estimating_environment_;
93  }
94
95  // Returns estimated noise level in dB.
96  float GetNoiseLevelDb() const;
97
98 private:
99  class HistoryRing;
100
101  // Resets the endpointer internal state.  If reset_threshold is true, the
102  // state will be reset completely, including adaptive thresholds and the
103  // removal of all history information.
104  void Restart(bool reset_threshold);
105
106  // Update internal speech and noise levels.
107  void UpdateLevels(float rms);
108
109  // Returns the number of frames (or frame number) corresponding to
110  // the 'time' (in seconds).
111  int TimeToFrame(float time) const;
112
113  EpStatus status_;  // The current state of this instance.
114  float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH
115  int64 endpointer_time_us_;  // Time of the most recently received audio frame.
116  int64 fast_update_frames_; // Number of frames for initial level adaptation.
117  int64 frame_counter_;  // Number of frames seen. Used for initial adaptation.
118  float max_window_dur_;  // Largest search window size (seconds)
119  float sample_rate_;  // Sampling rate.
120
121  // Ring buffers to hold the speech activity history.
122  scoped_ptr<HistoryRing> history_;
123
124  // Configuration parameters.
125  EnergyEndpointerParams params_;
126
127  // RMS which must be exceeded to conclude frame is speech.
128  float decision_threshold_;
129
130  // Flag to indicate that audio should be used to estimate environment, prior
131  // to receiving user input.
132  bool estimating_environment_;
133
134  // Estimate of the background noise level. Used externally for UI feedback.
135  float noise_level_;
136
137  // An adaptive threshold used to update decision_threshold_ when appropriate.
138  float rms_adapt_;
139
140  // Start lag corresponds to the highest fundamental frequency.
141  int start_lag_;
142
143  // End lag corresponds to the lowest fundamental frequency.
144  int end_lag_;
145
146  // Time when mode switched from environment estimation to user input. This
147  // is used to time forced rejection of audio feedback contamination.
148  int64 user_input_start_time_us_;
149
150  DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);
151};
152
153}  // namespace content
154
155#endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
156