1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
6#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
7
8#include "base/basictypes.h"
9#include "base/memory/scoped_ptr.h"
10#include "content/browser/speech/endpointer/endpointer.h"
11#include "content/browser/speech/speech_recognition_engine.h"
12#include "content/browser/speech/speech_recognizer.h"
13#include "content/public/common/speech_recognition_error.h"
14#include "content/public/common/speech_recognition_result.h"
15#include "media/audio/audio_input_controller.h"
16#include "media/audio/audio_logging.h"
17#include "net/url_request/url_request_context_getter.h"
18
19namespace media {
20class AudioBus;
21class AudioManager;
22}
23
24namespace content {
25
26class SpeechRecognitionEventListener;
27
28// Handles speech recognition for a session (identified by |session_id|), taking
29// care of audio capture, silence detection/endpointer and interaction with the
30// SpeechRecognitionEngine.
31class CONTENT_EXPORT SpeechRecognizerImpl
32    : public SpeechRecognizer,
33      public media::AudioInputController::EventHandler,
34      public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) {
35 public:
36  static const int kAudioSampleRate;
37  static const media::ChannelLayout kChannelLayout;
38  static const int kNumBitsPerAudioSample;
39  static const int kNoSpeechTimeoutMs;
40  static const int kEndpointerEstimationTimeMs;
41
42  static void SetAudioManagerForTesting(media::AudioManager* audio_manager);
43
44  SpeechRecognizerImpl(SpeechRecognitionEventListener* listener,
45                       int session_id,
46                       bool continuous,
47                       bool provisional_results,
48                       SpeechRecognitionEngine* engine);
49
50  virtual void StartRecognition(const std::string& device_id) OVERRIDE;
51  virtual void AbortRecognition() OVERRIDE;
52  virtual void StopAudioCapture() OVERRIDE;
53  virtual bool IsActive() const OVERRIDE;
54  virtual bool IsCapturingAudio() const OVERRIDE;
55  const SpeechRecognitionEngine& recognition_engine() const;
56
57 private:
58  friend class SpeechRecognizerTest;
59
60  enum FSMState {
61    STATE_IDLE = 0,
62    STATE_STARTING,
63    STATE_ESTIMATING_ENVIRONMENT,
64    STATE_WAITING_FOR_SPEECH,
65    STATE_RECOGNIZING,
66    STATE_WAITING_FINAL_RESULT,
67    STATE_ENDED,
68    STATE_MAX_VALUE = STATE_ENDED
69  };
70
71  enum FSMEvent {
72    EVENT_ABORT = 0,
73    EVENT_START,
74    EVENT_STOP_CAPTURE,
75    EVENT_AUDIO_DATA,
76    EVENT_ENGINE_RESULT,
77    EVENT_ENGINE_ERROR,
78    EVENT_AUDIO_ERROR,
79    EVENT_MAX_VALUE = EVENT_AUDIO_ERROR
80  };
81
82  struct FSMEventArgs {
83    explicit FSMEventArgs(FSMEvent event_value);
84    ~FSMEventArgs();
85
86    FSMEvent event;
87    scoped_refptr<AudioChunk> audio_data;
88    SpeechRecognitionResults engine_results;
89    SpeechRecognitionError engine_error;
90  };
91
92  virtual ~SpeechRecognizerImpl();
93
94  // Entry point for pushing any new external event into the recognizer FSM.
95  void DispatchEvent(const FSMEventArgs& event_args);
96
97  // Defines the behavior of the recognizer FSM, selecting the appropriate
98  // transition according to the current state and event.
99  FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args);
100
101  // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc).
102  void ProcessAudioPipeline(const AudioChunk& raw_audio);
103
104  // The methods below handle transitions of the recognizer FSM.
105  FSMState StartRecording(const FSMEventArgs& event_args);
106  FSMState StartRecognitionEngine(const FSMEventArgs& event_args);
107  FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args);
108  FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args);
109  FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args);
110  FSMState ProcessIntermediateResult(const FSMEventArgs& event_args);
111  FSMState ProcessFinalResult(const FSMEventArgs& event_args);
112  FSMState AbortSilently(const FSMEventArgs& event_args);
113  FSMState AbortWithError(const FSMEventArgs& event_args);
114  FSMState Abort(const SpeechRecognitionError& error);
115  FSMState DetectEndOfSpeech(const FSMEventArgs& event_args);
116  FSMState DoNothing(const FSMEventArgs& event_args) const;
117  FSMState NotFeasible(const FSMEventArgs& event_args);
118
119  // Returns the time span of captured audio samples since the start of capture.
120  int GetElapsedTimeMs() const;
121
122  // Calculates the input volume to be displayed in the UI, triggering the
123  // OnAudioLevelsChange event accordingly.
124  void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected);
125
126  void CloseAudioControllerAsynchronously();
127
128  // Callback called on IO thread by audio_controller->Close().
129  void OnAudioClosed(media::AudioInputController*);
130
131  // AudioInputController::EventHandler methods.
132  virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {}
133  virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {}
134  virtual void OnError(media::AudioInputController* controller,
135      media::AudioInputController::ErrorCode error_code) OVERRIDE;
136  virtual void OnData(media::AudioInputController* controller,
137                      const media::AudioBus* data) OVERRIDE;
138  virtual void OnLog(media::AudioInputController* controller,
139                     const std::string& message) OVERRIDE {}
140
141  // SpeechRecognitionEngineDelegate methods.
142  virtual void OnSpeechRecognitionEngineResults(
143      const SpeechRecognitionResults& results) OVERRIDE;
144  virtual void OnSpeechRecognitionEngineError(
145      const SpeechRecognitionError& error) OVERRIDE;
146
147  static media::AudioManager* audio_manager_for_tests_;
148
149  scoped_ptr<SpeechRecognitionEngine> recognition_engine_;
150  Endpointer endpointer_;
151  scoped_refptr<media::AudioInputController> audio_controller_;
152  scoped_ptr<media::AudioLog> audio_log_;
153  int num_samples_recorded_;
154  float audio_level_;
155  bool is_dispatching_event_;
156  bool provisional_results_;
157  FSMState state_;
158  std::string device_id_;
159
160  class OnDataConverter;
161
162  // Converts data between native input format and a WebSpeech specific
163  // output format.
164  scoped_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_;
165
166  DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl);
167};
168
169}  // namespace content
170
171#endif  // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
172