16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Copyright (c) 2013 The Chromium Authors. All rights reserved. 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Use of this source code is governed by a BSD-style license that can be 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// found in the LICENSE file. 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "content/browser/speech/speech_recognizer_impl.h" 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "base/basictypes.h" 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "base/bind.h" 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "base/time/time.h" 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "content/browser/browser_main_loop.h" 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "content/browser/media/media_internals.h" 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "content/browser/speech/audio_buffer.h" 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "content/browser/speech/google_one_shot_remote_engine.h" 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "content/public/browser/speech_recognition_event_listener.h" 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "media/base/audio_converter.h" 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "net/url_request/url_request_context_getter.h" 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#if defined(OS_WIN) 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "media/audio/win/core_audio_util_win.h" 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgusing media::AudioBus; 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgusing media::AudioConverter; 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgusing media::AudioInputController; 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgusing media::AudioManager; 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgusing media::AudioParameters; 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgusing media::ChannelLayout; 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgnamespace content { 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Private class which encapsulates the audio converter and the 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// AudioConverter::InputCallback. It handles resampling, buffering and 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// channel mixing between input and output parameters. 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass SpeechRecognizerImpl::OnDataConverter 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : public media::AudioConverter::InputCallback { 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org public: 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org OnDataConverter(const AudioParameters& input_params, 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const AudioParameters& output_params); 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual ~OnDataConverter(); 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Converts input audio |data| bus into an AudioChunk where the input format 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // is given by |input_parameters_| and the output format by 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // |output_parameters_|. 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scoped_refptr<AudioChunk> Convert(const AudioBus* data); 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org private: 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // media::AudioConverter::InputCallback implementation. 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org virtual double ProvideInput(AudioBus* dest, 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::TimeDelta buffer_delay) OVERRIDE; 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Handles resampling, buffering, and channel mixing between input and output 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // parameters. 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AudioConverter audio_converter_; 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scoped_ptr<AudioBus> input_bus_; 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scoped_ptr<AudioBus> output_bus_; 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const AudioParameters input_parameters_; 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const AudioParameters output_parameters_; 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bool waiting_for_input_; 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scoped_ptr<uint8[]> converted_data_; 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DISALLOW_COPY_AND_ASSIGN(OnDataConverter); 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgnamespace { 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// The following constants are related to the volume level indicator shown in 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// the UI for recorded audio. 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Multiplier used when new volume is greater than previous level. 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst float kUpSmoothingFactor = 1.0f; 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Multiplier used when new volume is lesser than previous level. 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst float kDownSmoothingFactor = 0.7f; 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// RMS dB value of a maximum (unclipped) sine wave for int16 samples. 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst float kAudioMeterMaxDb = 90.31f; 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Values lower than this will display as empty level-meter. 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst float kAudioMeterMinDb = 30.0f; 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Maximum level to draw to display unclipped meter. (1.0f displays clipping.) 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Returns true if more than 5% of the samples are at min or max value. 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbool DetectClipping(const AudioChunk& chunk) { 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const int num_samples = chunk.NumSamples(); 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const int16* samples = chunk.SamplesData16(); 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const int kThreshold = num_samples / 20; 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int clipping_samples = 0; 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org for (int i = 0; i < num_samples; ++i) { 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (samples[i] <= -32767 || samples[i] >= 32767) { 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (++clipping_samples > kThreshold) 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return true; 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return false; 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) { 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} // namespace 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst int SpeechRecognizerImpl::kAudioSampleRate = 16000; 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst ChannelLayout SpeechRecognizerImpl::kChannelLayout = 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org media::CHANNEL_LAYOUT_MONO; 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgmedia::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgCOMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org kNumBitsPerAudioSample_must_be_a_multiple_of_8); 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// SpeechRecognizerImpl::OnDataConverter implementation 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgSpeechRecognizerImpl::OnDataConverter::OnDataConverter( 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const AudioParameters& input_params, const AudioParameters& output_params) 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : audio_converter_(input_params, output_params, false), 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org input_bus_(AudioBus::Create(input_params)), 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output_bus_(AudioBus::Create(output_params)), 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org input_parameters_(input_params), 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output_parameters_(output_params), 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org waiting_for_input_(false), 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org converted_data_(new uint8[output_parameters_.GetBytesPerBuffer()]) { 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org audio_converter_.AddInput(this); 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgSpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // It should now be safe to unregister the converter since no more OnData() 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // callbacks are outstanding at this point. 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org audio_converter_.RemoveInput(this); 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgscoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const AudioBus* data) { 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org CHECK_EQ(data->frames(), input_parameters_.frames_per_buffer()); 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org data->CopyTo(input_bus_.get()); 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org waiting_for_input_ = true; 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org audio_converter_.Convert(output_bus_.get()); 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output_bus_->ToInterleaved( 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output_bus_->frames(), output_parameters_.bits_per_sample() / 8, 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org converted_data_.get()); 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // TODO(primiano): Refactor AudioChunk to avoid the extra-copy here 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // (see http://crbug.com/249316 for details). 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return scoped_refptr<AudioChunk>(new AudioChunk( 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org converted_data_.get(), 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output_parameters_.GetBytesPerBuffer(), 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org output_parameters_.bits_per_sample() / 8)); 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgdouble SpeechRecognizerImpl::OnDataConverter::ProvideInput( 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org AudioBus* dest, base::TimeDelta buffer_delay) { 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The audio converted should never ask for more than one bus in each call 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to Convert(). If so, we have a serious issue in our design since we might 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // miss recorded chunks of 100 ms audio data. 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org CHECK(waiting_for_input_); 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Read from the input bus to feed the converter. 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org input_bus_->CopyTo(dest); 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // |input_bus_| should only be provide once. 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org waiting_for_input_ = false; 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return 1; 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// SpeechRecognizerImpl implementation 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgSpeechRecognizerImpl::SpeechRecognizerImpl( 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org SpeechRecognitionEventListener* listener, 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int session_id, 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bool continuous, 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bool provisional_results, 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org SpeechRecognitionEngine* engine) 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : SpeechRecognizer(listener, session_id), 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org recognition_engine_(engine), 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endpointer_(kAudioSampleRate), 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org audio_log_(MediaInternals::GetInstance()->CreateAudioLog( 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org media::AudioLogFactory::AUDIO_INPUT_CONTROLLER)), 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org is_dispatching_event_(false), 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org provisional_results_(provisional_results), 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org state_(STATE_IDLE) { 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK(recognition_engine_ != NULL); 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (!continuous) { 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // In single shot (non-continous) recognition, 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the session is automatically ended after: 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // - 0.5 seconds of silence if time < 3 seconds 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // - 1 seconds of silence if time >= 3 seconds 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endpointer_.set_speech_input_complete_silence_length( 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Time::kMicrosecondsPerSecond / 2); 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endpointer_.set_long_speech_input_complete_silence_length( 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Time::kMicrosecondsPerSecond); 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // In continuous recognition, the session is automatically ended after 15 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // seconds of silence. 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15; 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endpointer_.set_speech_input_complete_silence_length(cont_timeout_us); 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endpointer_.set_long_speech_length(0); // Use only a single timeout. 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endpointer_.StartSession(); 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org recognition_engine_->set_delegate(this); 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// ------- Methods that trigger Finite State Machine (FSM) events ------------ 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// NOTE:all the external events and requests should be enqueued (PostTask), even 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// if they come from the same (IO) thread, in order to preserve the relationship 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// of causality between events and avoid interleaved event processing due to 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// synchronous callbacks. 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK(!device_id.empty()); 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org device_id_ = device_id; 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Bind(&SpeechRecognizerImpl::DispatchEvent, 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this, FSMEventArgs(EVENT_START))); 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::AbortRecognition() { 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Bind(&SpeechRecognizerImpl::DispatchEvent, 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this, FSMEventArgs(EVENT_ABORT))); 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::StopAudioCapture() { 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Bind(&SpeechRecognizerImpl::DispatchEvent, 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this, FSMEventArgs(EVENT_STOP_CAPTURE))); 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbool SpeechRecognizerImpl::IsActive() const { 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Checking the FSM state from another thread (thus, while the FSM is 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // potentially concurrently evolving) is meaningless. 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return state_ != STATE_IDLE && state_ != STATE_ENDED; 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgbool SpeechRecognizerImpl::IsCapturingAudio() const { 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const bool is_capturing_audio = state_ >= STATE_STARTING && 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org state_ <= STATE_RECOGNIZING; 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org (!is_capturing_audio && audio_controller_.get() == NULL)); 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return is_capturing_audio; 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgconst SpeechRecognitionEngine& 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgSpeechRecognizerImpl::recognition_engine() const { 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return *(recognition_engine_.get()); 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgSpeechRecognizerImpl::~SpeechRecognizerImpl() { 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org endpointer_.EndSession(); 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (audio_controller_.get()) { 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org audio_controller_->Close( 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Bind(&KeepAudioControllerRefcountedForDtor, audio_controller_)); 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org audio_log_->OnClosed(0); 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// Invoked in the audio thread. 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::OnError(AudioInputController* controller, 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org media::AudioInputController::ErrorCode error_code) { 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org FSMEventArgs event_args(EVENT_AUDIO_ERROR); 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Bind(&SpeechRecognizerImpl::DispatchEvent, 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this, event_args)); 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::OnData(AudioInputController* controller, 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const AudioBus* data) { 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Convert audio from native format to fixed format used by WebSpeech. 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org FSMEventArgs event_args(EVENT_AUDIO_DATA); 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org event_args.audio_data = audio_converter_->Convert(data); 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Bind(&SpeechRecognizerImpl::DispatchEvent, 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this, event_args)); 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const SpeechRecognitionResults& results) { 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org FSMEventArgs event_args(EVENT_ENGINE_RESULT); 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org event_args.engine_results = results; 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Bind(&SpeechRecognizerImpl::DispatchEvent, 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this, event_args)); 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::OnSpeechRecognitionEngineError( 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const SpeechRecognitionError& error) { 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org FSMEventArgs event_args(EVENT_ENGINE_ERROR); 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org event_args.engine_error = error; 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org base::Bind(&SpeechRecognizerImpl::DispatchEvent, 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org this, event_args)); 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// ----------------------- Core FSM implementation --------------------------- 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// TODO(primiano): After the changes in the media package (r129173), this class 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// slightly violates the SpeechRecognitionEventListener interface contract. In 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// particular, it is not true anymore that this class can be freed after the 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// call can be still in progress after the end event. Currently, it does not 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// represent a problem for the browser itself, since refcounting protects us 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// against such race conditions. However, we should fix this in the next CLs. 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// For instance, tests are currently working just because the 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// TestAudioInputController is not closing asynchronously as the real controller 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org// does, but they will become flaky if TestAudioInputController will be fixed. 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgvoid SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK_LE(event_args.event, EVENT_MAX_VALUE); 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK_LE(state_, STATE_MAX_VALUE); 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Event dispatching must be sequential, otherwise it will break all the rules 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and the assumptions of the finite state automata model. 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK(!is_dispatching_event_); 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org is_dispatching_event_ = true; 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Guard against the delegate freeing us until we finish processing the event. 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org scoped_refptr<SpeechRecognizerImpl> me(this); 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if (event_args.event == EVENT_AUDIO_DATA) { 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org DCHECK(event_args.audio_data.get() != NULL); 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ProcessAudioPipeline(*event_args.audio_data.get()); 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The audio pipeline must be processed before the event dispatch, otherwise 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // it would take actions according to the future state instead of the current. 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org state_ = ExecuteTransitionAndGetNextState(event_args); 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org is_dispatching_event_ = false; 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org} 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgSpeechRecognizerImpl::FSMState 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgSpeechRecognizerImpl::ExecuteTransitionAndGetNextState( 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const FSMEventArgs& event_args) { 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const FSMEvent event = event_args.event; 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org switch (state_) { 349 case STATE_IDLE: 350 switch (event) { 351 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and 352 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. 353 case EVENT_ABORT: 354 return AbortSilently(event_args); 355 case EVENT_START: 356 return StartRecording(event_args); 357 case EVENT_STOP_CAPTURE: 358 return AbortSilently(event_args); 359 case EVENT_AUDIO_DATA: // Corner cases related to queued messages 360 case EVENT_ENGINE_RESULT: // being lately dispatched. 361 case EVENT_ENGINE_ERROR: 362 case EVENT_AUDIO_ERROR: 363 return DoNothing(event_args); 364 } 365 break; 366 case STATE_STARTING: 367 switch (event) { 368 case EVENT_ABORT: 369 return AbortWithError(event_args); 370 case EVENT_START: 371 return NotFeasible(event_args); 372 case EVENT_STOP_CAPTURE: 373 return AbortSilently(event_args); 374 case EVENT_AUDIO_DATA: 375 return StartRecognitionEngine(event_args); 376 case EVENT_ENGINE_RESULT: 377 return NotFeasible(event_args); 378 case EVENT_ENGINE_ERROR: 379 case EVENT_AUDIO_ERROR: 380 return AbortWithError(event_args); 381 } 382 break; 383 case STATE_ESTIMATING_ENVIRONMENT: 384 switch (event) { 385 case EVENT_ABORT: 386 return AbortWithError(event_args); 387 case EVENT_START: 388 return NotFeasible(event_args); 389 case EVENT_STOP_CAPTURE: 390 return StopCaptureAndWaitForResult(event_args); 391 case EVENT_AUDIO_DATA: 392 return WaitEnvironmentEstimationCompletion(event_args); 393 case EVENT_ENGINE_RESULT: 394 return ProcessIntermediateResult(event_args); 395 case EVENT_ENGINE_ERROR: 396 case EVENT_AUDIO_ERROR: 397 return AbortWithError(event_args); 398 } 399 break; 400 case STATE_WAITING_FOR_SPEECH: 401 switch (event) { 402 case EVENT_ABORT: 403 return AbortWithError(event_args); 404 case EVENT_START: 405 return NotFeasible(event_args); 406 case EVENT_STOP_CAPTURE: 407 return StopCaptureAndWaitForResult(event_args); 408 case EVENT_AUDIO_DATA: 409 return DetectUserSpeechOrTimeout(event_args); 410 case EVENT_ENGINE_RESULT: 411 return ProcessIntermediateResult(event_args); 412 case EVENT_ENGINE_ERROR: 413 case EVENT_AUDIO_ERROR: 414 return AbortWithError(event_args); 415 } 416 break; 417 case STATE_RECOGNIZING: 418 switch (event) { 419 case EVENT_ABORT: 420 return AbortWithError(event_args); 421 case EVENT_START: 422 return NotFeasible(event_args); 423 case EVENT_STOP_CAPTURE: 424 return StopCaptureAndWaitForResult(event_args); 425 case EVENT_AUDIO_DATA: 426 return DetectEndOfSpeech(event_args); 427 case EVENT_ENGINE_RESULT: 428 return ProcessIntermediateResult(event_args); 429 case EVENT_ENGINE_ERROR: 430 case EVENT_AUDIO_ERROR: 431 return AbortWithError(event_args); 432 } 433 break; 434 case STATE_WAITING_FINAL_RESULT: 435 switch (event) { 436 case EVENT_ABORT: 437 return AbortWithError(event_args); 438 case EVENT_START: 439 return NotFeasible(event_args); 440 case EVENT_STOP_CAPTURE: 441 case EVENT_AUDIO_DATA: 442 return DoNothing(event_args); 443 case EVENT_ENGINE_RESULT: 444 return ProcessFinalResult(event_args); 445 case EVENT_ENGINE_ERROR: 446 case EVENT_AUDIO_ERROR: 447 return AbortWithError(event_args); 448 } 449 break; 450 451 // TODO(primiano): remove this state when speech input extensions support 452 // will be removed and STATE_IDLE.EVENT_ABORT,EVENT_STOP_CAPTURE will be 453 // reset to NotFeasible (see TODO above). 454 case STATE_ENDED: 455 return DoNothing(event_args); 456 } 457 return NotFeasible(event_args); 458} 459 460// ----------- Contract for all the FSM evolution functions below ------------- 461// - Are guaranteed to be executed in the IO thread; 462// - Are guaranteed to be not reentrant (themselves and each other); 463// - event_args members are guaranteed to be stable during the call; 464// - The class won't be freed in the meanwhile due to callbacks; 465// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. 466 467// TODO(primiano): the audio pipeline is currently serial. However, the 468// clipper->endpointer->vumeter chain and the sr_engine could be parallelized. 469// We should profile the execution to see if it would be worth or not. 470void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { 471 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && 472 state_ <= STATE_RECOGNIZING; 473 const bool route_to_sr_engine = route_to_endpointer; 474 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && 475 state_ <= STATE_RECOGNIZING; 476 const bool clip_detected = DetectClipping(raw_audio); 477 float rms = 0.0f; 478 479 num_samples_recorded_ += raw_audio.NumSamples(); 480 481 if (route_to_endpointer) 482 endpointer_.ProcessAudio(raw_audio, &rms); 483 484 if (route_to_vumeter) { 485 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. 486 UpdateSignalAndNoiseLevels(rms, clip_detected); 487 } 488 if (route_to_sr_engine) { 489 DCHECK(recognition_engine_.get() != NULL); 490 recognition_engine_->TakeAudioChunk(raw_audio); 491 } 492} 493 494SpeechRecognizerImpl::FSMState 495SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { 496 DCHECK(recognition_engine_.get() != NULL); 497 DCHECK(!IsCapturingAudio()); 498 const bool unit_test_is_active = (audio_manager_for_tests_ != NULL); 499 AudioManager* audio_manager = unit_test_is_active ? 500 audio_manager_for_tests_ : 501 AudioManager::Get(); 502 DCHECK(audio_manager != NULL); 503 504 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; 505 num_samples_recorded_ = 0; 506 audio_level_ = 0; 507 listener()->OnRecognitionStart(session_id()); 508 509 // TODO(xians): Check if the OS has the device with |device_id_|, return 510 // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist. 511 if (!audio_manager->HasAudioInputDevices()) { 512 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, 513 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); 514 } 515 516 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); 517 518 AudioParameters in_params = audio_manager->GetInputStreamParameters( 519 device_id_); 520 if (!in_params.IsValid() && !unit_test_is_active) { 521 DLOG(ERROR) << "Invalid native audio input parameters"; 522 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 523 } 524 525 // Audio converter shall provide audio based on these parameters as output. 526 // Hard coded, WebSpeech specific parameters are utilized here. 527 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; 528 AudioParameters output_parameters = AudioParameters( 529 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, 530 kNumBitsPerAudioSample, frames_per_buffer); 531 532 // Audio converter will receive audio based on these parameters as input. 533 // On Windows we start by verifying that Core Audio is supported. If not, 534 // the WaveIn API is used and we might as well avoid all audio conversations 535 // since WaveIn does the conversion for us. 536 // TODO(henrika): this code should be moved to platform dependent audio 537 // managers. 538 bool use_native_audio_params = true; 539#if defined(OS_WIN) 540 use_native_audio_params = media::CoreAudioUtil::IsSupported(); 541 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; 542#endif 543 544 AudioParameters input_parameters = output_parameters; 545 if (use_native_audio_params && !unit_test_is_active) { 546 // Use native audio parameters but avoid opening up at the native buffer 547 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. 548 // We rely on internal buffers in the audio back-end to fulfill this request 549 // and the idea is to simplify the audio conversion since each Convert() 550 // call will then render exactly one ProvideInput() call. 551 // Due to implementation details in the audio converter, 2 milliseconds 552 // are added to the default frame size (100 ms) to ensure there is enough 553 // data to generate 100 ms of output when resampling. 554 frames_per_buffer = 555 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; 556 input_parameters.Reset(in_params.format(), 557 in_params.channel_layout(), 558 in_params.channels(), 559 in_params.sample_rate(), 560 in_params.bits_per_sample(), 561 frames_per_buffer); 562 } 563 564 // Create an audio converter which converts data between native input format 565 // and WebSpeech specific output format. 566 audio_converter_.reset( 567 new OnDataConverter(input_parameters, output_parameters)); 568 569 audio_controller_ = AudioInputController::Create( 570 audio_manager, this, input_parameters, device_id_, NULL); 571 572 if (!audio_controller_.get()) { 573 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 574 } 575 576 audio_log_->OnCreated(0, input_parameters, device_id_); 577 578 // The endpointer needs to estimate the environment/background noise before 579 // starting to treat the audio as user input. We wait in the state 580 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching 581 // to user input mode. 582 endpointer_.SetEnvironmentEstimationMode(); 583 audio_controller_->Record(); 584 audio_log_->OnStarted(0); 585 return STATE_STARTING; 586} 587 588SpeechRecognizerImpl::FSMState 589SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { 590 // This is the first audio packet captured, so the recognition engine is 591 // started and the delegate notified about the event. 592 DCHECK(recognition_engine_.get() != NULL); 593 recognition_engine_->StartRecognition(); 594 listener()->OnAudioStart(session_id()); 595 596 // This is a little hack, since TakeAudioChunk() is already called by 597 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping 598 // the first audio chunk captured after opening the audio device. 599 recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get())); 600 return STATE_ESTIMATING_ENVIRONMENT; 601} 602 603SpeechRecognizerImpl::FSMState 604SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { 605 DCHECK(endpointer_.IsEstimatingEnvironment()); 606 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { 607 endpointer_.SetUserInputMode(); 608 listener()->OnEnvironmentEstimationComplete(session_id()); 609 return STATE_WAITING_FOR_SPEECH; 610 } else { 611 return STATE_ESTIMATING_ENVIRONMENT; 612 } 613} 614 615SpeechRecognizerImpl::FSMState 616SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { 617 if (endpointer_.DidStartReceivingSpeech()) { 618 listener()->OnSoundStart(session_id()); 619 return STATE_RECOGNIZING; 620 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { 621 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH)); 622 } 623 return STATE_WAITING_FOR_SPEECH; 624} 625 626SpeechRecognizerImpl::FSMState 627SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { 628 if (endpointer_.speech_input_complete()) 629 return StopCaptureAndWaitForResult(event_args); 630 return STATE_RECOGNIZING; 631} 632 633SpeechRecognizerImpl::FSMState 634SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { 635 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); 636 637 DVLOG(1) << "Concluding recognition"; 638 CloseAudioControllerAsynchronously(); 639 recognition_engine_->AudioChunksEnded(); 640 641 if (state_ > STATE_WAITING_FOR_SPEECH) 642 listener()->OnSoundEnd(session_id()); 643 644 listener()->OnAudioEnd(session_id()); 645 return STATE_WAITING_FINAL_RESULT; 646} 647 648SpeechRecognizerImpl::FSMState 649SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { 650 DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); 651 DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); 652 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE)); 653} 654 655SpeechRecognizerImpl::FSMState 656SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { 657 if (event_args.event == EVENT_AUDIO_ERROR) { 658 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 659 } else if (event_args.event == EVENT_ENGINE_ERROR) { 660 return Abort(event_args.engine_error); 661 } 662 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); 663} 664 665SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( 666 const SpeechRecognitionError& error) { 667 if (IsCapturingAudio()) 668 CloseAudioControllerAsynchronously(); 669 670 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; 671 672 // The recognition engine is initialized only after STATE_STARTING. 673 if (state_ > STATE_STARTING) { 674 DCHECK(recognition_engine_.get() != NULL); 675 recognition_engine_->EndRecognition(); 676 } 677 678 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) 679 listener()->OnSoundEnd(session_id()); 680 681 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) 682 listener()->OnAudioEnd(session_id()); 683 684 if (error.code != SPEECH_RECOGNITION_ERROR_NONE) 685 listener()->OnRecognitionError(session_id(), error); 686 687 listener()->OnRecognitionEnd(session_id()); 688 689 return STATE_ENDED; 690} 691 692SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( 693 const FSMEventArgs& event_args) { 694 // Provisional results can occur only if explicitly enabled in the JS API. 695 DCHECK(provisional_results_); 696 697 // In continuous recognition, intermediate results can occur even when we are 698 // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the 699 // recognition engine is "faster" than our endpointer). In these cases we 700 // skip the endpointer and fast-forward to the RECOGNIZING state, with respect 701 // of the events triggering order. 702 if (state_ == STATE_ESTIMATING_ENVIRONMENT) { 703 DCHECK(endpointer_.IsEstimatingEnvironment()); 704 endpointer_.SetUserInputMode(); 705 listener()->OnEnvironmentEstimationComplete(session_id()); 706 } else if (state_ == STATE_WAITING_FOR_SPEECH) { 707 listener()->OnSoundStart(session_id()); 708 } else { 709 DCHECK_EQ(STATE_RECOGNIZING, state_); 710 } 711 712 listener()->OnRecognitionResults(session_id(), event_args.engine_results); 713 return STATE_RECOGNIZING; 714} 715 716SpeechRecognizerImpl::FSMState 717SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { 718 const SpeechRecognitionResults& results = event_args.engine_results; 719 SpeechRecognitionResults::const_iterator i = results.begin(); 720 bool provisional_results_pending = false; 721 bool results_are_empty = true; 722 for (; i != results.end(); ++i) { 723 const SpeechRecognitionResult& result = *i; 724 if (result.is_provisional) { 725 DCHECK(provisional_results_); 726 provisional_results_pending = true; 727 } else if (results_are_empty) { 728 results_are_empty = result.hypotheses.empty(); 729 } 730 } 731 732 if (provisional_results_pending) { 733 listener()->OnRecognitionResults(session_id(), results); 734 // We don't end the recognition if a provisional result is received in 735 // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will 736 // end the recognition. 737 return state_; 738 } 739 740 recognition_engine_->EndRecognition(); 741 742 if (!results_are_empty) { 743 // We could receive an empty result (which we won't propagate further) 744 // in the following (continuous) scenario: 745 // 1. The caller start pushing audio and receives some results; 746 // 2. A |StopAudioCapture| is issued later; 747 // 3. The final audio frames captured in the interval ]1,2] do not lead to 748 // any result (nor any error); 749 // 4. The speech recognition engine, therefore, emits an empty result to 750 // notify that the recognition is ended with no error, yet neither any 751 // further result. 752 listener()->OnRecognitionResults(session_id(), results); 753 } 754 755 listener()->OnRecognitionEnd(session_id()); 756 return STATE_ENDED; 757} 758 759SpeechRecognizerImpl::FSMState 760SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { 761 return state_; // Just keep the current state. 762} 763 764SpeechRecognizerImpl::FSMState 765SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { 766 NOTREACHED() << "Unfeasible event " << event_args.event 767 << " in state " << state_; 768 return state_; 769} 770 771void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { 772 DCHECK(IsCapturingAudio()); 773 DVLOG(1) << "SpeechRecognizerImpl closing audio controller."; 774 // Issues a Close on the audio controller, passing an empty callback. The only 775 // purpose of such callback is to keep the audio controller refcounted until 776 // Close has completed (in the audio thread) and automatically destroy it 777 // afterwards (upon return from OnAudioClosed). 778 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, 779 this, audio_controller_)); 780 audio_controller_ = NULL; // The controller is still refcounted by Bind. 781 audio_log_->OnClosed(0); 782} 783 784int SpeechRecognizerImpl::GetElapsedTimeMs() const { 785 return (num_samples_recorded_ * 1000) / kAudioSampleRate; 786} 787 788void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, 789 bool clip_detected) { 790 // Calculate the input volume to display in the UI, smoothing towards the 791 // new level. 792 // TODO(primiano): Do we really need all this floating point arith here? 793 // Perhaps it might be quite expensive on mobile. 794 float level = (rms - kAudioMeterMinDb) / 795 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 796 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); 797 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : 798 kDownSmoothingFactor; 799 audio_level_ += (level - audio_level_) * smoothing_factor; 800 801 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / 802 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 803 noise_level = std::min(std::max(0.0f, noise_level), 804 kAudioMeterRangeMaxUnclipped); 805 806 listener()->OnAudioLevelsChange( 807 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); 808} 809 810void SpeechRecognizerImpl::SetAudioManagerForTesting( 811 AudioManager* audio_manager) { 812 audio_manager_for_tests_ = audio_manager; 813} 814 815SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) 816 : event(event_value), 817 audio_data(NULL), 818 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { 819} 820 821SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { 822} 823 824} // namespace content 825