1// Copyright (c) 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "content/browser/speech/speech_recognizer_impl.h" 6 7#include "base/basictypes.h" 8#include "base/bind.h" 9#include "base/time/time.h" 10#include "content/browser/browser_main_loop.h" 11#include "content/browser/speech/audio_buffer.h" 12#include "content/browser/speech/google_one_shot_remote_engine.h" 13#include "content/public/browser/speech_recognition_event_listener.h" 14#include "media/base/audio_converter.h" 15#include "net/url_request/url_request_context_getter.h" 16 17#if defined(OS_WIN) 18#include "media/audio/win/core_audio_util_win.h" 19#endif 20 21using media::AudioBus; 22using media::AudioConverter; 23using media::AudioInputController; 24using media::AudioManager; 25using media::AudioParameters; 26using media::ChannelLayout; 27 28namespace content { 29 30// Private class which encapsulates the audio converter and the 31// AudioConverter::InputCallback. It handles resampling, buffering and 32// channel mixing between input and output parameters. 33class SpeechRecognizerImpl::OnDataConverter 34 : public media::AudioConverter::InputCallback { 35 public: 36 OnDataConverter(const AudioParameters& input_params, 37 const AudioParameters& output_params); 38 virtual ~OnDataConverter(); 39 40 // Converts input |data| buffer into an AudioChunk where the input format 41 // is given by |input_parameters_| and the output format by 42 // |output_parameters_|. 43 scoped_refptr<AudioChunk> Convert(const uint8* data, size_t size); 44 45 private: 46 // media::AudioConverter::InputCallback implementation. 47 virtual double ProvideInput(AudioBus* dest, 48 base::TimeDelta buffer_delay) OVERRIDE; 49 50 // Handles resampling, buffering, and channel mixing between input and output 51 // parameters. 52 AudioConverter audio_converter_; 53 54 scoped_ptr<AudioBus> input_bus_; 55 scoped_ptr<AudioBus> output_bus_; 56 const AudioParameters input_parameters_; 57 const AudioParameters output_parameters_; 58 bool waiting_for_input_; 59 scoped_ptr<uint8[]> converted_data_; 60 61 DISALLOW_COPY_AND_ASSIGN(OnDataConverter); 62}; 63 64namespace { 65 66// The following constants are related to the volume level indicator shown in 67// the UI for recorded audio. 68// Multiplier used when new volume is greater than previous level. 69const float kUpSmoothingFactor = 1.0f; 70// Multiplier used when new volume is lesser than previous level. 71const float kDownSmoothingFactor = 0.7f; 72// RMS dB value of a maximum (unclipped) sine wave for int16 samples. 73const float kAudioMeterMaxDb = 90.31f; 74// This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. 75// Values lower than this will display as empty level-meter. 76const float kAudioMeterMinDb = 30.0f; 77const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; 78 79// Maximum level to draw to display unclipped meter. (1.0f displays clipping.) 80const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; 81 82// Returns true if more than 5% of the samples are at min or max value. 83bool DetectClipping(const AudioChunk& chunk) { 84 const int num_samples = chunk.NumSamples(); 85 const int16* samples = chunk.SamplesData16(); 86 const int kThreshold = num_samples / 20; 87 int clipping_samples = 0; 88 89 for (int i = 0; i < num_samples; ++i) { 90 if (samples[i] <= -32767 || samples[i] >= 32767) { 91 if (++clipping_samples > kThreshold) 92 return true; 93 } 94 } 95 return false; 96} 97 98void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) { 99} 100 101} // namespace 102 103const int SpeechRecognizerImpl::kAudioSampleRate = 16000; 104const ChannelLayout SpeechRecognizerImpl::kChannelLayout = 105 media::CHANNEL_LAYOUT_MONO; 106const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; 107const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; 108const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; 109media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; 110 111COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, 112 kNumBitsPerAudioSample_must_be_a_multiple_of_8); 113 114// SpeechRecognizerImpl::OnDataConverter implementation 115 116SpeechRecognizerImpl::OnDataConverter::OnDataConverter( 117 const AudioParameters& input_params, const AudioParameters& output_params) 118 : audio_converter_(input_params, output_params, false), 119 input_bus_(AudioBus::Create(input_params)), 120 output_bus_(AudioBus::Create(output_params)), 121 input_parameters_(input_params), 122 output_parameters_(output_params), 123 waiting_for_input_(false), 124 converted_data_(new uint8[output_parameters_.GetBytesPerBuffer()]) { 125 audio_converter_.AddInput(this); 126} 127 128SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() { 129 // It should now be safe to unregister the converter since no more OnData() 130 // callbacks are outstanding at this point. 131 audio_converter_.RemoveInput(this); 132} 133 134scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert( 135 const uint8* data, size_t size) { 136 CHECK_EQ(size, static_cast<size_t>(input_parameters_.GetBytesPerBuffer())); 137 138 input_bus_->FromInterleaved( 139 data, input_bus_->frames(), input_parameters_.bits_per_sample() / 8); 140 141 waiting_for_input_ = true; 142 audio_converter_.Convert(output_bus_.get()); 143 144 output_bus_->ToInterleaved( 145 output_bus_->frames(), output_parameters_.bits_per_sample() / 8, 146 converted_data_.get()); 147 148 // TODO(primiano): Refactor AudioChunk to avoid the extra-copy here 149 // (see http://crbug.com/249316 for details). 150 return scoped_refptr<AudioChunk>(new AudioChunk( 151 converted_data_.get(), 152 output_parameters_.GetBytesPerBuffer(), 153 output_parameters_.bits_per_sample() / 8)); 154} 155 156double SpeechRecognizerImpl::OnDataConverter::ProvideInput( 157 AudioBus* dest, base::TimeDelta buffer_delay) { 158 // The audio converted should never ask for more than one bus in each call 159 // to Convert(). If so, we have a serious issue in our design since we might 160 // miss recorded chunks of 100 ms audio data. 161 CHECK(waiting_for_input_); 162 163 // Read from the input bus to feed the converter. 164 input_bus_->CopyTo(dest); 165 166 // |input_bus_| should only be provide once. 167 waiting_for_input_ = false; 168 return 1; 169} 170 171// SpeechRecognizerImpl implementation 172 173SpeechRecognizerImpl::SpeechRecognizerImpl( 174 SpeechRecognitionEventListener* listener, 175 int session_id, 176 bool is_single_shot, 177 SpeechRecognitionEngine* engine) 178 : SpeechRecognizer(listener, session_id), 179 recognition_engine_(engine), 180 endpointer_(kAudioSampleRate), 181 is_dispatching_event_(false), 182 is_single_shot_(is_single_shot), 183 state_(STATE_IDLE) { 184 DCHECK(recognition_engine_ != NULL); 185 if (is_single_shot) { 186 // In single shot recognition, the session is automatically ended after: 187 // - 0.5 seconds of silence if time < 3 seconds 188 // - 1 seconds of silence if time >= 3 seconds 189 endpointer_.set_speech_input_complete_silence_length( 190 base::Time::kMicrosecondsPerSecond / 2); 191 endpointer_.set_long_speech_input_complete_silence_length( 192 base::Time::kMicrosecondsPerSecond); 193 endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); 194 } else { 195 // In continuous recognition, the session is automatically ended after 15 196 // seconds of silence. 197 const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15; 198 endpointer_.set_speech_input_complete_silence_length(cont_timeout_us); 199 endpointer_.set_long_speech_length(0); // Use only a single timeout. 200 } 201 endpointer_.StartSession(); 202 recognition_engine_->set_delegate(this); 203} 204 205// ------- Methods that trigger Finite State Machine (FSM) events ------------ 206 207// NOTE:all the external events and requests should be enqueued (PostTask), even 208// if they come from the same (IO) thread, in order to preserve the relationship 209// of causality between events and avoid interleaved event processing due to 210// synchronous callbacks. 211 212void SpeechRecognizerImpl::StartRecognition(const std::string& device_id) { 213 DCHECK(!device_id.empty()); 214 device_id_ = device_id; 215 216 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 217 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 218 this, FSMEventArgs(EVENT_START))); 219} 220 221void SpeechRecognizerImpl::AbortRecognition() { 222 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 223 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 224 this, FSMEventArgs(EVENT_ABORT))); 225} 226 227void SpeechRecognizerImpl::StopAudioCapture() { 228 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 229 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 230 this, FSMEventArgs(EVENT_STOP_CAPTURE))); 231} 232 233bool SpeechRecognizerImpl::IsActive() const { 234 // Checking the FSM state from another thread (thus, while the FSM is 235 // potentially concurrently evolving) is meaningless. 236 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 237 return state_ != STATE_IDLE && state_ != STATE_ENDED; 238} 239 240bool SpeechRecognizerImpl::IsCapturingAudio() const { 241 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). 242 const bool is_capturing_audio = state_ >= STATE_STARTING && 243 state_ <= STATE_RECOGNIZING; 244 DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || 245 (!is_capturing_audio && audio_controller_.get() == NULL)); 246 return is_capturing_audio; 247} 248 249const SpeechRecognitionEngine& 250SpeechRecognizerImpl::recognition_engine() const { 251 return *(recognition_engine_.get()); 252} 253 254SpeechRecognizerImpl::~SpeechRecognizerImpl() { 255 endpointer_.EndSession(); 256 if (audio_controller_.get()) { 257 audio_controller_->Close( 258 base::Bind(&KeepAudioControllerRefcountedForDtor, audio_controller_)); 259 } 260} 261 262// Invoked in the audio thread. 263void SpeechRecognizerImpl::OnError(AudioInputController* controller) { 264 FSMEventArgs event_args(EVENT_AUDIO_ERROR); 265 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 266 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 267 this, event_args)); 268} 269 270void SpeechRecognizerImpl::OnData(AudioInputController* controller, 271 const uint8* data, uint32 size) { 272 if (size == 0) // This could happen when audio capture stops and is normal. 273 return; 274 275 // Convert audio from native format to fixed format used by WebSpeech. 276 FSMEventArgs event_args(EVENT_AUDIO_DATA); 277 event_args.audio_data = audio_converter_->Convert(data, size); 278 279 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 280 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 281 this, event_args)); 282} 283 284void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} 285 286void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( 287 const SpeechRecognitionResults& results) { 288 FSMEventArgs event_args(EVENT_ENGINE_RESULT); 289 event_args.engine_results = results; 290 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 291 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 292 this, event_args)); 293} 294 295void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( 296 const SpeechRecognitionError& error) { 297 FSMEventArgs event_args(EVENT_ENGINE_ERROR); 298 event_args.engine_error = error; 299 BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, 300 base::Bind(&SpeechRecognizerImpl::DispatchEvent, 301 this, event_args)); 302} 303 304// ----------------------- Core FSM implementation --------------------------- 305// TODO(primiano): After the changes in the media package (r129173), this class 306// slightly violates the SpeechRecognitionEventListener interface contract. In 307// particular, it is not true anymore that this class can be freed after the 308// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous 309// call can be still in progress after the end event. Currently, it does not 310// represent a problem for the browser itself, since refcounting protects us 311// against such race conditions. However, we should fix this in the next CLs. 312// For instance, tests are currently working just because the 313// TestAudioInputController is not closing asynchronously as the real controller 314// does, but they will become flaky if TestAudioInputController will be fixed. 315 316void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { 317 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); 318 DCHECK_LE(event_args.event, EVENT_MAX_VALUE); 319 DCHECK_LE(state_, STATE_MAX_VALUE); 320 321 // Event dispatching must be sequential, otherwise it will break all the rules 322 // and the assumptions of the finite state automata model. 323 DCHECK(!is_dispatching_event_); 324 is_dispatching_event_ = true; 325 326 // Guard against the delegate freeing us until we finish processing the event. 327 scoped_refptr<SpeechRecognizerImpl> me(this); 328 329 if (event_args.event == EVENT_AUDIO_DATA) { 330 DCHECK(event_args.audio_data.get() != NULL); 331 ProcessAudioPipeline(*event_args.audio_data.get()); 332 } 333 334 // The audio pipeline must be processed before the event dispatch, otherwise 335 // it would take actions according to the future state instead of the current. 336 state_ = ExecuteTransitionAndGetNextState(event_args); 337 is_dispatching_event_ = false; 338} 339 340SpeechRecognizerImpl::FSMState 341SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( 342 const FSMEventArgs& event_args) { 343 const FSMEvent event = event_args.event; 344 switch (state_) { 345 case STATE_IDLE: 346 switch (event) { 347 // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and 348 // EVENT_STOP_CAPTURE below once speech input extensions are fixed. 349 case EVENT_ABORT: 350 return AbortSilently(event_args); 351 case EVENT_START: 352 return StartRecording(event_args); 353 case EVENT_STOP_CAPTURE: 354 return AbortSilently(event_args); 355 case EVENT_AUDIO_DATA: // Corner cases related to queued messages 356 case EVENT_ENGINE_RESULT: // being lately dispatched. 357 case EVENT_ENGINE_ERROR: 358 case EVENT_AUDIO_ERROR: 359 return DoNothing(event_args); 360 } 361 break; 362 case STATE_STARTING: 363 switch (event) { 364 case EVENT_ABORT: 365 return AbortWithError(event_args); 366 case EVENT_START: 367 return NotFeasible(event_args); 368 case EVENT_STOP_CAPTURE: 369 return AbortSilently(event_args); 370 case EVENT_AUDIO_DATA: 371 return StartRecognitionEngine(event_args); 372 case EVENT_ENGINE_RESULT: 373 return NotFeasible(event_args); 374 case EVENT_ENGINE_ERROR: 375 case EVENT_AUDIO_ERROR: 376 return AbortWithError(event_args); 377 } 378 break; 379 case STATE_ESTIMATING_ENVIRONMENT: 380 switch (event) { 381 case EVENT_ABORT: 382 return AbortWithError(event_args); 383 case EVENT_START: 384 return NotFeasible(event_args); 385 case EVENT_STOP_CAPTURE: 386 return StopCaptureAndWaitForResult(event_args); 387 case EVENT_AUDIO_DATA: 388 return WaitEnvironmentEstimationCompletion(event_args); 389 case EVENT_ENGINE_RESULT: 390 return ProcessIntermediateResult(event_args); 391 case EVENT_ENGINE_ERROR: 392 case EVENT_AUDIO_ERROR: 393 return AbortWithError(event_args); 394 } 395 break; 396 case STATE_WAITING_FOR_SPEECH: 397 switch (event) { 398 case EVENT_ABORT: 399 return AbortWithError(event_args); 400 case EVENT_START: 401 return NotFeasible(event_args); 402 case EVENT_STOP_CAPTURE: 403 return StopCaptureAndWaitForResult(event_args); 404 case EVENT_AUDIO_DATA: 405 return DetectUserSpeechOrTimeout(event_args); 406 case EVENT_ENGINE_RESULT: 407 return ProcessIntermediateResult(event_args); 408 case EVENT_ENGINE_ERROR: 409 case EVENT_AUDIO_ERROR: 410 return AbortWithError(event_args); 411 } 412 break; 413 case STATE_RECOGNIZING: 414 switch (event) { 415 case EVENT_ABORT: 416 return AbortWithError(event_args); 417 case EVENT_START: 418 return NotFeasible(event_args); 419 case EVENT_STOP_CAPTURE: 420 return StopCaptureAndWaitForResult(event_args); 421 case EVENT_AUDIO_DATA: 422 return DetectEndOfSpeech(event_args); 423 case EVENT_ENGINE_RESULT: 424 return ProcessIntermediateResult(event_args); 425 case EVENT_ENGINE_ERROR: 426 case EVENT_AUDIO_ERROR: 427 return AbortWithError(event_args); 428 } 429 break; 430 case STATE_WAITING_FINAL_RESULT: 431 switch (event) { 432 case EVENT_ABORT: 433 return AbortWithError(event_args); 434 case EVENT_START: 435 return NotFeasible(event_args); 436 case EVENT_STOP_CAPTURE: 437 case EVENT_AUDIO_DATA: 438 return DoNothing(event_args); 439 case EVENT_ENGINE_RESULT: 440 return ProcessFinalResult(event_args); 441 case EVENT_ENGINE_ERROR: 442 case EVENT_AUDIO_ERROR: 443 return AbortWithError(event_args); 444 } 445 break; 446 447 // TODO(primiano): remove this state when speech input extensions support 448 // will be removed and STATE_IDLE.EVENT_ABORT,EVENT_STOP_CAPTURE will be 449 // reset to NotFeasible (see TODO above). 450 case STATE_ENDED: 451 return DoNothing(event_args); 452 } 453 return NotFeasible(event_args); 454} 455 456// ----------- Contract for all the FSM evolution functions below ------------- 457// - Are guaranteed to be executed in the IO thread; 458// - Are guaranteed to be not reentrant (themselves and each other); 459// - event_args members are guaranteed to be stable during the call; 460// - The class won't be freed in the meanwhile due to callbacks; 461// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. 462 463// TODO(primiano): the audio pipeline is currently serial. However, the 464// clipper->endpointer->vumeter chain and the sr_engine could be parallelized. 465// We should profile the execution to see if it would be worth or not. 466void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { 467 const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && 468 state_ <= STATE_RECOGNIZING; 469 const bool route_to_sr_engine = route_to_endpointer; 470 const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && 471 state_ <= STATE_RECOGNIZING; 472 const bool clip_detected = DetectClipping(raw_audio); 473 float rms = 0.0f; 474 475 num_samples_recorded_ += raw_audio.NumSamples(); 476 477 if (route_to_endpointer) 478 endpointer_.ProcessAudio(raw_audio, &rms); 479 480 if (route_to_vumeter) { 481 DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. 482 UpdateSignalAndNoiseLevels(rms, clip_detected); 483 } 484 if (route_to_sr_engine) { 485 DCHECK(recognition_engine_.get() != NULL); 486 recognition_engine_->TakeAudioChunk(raw_audio); 487 } 488} 489 490SpeechRecognizerImpl::FSMState 491SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { 492 DCHECK(recognition_engine_.get() != NULL); 493 DCHECK(!IsCapturingAudio()); 494 const bool unit_test_is_active = (audio_manager_for_tests_ != NULL); 495 AudioManager* audio_manager = unit_test_is_active ? 496 audio_manager_for_tests_ : 497 AudioManager::Get(); 498 DCHECK(audio_manager != NULL); 499 500 DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; 501 num_samples_recorded_ = 0; 502 audio_level_ = 0; 503 listener()->OnRecognitionStart(session_id()); 504 505 // TODO(xians): Check if the OS has the device with |device_id_|, return 506 // |SPEECH_AUDIO_ERROR_DETAILS_NO_MIC| if the target device does not exist. 507 if (!audio_manager->HasAudioInputDevices()) { 508 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, 509 SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); 510 } 511 512 int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs(); 513 514 AudioParameters in_params = audio_manager->GetInputStreamParameters( 515 device_id_); 516 if (!in_params.IsValid() && !unit_test_is_active) { 517 DLOG(ERROR) << "Invalid native audio input parameters"; 518 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 519 } 520 521 // Audio converter shall provide audio based on these parameters as output. 522 // Hard coded, WebSpeech specific parameters are utilized here. 523 int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000; 524 AudioParameters output_parameters = AudioParameters( 525 AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate, 526 kNumBitsPerAudioSample, frames_per_buffer); 527 528 // Audio converter will receive audio based on these parameters as input. 529 // On Windows we start by verifying that Core Audio is supported. If not, 530 // the WaveIn API is used and we might as well avoid all audio conversations 531 // since WaveIn does the conversion for us. 532 // TODO(henrika): this code should be moved to platform dependent audio 533 // managers. 534 bool use_native_audio_params = true; 535#if defined(OS_WIN) 536 use_native_audio_params = media::CoreAudioUtil::IsSupported(); 537 DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech"; 538#endif 539 540 AudioParameters input_parameters = output_parameters; 541 if (use_native_audio_params && !unit_test_is_active) { 542 // Use native audio parameters but avoid opening up at the native buffer 543 // size. Instead use same frame size (in milliseconds) as WebSpeech uses. 544 // We rely on internal buffers in the audio back-end to fulfill this request 545 // and the idea is to simplify the audio conversion since each Convert() 546 // call will then render exactly one ProvideInput() call. 547 // Due to implementation details in the audio converter, 2 milliseconds 548 // are added to the default frame size (100 ms) to ensure there is enough 549 // data to generate 100 ms of output when resampling. 550 frames_per_buffer = 551 ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5; 552 input_parameters.Reset(in_params.format(), 553 in_params.channel_layout(), 554 in_params.channels(), 555 in_params.input_channels(), 556 in_params.sample_rate(), 557 in_params.bits_per_sample(), 558 frames_per_buffer); 559 } 560 561 // Create an audio converter which converts data between native input format 562 // and WebSpeech specific output format. 563 audio_converter_.reset( 564 new OnDataConverter(input_parameters, output_parameters)); 565 566 audio_controller_ = AudioInputController::Create( 567 audio_manager, this, input_parameters, device_id_, NULL); 568 569 if (!audio_controller_.get()) { 570 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 571 } 572 573 // The endpointer needs to estimate the environment/background noise before 574 // starting to treat the audio as user input. We wait in the state 575 // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching 576 // to user input mode. 577 endpointer_.SetEnvironmentEstimationMode(); 578 audio_controller_->Record(); 579 return STATE_STARTING; 580} 581 582SpeechRecognizerImpl::FSMState 583SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { 584 // This is the first audio packet captured, so the recognition engine is 585 // started and the delegate notified about the event. 586 DCHECK(recognition_engine_.get() != NULL); 587 recognition_engine_->StartRecognition(); 588 listener()->OnAudioStart(session_id()); 589 590 // This is a little hack, since TakeAudioChunk() is already called by 591 // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping 592 // the first audio chunk captured after opening the audio device. 593 recognition_engine_->TakeAudioChunk(*(event_args.audio_data.get())); 594 return STATE_ESTIMATING_ENVIRONMENT; 595} 596 597SpeechRecognizerImpl::FSMState 598SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { 599 DCHECK(endpointer_.IsEstimatingEnvironment()); 600 if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { 601 endpointer_.SetUserInputMode(); 602 listener()->OnEnvironmentEstimationComplete(session_id()); 603 return STATE_WAITING_FOR_SPEECH; 604 } else { 605 return STATE_ESTIMATING_ENVIRONMENT; 606 } 607} 608 609SpeechRecognizerImpl::FSMState 610SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { 611 if (endpointer_.DidStartReceivingSpeech()) { 612 listener()->OnSoundStart(session_id()); 613 return STATE_RECOGNIZING; 614 } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { 615 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH)); 616 } 617 return STATE_WAITING_FOR_SPEECH; 618} 619 620SpeechRecognizerImpl::FSMState 621SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { 622 if (endpointer_.speech_input_complete()) 623 return StopCaptureAndWaitForResult(event_args); 624 return STATE_RECOGNIZING; 625} 626 627SpeechRecognizerImpl::FSMState 628SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { 629 DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); 630 631 DVLOG(1) << "Concluding recognition"; 632 CloseAudioControllerAsynchronously(); 633 recognition_engine_->AudioChunksEnded(); 634 635 if (state_ > STATE_WAITING_FOR_SPEECH) 636 listener()->OnSoundEnd(session_id()); 637 638 listener()->OnAudioEnd(session_id()); 639 return STATE_WAITING_FINAL_RESULT; 640} 641 642SpeechRecognizerImpl::FSMState 643SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { 644 DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); 645 DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); 646 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE)); 647} 648 649SpeechRecognizerImpl::FSMState 650SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { 651 if (event_args.event == EVENT_AUDIO_ERROR) { 652 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); 653 } else if (event_args.event == EVENT_ENGINE_ERROR) { 654 return Abort(event_args.engine_error); 655 } 656 return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); 657} 658 659SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( 660 const SpeechRecognitionError& error) { 661 if (IsCapturingAudio()) 662 CloseAudioControllerAsynchronously(); 663 664 DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; 665 666 // The recognition engine is initialized only after STATE_STARTING. 667 if (state_ > STATE_STARTING) { 668 DCHECK(recognition_engine_.get() != NULL); 669 recognition_engine_->EndRecognition(); 670 } 671 672 if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) 673 listener()->OnSoundEnd(session_id()); 674 675 if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) 676 listener()->OnAudioEnd(session_id()); 677 678 if (error.code != SPEECH_RECOGNITION_ERROR_NONE) 679 listener()->OnRecognitionError(session_id(), error); 680 681 listener()->OnRecognitionEnd(session_id()); 682 683 return STATE_ENDED; 684} 685 686SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( 687 const FSMEventArgs& event_args) { 688 // Provisional results can occur only during continuous (non one-shot) mode. 689 // If this check is reached it means that a continuous speech recognition 690 // engine is being used for a one shot recognition. 691 DCHECK_EQ(false, is_single_shot_); 692 693 // In continuous recognition, intermediate results can occur even when we are 694 // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the 695 // recognition engine is "faster" than our endpointer). In these cases we 696 // skip the endpointer and fast-forward to the RECOGNIZING state, with respect 697 // of the events triggering order. 698 if (state_ == STATE_ESTIMATING_ENVIRONMENT) { 699 DCHECK(endpointer_.IsEstimatingEnvironment()); 700 endpointer_.SetUserInputMode(); 701 listener()->OnEnvironmentEstimationComplete(session_id()); 702 } else if (state_ == STATE_WAITING_FOR_SPEECH) { 703 listener()->OnSoundStart(session_id()); 704 } else { 705 DCHECK_EQ(STATE_RECOGNIZING, state_); 706 } 707 708 listener()->OnRecognitionResults(session_id(), event_args.engine_results); 709 return STATE_RECOGNIZING; 710} 711 712SpeechRecognizerImpl::FSMState 713SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { 714 const SpeechRecognitionResults& results = event_args.engine_results; 715 SpeechRecognitionResults::const_iterator i = results.begin(); 716 bool provisional_results_pending = false; 717 bool results_are_empty = true; 718 for (; i != results.end(); ++i) { 719 const SpeechRecognitionResult& result = *i; 720 if (result.is_provisional) { 721 provisional_results_pending = true; 722 DCHECK(!is_single_shot_); 723 } else if (results_are_empty) { 724 results_are_empty = result.hypotheses.empty(); 725 } 726 } 727 728 if (provisional_results_pending) { 729 listener()->OnRecognitionResults(session_id(), results); 730 // We don't end the recognition if a provisional result is received in 731 // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will 732 // end the recognition. 733 return state_; 734 } 735 736 recognition_engine_->EndRecognition(); 737 738 if (!results_are_empty) { 739 // We could receive an empty result (which we won't propagate further) 740 // in the following (continuous) scenario: 741 // 1. The caller start pushing audio and receives some results; 742 // 2. A |StopAudioCapture| is issued later; 743 // 3. The final audio frames captured in the interval ]1,2] do not lead to 744 // any result (nor any error); 745 // 4. The speech recognition engine, therefore, emits an empty result to 746 // notify that the recognition is ended with no error, yet neither any 747 // further result. 748 listener()->OnRecognitionResults(session_id(), results); 749 } 750 751 listener()->OnRecognitionEnd(session_id()); 752 return STATE_ENDED; 753} 754 755SpeechRecognizerImpl::FSMState 756SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { 757 return state_; // Just keep the current state. 758} 759 760SpeechRecognizerImpl::FSMState 761SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { 762 NOTREACHED() << "Unfeasible event " << event_args.event 763 << " in state " << state_; 764 return state_; 765} 766 767void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { 768 DCHECK(IsCapturingAudio()); 769 DVLOG(1) << "SpeechRecognizerImpl closing audio controller."; 770 // Issues a Close on the audio controller, passing an empty callback. The only 771 // purpose of such callback is to keep the audio controller refcounted until 772 // Close has completed (in the audio thread) and automatically destroy it 773 // afterwards (upon return from OnAudioClosed). 774 audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, 775 this, audio_controller_)); 776 audio_controller_ = NULL; // The controller is still refcounted by Bind. 777} 778 779int SpeechRecognizerImpl::GetElapsedTimeMs() const { 780 return (num_samples_recorded_ * 1000) / kAudioSampleRate; 781} 782 783void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, 784 bool clip_detected) { 785 // Calculate the input volume to display in the UI, smoothing towards the 786 // new level. 787 // TODO(primiano): Do we really need all this floating point arith here? 788 // Perhaps it might be quite expensive on mobile. 789 float level = (rms - kAudioMeterMinDb) / 790 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 791 level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); 792 const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : 793 kDownSmoothingFactor; 794 audio_level_ += (level - audio_level_) * smoothing_factor; 795 796 float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / 797 (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); 798 noise_level = std::min(std::max(0.0f, noise_level), 799 kAudioMeterRangeMaxUnclipped); 800 801 listener()->OnAudioLevelsChange( 802 session_id(), clip_detected ? 1.0f : audio_level_, noise_level); 803} 804 805void SpeechRecognizerImpl::SetAudioManagerForTesting( 806 AudioManager* audio_manager) { 807 audio_manager_for_tests_ = audio_manager; 808} 809 810SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) 811 : event(event_value), 812 audio_data(NULL), 813 engine_error(SPEECH_RECOGNITION_ERROR_NONE) { 814} 815 816SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { 817} 818 819} // namespace content 820