1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <vector>
6
7#include "content/browser/browser_thread_impl.h"
8#include "content/browser/speech/google_one_shot_remote_engine.h"
9#include "content/browser/speech/speech_recognizer_impl.h"
10#include "content/public/browser/speech_recognition_event_listener.h"
11#include "media/audio/audio_manager_base.h"
12#include "media/audio/fake_audio_input_stream.h"
13#include "media/audio/fake_audio_output_stream.h"
14#include "media/audio/mock_audio_manager.h"
15#include "media/audio/test_audio_input_controller_factory.h"
16#include "media/base/audio_bus.h"
17#include "net/base/net_errors.h"
18#include "net/url_request/test_url_fetcher_factory.h"
19#include "net/url_request/url_request_status.h"
20#include "testing/gtest/include/gtest/gtest.h"
21
22using base::MessageLoopProxy;
23using media::AudioInputController;
24using media::AudioInputStream;
25using media::AudioManager;
26using media::AudioOutputStream;
27using media::AudioParameters;
28using media::TestAudioInputController;
29using media::TestAudioInputControllerFactory;
30
31namespace content {
32
33class SpeechRecognizerImplTest : public SpeechRecognitionEventListener,
34                                 public testing::Test {
35 public:
36  SpeechRecognizerImplTest()
37      : io_thread_(BrowserThread::IO, &message_loop_),
38        recognition_started_(false),
39        recognition_ended_(false),
40        result_received_(false),
41        audio_started_(false),
42        audio_ended_(false),
43        sound_started_(false),
44        sound_ended_(false),
45        error_(SPEECH_RECOGNITION_ERROR_NONE),
46        volume_(-1.0f) {
47    // SpeechRecognizer takes ownership of sr_engine.
48    SpeechRecognitionEngine* sr_engine =
49        new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */);
50    SpeechRecognitionEngineConfig config;
51    config.audio_num_bits_per_sample =
52        SpeechRecognizerImpl::kNumBitsPerAudioSample;
53    config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate;
54    config.filter_profanities = false;
55    sr_engine->SetConfig(config);
56
57    const int kTestingSessionId = 1;
58    recognizer_ = new SpeechRecognizerImpl(
59        this, kTestingSessionId, false, false, sr_engine);
60    audio_manager_.reset(new media::MockAudioManager(
61        base::MessageLoop::current()->message_loop_proxy().get()));
62    recognizer_->SetAudioManagerForTesting(audio_manager_.get());
63
64    int audio_packet_length_bytes =
65        (SpeechRecognizerImpl::kAudioSampleRate *
66         GoogleOneShotRemoteEngine::kAudioPacketIntervalMs *
67         ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) *
68         SpeechRecognizerImpl::kNumBitsPerAudioSample) / (8 * 1000);
69    audio_packet_.resize(audio_packet_length_bytes);
70
71    const int channels =
72        ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout);
73    bytes_per_sample_ = SpeechRecognizerImpl::kNumBitsPerAudioSample / 8;
74    const int frames = audio_packet_length_bytes / channels / bytes_per_sample_;
75    audio_bus_ = media::AudioBus::Create(channels, frames);
76    audio_bus_->Zero();
77  }
78
79  void CheckEventsConsistency() {
80    // Note: "!x || y" == "x implies y".
81    EXPECT_TRUE(!recognition_ended_ || recognition_started_);
82    EXPECT_TRUE(!audio_ended_ || audio_started_);
83    EXPECT_TRUE(!sound_ended_ || sound_started_);
84    EXPECT_TRUE(!audio_started_ || recognition_started_);
85    EXPECT_TRUE(!sound_started_ || audio_started_);
86    EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_));
87    EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_));
88  }
89
90  void CheckFinalEventsConsistency() {
91    // Note: "!(x ^ y)" == "(x && y) || (!x && !x)".
92    EXPECT_FALSE(recognition_started_ ^ recognition_ended_);
93    EXPECT_FALSE(audio_started_ ^ audio_ended_);
94    EXPECT_FALSE(sound_started_ ^ sound_ended_);
95  }
96
97  // Overridden from SpeechRecognitionEventListener:
98  virtual void OnAudioStart(int session_id) OVERRIDE {
99    audio_started_ = true;
100    CheckEventsConsistency();
101  }
102
103  virtual void OnAudioEnd(int session_id) OVERRIDE {
104    audio_ended_ = true;
105    CheckEventsConsistency();
106  }
107
108  virtual void OnRecognitionResults(
109      int session_id, const SpeechRecognitionResults& results) OVERRIDE {
110    result_received_ = true;
111  }
112
113  virtual void OnRecognitionError(
114      int session_id, const SpeechRecognitionError& error) OVERRIDE {
115    EXPECT_TRUE(recognition_started_);
116    EXPECT_FALSE(recognition_ended_);
117    error_ = error.code;
118  }
119
120  virtual void OnAudioLevelsChange(int session_id, float volume,
121                                   float noise_volume) OVERRIDE {
122    volume_ = volume;
123    noise_volume_ = noise_volume;
124  }
125
126  virtual void OnRecognitionEnd(int session_id) OVERRIDE {
127    recognition_ended_ = true;
128    CheckEventsConsistency();
129  }
130
131  virtual void OnRecognitionStart(int session_id) OVERRIDE {
132    recognition_started_ = true;
133    CheckEventsConsistency();
134  }
135
136  virtual void OnEnvironmentEstimationComplete(int session_id) OVERRIDE {}
137
138  virtual void OnSoundStart(int session_id) OVERRIDE {
139    sound_started_ = true;
140    CheckEventsConsistency();
141  }
142
143  virtual void OnSoundEnd(int session_id) OVERRIDE {
144    sound_ended_ = true;
145    CheckEventsConsistency();
146  }
147
148  // testing::Test methods.
149  virtual void SetUp() OVERRIDE {
150    AudioInputController::set_factory_for_testing(
151        &audio_input_controller_factory_);
152  }
153
154  virtual void TearDown() OVERRIDE {
155    AudioInputController::set_factory_for_testing(NULL);
156  }
157
158  void CopyPacketToAudioBus() {
159    // Copy the created signal into an audio bus in a deinterleaved format.
160    audio_bus_->FromInterleaved(
161        &audio_packet_[0], audio_bus_->frames(), bytes_per_sample_);
162  }
163
164  void FillPacketWithTestWaveform() {
165    // Fill the input with a simple pattern, a 125Hz sawtooth waveform.
166    for (size_t i = 0; i < audio_packet_.size(); ++i)
167      audio_packet_[i] = static_cast<uint8>(i);
168    CopyPacketToAudioBus();
169  }
170
171  void FillPacketWithNoise() {
172    int value = 0;
173    int factor = 175;
174    for (size_t i = 0; i < audio_packet_.size(); ++i) {
175      value += factor;
176      audio_packet_[i] = value % 100;
177    }
178    CopyPacketToAudioBus();
179  }
180
181 protected:
182  base::MessageLoopForIO message_loop_;
183  BrowserThreadImpl io_thread_;
184  scoped_refptr<SpeechRecognizerImpl> recognizer_;
185  scoped_ptr<AudioManager> audio_manager_;
186  bool recognition_started_;
187  bool recognition_ended_;
188  bool result_received_;
189  bool audio_started_;
190  bool audio_ended_;
191  bool sound_started_;
192  bool sound_ended_;
193  SpeechRecognitionErrorCode error_;
194  net::TestURLFetcherFactory url_fetcher_factory_;
195  TestAudioInputControllerFactory audio_input_controller_factory_;
196  std::vector<uint8> audio_packet_;
197  scoped_ptr<media::AudioBus> audio_bus_;
198  int bytes_per_sample_;
199  float volume_;
200  float noise_volume_;
201};
202
203TEST_F(SpeechRecognizerImplTest, StopNoData) {
204  // Check for callbacks when stopping record before any audio gets recorded.
205  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
206  recognizer_->StopAudioCapture();
207  base::MessageLoop::current()->RunUntilIdle();
208  EXPECT_TRUE(recognition_started_);
209  EXPECT_FALSE(audio_started_);
210  EXPECT_FALSE(result_received_);
211  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
212  CheckFinalEventsConsistency();
213}
214
215TEST_F(SpeechRecognizerImplTest, CancelNoData) {
216  // Check for callbacks when canceling recognition before any audio gets
217  // recorded.
218  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
219  recognizer_->AbortRecognition();
220  base::MessageLoop::current()->RunUntilIdle();
221  EXPECT_TRUE(recognition_started_);
222  EXPECT_FALSE(audio_started_);
223  EXPECT_FALSE(result_received_);
224  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_);
225  CheckFinalEventsConsistency();
226}
227
228TEST_F(SpeechRecognizerImplTest, StopWithData) {
229  // Start recording, give some data and then stop. This should wait for the
230  // network callback to arrive before completion.
231  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
232  base::MessageLoop::current()->RunUntilIdle();
233  TestAudioInputController* controller =
234      audio_input_controller_factory_.controller();
235  ASSERT_TRUE(controller);
236
237  // Try sending 5 chunks of mock audio data and verify that each of them
238  // resulted immediately in a packet sent out via the network. This verifies
239  // that we are streaming out encoded data as chunks without waiting for the
240  // full recording to complete.
241  const size_t kNumChunks = 5;
242  for (size_t i = 0; i < kNumChunks; ++i) {
243    controller->event_handler()->OnData(controller, audio_bus_.get());
244    base::MessageLoop::current()->RunUntilIdle();
245    net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
246    ASSERT_TRUE(fetcher);
247    EXPECT_EQ(i + 1, fetcher->upload_chunks().size());
248  }
249
250  recognizer_->StopAudioCapture();
251  base::MessageLoop::current()->RunUntilIdle();
252  EXPECT_TRUE(audio_started_);
253  EXPECT_TRUE(audio_ended_);
254  EXPECT_FALSE(recognition_ended_);
255  EXPECT_FALSE(result_received_);
256  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
257
258  // Issue the network callback to complete the process.
259  net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
260  ASSERT_TRUE(fetcher);
261
262  fetcher->set_url(fetcher->GetOriginalURL());
263  net::URLRequestStatus status;
264  status.set_status(net::URLRequestStatus::SUCCESS);
265  fetcher->set_status(status);
266  fetcher->set_response_code(200);
267  fetcher->SetResponseString(
268      "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}");
269  fetcher->delegate()->OnURLFetchComplete(fetcher);
270  base::MessageLoop::current()->RunUntilIdle();
271  EXPECT_TRUE(recognition_ended_);
272  EXPECT_TRUE(result_received_);
273  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
274  CheckFinalEventsConsistency();
275}
276
277TEST_F(SpeechRecognizerImplTest, CancelWithData) {
278  // Start recording, give some data and then cancel.
279  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
280  base::MessageLoop::current()->RunUntilIdle();
281  TestAudioInputController* controller =
282      audio_input_controller_factory_.controller();
283  ASSERT_TRUE(controller);
284  controller->event_handler()->OnData(controller, audio_bus_.get());
285  base::MessageLoop::current()->RunUntilIdle();
286  recognizer_->AbortRecognition();
287  base::MessageLoop::current()->RunUntilIdle();
288  ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0));
289  EXPECT_TRUE(recognition_started_);
290  EXPECT_TRUE(audio_started_);
291  EXPECT_FALSE(result_received_);
292  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_);
293  CheckFinalEventsConsistency();
294}
295
296TEST_F(SpeechRecognizerImplTest, ConnectionError) {
297  // Start recording, give some data and then stop. Issue the network callback
298  // with a connection error and verify that the recognizer bubbles the error up
299  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
300  base::MessageLoop::current()->RunUntilIdle();
301  TestAudioInputController* controller =
302      audio_input_controller_factory_.controller();
303  ASSERT_TRUE(controller);
304  controller->event_handler()->OnData(controller, audio_bus_.get());
305  base::MessageLoop::current()->RunUntilIdle();
306  net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
307  ASSERT_TRUE(fetcher);
308
309  recognizer_->StopAudioCapture();
310  base::MessageLoop::current()->RunUntilIdle();
311  EXPECT_TRUE(audio_started_);
312  EXPECT_TRUE(audio_ended_);
313  EXPECT_FALSE(recognition_ended_);
314  EXPECT_FALSE(result_received_);
315  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
316
317  // Issue the network callback to complete the process.
318  fetcher->set_url(fetcher->GetOriginalURL());
319  net::URLRequestStatus status;
320  status.set_status(net::URLRequestStatus::FAILED);
321  status.set_error(net::ERR_CONNECTION_REFUSED);
322  fetcher->set_status(status);
323  fetcher->set_response_code(0);
324  fetcher->SetResponseString(std::string());
325  fetcher->delegate()->OnURLFetchComplete(fetcher);
326  base::MessageLoop::current()->RunUntilIdle();
327  EXPECT_TRUE(recognition_ended_);
328  EXPECT_FALSE(result_received_);
329  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
330  CheckFinalEventsConsistency();
331}
332
333TEST_F(SpeechRecognizerImplTest, ServerError) {
334  // Start recording, give some data and then stop. Issue the network callback
335  // with a 500 error and verify that the recognizer bubbles the error up
336  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
337  base::MessageLoop::current()->RunUntilIdle();
338  TestAudioInputController* controller =
339      audio_input_controller_factory_.controller();
340  ASSERT_TRUE(controller);
341  controller->event_handler()->OnData(controller, audio_bus_.get());
342  base::MessageLoop::current()->RunUntilIdle();
343  net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
344  ASSERT_TRUE(fetcher);
345
346  recognizer_->StopAudioCapture();
347  base::MessageLoop::current()->RunUntilIdle();
348  EXPECT_TRUE(audio_started_);
349  EXPECT_TRUE(audio_ended_);
350  EXPECT_FALSE(recognition_ended_);
351  EXPECT_FALSE(result_received_);
352  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
353
354  // Issue the network callback to complete the process.
355  fetcher->set_url(fetcher->GetOriginalURL());
356  net::URLRequestStatus status;
357  status.set_status(net::URLRequestStatus::SUCCESS);
358  fetcher->set_status(status);
359  fetcher->set_response_code(500);
360  fetcher->SetResponseString("Internal Server Error");
361  fetcher->delegate()->OnURLFetchComplete(fetcher);
362  base::MessageLoop::current()->RunUntilIdle();
363  EXPECT_TRUE(recognition_ended_);
364  EXPECT_FALSE(result_received_);
365  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
366  CheckFinalEventsConsistency();
367}
368
369TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) {
370  // Check if things tear down properly if AudioInputController threw an error.
371  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
372  base::MessageLoop::current()->RunUntilIdle();
373  TestAudioInputController* controller =
374      audio_input_controller_factory_.controller();
375  ASSERT_TRUE(controller);
376  controller->event_handler()->OnError(controller,
377      AudioInputController::UNKNOWN_ERROR);
378  base::MessageLoop::current()->RunUntilIdle();
379  EXPECT_TRUE(recognition_started_);
380  EXPECT_FALSE(audio_started_);
381  EXPECT_FALSE(result_received_);
382  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_);
383  CheckFinalEventsConsistency();
384}
385
386TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) {
387  // Check if things tear down properly if AudioInputController threw an error
388  // after giving some audio data.
389  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
390  base::MessageLoop::current()->RunUntilIdle();
391  TestAudioInputController* controller =
392      audio_input_controller_factory_.controller();
393  ASSERT_TRUE(controller);
394  controller->event_handler()->OnData(controller, audio_bus_.get());
395  controller->event_handler()->OnError(controller,
396      AudioInputController::UNKNOWN_ERROR);
397  base::MessageLoop::current()->RunUntilIdle();
398  ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0));
399  EXPECT_TRUE(recognition_started_);
400  EXPECT_TRUE(audio_started_);
401  EXPECT_FALSE(result_received_);
402  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_);
403  CheckFinalEventsConsistency();
404}
405
406TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) {
407  // Start recording and give a lot of packets with audio samples set to zero.
408  // This should trigger the no-speech detector and issue a callback.
409  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
410  base::MessageLoop::current()->RunUntilIdle();
411  TestAudioInputController* controller =
412      audio_input_controller_factory_.controller();
413  ASSERT_TRUE(controller);
414
415  int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
416                     GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1;
417  // The vector is already filled with zero value samples on create.
418  for (int i = 0; i < num_packets; ++i) {
419    controller->event_handler()->OnData(controller, audio_bus_.get());
420  }
421  base::MessageLoop::current()->RunUntilIdle();
422  EXPECT_TRUE(recognition_started_);
423  EXPECT_TRUE(audio_started_);
424  EXPECT_FALSE(result_received_);
425  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_);
426  CheckFinalEventsConsistency();
427}
428
429TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) {
430  // Start recording and give a lot of packets with audio samples set to zero
431  // and then some more with reasonably loud audio samples. This should be
432  // treated as normal speech input and the no-speech detector should not get
433  // triggered.
434  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
435  base::MessageLoop::current()->RunUntilIdle();
436  TestAudioInputController* controller =
437      audio_input_controller_factory_.controller();
438  ASSERT_TRUE(controller);
439  controller = audio_input_controller_factory_.controller();
440  ASSERT_TRUE(controller);
441
442  int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
443                     GoogleOneShotRemoteEngine::kAudioPacketIntervalMs;
444
445  // The vector is already filled with zero value samples on create.
446  for (int i = 0; i < num_packets / 2; ++i) {
447    controller->event_handler()->OnData(controller, audio_bus_.get());
448  }
449
450  FillPacketWithTestWaveform();
451  for (int i = 0; i < num_packets / 2; ++i) {
452    controller->event_handler()->OnData(controller, audio_bus_.get());
453  }
454
455  base::MessageLoop::current()->RunUntilIdle();
456  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
457  EXPECT_TRUE(audio_started_);
458  EXPECT_FALSE(audio_ended_);
459  EXPECT_FALSE(recognition_ended_);
460  recognizer_->AbortRecognition();
461  base::MessageLoop::current()->RunUntilIdle();
462  CheckFinalEventsConsistency();
463}
464
465TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
466  // Start recording and give a lot of packets with audio samples set to zero
467  // and then some more with reasonably loud audio samples. Check that we don't
468  // get the callback during estimation phase, then get zero for the silence
469  // samples and proper volume for the loud audio.
470  recognizer_->StartRecognition(media::AudioManagerBase::kDefaultDeviceId);
471  base::MessageLoop::current()->RunUntilIdle();
472  TestAudioInputController* controller =
473      audio_input_controller_factory_.controller();
474  ASSERT_TRUE(controller);
475  controller = audio_input_controller_factory_.controller();
476  ASSERT_TRUE(controller);
477
478  // Feed some samples to begin with for the endpointer to do noise estimation.
479  int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs /
480                    GoogleOneShotRemoteEngine::kAudioPacketIntervalMs;
481  FillPacketWithNoise();
482  for (int i = 0; i < num_packets; ++i) {
483    controller->event_handler()->OnData(controller, audio_bus_.get());
484  }
485  base::MessageLoop::current()->RunUntilIdle();
486  EXPECT_EQ(-1.0f, volume_);  // No audio volume set yet.
487
488  // The vector is already filled with zero value samples on create.
489  controller->event_handler()->OnData(controller, audio_bus_.get());
490  base::MessageLoop::current()->RunUntilIdle();
491  EXPECT_FLOAT_EQ(0.74939233f, volume_);
492
493  FillPacketWithTestWaveform();
494  controller->event_handler()->OnData(controller, audio_bus_.get());
495  base::MessageLoop::current()->RunUntilIdle();
496  EXPECT_NEAR(0.89926866f, volume_, 0.00001f);
497  EXPECT_FLOAT_EQ(0.75071919f, noise_volume_);
498
499  EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
500  EXPECT_FALSE(audio_ended_);
501  EXPECT_FALSE(recognition_ended_);
502  recognizer_->AbortRecognition();
503  base::MessageLoop::current()->RunUntilIdle();
504  CheckFinalEventsConsistency();
505}
506
507}  // namespace content
508