google_streaming_remote_engine_unittest.cc revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <queue>
6
7#include "base/memory/scoped_ptr.h"
8#include "base/message_loop.h"
9#include "base/utf_string_conversions.h"
10#include "content/browser/speech/audio_buffer.h"
11#include "content/browser/speech/google_streaming_remote_engine.h"
12#include "content/browser/speech/proto/google_streaming_api.pb.h"
13#include "content/public/common/speech_recognition_error.h"
14#include "content/public/common/speech_recognition_result.h"
15#include "net/url_request/test_url_fetcher_factory.h"
16#include "net/url_request/url_request_context_getter.h"
17#include "net/url_request/url_request_status.h"
18#include "testing/gtest/include/gtest/gtest.h"
19
20using net::URLRequestStatus;
21using net::TestURLFetcher;
22using net::TestURLFetcherFactory;
23
24namespace content {
25
26// Note: the terms upstream and downstream are from the point-of-view of the
27// client (engine_under_test_).
28
29class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate,
30                                        public testing::Test {
31 public:
32  GoogleStreamingRemoteEngineTest()
33      : last_number_of_upstream_chunks_seen_(0U),
34        error_(SPEECH_RECOGNITION_ERROR_NONE) { }
35
36  // Creates a speech recognition request and invokes its URL fetcher delegate
37  // with the given test data.
38  void CreateAndTestRequest(bool success, const std::string& http_response);
39
40  // SpeechRecognitionRequestDelegate methods.
41  virtual void OnSpeechRecognitionEngineResults(
42      const SpeechRecognitionResults& results) OVERRIDE {
43    results_.push(results);
44  }
45  virtual void OnSpeechRecognitionEngineError(
46      const SpeechRecognitionError& error) OVERRIDE {
47    error_ = error.code;
48  }
49
50  // testing::Test methods.
51  virtual void SetUp() OVERRIDE;
52  virtual void TearDown() OVERRIDE;
53
54 protected:
55  enum DownstreamError {
56    DOWNSTREAM_ERROR_NONE,
57    DOWNSTREAM_ERROR_HTTP500,
58    DOWNSTREAM_ERROR_NETWORK,
59    DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
60  };
61  static bool ResultsAreEqual(const SpeechRecognitionResults& a,
62                              const SpeechRecognitionResults& b);
63  static std::string SerializeProtobufResponse(
64      const proto::SpeechRecognitionEvent& msg);
65  static std::string ToBigEndian32(uint32 value);
66
67  TestURLFetcher* GetUpstreamFetcher();
68  TestURLFetcher* GetDownstreamFetcher();
69  void StartMockRecognition();
70  void EndMockRecognition();
71  void InjectDummyAudioChunk();
72  size_t UpstreamChunksUploadedFromLastCall();
73  void ProvideMockProtoResultDownstream(
74      const proto::SpeechRecognitionEvent& result);
75  void ProvideMockResultDownstream(const SpeechRecognitionResult& result);
76  void ExpectResultsReceived(const SpeechRecognitionResults& result);
77  void CloseMockDownstream(DownstreamError error);
78
79  scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_;
80  TestURLFetcherFactory url_fetcher_factory_;
81  size_t last_number_of_upstream_chunks_seen_;
82  MessageLoop message_loop_;
83  std::string response_buffer_;
84  SpeechRecognitionErrorCode error_;
85  std::queue<SpeechRecognitionResults> results_;
86};
87
88TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) {
89  StartMockRecognition();
90  ASSERT_TRUE(GetUpstreamFetcher());
91  ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
92
93  // Inject some dummy audio chunks and check a corresponding chunked upload
94  // is performed every time on the server.
95  for (int i = 0; i < 3; ++i) {
96    InjectDummyAudioChunk();
97    ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
98  }
99
100  // Ensure that a final (empty) audio chunk is uploaded on chunks end.
101  engine_under_test_->AudioChunksEnded();
102  ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
103  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
104
105  // Simulate a protobuf message streamed from the server containing a single
106  // result with two hypotheses.
107  SpeechRecognitionResults results;
108  results.push_back(SpeechRecognitionResult());
109  SpeechRecognitionResult& result = results.back();
110  result.is_provisional = false;
111  result.hypotheses.push_back(
112      SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 1"), 0.1F));
113  result.hypotheses.push_back(
114      SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 2"), 0.2F));
115
116  ProvideMockResultDownstream(result);
117  ExpectResultsReceived(results);
118  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
119
120  // Ensure everything is closed cleanly after the downstream is closed.
121  CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
122  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
123  EndMockRecognition();
124  ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
125  ASSERT_EQ(0U, results_.size());
126}
127
128TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) {
129  StartMockRecognition();
130  ASSERT_TRUE(GetUpstreamFetcher());
131  ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
132
133  for (int i = 0; i < 4; ++i) {
134    InjectDummyAudioChunk();
135    ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
136
137    SpeechRecognitionResults results;
138    results.push_back(SpeechRecognitionResult());
139    SpeechRecognitionResult& result = results.back();
140    result.is_provisional = (i % 2 == 0);  // Alternate result types.
141    float confidence = result.is_provisional ? 0.0F : (i * 0.1F);
142    result.hypotheses.push_back(
143        SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), confidence));
144
145    ProvideMockResultDownstream(result);
146    ExpectResultsReceived(results);
147    ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
148  }
149
150  // Ensure that a final (empty) audio chunk is uploaded on chunks end.
151  engine_under_test_->AudioChunksEnded();
152  ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
153  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
154
155  // Simulate a final definitive result.
156  SpeechRecognitionResults results;
157  results.push_back(SpeechRecognitionResult());
158  SpeechRecognitionResult& result = results.back();
159  result.is_provisional = false;
160  result.hypotheses.push_back(
161      SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 1.0F));
162  ProvideMockResultDownstream(result);
163  ExpectResultsReceived(results);
164  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
165
166  // Ensure everything is closed cleanly after the downstream is closed.
167  CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
168  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
169  EndMockRecognition();
170  ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
171  ASSERT_EQ(0U, results_.size());
172}
173
174TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) {
175  StartMockRecognition();
176  ASSERT_TRUE(GetUpstreamFetcher());
177  ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
178
179  // Simulate one pushed audio chunk.
180  InjectDummyAudioChunk();
181  ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
182
183  // Simulate the corresponding definitive result.
184  SpeechRecognitionResults results;
185  results.push_back(SpeechRecognitionResult());
186  SpeechRecognitionResult& result = results.back();
187  result.hypotheses.push_back(
188      SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), 1.0F));
189  ProvideMockResultDownstream(result);
190  ExpectResultsReceived(results);
191  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
192
193  // Simulate a silent downstream closure after |AudioChunksEnded|.
194  engine_under_test_->AudioChunksEnded();
195  ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
196  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
197  CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
198
199  // Expect an empty result, aimed at notifying recognition ended with no
200  // actual results nor errors.
201  SpeechRecognitionResults empty_results;
202  ExpectResultsReceived(empty_results);
203
204  // Ensure everything is closed cleanly after the downstream is closed.
205  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
206  EndMockRecognition();
207  ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
208  ASSERT_EQ(0U, results_.size());
209}
210
211TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) {
212  StartMockRecognition();
213  ASSERT_TRUE(GetUpstreamFetcher());
214  ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
215
216  for (int i = 0; i < 3; ++i)
217    InjectDummyAudioChunk();
218  engine_under_test_->AudioChunksEnded();
219  ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall());
220  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
221
222  // Simulate only a provisional result.
223  SpeechRecognitionResults results;
224  results.push_back(SpeechRecognitionResult());
225  SpeechRecognitionResult& result = results.back();
226  result.is_provisional = true;
227  result.hypotheses.push_back(
228      SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 0.0F));
229  ProvideMockResultDownstream(result);
230  ExpectResultsReceived(results);
231  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
232
233  CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH);
234
235  // Expect an empty result.
236  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
237  EndMockRecognition();
238  SpeechRecognitionResults empty_result;
239  ExpectResultsReceived(empty_result);
240}
241
242TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) {
243  StartMockRecognition();
244  ASSERT_TRUE(GetUpstreamFetcher());
245  ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
246
247  InjectDummyAudioChunk();
248  ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
249
250  // Close the downstream with a HTTP 500 error.
251  CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500);
252
253  // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
254  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
255  EndMockRecognition();
256  ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
257  ASSERT_EQ(0U, results_.size());
258}
259
260TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) {
261  StartMockRecognition();
262  ASSERT_TRUE(GetUpstreamFetcher());
263  ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
264
265  InjectDummyAudioChunk();
266  ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
267
268  // Close the downstream fetcher simulating a network failure.
269  CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK);
270
271  // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
272  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
273  EndMockRecognition();
274  ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
275  ASSERT_EQ(0U, results_.size());
276}
277
278TEST_F(GoogleStreamingRemoteEngineTest, Stability) {
279  StartMockRecognition();
280  ASSERT_TRUE(GetUpstreamFetcher());
281  ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
282
283  // Upload a dummy audio chunk.
284  InjectDummyAudioChunk();
285  ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
286  engine_under_test_->AudioChunksEnded();
287
288  // Simulate a protobuf message with an intermediate result without confidence,
289  // but with stability.
290  proto::SpeechRecognitionEvent proto_event;
291  proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
292  proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
293  proto_result->set_stability(0.5);
294  proto::SpeechRecognitionAlternative *proto_alternative =
295      proto_result->add_alternative();
296  proto_alternative->set_transcript("foo");
297  ProvideMockProtoResultDownstream(proto_event);
298
299  // Set up expectations.
300  SpeechRecognitionResults results;
301  results.push_back(SpeechRecognitionResult());
302  SpeechRecognitionResult& result = results.back();
303  result.is_provisional = true;
304  result.hypotheses.push_back(
305      SpeechRecognitionHypothesis(UTF8ToUTF16("foo"), 0.5));
306
307  // Check that the protobuf generated the expected result.
308  ExpectResultsReceived(results);
309
310  // Since it was a provisional result, recognition is still pending.
311  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
312
313  // Shut down.
314  CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
315  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
316  EndMockRecognition();
317
318  // Since there was no final result, we get an empty "no match" result.
319  SpeechRecognitionResults empty_result;
320  ExpectResultsReceived(empty_result);
321  ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
322  ASSERT_EQ(0U, results_.size());
323}
324
325void GoogleStreamingRemoteEngineTest::SetUp() {
326  engine_under_test_.reset(
327      new  GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/));
328  engine_under_test_->set_delegate(this);
329}
330
331void GoogleStreamingRemoteEngineTest::TearDown() {
332  engine_under_test_.reset();
333}
334
335TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() {
336  return url_fetcher_factory_.GetFetcherByID(
337        GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTests);
338}
339
340TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() {
341  return url_fetcher_factory_.GetFetcherByID(
342        GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTests);
343}
344
345// Starts recognition on the engine, ensuring that both stream fetchers are
346// created.
347void GoogleStreamingRemoteEngineTest::StartMockRecognition() {
348  DCHECK(engine_under_test_.get());
349
350  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
351
352  engine_under_test_->StartRecognition();
353  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
354
355  TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
356  ASSERT_TRUE(upstream_fetcher);
357  upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL());
358
359  TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
360  ASSERT_TRUE(downstream_fetcher);
361  downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL());
362}
363
364void GoogleStreamingRemoteEngineTest::EndMockRecognition() {
365  DCHECK(engine_under_test_.get());
366  engine_under_test_->EndRecognition();
367  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
368
369  // TODO(primiano): In order to be very pedantic we should check that both the
370  // upstream and downstream URL fetchers have been disposed at this time.
371  // Unfortunately it seems that there is no direct way to detect (in tests)
372  // if a url_fetcher has been freed or not, since they are not automatically
373  // de-registered from the TestURLFetcherFactory on destruction.
374}
375
376void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() {
377  unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'};
378  scoped_refptr<AudioChunk> dummy_audio_chunk(
379      new AudioChunk(&dummy_audio_buffer_data[0],
380                     sizeof(dummy_audio_buffer_data),
381                     2 /* bytes per sample */));
382  DCHECK(engine_under_test_.get());
383  engine_under_test_->TakeAudioChunk(*dummy_audio_chunk);
384}
385
386size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() {
387  TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
388  DCHECK(upstream_fetcher);
389  const size_t number_of_chunks = upstream_fetcher->upload_chunks().size();
390  DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_);
391  const size_t new_chunks = number_of_chunks -
392                            last_number_of_upstream_chunks_seen_;
393  last_number_of_upstream_chunks_seen_ = number_of_chunks;
394  return new_chunks;
395}
396
397void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream(
398    const proto::SpeechRecognitionEvent& result) {
399  TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
400
401  ASSERT_TRUE(downstream_fetcher);
402  downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */));
403  downstream_fetcher->set_response_code(200);
404
405  std::string response_string = SerializeProtobufResponse(result);
406  response_buffer_.append(response_string);
407  downstream_fetcher->SetResponseString(response_buffer_);
408  downstream_fetcher->delegate()->OnURLFetchDownloadProgress(
409      downstream_fetcher,
410      response_buffer_.size(),
411      -1 /* total response length not used */);
412}
413
414void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream(
415    const SpeechRecognitionResult& result) {
416  proto::SpeechRecognitionEvent proto_event;
417  proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
418  proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
419  proto_result->set_final(!result.is_provisional);
420  for (size_t i = 0; i < result.hypotheses.size(); ++i) {
421    proto::SpeechRecognitionAlternative* proto_alternative =
422        proto_result->add_alternative();
423    const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i];
424    proto_alternative->set_confidence(hypothesis.confidence);
425    proto_alternative->set_transcript(UTF16ToUTF8(hypothesis.utterance));
426  }
427  ProvideMockProtoResultDownstream(proto_event);
428}
429
430void GoogleStreamingRemoteEngineTest::CloseMockDownstream(
431    DownstreamError error) {
432  TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
433  ASSERT_TRUE(downstream_fetcher);
434
435  const URLRequestStatus::Status fetcher_status =
436      (error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED :
437                                            URLRequestStatus::SUCCESS;
438  downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0));
439  downstream_fetcher->set_response_code(
440      (error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200);
441
442  if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) {
443    // Send empty response.
444    proto::SpeechRecognitionEvent response;
445    response_buffer_.append(SerializeProtobufResponse(response));
446  }
447  downstream_fetcher->SetResponseString(response_buffer_);
448  downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher);
449}
450
451void GoogleStreamingRemoteEngineTest::ExpectResultsReceived(
452    const SpeechRecognitionResults& results) {
453  ASSERT_GE(1U, results_.size());
454  ASSERT_TRUE(ResultsAreEqual(results, results_.front()));
455  results_.pop();
456}
457
458bool GoogleStreamingRemoteEngineTest::ResultsAreEqual(
459    const SpeechRecognitionResults& a, const SpeechRecognitionResults& b) {
460  if (a.size() != b.size())
461    return false;
462
463  SpeechRecognitionResults::const_iterator it_a = a.begin();
464  SpeechRecognitionResults::const_iterator it_b = b.begin();
465  for (; it_a != a.end() && it_b != b.end(); ++it_a, ++it_b) {
466    if (it_a->is_provisional != it_b->is_provisional ||
467        it_a->hypotheses.size() != it_b->hypotheses.size()) {
468      return false;
469    }
470    for (size_t i = 0; i < it_a->hypotheses.size(); ++i) {
471      const SpeechRecognitionHypothesis& hyp_a = it_a->hypotheses[i];
472      const SpeechRecognitionHypothesis& hyp_b = it_b->hypotheses[i];
473      if (hyp_a.utterance != hyp_b.utterance ||
474          hyp_a.confidence != hyp_b.confidence) {
475        return false;
476      }
477    }
478  }
479
480  return true;
481}
482
483std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse(
484    const proto::SpeechRecognitionEvent& msg) {
485  std::string msg_string;
486  msg.SerializeToString(&msg_string);
487
488  // Prepend 4 byte prefix length indication to the protobuf message as
489  // envisaged by the google streaming recognition webservice protocol.
490  msg_string.insert(0, ToBigEndian32(msg_string.size()));
491  return msg_string;
492}
493
494std::string GoogleStreamingRemoteEngineTest::ToBigEndian32(uint32 value) {
495  char raw_data[4];
496  raw_data[0] = static_cast<uint8>((value >> 24) & 0xFF);
497  raw_data[1] = static_cast<uint8>((value >> 16) & 0xFF);
498  raw_data[2] = static_cast<uint8>((value >> 8) & 0xFF);
499  raw_data[3] = static_cast<uint8>(value & 0xFF);
500  return std::string(raw_data, sizeof(raw_data));
501}
502
503}  // namespace content
504