google_one_shot_remote_engine.cc revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "content/browser/speech/google_one_shot_remote_engine.h"
6
7#include <vector>
8
9#include "base/json/json_reader.h"
10#include "base/strings/string_number_conversions.h"
11#include "base/strings/string_util.h"
12#include "base/values.h"
13#include "content/browser/speech/audio_buffer.h"
14#include "content/public/common/speech_recognition_error.h"
15#include "content/public/common/speech_recognition_result.h"
16#include "google_apis/google_api_keys.h"
17#include "net/base/escape.h"
18#include "net/base/load_flags.h"
19#include "net/url_request/http_user_agent_settings.h"
20#include "net/url_request/url_fetcher.h"
21#include "net/url_request/url_request_context.h"
22#include "net/url_request/url_request_context_getter.h"
23#include "net/url_request/url_request_status.h"
24
25namespace content {
26namespace {
27
28const char* const kDefaultSpeechRecognitionUrl =
29    "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";
30const char* const kStatusString = "status";
31const char* const kHypothesesString = "hypotheses";
32const char* const kUtteranceString = "utterance";
33const char* const kConfidenceString = "confidence";
34const int kWebServiceStatusNoError = 0;
35const int kWebServiceStatusNoSpeech = 4;
36const int kWebServiceStatusNoMatch = 5;
37const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;
38
39bool ParseServerResponse(const std::string& response_body,
40                         SpeechRecognitionResult* result,
41                         SpeechRecognitionError* error) {
42  if (response_body.empty()) {
43    LOG(WARNING) << "ParseServerResponse: Response was empty.";
44    return false;
45  }
46  DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;
47
48  // Parse the response, ignoring comments.
49  std::string error_msg;
50  scoped_ptr<base::Value> response_value(base::JSONReader::ReadAndReturnError(
51      response_body, base::JSON_PARSE_RFC, NULL, &error_msg));
52  if (response_value == NULL) {
53    LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg;
54    return false;
55  }
56
57  if (!response_value->IsType(base::Value::TYPE_DICTIONARY)) {
58    VLOG(1) << "ParseServerResponse: Unexpected response type "
59            << response_value->GetType();
60    return false;
61  }
62  const base::DictionaryValue* response_object =
63      static_cast<const base::DictionaryValue*>(response_value.get());
64
65  // Get the status.
66  int status;
67  if (!response_object->GetInteger(kStatusString, &status)) {
68    VLOG(1) << "ParseServerResponse: " << kStatusString
69            << " is not a valid integer value.";
70    return false;
71  }
72
73  // Process the status.
74  switch (status) {
75    case kWebServiceStatusNoError:
76      break;
77    case kWebServiceStatusNoSpeech:
78      error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH;
79      return false;
80    case kWebServiceStatusNoMatch:
81      error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH;
82      return false;
83    default:
84      error->code = SPEECH_RECOGNITION_ERROR_NETWORK;
85      // Other status codes should not be returned by the server.
86      VLOG(1) << "ParseServerResponse: unexpected status code " << status;
87      return false;
88  }
89
90  // Get the hypotheses.
91  const base::Value* hypotheses_value = NULL;
92  if (!response_object->Get(kHypothesesString, &hypotheses_value)) {
93    VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";
94    return false;
95  }
96
97  DCHECK(hypotheses_value);
98  if (!hypotheses_value->IsType(base::Value::TYPE_LIST)) {
99    VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "
100            << hypotheses_value->GetType();
101    return false;
102  }
103
104  const base::ListValue* hypotheses_list =
105      static_cast<const base::ListValue*>(hypotheses_value);
106
107  // For now we support only single shot recognition, so we are giving only a
108  // final result, consisting of one fragment (with one or more hypotheses).
109  size_t index = 0;
110  for (; index < hypotheses_list->GetSize(); ++index) {
111    const base::Value* hypothesis = NULL;
112    if (!hypotheses_list->Get(index, &hypothesis)) {
113      LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";
114      break;
115    }
116    DCHECK(hypothesis);
117    if (!hypothesis->IsType(base::Value::TYPE_DICTIONARY)) {
118      LOG(WARNING) << "ParseServerResponse: Unexpected value type "
119                   << hypothesis->GetType();
120      break;
121    }
122
123    const base::DictionaryValue* hypothesis_value =
124        static_cast<const base::DictionaryValue*>(hypothesis);
125    base::string16 utterance;
126
127    if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {
128      LOG(WARNING) << "ParseServerResponse: Missing utterance value.";
129      break;
130    }
131
132    // It is not an error if the 'confidence' field is missing.
133    double confidence = 0.0;
134    hypothesis_value->GetDouble(kConfidenceString, &confidence);
135    result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,
136                                                             confidence));
137  }
138
139  if (index < hypotheses_list->GetSize()) {
140    result->hypotheses.clear();
141    return false;
142  }
143  return true;
144}
145
146}  // namespace
147
148const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;
149int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;
150
151GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(
152    net::URLRequestContextGetter* context)
153    : url_context_(context) {
154}
155
156GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}
157
158void GoogleOneShotRemoteEngine::SetConfig(
159    const SpeechRecognitionEngineConfig& config) {
160  config_ = config;
161}
162
163void GoogleOneShotRemoteEngine::StartRecognition() {
164  DCHECK(delegate());
165  DCHECK(!url_fetcher_.get());
166  std::string lang_param = config_.language;
167
168  if (lang_param.empty() && url_context_.get()) {
169    // If no language is provided then we use the first from the accepted
170    // language list. If this list is empty then it defaults to "en-US".
171    // Example of the contents of this list: "es,en-GB;q=0.8", ""
172    net::URLRequestContext* request_context =
173        url_context_->GetURLRequestContext();
174    DCHECK(request_context);
175    // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with
176    // a reference to the HttpUserAgentSettings rather than accessing the
177    // accept language through the URLRequestContext.
178    if (request_context->http_user_agent_settings()) {
179      std::string accepted_language_list =
180          request_context->http_user_agent_settings()->GetAcceptLanguage();
181      size_t separator = accepted_language_list.find_first_of(",;");
182      lang_param = accepted_language_list.substr(0, separator);
183    }
184  }
185
186  if (lang_param.empty())
187    lang_param = "en-US";
188
189  std::vector<std::string> parts;
190  parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));
191
192  if (!config_.grammars.empty()) {
193    DCHECK_EQ(config_.grammars.size(), 1U);
194    parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url,
195                                                       true));
196  }
197
198  if (!config_.hardware_info.empty())
199    parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,
200                                                        true));
201  parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses));
202  parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");
203
204  std::string api_key = google_apis::GetAPIKey();
205  parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true));
206
207  GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));
208
209  encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
210                                      config_.audio_sample_rate,
211                                      config_.audio_num_bits_per_sample));
212  DCHECK(encoder_.get());
213  url_fetcher_.reset(net::URLFetcher::Create(url_fetcher_id_for_tests,
214                                             url,
215                                             net::URLFetcher::POST,
216                                             this));
217  url_fetcher_->SetChunkedUpload(encoder_->mime_type());
218  url_fetcher_->SetRequestContext(url_context_.get());
219  url_fetcher_->SetReferrer(config_.origin_url);
220
221  // The speech recognition API does not require user identification as part
222  // of requests, so we don't send cookies or auth data for these requests to
223  // prevent any accidental connection between users who are logged into the
224  // domain for other services (e.g. bookmark sync) with the speech requests.
225  url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
226                             net::LOAD_DO_NOT_SEND_COOKIES |
227                             net::LOAD_DO_NOT_SEND_AUTH_DATA);
228  url_fetcher_->Start();
229}
230
231void GoogleOneShotRemoteEngine::EndRecognition() {
232  url_fetcher_.reset();
233}
234
235void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
236  DCHECK(url_fetcher_.get());
237  DCHECK(encoder_.get());
238  DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
239  encoder_->Encode(data);
240  scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
241  url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
242}
243
244void GoogleOneShotRemoteEngine::AudioChunksEnded() {
245  DCHECK(url_fetcher_.get());
246  DCHECK(encoder_.get());
247
248  // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
249  // of silence in case encoder had no data already.
250  std::vector<int16> samples(
251      config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
252  scoped_refptr<AudioChunk> dummy_chunk(
253      new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
254                     samples.size() * sizeof(int16),
255                     encoder_->bits_per_sample() / 8));
256  encoder_->Encode(*dummy_chunk.get());
257  encoder_->Flush();
258  scoped_refptr<AudioChunk> encoded_dummy_data(
259      encoder_->GetEncodedDataAndClear());
260  DCHECK(!encoded_dummy_data->IsEmpty());
261  encoder_.reset();
262
263  url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
264}
265
266void GoogleOneShotRemoteEngine::OnURLFetchComplete(
267    const net::URLFetcher* source) {
268  DCHECK_EQ(url_fetcher_.get(), source);
269  SpeechRecognitionResults results;
270  results.push_back(SpeechRecognitionResult());
271  SpeechRecognitionResult& result = results.back();
272  SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK);
273  std::string data;
274
275  // The default error code in case of parse errors is NETWORK_FAILURE, however
276  // ParseServerResponse can change the error to a more appropriate one.
277  bool error_occurred = (!source->GetStatus().is_success() ||
278                        source->GetResponseCode() != 200 ||
279                        !source->GetResponseAsString(&data) ||
280                        !ParseServerResponse(data, &result, &error));
281  url_fetcher_.reset();
282  if (error_occurred) {
283    DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;
284    delegate()->OnSpeechRecognitionEngineError(error);
285  } else {
286    DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";
287    delegate()->OnSpeechRecognitionEngineResults(results);
288  }
289}
290
291bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {
292  return url_fetcher_ != NULL;
293}
294
295int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {
296  return kAudioPacketIntervalMs;
297}
298
299}  // namespace content
300