google_one_shot_remote_engine.cc revision 868fa2fe829687343ffae624259930155e16dbd8
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "content/browser/speech/google_one_shot_remote_engine.h"
6
7#include <vector>
8
9#include "base/json/json_reader.h"
10#include "base/strings/string_number_conversions.h"
11#include "base/strings/string_util.h"
12#include "base/values.h"
13#include "content/browser/speech/audio_buffer.h"
14#include "content/public/common/speech_recognition_error.h"
15#include "content/public/common/speech_recognition_result.h"
16#include "google_apis/google_api_keys.h"
17#include "net/base/escape.h"
18#include "net/base/load_flags.h"
19#include "net/url_request/url_fetcher.h"
20#include "net/url_request/url_request_context.h"
21#include "net/url_request/url_request_context_getter.h"
22#include "net/url_request/url_request_status.h"
23
24namespace content {
25namespace {
26
27const char* const kDefaultSpeechRecognitionUrl =
28    "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";
29const char* const kStatusString = "status";
30const char* const kHypothesesString = "hypotheses";
31const char* const kUtteranceString = "utterance";
32const char* const kConfidenceString = "confidence";
33const int kWebServiceStatusNoError = 0;
34const int kWebServiceStatusNoSpeech = 4;
35const int kWebServiceStatusNoMatch = 5;
36const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;
37
38bool ParseServerResponse(const std::string& response_body,
39                         SpeechRecognitionResult* result,
40                         SpeechRecognitionError* error) {
41  if (response_body.empty()) {
42    LOG(WARNING) << "ParseServerResponse: Response was empty.";
43    return false;
44  }
45  DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;
46
47  // Parse the response, ignoring comments.
48  std::string error_msg;
49  scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(
50      response_body, base::JSON_PARSE_RFC, NULL, &error_msg));
51  if (response_value == NULL) {
52    LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg;
53    return false;
54  }
55
56  if (!response_value->IsType(Value::TYPE_DICTIONARY)) {
57    VLOG(1) << "ParseServerResponse: Unexpected response type "
58            << response_value->GetType();
59    return false;
60  }
61  const DictionaryValue* response_object =
62      static_cast<const DictionaryValue*>(response_value.get());
63
64  // Get the status.
65  int status;
66  if (!response_object->GetInteger(kStatusString, &status)) {
67    VLOG(1) << "ParseServerResponse: " << kStatusString
68            << " is not a valid integer value.";
69    return false;
70  }
71
72  // Process the status.
73  switch (status) {
74    case kWebServiceStatusNoError:
75      break;
76    case kWebServiceStatusNoSpeech:
77      error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH;
78      return false;
79    case kWebServiceStatusNoMatch:
80      error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH;
81      return false;
82    default:
83      error->code = SPEECH_RECOGNITION_ERROR_NETWORK;
84      // Other status codes should not be returned by the server.
85      VLOG(1) << "ParseServerResponse: unexpected status code " << status;
86      return false;
87  }
88
89  // Get the hypotheses.
90  const Value* hypotheses_value = NULL;
91  if (!response_object->Get(kHypothesesString, &hypotheses_value)) {
92    VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";
93    return false;
94  }
95
96  DCHECK(hypotheses_value);
97  if (!hypotheses_value->IsType(Value::TYPE_LIST)) {
98    VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "
99            << hypotheses_value->GetType();
100    return false;
101  }
102
103  const ListValue* hypotheses_list =
104      static_cast<const ListValue*>(hypotheses_value);
105
106  // For now we support only single shot recognition, so we are giving only a
107  // final result, consisting of one fragment (with one or more hypotheses).
108  size_t index = 0;
109  for (; index < hypotheses_list->GetSize(); ++index) {
110    const Value* hypothesis = NULL;
111    if (!hypotheses_list->Get(index, &hypothesis)) {
112      LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";
113      break;
114    }
115    DCHECK(hypothesis);
116    if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {
117      LOG(WARNING) << "ParseServerResponse: Unexpected value type "
118                   << hypothesis->GetType();
119      break;
120    }
121
122    const DictionaryValue* hypothesis_value =
123        static_cast<const DictionaryValue*>(hypothesis);
124    string16 utterance;
125
126    if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {
127      LOG(WARNING) << "ParseServerResponse: Missing utterance value.";
128      break;
129    }
130
131    // It is not an error if the 'confidence' field is missing.
132    double confidence = 0.0;
133    hypothesis_value->GetDouble(kConfidenceString, &confidence);
134    result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,
135                                                             confidence));
136  }
137
138  if (index < hypotheses_list->GetSize()) {
139    result->hypotheses.clear();
140    return false;
141  }
142  return true;
143}
144
145}  // namespace
146
147const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;
148int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;
149
150GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(
151    net::URLRequestContextGetter* context)
152    : url_context_(context) {
153}
154
155GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}
156
157void GoogleOneShotRemoteEngine::SetConfig(
158    const SpeechRecognitionEngineConfig& config) {
159  config_ = config;
160}
161
162void GoogleOneShotRemoteEngine::StartRecognition() {
163  DCHECK(delegate());
164  DCHECK(!url_fetcher_.get());
165  std::string lang_param = config_.language;
166
167  if (lang_param.empty() && url_context_.get()) {
168    // If no language is provided then we use the first from the accepted
169    // language list. If this list is empty then it defaults to "en-US".
170    // Example of the contents of this list: "es,en-GB;q=0.8", ""
171    net::URLRequestContext* request_context =
172        url_context_->GetURLRequestContext();
173    DCHECK(request_context);
174    // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with
175    // a reference to the HttpUserAgentSettings rather than accessing the
176    // accept language through the URLRequestContext.
177    std::string accepted_language_list = request_context->GetAcceptLanguage();
178    size_t separator = accepted_language_list.find_first_of(",;");
179    lang_param = accepted_language_list.substr(0, separator);
180  }
181
182  if (lang_param.empty())
183    lang_param = "en-US";
184
185  std::vector<std::string> parts;
186  parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));
187
188  if (!config_.grammars.empty()) {
189    DCHECK_EQ(config_.grammars.size(), 1U);
190    parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url,
191                                                       true));
192  }
193
194  if (!config_.hardware_info.empty())
195    parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,
196                                                        true));
197  parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses));
198  parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");
199
200  std::string api_key = google_apis::GetAPIKey();
201  parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true));
202
203  GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));
204
205  encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
206                                      config_.audio_sample_rate,
207                                      config_.audio_num_bits_per_sample));
208  DCHECK(encoder_.get());
209  url_fetcher_.reset(net::URLFetcher::Create(url_fetcher_id_for_tests,
210                                             url,
211                                             net::URLFetcher::POST,
212                                             this));
213  url_fetcher_->SetChunkedUpload(encoder_->mime_type());
214  url_fetcher_->SetRequestContext(url_context_.get());
215  url_fetcher_->SetReferrer(config_.origin_url);
216
217  // The speech recognition API does not require user identification as part
218  // of requests, so we don't send cookies or auth data for these requests to
219  // prevent any accidental connection between users who are logged into the
220  // domain for other services (e.g. bookmark sync) with the speech requests.
221  url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
222                             net::LOAD_DO_NOT_SEND_COOKIES |
223                             net::LOAD_DO_NOT_SEND_AUTH_DATA);
224  url_fetcher_->Start();
225}
226
227void GoogleOneShotRemoteEngine::EndRecognition() {
228  url_fetcher_.reset();
229}
230
231void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
232  DCHECK(url_fetcher_.get());
233  DCHECK(encoder_.get());
234  DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
235  encoder_->Encode(data);
236  scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
237  url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
238}
239
240void GoogleOneShotRemoteEngine::AudioChunksEnded() {
241  DCHECK(url_fetcher_.get());
242  DCHECK(encoder_.get());
243
244  // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
245  // of silence in case encoder had no data already.
246  std::vector<int16> samples(
247      config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
248  scoped_refptr<AudioChunk> dummy_chunk(
249      new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
250                     samples.size() * sizeof(int16),
251                     encoder_->bits_per_sample() / 8));
252  encoder_->Encode(*dummy_chunk.get());
253  encoder_->Flush();
254  scoped_refptr<AudioChunk> encoded_dummy_data(
255      encoder_->GetEncodedDataAndClear());
256  DCHECK(!encoded_dummy_data->IsEmpty());
257  encoder_.reset();
258
259  url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
260}
261
262void GoogleOneShotRemoteEngine::OnURLFetchComplete(
263    const net::URLFetcher* source) {
264  DCHECK_EQ(url_fetcher_.get(), source);
265  SpeechRecognitionResults results;
266  results.push_back(SpeechRecognitionResult());
267  SpeechRecognitionResult& result = results.back();
268  SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK);
269  std::string data;
270
271  // The default error code in case of parse errors is NETWORK_FAILURE, however
272  // ParseServerResponse can change the error to a more appropriate one.
273  bool error_occurred = (!source->GetStatus().is_success() ||
274                        source->GetResponseCode() != 200 ||
275                        !source->GetResponseAsString(&data) ||
276                        !ParseServerResponse(data, &result, &error));
277  url_fetcher_.reset();
278  if (error_occurred) {
279    DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;
280    delegate()->OnSpeechRecognitionEngineError(error);
281  } else {
282    DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";
283    delegate()->OnSpeechRecognitionEngineResults(results);
284  }
285}
286
287bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {
288  return url_fetcher_ != NULL;
289}
290
291int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {
292  return kAudioPacketIntervalMs;
293}
294
295}  // namespace content
296