google_one_shot_remote_engine.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "content/browser/speech/google_one_shot_remote_engine.h" 6 7#include <vector> 8 9#include "base/json/json_reader.h" 10#include "base/string_number_conversions.h" 11#include "base/string_util.h" 12#include "base/values.h" 13#include "content/browser/speech/audio_buffer.h" 14#include "content/public/common/speech_recognition_error.h" 15#include "content/public/common/speech_recognition_result.h" 16#include "google_apis/google_api_keys.h" 17#include "net/base/escape.h" 18#include "net/base/load_flags.h" 19#include "net/url_request/url_fetcher.h" 20#include "net/url_request/url_request_context.h" 21#include "net/url_request/url_request_context_getter.h" 22#include "net/url_request/url_request_status.h" 23 24namespace content { 25namespace { 26 27const char* const kDefaultSpeechRecognitionUrl = 28 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; 29const char* const kStatusString = "status"; 30const char* const kHypothesesString = "hypotheses"; 31const char* const kUtteranceString = "utterance"; 32const char* const kConfidenceString = "confidence"; 33const int kWebServiceStatusNoError = 0; 34const int kWebServiceStatusNoSpeech = 4; 35const int kWebServiceStatusNoMatch = 5; 36const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC; 37 38bool ParseServerResponse(const std::string& response_body, 39 SpeechRecognitionResult* result, 40 SpeechRecognitionError* error) { 41 if (response_body.empty()) { 42 LOG(WARNING) << "ParseServerResponse: Response was empty."; 43 return false; 44 } 45 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; 46 47 // Parse the response, ignoring comments. 48 std::string error_msg; 49 scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError( 50 response_body, base::JSON_PARSE_RFC, NULL, &error_msg)); 51 if (response_value == NULL) { 52 LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg; 53 return false; 54 } 55 56 if (!response_value->IsType(Value::TYPE_DICTIONARY)) { 57 VLOG(1) << "ParseServerResponse: Unexpected response type " 58 << response_value->GetType(); 59 return false; 60 } 61 const DictionaryValue* response_object = 62 static_cast<const DictionaryValue*>(response_value.get()); 63 64 // Get the status. 65 int status; 66 if (!response_object->GetInteger(kStatusString, &status)) { 67 VLOG(1) << "ParseServerResponse: " << kStatusString 68 << " is not a valid integer value."; 69 return false; 70 } 71 72 // Process the status. 73 switch (status) { 74 case kWebServiceStatusNoError: 75 break; 76 case kWebServiceStatusNoSpeech: 77 error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH; 78 return false; 79 case kWebServiceStatusNoMatch: 80 error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH; 81 return false; 82 default: 83 error->code = SPEECH_RECOGNITION_ERROR_NETWORK; 84 // Other status codes should not be returned by the server. 85 VLOG(1) << "ParseServerResponse: unexpected status code " << status; 86 return false; 87 } 88 89 // Get the hypotheses. 90 const Value* hypotheses_value = NULL; 91 if (!response_object->Get(kHypothesesString, &hypotheses_value)) { 92 VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; 93 return false; 94 } 95 96 DCHECK(hypotheses_value); 97 if (!hypotheses_value->IsType(Value::TYPE_LIST)) { 98 VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " 99 << hypotheses_value->GetType(); 100 return false; 101 } 102 103 const ListValue* hypotheses_list = 104 static_cast<const ListValue*>(hypotheses_value); 105 106 // For now we support only single shot recognition, so we are giving only a 107 // final result, consisting of one fragment (with one or more hypotheses). 108 size_t index = 0; 109 for (; index < hypotheses_list->GetSize(); ++index) { 110 const Value* hypothesis = NULL; 111 if (!hypotheses_list->Get(index, &hypothesis)) { 112 LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; 113 break; 114 } 115 DCHECK(hypothesis); 116 if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) { 117 LOG(WARNING) << "ParseServerResponse: Unexpected value type " 118 << hypothesis->GetType(); 119 break; 120 } 121 122 const DictionaryValue* hypothesis_value = 123 static_cast<const DictionaryValue*>(hypothesis); 124 string16 utterance; 125 126 if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { 127 LOG(WARNING) << "ParseServerResponse: Missing utterance value."; 128 break; 129 } 130 131 // It is not an error if the 'confidence' field is missing. 132 double confidence = 0.0; 133 hypothesis_value->GetDouble(kConfidenceString, &confidence); 134 result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance, 135 confidence)); 136 } 137 138 if (index < hypotheses_list->GetSize()) { 139 result->hypotheses.clear(); 140 return false; 141 } 142 return true; 143} 144 145} // namespace 146 147const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100; 148int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0; 149 150GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine( 151 net::URLRequestContextGetter* context) 152 : url_context_(context) { 153} 154 155GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {} 156 157void GoogleOneShotRemoteEngine::SetConfig( 158 const SpeechRecognitionEngineConfig& config) { 159 config_ = config; 160} 161 162void GoogleOneShotRemoteEngine::StartRecognition() { 163 DCHECK(delegate()); 164 DCHECK(!url_fetcher_.get()); 165 std::string lang_param = config_.language; 166 167 if (lang_param.empty() && url_context_) { 168 // If no language is provided then we use the first from the accepted 169 // language list. If this list is empty then it defaults to "en-US". 170 // Example of the contents of this list: "es,en-GB;q=0.8", "" 171 net::URLRequestContext* request_context = 172 url_context_->GetURLRequestContext(); 173 DCHECK(request_context); 174 // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with 175 // a reference to the HttpUserAgentSettings rather than accessing the 176 // accept language through the URLRequestContext. 177 std::string accepted_language_list = request_context->GetAcceptLanguage(); 178 size_t separator = accepted_language_list.find_first_of(",;"); 179 lang_param = accepted_language_list.substr(0, separator); 180 } 181 182 if (lang_param.empty()) 183 lang_param = "en-US"; 184 185 std::vector<std::string> parts; 186 parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); 187 188 if (!config_.grammars.empty()) { 189 DCHECK_EQ(config_.grammars.size(), 1U); 190 parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url, 191 true)); 192 } 193 194 if (!config_.hardware_info.empty()) 195 parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info, 196 true)); 197 parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses)); 198 parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0"); 199 200 std::string api_key = google_apis::GetAPIKey(); 201 parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true)); 202 203 GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); 204 205 encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec, 206 config_.audio_sample_rate, 207 config_.audio_num_bits_per_sample)); 208 DCHECK(encoder_.get()); 209 url_fetcher_.reset(net::URLFetcher::Create(url_fetcher_id_for_tests, 210 url, 211 net::URLFetcher::POST, 212 this)); 213 url_fetcher_->SetChunkedUpload(encoder_->mime_type()); 214 url_fetcher_->SetRequestContext(url_context_); 215 url_fetcher_->SetReferrer(config_.origin_url); 216 217 // The speech recognition API does not require user identification as part 218 // of requests, so we don't send cookies or auth data for these requests to 219 // prevent any accidental connection between users who are logged into the 220 // domain for other services (e.g. bookmark sync) with the speech requests. 221 url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | 222 net::LOAD_DO_NOT_SEND_COOKIES | 223 net::LOAD_DO_NOT_SEND_AUTH_DATA); 224 url_fetcher_->Start(); 225} 226 227void GoogleOneShotRemoteEngine::EndRecognition() { 228 url_fetcher_.reset(); 229} 230 231void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) { 232 DCHECK(url_fetcher_.get()); 233 DCHECK(encoder_.get()); 234 DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); 235 encoder_->Encode(data); 236 scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); 237 url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); 238} 239 240void GoogleOneShotRemoteEngine::AudioChunksEnded() { 241 DCHECK(url_fetcher_.get()); 242 DCHECK(encoder_.get()); 243 244 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet 245 // of silence in case encoder had no data already. 246 std::vector<int16> samples( 247 config_.audio_sample_rate * kAudioPacketIntervalMs / 1000); 248 scoped_refptr<AudioChunk> dummy_chunk( 249 new AudioChunk(reinterpret_cast<uint8*>(&samples[0]), 250 samples.size() * sizeof(int16), 251 encoder_->bits_per_sample() / 8)); 252 encoder_->Encode(*dummy_chunk); 253 encoder_->Flush(); 254 scoped_refptr<AudioChunk> encoded_dummy_data( 255 encoder_->GetEncodedDataAndClear()); 256 DCHECK(!encoded_dummy_data->IsEmpty()); 257 encoder_.reset(); 258 259 url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); 260} 261 262void GoogleOneShotRemoteEngine::OnURLFetchComplete( 263 const net::URLFetcher* source) { 264 DCHECK_EQ(url_fetcher_.get(), source); 265 SpeechRecognitionResult result; 266 SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK); 267 std::string data; 268 269 // The default error code in case of parse errors is NETWORK_FAILURE, however 270 // ParseServerResponse can change the error to a more appropriate one. 271 bool error_occurred = (!source->GetStatus().is_success() || 272 source->GetResponseCode() != 200 || 273 !source->GetResponseAsString(&data) || 274 !ParseServerResponse(data, &result, &error)); 275 url_fetcher_.reset(); 276 if (error_occurred) { 277 DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code; 278 delegate()->OnSpeechRecognitionEngineError(error); 279 } else { 280 DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result."; 281 delegate()->OnSpeechRecognitionEngineResult(result); 282 } 283} 284 285bool GoogleOneShotRemoteEngine::IsRecognitionPending() const { 286 return url_fetcher_ != NULL; 287} 288 289int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const { 290 return kAudioPacketIntervalMs; 291} 292 293} // namespace content 294