15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/ref_counted.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/threading/non_thread_safe.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/browser/speech/audio_encoder.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/browser/speech/chunked_byte_buffer.h"
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/browser/speech/speech_recognition_engine.h"
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/common/content_export.h"
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/public/common/speech_recognition_error.h"
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/url_request/url_fetcher_delegate.h"
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace net {
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class URLRequestContextGetter;
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content {
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class AudioChunk;
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct SpeechRecognitionError;
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct SpeechRecognitionResult;
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Implements a SpeechRecognitionEngine supporting continuous recognition by
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// means of interaction with Google streaming speech recognition webservice.
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// More in details, this class establishes two HTTP(S) connections with the
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// webservice, for each session, herein called "upstream" and "downstream".
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Audio chunks are sent on the upstream by means of a chunked HTTP POST upload.
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Recognition results are retrieved in a full-duplex fashion (i.e. while
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// pushing audio on the upstream) on the downstream by means of a chunked
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// HTTP GET request. Pairing between the two stream is handled through a
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// randomly generated key, unique for each request, which is passed in the
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// &pair= arg to both stream request URLs.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In the case of a regular session, the upstream is closed when the audio
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// capture ends (notified through a |AudioChunksEnded| call) and the downstream
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// waits for a corresponding server closure (eventually some late results can
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// come after closing the upstream).
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Both stream are guaranteed to be closed when |EndRecognition| call is issued.
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class CONTENT_EXPORT GoogleStreamingRemoteEngine
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : public NON_EXPORTED_BASE(SpeechRecognitionEngine),
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      public net::URLFetcherDelegate,
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      public NON_EXPORTED_BASE(base::NonThreadSafe) {
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
524e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)  // Duration of each audio packet.
534e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)  static const int kAudioPacketIntervalMs;
544e180b6a0b4720a9b8e9e959a882386f690f08ffTorne (Richard Coles)
558bcbed890bc3ce4d7a057a8f32cab53fa534672eTorne (Richard Coles)  // IDs passed to URLFetcher::Create(). Used for testing.
568bcbed890bc3ce4d7a057a8f32cab53fa534672eTorne (Richard Coles)  static const int kUpstreamUrlFetcherIdForTesting;
578bcbed890bc3ce4d7a057a8f32cab53fa534672eTorne (Richard Coles)  static const int kDownstreamUrlFetcherIdForTesting;
588bcbed890bc3ce4d7a057a8f32cab53fa534672eTorne (Richard Coles)
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter* context);
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~GoogleStreamingRemoteEngine();
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // SpeechRecognitionEngine methods.
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void SetConfig(const SpeechRecognitionEngineConfig& config) OVERRIDE;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void StartRecognition() OVERRIDE;
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void EndRecognition() OVERRIDE;
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void TakeAudioChunk(const AudioChunk& data) OVERRIDE;
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void AudioChunksEnded() OVERRIDE;
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual bool IsRecognitionPending() const OVERRIDE;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual int GetDesiredAudioChunkDurationMs() const OVERRIDE;
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // net::URLFetcherDelegate methods.
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void OnURLFetchComplete(const net::URLFetcher* source) OVERRIDE;
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void OnURLFetchDownloadProgress(const net::URLFetcher* source,
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                          int64 current, int64 total) OVERRIDE;
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Response status codes from the speech recognition webservice.
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kWebserviceStatusNoError;
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kWebserviceStatusErrorNoMatch;
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Data types for the internal Finite State Machine (FSM).
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  enum FSMState {
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    STATE_IDLE = 0,
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    STATE_BOTH_STREAMS_CONNECTED,
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    STATE_WAITING_DOWNSTREAM_RESULTS,
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  enum FSMEvent {
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_END_RECOGNITION = 0,
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_START_RECOGNITION,
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_AUDIO_CHUNK,
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_AUDIO_CHUNKS_ENDED,
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_UPSTREAM_ERROR,
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_DOWNSTREAM_ERROR,
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_DOWNSTREAM_RESPONSE,
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_DOWNSTREAM_CLOSED,
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  struct FSMEventArgs {
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    explicit FSMEventArgs(FSMEvent event_value);
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ~FSMEventArgs();
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    FSMEvent event;
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    scoped_refptr<const AudioChunk> audio_data;
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    scoped_ptr<std::vector<uint8> > response;
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   private:
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Invoked by both upstream and downstream URLFetcher callbacks to handle
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // new chunk data, connection closed or errors notifications.
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void DispatchHTTPResponse(const net::URLFetcher* source,
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            bool end_of_response);
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Entry point for pushing any new external event into the recognizer FSM.
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void DispatchEvent(const FSMEventArgs& event_args);
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Defines the behavior of the recognizer FSM, selecting the appropriate
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // transition according to the current state and event.
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The methods below handle transitions of the recognizer FSM.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState ConnectBothStreams(const FSMEventArgs& event_args);
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState CloseDownstream(const FSMEventArgs& event_args);
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState AbortSilently(const FSMEventArgs& event_args);
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState AbortWithError(const FSMEventArgs& event_args);
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState Abort(SpeechRecognitionErrorCode error);
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState DoNothing(const FSMEventArgs& event_args);
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState NotFeasible(const FSMEventArgs& event_args);
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string GetAcceptedLanguages() const;
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string GenerateRequestKey() const;
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SpeechRecognitionEngineConfig config_;
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<net::URLFetcher> upstream_fetcher_;
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<net::URLFetcher> downstream_fetcher_;
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_refptr<net::URLRequestContextGetter> url_context_;
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<AudioEncoder> encoder_;
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ChunkedByteBuffer chunked_byte_buffer_;
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t previous_response_length_;
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool got_last_definitive_result_;
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool is_dispatching_event_;
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FSMState state_;
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine);
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace content
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
162