chromium/content/browser/speech/google_streaming_remote_engine.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
#define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_

#include <string>
#include <vector>

#include "base/basictypes.h"
#include "base/memory/ref_counted.h"
#include "base/memory/scoped_ptr.h"
#include "base/threading/non_thread_safe.h"
#include "content/browser/speech/audio_encoder.h"
#include "content/browser/speech/chunked_byte_buffer.h"
#include "content/browser/speech/speech_recognition_engine.h"
#include "content/common/content_export.h"
#include "content/public/common/speech_recognition_error.h"
#include "net/url_request/url_fetcher_delegate.h"

namespace net {
class URLRequestContextGetter;
}

namespace content {

class AudioChunk;
struct SpeechRecognitionError;
struct SpeechRecognitionResult;

// Implements a SpeechRecognitionEngine supporting continuous recognition by
// means of interaction with Google streaming speech recognition webservice.
// More in details, this class establishes two HTTP(S) connections with the
// webservice, for each session, herein called "upstream" and "downstream".
// Audio chunks are sent on the upstream by means of a chunked HTTP POST upload.
// Recognition results are retrieved in a full-duplex fashion (i.e. while
// pushing audio on the upstream) on the downstream by means of a chunked
// HTTP GET request. Pairing between the two stream is handled through a
// randomly generated key, unique for each request, which is passed in the
// &pair= arg to both stream request URLs.
// In the case of a regular session, the upstream is closed when the audio
// capture ends (notified through a |AudioChunksEnded| call) and the downstream
// waits for a corresponding server closure (eventually some late results can
// come after closing the upstream).
// Both stream are guaranteed to be closed when |EndRecognition| call is issued.
class CONTENT_EXPORT GoogleStreamingRemoteEngine
    : public NON_EXPORTED_BASE(SpeechRecognitionEngine),
      public net::URLFetcherDelegate,
      public NON_EXPORTED_BASE(base::NonThreadSafe) {
 public:
  // Duration of each audio packet.
  static const int kAudioPacketIntervalMs;

  // IDs passed to URLFetcher::Create(). Used for testing.
  static const int kUpstreamUrlFetcherIdForTesting;
  static const int kDownstreamUrlFetcherIdForTesting;

  explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter* context);
  ~GoogleStreamingRemoteEngine() override;

  // SpeechRecognitionEngine methods.
  void SetConfig(const SpeechRecognitionEngineConfig& config) override;
  void StartRecognition() override;
  void EndRecognition() override;
  void TakeAudioChunk(const AudioChunk& data) override;
  void AudioChunksEnded() override;
  bool IsRecognitionPending() const override;
  int GetDesiredAudioChunkDurationMs() const override;

  // net::URLFetcherDelegate methods.
  void OnURLFetchComplete(const net::URLFetcher* source) override;
  void OnURLFetchDownloadProgress(const net::URLFetcher* source,
                                  int64 current,
                                  int64 total) override;

 private:
  // Response status codes from the speech recognition webservice.
  static const int kWebserviceStatusNoError;
  static const int kWebserviceStatusErrorNoMatch;

  // Frame type for framed POST data. Do NOT change these. They must match
  // values the server expects.
  enum FrameType {
    FRAME_PREAMBLE_AUDIO = 0,
    FRAME_RECOGNITION_AUDIO = 1
  };

  // Data types for the internal Finite State Machine (FSM).
  enum FSMState {
    STATE_IDLE = 0,
    STATE_BOTH_STREAMS_CONNECTED,
    STATE_WAITING_DOWNSTREAM_RESULTS,
    STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
  };

  enum FSMEvent {
    EVENT_END_RECOGNITION = 0,
    EVENT_START_RECOGNITION,
    EVENT_AUDIO_CHUNK,
    EVENT_AUDIO_CHUNKS_ENDED,
    EVENT_UPSTREAM_ERROR,
    EVENT_DOWNSTREAM_ERROR,
    EVENT_DOWNSTREAM_RESPONSE,
    EVENT_DOWNSTREAM_CLOSED,
    EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
  };

  struct FSMEventArgs {
    explicit FSMEventArgs(FSMEvent event_value);
    ~FSMEventArgs();

    FSMEvent event;

    // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
    scoped_refptr<const AudioChunk> audio_data;

    // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
    scoped_ptr<std::vector<uint8> > response;

   private:
    DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);
  };

  // Invoked by both upstream and downstream URLFetcher callbacks to handle
  // new chunk data, connection closed or errors notifications.
  void DispatchHTTPResponse(const net::URLFetcher* source,
                            bool end_of_response);

  // Entry point for pushing any new external event into the recognizer FSM.
  void DispatchEvent(const FSMEventArgs& event_args);

  // Defines the behavior of the recognizer FSM, selecting the appropriate
  // transition according to the current state and event.
  FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);

  // The methods below handle transitions of the recognizer FSM.
  FSMState ConnectBothStreams(const FSMEventArgs& event_args);
  FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
  FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
  FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
  FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
  FSMState CloseDownstream(const FSMEventArgs& event_args);
  FSMState AbortSilently(const FSMEventArgs& event_args);
  FSMState AbortWithError(const FSMEventArgs& event_args);
  FSMState Abort(SpeechRecognitionErrorCode error);
  FSMState DoNothing(const FSMEventArgs& event_args);
  FSMState NotFeasible(const FSMEventArgs& event_args);

  std::string GetAcceptedLanguages() const;
  std::string GenerateRequestKey() const;

  // Upload a single chunk of audio data. Handles both unframed and framed
  // upload formats, and uses the appropriate one.
  void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);

  SpeechRecognitionEngineConfig config_;
  scoped_ptr<net::URLFetcher> upstream_fetcher_;
  scoped_ptr<net::URLFetcher> downstream_fetcher_;
  scoped_refptr<net::URLRequestContextGetter> url_context_;
  scoped_ptr<AudioEncoder> encoder_;
  scoped_ptr<AudioEncoder> preamble_encoder_;
  ChunkedByteBuffer chunked_byte_buffer_;
  size_t previous_response_length_;
  bool got_last_definitive_result_;
  bool is_dispatching_event_;
  bool use_framed_post_data_;
  FSMState state_;

  DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine);
};

}  // namespace content

#endif  // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_