chrome/browser/speech/tts_controller.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROME_BROWSER_SPEECH_TTS_CONTROLLER_H_
#define CHROME_BROWSER_SPEECH_TTS_CONTROLLER_H_

#include <queue>
#include <set>
#include <string>
#include <vector>

#include "base/memory/scoped_ptr.h"
#include "base/memory/singleton.h"
#include "base/memory/weak_ptr.h"
#include "url/gurl.h"

class Utterance;
class TtsPlatformImpl;

namespace base {
class Value;
}

namespace content {
class BrowserContext;
}

// Events sent back from the TTS engine indicating the progress.
enum TtsEventType {
  TTS_EVENT_START,
  TTS_EVENT_END,
  TTS_EVENT_WORD,
  TTS_EVENT_SENTENCE,
  TTS_EVENT_MARKER,
  TTS_EVENT_INTERRUPTED,
  TTS_EVENT_CANCELLED,
  TTS_EVENT_ERROR,
  TTS_EVENT_PAUSE,
  TTS_EVENT_RESUME
};

enum TtsGenderType {
  TTS_GENDER_NONE,
  TTS_GENDER_MALE,
  TTS_GENDER_FEMALE
};

// Returns true if this event type is one that indicates an utterance
// is finished and can be destroyed.
bool IsFinalTtsEventType(TtsEventType event_type);

// The continuous parameters that apply to a given utterance.
struct UtteranceContinuousParameters {
  UtteranceContinuousParameters();

  double rate;
  double pitch;
  double volume;
};

// Information about one voice.
struct VoiceData {
  VoiceData();
  ~VoiceData();

  std::string name;
  std::string lang;
  TtsGenderType gender;
  std::string extension_id;
  std::set<TtsEventType> events;

  // If true, the synthesis engine is a remote network resource.
  // It may be higher latency and may incur bandwidth costs.
  bool remote;

  // If true, this is implemented by this platform's subclass of
  // TtsPlatformImpl. If false, this is implemented by an extension.
  bool native;
  std::string native_voice_identifier;
};

// Interface that delegates TTS requests to user-installed extensions.
class TtsEngineDelegate {
 public:
  virtual ~TtsEngineDelegate() {}

  // Return a list of all available voices registered.
  virtual void GetVoices(content::BrowserContext* browser_context,
                         std::vector<VoiceData>* out_voices) = 0;

  // Speak the given utterance by sending an event to the given TTS engine.
  virtual void Speak(Utterance* utterance, const VoiceData& voice) = 0;

  // Stop speaking the given utterance by sending an event to the target
  // associated with this utterance.
  virtual void Stop(Utterance* utterance) = 0;

  // Pause in the middle of speaking this utterance.
  virtual void Pause(Utterance* utterance) = 0;

  // Resume speaking this utterance.
  virtual void Resume(Utterance* utterance) = 0;

  // Load the built-in component extension for ChromeOS.
  virtual bool LoadBuiltInTtsExtension(
      content::BrowserContext* browser_context) = 0;
};

// Class that wants to receive events on utterances.
class UtteranceEventDelegate {
 public:
  virtual ~UtteranceEventDelegate() {}
  virtual void OnTtsEvent(Utterance* utterance,
                          TtsEventType event_type,
                          int char_index,
                          const std::string& error_message) = 0;
};

// Class that wants to be notified when the set of
// voices has changed.
class VoicesChangedDelegate {
 public:
  virtual ~VoicesChangedDelegate() {}
  virtual void OnVoicesChanged() = 0;
};

// One speech utterance.
class Utterance {
 public:
  // Construct an utterance given a profile and a completion task to call
  // when the utterance is done speaking. Before speaking this utterance,
  // its other parameters like text, rate, pitch, etc. should all be set.
  explicit Utterance(content::BrowserContext* browser_context);
  ~Utterance();

  // Sends an event to the delegate. If the event type is TTS_EVENT_END
  // or TTS_EVENT_ERROR, deletes the utterance. If |char_index| is -1,
  // uses the last good value.
  void OnTtsEvent(TtsEventType event_type,
                  int char_index,
                  const std::string& error_message);

  // Finish an utterance without sending an event to the delegate.
  void Finish();

  // Getters and setters for the text to speak and other speech options.
  void set_text(const std::string& text) { text_ = text; }
  const std::string& text() const { return text_; }

  void set_options(const base::Value* options);
  const base::Value* options() const { return options_.get(); }

  void set_src_id(int src_id) { src_id_ = src_id; }
  int src_id() { return src_id_; }

  void set_src_url(const GURL& src_url) { src_url_ = src_url; }
  const GURL& src_url() { return src_url_; }

  void set_voice_name(const std::string& voice_name) {
    voice_name_ = voice_name;
  }
  const std::string& voice_name() const { return voice_name_; }

  void set_lang(const std::string& lang) {
    lang_ = lang;
  }
  const std::string& lang() const { return lang_; }

  void set_gender(TtsGenderType gender) {
    gender_ = gender;
  }
  TtsGenderType gender() const { return gender_; }

  void set_continuous_parameters(const double rate,
                                 const double pitch,
                                 const double volume) {
    continuous_parameters_.rate = rate;
    continuous_parameters_.pitch = pitch;
    continuous_parameters_.volume = volume;
  }
  const UtteranceContinuousParameters& continuous_parameters() {
    return continuous_parameters_;
  }

  void set_can_enqueue(bool can_enqueue) { can_enqueue_ = can_enqueue; }
  bool can_enqueue() const { return can_enqueue_; }

  void set_required_event_types(const std::set<TtsEventType>& types) {
    required_event_types_ = types;
  }
  const std::set<TtsEventType>& required_event_types() const {
    return required_event_types_;
  }

  void set_desired_event_types(const std::set<TtsEventType>& types) {
    desired_event_types_ = types;
  }
  const std::set<TtsEventType>& desired_event_types() const {
    return desired_event_types_;
  }

  const std::string& extension_id() const { return extension_id_; }
  void set_extension_id(const std::string& extension_id) {
    extension_id_ = extension_id;
  }

  UtteranceEventDelegate* event_delegate() const {
    return event_delegate_;
  }
  void set_event_delegate(UtteranceEventDelegate* event_delegate) {
    event_delegate_ = event_delegate;
  }

  // Getters and setters for internal state.
  content::BrowserContext* browser_context() const { return browser_context_; }
  int id() const { return id_; }
  bool finished() const { return finished_; }

 private:
  // The BrowserContext that initiated this utterance.
  content::BrowserContext* browser_context_;

  // The extension ID of the extension providing TTS for this utterance, or
  // empty if native TTS is being used.
  std::string extension_id_;

  // The unique ID of this utterance, used to associate callback functions
  // with utterances.
  int id_;

  // The id of the next utterance, so we can associate requests with
  // responses.
  static int next_utterance_id_;

  // The text to speak.
  std::string text_;

  // The full options arg passed to tts.speak, which may include fields
  // other than the ones we explicitly parse, below.
  scoped_ptr<base::Value> options_;

  // The source extension's ID of this utterance, so that it can associate
  // events with the appropriate callback.
  int src_id_;

  // The URL of the page where the source extension called speak.
  GURL src_url_;

  // The delegate to be called when an utterance event is fired.
  UtteranceEventDelegate* event_delegate_;

  // The parsed options.
  std::string voice_name_;
  std::string lang_;
  TtsGenderType gender_;
  UtteranceContinuousParameters continuous_parameters_;
  bool can_enqueue_;
  std::set<TtsEventType> required_event_types_;
  std::set<TtsEventType> desired_event_types_;

  // The index of the current char being spoken.
  int char_index_;

  // True if this utterance received an event indicating it's done.
  bool finished_;
};

// Singleton class that manages text-to-speech for the TTS and TTS engine
// extension APIs, maintaining a queue of pending utterances and keeping
// track of all state.
class TtsController {
 public:
  // Get the single instance of this class.
  static TtsController* GetInstance();

  // Returns true if we're currently speaking an utterance.
  virtual bool IsSpeaking() = 0;

  // Speak the given utterance. If the utterance's can_enqueue flag is true
  // and another utterance is in progress, adds it to the end of the queue.
  // Otherwise, interrupts any current utterance and speaks this one
  // immediately.
  virtual void SpeakOrEnqueue(Utterance* utterance) = 0;

  // Stop all utterances and flush the queue. Implies leaving pause mode
  // as well.
  virtual void Stop() = 0;

  // Pause the speech queue. Some engines may support pausing in the middle
  // of an utterance.
  virtual void Pause() = 0;

  // Resume speaking.
  virtual void Resume() = 0;

  // Handle events received from the speech engine. Events are forwarded to
  // the callback function, and in addition, completion and error events
  // trigger finishing the current utterance and starting the next one, if
  // any.
  virtual void OnTtsEvent(int utterance_id,
                          TtsEventType event_type,
                          int char_index,
                          const std::string& error_message) = 0;

  // Return a list of all available voices, including the native voice,
  // if supported, and all voices registered by extensions.
  virtual void GetVoices(content::BrowserContext* browser_context,
                         std::vector<VoiceData>* out_voices) = 0;

  // Called by the extension system or platform implementation when the
  // list of voices may have changed and should be re-queried.
  virtual void VoicesChanged() = 0;

  // Add a delegate that wants to be notified when the set of voices changes.
  virtual void AddVoicesChangedDelegate(VoicesChangedDelegate* delegate) = 0;

  // Remove delegate that wants to be notified when the set of voices changes.
  virtual void RemoveVoicesChangedDelegate(VoicesChangedDelegate* delegate) = 0;

  // Remove delegate that wants to be notified when an utterance fires an event.
  // Note: this cancels speech from any utterance with this delegate, and
  // removes any utterances with this delegate from the queue.
  virtual void RemoveUtteranceEventDelegate(UtteranceEventDelegate* delegate)
      = 0;

  // Set the delegate that processes TTS requests with user-installed
  // extensions.
  virtual void SetTtsEngineDelegate(TtsEngineDelegate* delegate) = 0;

  // Get the delegate that processes TTS requests with user-installed
  // extensions.
  virtual TtsEngineDelegate* GetTtsEngineDelegate() = 0;

  // For unit testing.
  virtual void SetPlatformImpl(TtsPlatformImpl* platform_impl) = 0;
  virtual int QueueSize() = 0;

 protected:
  virtual ~TtsController() {}
};

#endif  // CHROME_BROWSER_SPEECH_TTS_CONTROLLER_H_