summaryrefslogtreecommitdiffstats
path: root/content/browser/speech/speech_recognizer.h
blob: f60a9de4964bb678285bc3d864b93e60f256159e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_

#include <list>
#include <string>
#include <utility>

#include "base/memory/ref_counted.h"
#include "base/memory/scoped_ptr.h"
#include "content/browser/speech/audio_encoder.h"
#include "content/browser/speech/endpointer/endpointer.h"
#include "content/browser/speech/speech_recognition_request.h"
#include "media/audio/audio_input_controller.h"

namespace speech_input {

// Records audio, sends recorded audio to server and translates server response
// to recognition result.
class SpeechRecognizer
    : public base::RefCountedThreadSafe<SpeechRecognizer>,
      public media::AudioInputController::EventHandler,
      public SpeechRecognitionRequestDelegate {
 public:
  enum ErrorCode {
    RECOGNIZER_NO_ERROR,
    RECOGNIZER_ERROR_CAPTURE,
    RECOGNIZER_ERROR_NO_SPEECH,
    RECOGNIZER_ERROR_NO_RESULTS,
    RECOGNIZER_ERROR_NETWORK,
  };

  // Implemented by the caller to receive recognition events.
  class Delegate {
   public:
    virtual void SetRecognitionResult(
        int caller_id,
        bool error,
        const SpeechInputResultArray& result) = 0;

    // Invoked when the first audio packet was received from the audio capture
    // device.
    virtual void DidStartReceivingAudio(int caller_id) = 0;

    // Invoked when audio recording stops, either due to the end pointer
    // detecting silence in user input or if |StopRecording| was called. The
    // delegate has to wait until |DidCompleteRecognition| is invoked before
    // destroying the |SpeechRecognizer| object.
    virtual void DidCompleteRecording(int caller_id) = 0;

    // This is guaranteed to be the last method invoked in the recognition
    // sequence and the |SpeechRecognizer| object can be freed up if necessary.
    virtual void DidCompleteRecognition(int caller_id) = 0;

    // Invoked if there was an error while recording or recognizing audio. The
    // session has already been cancelled when this call is made and the DidXxxx
    // callbacks will not be issued. It is safe to destroy/release the
    // |SpeechRecognizer| object while processing this call.
    virtual void OnRecognizerError(int caller_id,
                                   SpeechRecognizer::ErrorCode error) = 0;

    // At the start of recognition, a short amount of audio is recorded to
    // estimate the environment/background noise and this callback is issued
    // after that is complete. Typically the delegate brings up any speech
    // recognition UI once this callback is received.
    virtual void DidCompleteEnvironmentEstimation(int caller_id) = 0;

    // Informs of a change in the captured audio level, useful if displaying
    // a microphone volume indicator while recording.
    // The value of |volume| and |noise_volume| is in the [0.0, 1.0] range.
    virtual void SetInputVolume(int caller_id, float volume,
                                float noise_volume) = 0;

   protected:
    virtual ~Delegate() {}
  };

  SpeechRecognizer(Delegate* delegate,
                   int caller_id,
                   const std::string& language,
                   const std::string& grammar,
                   const std::string& hardware_info,
                   const std::string& origin_url);
  ~SpeechRecognizer();

  // Starts audio recording and does recognition after recording ends. The same
  // SpeechRecognizer instance can be used multiple times for speech recognition
  // though each recognition request can be made only after the previous one
  // completes (i.e. after receiving Delegate::DidCompleteRecognition).
  bool StartRecording();

  // Stops recording audio and starts recognition.
  void StopRecording();

  // Stops recording audio and cancels recognition. Any audio recorded so far
  // gets discarded.
  void CancelRecognition();

  // AudioInputController::EventHandler methods.
  virtual void OnCreated(media::AudioInputController* controller) { }
  virtual void OnRecording(media::AudioInputController* controller) { }
  virtual void OnError(media::AudioInputController* controller, int error_code);
  virtual void OnData(media::AudioInputController* controller,
                      const uint8* data,
                      uint32 size);

  // SpeechRecognitionRequest::Delegate methods.
  virtual void SetRecognitionResult(bool error,
                                    const SpeechInputResultArray& result);

  static const int kAudioSampleRate;
  static const int kAudioPacketIntervalMs;  // Duration of each audio packet.
  static const ChannelLayout kChannelLayout;
  static const int kNumBitsPerAudioSample;
  static const int kNoSpeechTimeoutSec;
  static const int kEndpointerEstimationTimeMs;

 private:
  void InformErrorAndCancelRecognition(ErrorCode error);
  void SendRecordedAudioToServer();

  void HandleOnError(int error_code);  // Handles OnError in the IO thread.

  // Handles OnData in the IO thread. Takes ownership of |data|.
  void HandleOnData(std::string* data);

  Delegate* delegate_;
  int caller_id_;
  std::string language_;
  std::string grammar_;
  std::string hardware_info_;
  std::string origin_url_;

  scoped_ptr<SpeechRecognitionRequest> request_;
  scoped_refptr<media::AudioInputController> audio_controller_;
  AudioEncoder::Codec codec_;
  scoped_ptr<AudioEncoder> encoder_;
  Endpointer endpointer_;
  int num_samples_recorded_;
  float audio_level_;

  DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer);
};

// This typedef is to workaround the issue with certain versions of
// Visual Studio where it gets confused between multiple Delegate
// classes and gives a C2500 error. (I saw this error on the try bots -
// the workaround was not needed for my machine).
typedef SpeechRecognizer::Delegate SpeechRecognizerDelegate;

}  // namespace speech_input

#endif  // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_