summaryrefslogtreecommitdiffstats
path: root/chrome/browser/speech/speech_recognizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'chrome/browser/speech/speech_recognizer.cc')
-rw-r--r--chrome/browser/speech/speech_recognizer.cc45
1 files changed, 36 insertions, 9 deletions
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc
index 2e8f353..0df671c 100644
--- a/chrome/browser/speech/speech_recognizer.cc
+++ b/chrome/browser/speech/speech_recognizer.cc
@@ -6,6 +6,7 @@
#include "base/ref_counted.h"
#include "base/scoped_ptr.h"
+#include "base/time.h"
#include "chrome/browser/chrome_thread.h"
#include "chrome/browser/profile.h"
#include "chrome/common/net/url_request_context_getter.h"
@@ -43,10 +44,11 @@ class SpeexEncoder {
int samples_per_frame() const { return samples_per_frame_; }
- // Encodes each frame of raw audio in |raw_samples| and adds the
+ // Encodes each frame of raw audio in |samples| and adds the
// encoded frames as a set of strings to the |encoded_frames| list.
// Ownership of the newly added strings is transferred to the caller.
- void Encode(const string& raw_samples,
+ void Encode(const short* samples,
+ int num_samples,
std::list<std::string*>* encoded_frames);
private:
@@ -73,12 +75,9 @@ SpeexEncoder::~SpeexEncoder() {
speex_encoder_destroy(encoder_state_);
}
-void SpeexEncoder::Encode(const string& raw_samples,
+void SpeexEncoder::Encode(const short* samples,
+ int num_samples,
std::list<std::string*>* encoded_frames) {
- const short* samples = reinterpret_cast<const short*>(raw_samples.data());
- DCHECK((raw_samples.length() % sizeof(short)) == 0);
- int num_samples = raw_samples.length() / sizeof(short);
-
// Drop incomplete frames, typically those which come in when recording stops.
num_samples -= (num_samples % samples_per_frame_);
for (int i = 0; i < num_samples; i += samples_per_frame_) {
@@ -100,7 +99,14 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
const SpeechInputCallerId& caller_id)
: delegate_(delegate),
caller_id_(caller_id),
- encoder_(new SpeexEncoder()) {
+ encoder_(new SpeexEncoder()),
+ endpointer_(kAudioSampleRate) {
+ endpointer_.set_speech_input_complete_silence_length(
+ base::Time::kMicrosecondsPerSecond / 2);
+ endpointer_.set_long_speech_input_complete_silence_length(
+ base::Time::kMicrosecondsPerSecond);
+ endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
+ endpointer_.StartSession();
}
SpeechRecognizer::~SpeechRecognizer() {
@@ -109,6 +115,7 @@ SpeechRecognizer::~SpeechRecognizer() {
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
DCHECK(audio_buffers_.empty());
+ endpointer_.EndSession();
}
bool SpeechRecognizer::StartRecording() {
@@ -116,6 +123,16 @@ bool SpeechRecognizer::StartRecording() {
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
+ // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to
+ // estimate the environment/background noise before starting to treat the
+ // audio as user input. Once we have implemented a popup UI to notify the user
+ // that recording has started, we should perhaps have a short interval where
+ // we record background audio and then show the popup UI so that the user can
+ // start speaking after that. For now we just do these together so there isn't
+ // any background noise for the end pointer (still works ok).
+ endpointer_.SetEnvironmentEstimationMode();
+ endpointer_.SetUserInputMode();
+
int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
audio_controller_ = AudioInputController::Create(this,
@@ -156,6 +173,7 @@ void SpeechRecognizer::StopRecording() {
LOG(INFO) << "SpeechRecognizer stopping record.";
audio_controller_->Close();
audio_controller_ = NULL; // Releases the ref ptr.
+
delegate_->DidCompleteRecording(caller_id_);
// If we haven't got any audio yet end the recognition sequence here.
@@ -240,9 +258,18 @@ void SpeechRecognizer::HandleOnData(string* data) {
return;
}
- encoder_->Encode(*data, &audio_buffers_);
+ const short* samples = reinterpret_cast<const short*>(data->data());
+ DCHECK((data->length() % sizeof(short)) == 0);
+ int num_samples = data->length() / sizeof(short);
+
+ encoder_->Encode(samples, num_samples, &audio_buffers_);
+ endpointer_.ProcessAudio(samples, num_samples);
delete data;
+ if (endpointer_.speech_input_complete()) {
+ StopRecording();
+ }
+
// TODO(satish): Once we have streaming POST, start sending the data received
// here as POST chunks.
}