diff options
Diffstat (limited to 'chrome/browser/speech/speech_recognizer.cc')
-rw-r--r-- | chrome/browser/speech/speech_recognizer.cc | 45 |
1 files changed, 36 insertions, 9 deletions
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc index 2e8f353..0df671c 100644 --- a/chrome/browser/speech/speech_recognizer.cc +++ b/chrome/browser/speech/speech_recognizer.cc @@ -6,6 +6,7 @@ #include "base/ref_counted.h" #include "base/scoped_ptr.h" +#include "base/time.h" #include "chrome/browser/chrome_thread.h" #include "chrome/browser/profile.h" #include "chrome/common/net/url_request_context_getter.h" @@ -43,10 +44,11 @@ class SpeexEncoder { int samples_per_frame() const { return samples_per_frame_; } - // Encodes each frame of raw audio in |raw_samples| and adds the + // Encodes each frame of raw audio in |samples| and adds the // encoded frames as a set of strings to the |encoded_frames| list. // Ownership of the newly added strings is transferred to the caller. - void Encode(const string& raw_samples, + void Encode(const short* samples, + int num_samples, std::list<std::string*>* encoded_frames); private: @@ -73,12 +75,9 @@ SpeexEncoder::~SpeexEncoder() { speex_encoder_destroy(encoder_state_); } -void SpeexEncoder::Encode(const string& raw_samples, +void SpeexEncoder::Encode(const short* samples, + int num_samples, std::list<std::string*>* encoded_frames) { - const short* samples = reinterpret_cast<const short*>(raw_samples.data()); - DCHECK((raw_samples.length() % sizeof(short)) == 0); - int num_samples = raw_samples.length() / sizeof(short); - // Drop incomplete frames, typically those which come in when recording stops. num_samples -= (num_samples % samples_per_frame_); for (int i = 0; i < num_samples; i += samples_per_frame_) { @@ -100,7 +99,14 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate, const SpeechInputCallerId& caller_id) : delegate_(delegate), caller_id_(caller_id), - encoder_(new SpeexEncoder()) { + encoder_(new SpeexEncoder()), + endpointer_(kAudioSampleRate) { + endpointer_.set_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond / 2); + endpointer_.set_long_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond); + endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); + endpointer_.StartSession(); } SpeechRecognizer::~SpeechRecognizer() { @@ -109,6 +115,7 @@ SpeechRecognizer::~SpeechRecognizer() { DCHECK(!audio_controller_.get()); DCHECK(!request_.get() || !request_->HasPendingRequest()); DCHECK(audio_buffers_.empty()); + endpointer_.EndSession(); } bool SpeechRecognizer::StartRecording() { @@ -116,6 +123,16 @@ bool SpeechRecognizer::StartRecording() { DCHECK(!audio_controller_.get()); DCHECK(!request_.get() || !request_->HasPendingRequest()); + // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to + // estimate the environment/background noise before starting to treat the + // audio as user input. Once we have implemented a popup UI to notify the user + // that recording has started, we should perhaps have a short interval where + // we record background audio and then show the popup UI so that the user can + // start speaking after that. For now we just do these together so there isn't + // any background noise for the end pointer (still works ok). + endpointer_.SetEnvironmentEstimationMode(); + endpointer_.SetUserInputMode(); + int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); audio_controller_ = AudioInputController::Create(this, @@ -156,6 +173,7 @@ void SpeechRecognizer::StopRecording() { LOG(INFO) << "SpeechRecognizer stopping record."; audio_controller_->Close(); audio_controller_ = NULL; // Releases the ref ptr. + delegate_->DidCompleteRecording(caller_id_); // If we haven't got any audio yet end the recognition sequence here. @@ -240,9 +258,18 @@ void SpeechRecognizer::HandleOnData(string* data) { return; } - encoder_->Encode(*data, &audio_buffers_); + const short* samples = reinterpret_cast<const short*>(data->data()); + DCHECK((data->length() % sizeof(short)) == 0); + int num_samples = data->length() / sizeof(short); + + encoder_->Encode(samples, num_samples, &audio_buffers_); + endpointer_.ProcessAudio(samples, num_samples); delete data; + if (endpointer_.speech_input_complete()) { + StopRecording(); + } + // TODO(satish): Once we have streaming POST, start sending the data received // here as POST chunks. } |