summaryrefslogtreecommitdiffstats
path: root/content/browser/speech/speech_recognizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'content/browser/speech/speech_recognizer.cc')
-rw-r--r--content/browser/speech/speech_recognizer.cc262
1 files changed, 262 insertions, 0 deletions
diff --git a/content/browser/speech/speech_recognizer.cc b/content/browser/speech/speech_recognizer.cc
new file mode 100644
index 0000000..fdc1a4c
--- /dev/null
+++ b/content/browser/speech/speech_recognizer.cc
@@ -0,0 +1,262 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/speech_recognizer.h"
+
+#include "base/time.h"
+#include "chrome/browser/profiles/profile.h"
+#include "chrome/common/net/url_request_context_getter.h"
+#include "content/browser/browser_thread.h"
+
+using media::AudioInputController;
+using std::string;
+
+namespace {
+
+// The following constants are related to the volume level indicator shown in
+// the UI for recorded audio.
+// Multiplier used when new volume is greater than previous level.
+const float kUpSmoothingFactor = 0.9f;
+// Multiplier used when new volume is lesser than previous level.
+const float kDownSmoothingFactor = 0.4f;
+const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter.
+const float kAudioMeterDbRange = 25.0f;
+} // namespace
+
+namespace speech_input {
+
+const int SpeechRecognizer::kAudioSampleRate = 16000;
+const int SpeechRecognizer::kAudioPacketIntervalMs = 100;
+const int SpeechRecognizer::kNumAudioChannels = 1;
+const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
+const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
+const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;
+
+SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
+ int caller_id,
+ const std::string& language,
+ const std::string& grammar,
+ const std::string& hardware_info,
+ const std::string& origin_url)
+ : delegate_(delegate),
+ caller_id_(caller_id),
+ language_(language),
+ grammar_(grammar),
+ hardware_info_(hardware_info),
+ origin_url_(origin_url),
+ codec_(AudioEncoder::CODEC_SPEEX),
+ encoder_(NULL),
+ endpointer_(kAudioSampleRate),
+ num_samples_recorded_(0),
+ audio_level_(0.0f) {
+ endpointer_.set_speech_input_complete_silence_length(
+ base::Time::kMicrosecondsPerSecond / 2);
+ endpointer_.set_long_speech_input_complete_silence_length(
+ base::Time::kMicrosecondsPerSecond);
+ endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
+ endpointer_.StartSession();
+}
+
+SpeechRecognizer::~SpeechRecognizer() {
+ // Recording should have stopped earlier due to the endpointer or
+ // |StopRecording| being called.
+ DCHECK(!audio_controller_.get());
+ DCHECK(!request_.get() || !request_->HasPendingRequest());
+ DCHECK(!encoder_.get());
+ endpointer_.EndSession();
+}
+
+bool SpeechRecognizer::StartRecording() {
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+ DCHECK(!audio_controller_.get());
+ DCHECK(!request_.get() || !request_->HasPendingRequest());
+ DCHECK(!encoder_.get());
+
+ // The endpointer needs to estimate the environment/background noise before
+ // starting to treat the audio as user input. In |HandleOnData| we wait until
+ // such time has passed before switching to user input mode.
+ endpointer_.SetEnvironmentEstimationMode();
+
+ encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,
+ kNumBitsPerAudioSample));
+ int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
+ AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
+ kAudioSampleRate, kNumBitsPerAudioSample,
+ samples_per_packet);
+ audio_controller_ = AudioInputController::Create(this, params);
+ DCHECK(audio_controller_.get());
+ VLOG(1) << "SpeechRecognizer starting record.";
+ num_samples_recorded_ = 0;
+ audio_controller_->Record();
+
+ return true;
+}
+
+void SpeechRecognizer::CancelRecognition() {
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+ DCHECK(audio_controller_.get() || request_.get());
+
+ // Stop recording if required.
+ if (audio_controller_.get()) {
+ VLOG(1) << "SpeechRecognizer stopping record.";
+ audio_controller_->Close();
+ audio_controller_ = NULL; // Releases the ref ptr.
+ }
+
+ VLOG(1) << "SpeechRecognizer canceling recognition.";
+ encoder_.reset();
+ request_.reset();
+}
+
+void SpeechRecognizer::StopRecording() {
+ DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+
+ // If audio recording has already stopped and we are in recognition phase,
+ // silently ignore any more calls to stop recording.
+ if (!audio_controller_.get())
+ return;
+
+ VLOG(1) << "SpeechRecognizer stopping record.";
+ audio_controller_->Close();
+ audio_controller_ = NULL; // Releases the ref ptr.
+ encoder_->Flush();
+
+ delegate_->DidCompleteRecording(caller_id_);
+
+ // Since the http request takes a single string as POST data, allocate
+ // one and copy over bytes from the audio buffers to the string.
+ // And If we haven't got any audio yet end the recognition sequence here.
+ string mime_type = encoder_->mime_type();
+ string data;
+ encoder_->GetEncodedData(&data);
+ encoder_.reset();
+
+ if (data.empty()) {
+ // Guard against the delegate freeing us until we finish our job.
+ scoped_refptr<SpeechRecognizer> me(this);
+ delegate_->DidCompleteRecognition(caller_id_);
+ } else {
+ DCHECK(!request_.get());
+ request_.reset(new SpeechRecognitionRequest(
+ Profile::GetDefaultRequestContext(), this));
+ request_->Send(language_, grammar_, hardware_info_, origin_url_,
+ mime_type, data);
+ }
+}
+
+void SpeechRecognizer::ReleaseAudioBuffers() {
+}
+
+// Invoked in the audio thread.
+void SpeechRecognizer::OnError(AudioInputController* controller,
+ int error_code) {
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+ NewRunnableMethod(this,
+ &SpeechRecognizer::HandleOnError,
+ error_code));
+}
+
+void SpeechRecognizer::HandleOnError(int error_code) {
+ LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
+
+ // Check if we are still recording before canceling recognition, as
+ // recording might have been stopped after this error was posted to the queue
+ // by |OnError|.
+ if (!audio_controller_.get())
+ return;
+
+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);
+}
+
+void SpeechRecognizer::OnData(AudioInputController* controller,
+ const uint8* data, uint32 size) {
+ if (size == 0) // This could happen when recording stops and is normal.
+ return;
+
+ string* str_data = new string(reinterpret_cast<const char*>(data), size);
+ BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+ NewRunnableMethod(this,
+ &SpeechRecognizer::HandleOnData,
+ str_data));
+}
+
+void SpeechRecognizer::HandleOnData(string* data) {
+ // Check if we are still recording and if not discard this buffer, as
+ // recording might have been stopped after this buffer was posted to the queue
+ // by |OnData|.
+ if (!audio_controller_.get()) {
+ delete data;
+ return;
+ }
+
+ const short* samples = reinterpret_cast<const short*>(data->data());
+ DCHECK((data->length() % sizeof(short)) == 0);
+ int num_samples = data->length() / sizeof(short);
+
+ encoder_->Encode(samples, num_samples);
+ float rms;
+ endpointer_.ProcessAudio(samples, num_samples, &rms);
+ delete data;
+ num_samples_recorded_ += num_samples;
+
+ if (endpointer_.IsEstimatingEnvironment()) {
+ // Check if we have gathered enough audio for the endpointer to do
+ // environment estimation and should move on to detect speech/end of speech.
+ if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
+ kAudioSampleRate) / 1000) {
+ endpointer_.SetUserInputMode();
+ delegate_->DidCompleteEnvironmentEstimation(caller_id_);
+ }
+ return; // No more processing since we are still estimating environment.
+ }
+
+ // Check if we have waited too long without hearing any speech.
+ if (!endpointer_.DidStartReceivingSpeech() &&
+ num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {
+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);
+ return;
+ }
+
+ // Calculate the input volume to display in the UI, smoothing towards the
+ // new level.
+ float level = (rms - kAudioMeterMinDb) / kAudioMeterDbRange;
+ level = std::min(std::max(0.0f, level), 1.0f);
+ if (level > audio_level_) {
+ audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
+ } else {
+ audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
+ }
+ delegate_->SetInputVolume(caller_id_, audio_level_);
+
+ if (endpointer_.speech_input_complete()) {
+ StopRecording();
+ }
+
+ // TODO(satish): Once we have streaming POST, start sending the data received
+ // here as POST chunks.
+}
+
+void SpeechRecognizer::SetRecognitionResult(
+ bool error, const SpeechInputResultArray& result) {
+ if (result.empty()) {
+ InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS);
+ return;
+ }
+
+ delegate_->SetRecognitionResult(caller_id_, error, result);
+
+ // Guard against the delegate freeing us until we finish our job.
+ scoped_refptr<SpeechRecognizer> me(this);
+ delegate_->DidCompleteRecognition(caller_id_);
+}
+
+void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {
+ CancelRecognition();
+
+ // Guard against the delegate freeing us until we finish our job.
+ scoped_refptr<SpeechRecognizer> me(this);
+ delegate_->OnRecognizerError(caller_id_, error);
+}
+
+} // namespace speech_input