diff options
author | jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-26 18:46:15 +0000 |
---|---|---|
committer | jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-26 18:46:15 +0000 |
commit | 50fab53bddb2c3cb24d5682c913a03226ccf49ef (patch) | |
tree | bb04af83ca5f2be010e32c2e10cfd245117a4847 /content/browser/speech/speech_recognizer.cc | |
parent | 5c557f37629dc12dfd99e8fb55c235c8c46a8098 (diff) | |
download | chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.zip chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.gz chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.bz2 |
Move core pieces of speech from chrome to content.
TBR=satish
Review URL: http://codereview.chromium.org/6591024
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@76165 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'content/browser/speech/speech_recognizer.cc')
-rw-r--r-- | content/browser/speech/speech_recognizer.cc | 262 |
1 files changed, 262 insertions, 0 deletions
diff --git a/content/browser/speech/speech_recognizer.cc b/content/browser/speech/speech_recognizer.cc new file mode 100644 index 0000000..fdc1a4c --- /dev/null +++ b/content/browser/speech/speech_recognizer.cc @@ -0,0 +1,262 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/speech_recognizer.h" + +#include "base/time.h" +#include "chrome/browser/profiles/profile.h" +#include "chrome/common/net/url_request_context_getter.h" +#include "content/browser/browser_thread.h" + +using media::AudioInputController; +using std::string; + +namespace { + +// The following constants are related to the volume level indicator shown in +// the UI for recorded audio. +// Multiplier used when new volume is greater than previous level. +const float kUpSmoothingFactor = 0.9f; +// Multiplier used when new volume is lesser than previous level. +const float kDownSmoothingFactor = 0.4f; +const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. +const float kAudioMeterDbRange = 25.0f; +} // namespace + +namespace speech_input { + +const int SpeechRecognizer::kAudioSampleRate = 16000; +const int SpeechRecognizer::kAudioPacketIntervalMs = 100; +const int SpeechRecognizer::kNumAudioChannels = 1; +const int SpeechRecognizer::kNumBitsPerAudioSample = 16; +const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; +const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; + +SpeechRecognizer::SpeechRecognizer(Delegate* delegate, + int caller_id, + const std::string& language, + const std::string& grammar, + const std::string& hardware_info, + const std::string& origin_url) + : delegate_(delegate), + caller_id_(caller_id), + language_(language), + grammar_(grammar), + hardware_info_(hardware_info), + origin_url_(origin_url), + codec_(AudioEncoder::CODEC_SPEEX), + encoder_(NULL), + endpointer_(kAudioSampleRate), + num_samples_recorded_(0), + audio_level_(0.0f) { + endpointer_.set_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond / 2); + endpointer_.set_long_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond); + endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); + endpointer_.StartSession(); +} + +SpeechRecognizer::~SpeechRecognizer() { + // Recording should have stopped earlier due to the endpointer or + // |StopRecording| being called. + DCHECK(!audio_controller_.get()); + DCHECK(!request_.get() || !request_->HasPendingRequest()); + DCHECK(!encoder_.get()); + endpointer_.EndSession(); +} + +bool SpeechRecognizer::StartRecording() { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + DCHECK(!audio_controller_.get()); + DCHECK(!request_.get() || !request_->HasPendingRequest()); + DCHECK(!encoder_.get()); + + // The endpointer needs to estimate the environment/background noise before + // starting to treat the audio as user input. In |HandleOnData| we wait until + // such time has passed before switching to user input mode. + endpointer_.SetEnvironmentEstimationMode(); + + encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, + kNumBitsPerAudioSample)); + int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; + AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, + kAudioSampleRate, kNumBitsPerAudioSample, + samples_per_packet); + audio_controller_ = AudioInputController::Create(this, params); + DCHECK(audio_controller_.get()); + VLOG(1) << "SpeechRecognizer starting record."; + num_samples_recorded_ = 0; + audio_controller_->Record(); + + return true; +} + +void SpeechRecognizer::CancelRecognition() { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + DCHECK(audio_controller_.get() || request_.get()); + + // Stop recording if required. + if (audio_controller_.get()) { + VLOG(1) << "SpeechRecognizer stopping record."; + audio_controller_->Close(); + audio_controller_ = NULL; // Releases the ref ptr. + } + + VLOG(1) << "SpeechRecognizer canceling recognition."; + encoder_.reset(); + request_.reset(); +} + +void SpeechRecognizer::StopRecording() { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + + // If audio recording has already stopped and we are in recognition phase, + // silently ignore any more calls to stop recording. + if (!audio_controller_.get()) + return; + + VLOG(1) << "SpeechRecognizer stopping record."; + audio_controller_->Close(); + audio_controller_ = NULL; // Releases the ref ptr. + encoder_->Flush(); + + delegate_->DidCompleteRecording(caller_id_); + + // Since the http request takes a single string as POST data, allocate + // one and copy over bytes from the audio buffers to the string. + // And If we haven't got any audio yet end the recognition sequence here. + string mime_type = encoder_->mime_type(); + string data; + encoder_->GetEncodedData(&data); + encoder_.reset(); + + if (data.empty()) { + // Guard against the delegate freeing us until we finish our job. + scoped_refptr<SpeechRecognizer> me(this); + delegate_->DidCompleteRecognition(caller_id_); + } else { + DCHECK(!request_.get()); + request_.reset(new SpeechRecognitionRequest( + Profile::GetDefaultRequestContext(), this)); + request_->Send(language_, grammar_, hardware_info_, origin_url_, + mime_type, data); + } +} + +void SpeechRecognizer::ReleaseAudioBuffers() { +} + +// Invoked in the audio thread. +void SpeechRecognizer::OnError(AudioInputController* controller, + int error_code) { + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + NewRunnableMethod(this, + &SpeechRecognizer::HandleOnError, + error_code)); +} + +void SpeechRecognizer::HandleOnError(int error_code) { + LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; + + // Check if we are still recording before canceling recognition, as + // recording might have been stopped after this error was posted to the queue + // by |OnError|. + if (!audio_controller_.get()) + return; + + InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE); +} + +void SpeechRecognizer::OnData(AudioInputController* controller, + const uint8* data, uint32 size) { + if (size == 0) // This could happen when recording stops and is normal. + return; + + string* str_data = new string(reinterpret_cast<const char*>(data), size); + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + NewRunnableMethod(this, + &SpeechRecognizer::HandleOnData, + str_data)); +} + +void SpeechRecognizer::HandleOnData(string* data) { + // Check if we are still recording and if not discard this buffer, as + // recording might have been stopped after this buffer was posted to the queue + // by |OnData|. + if (!audio_controller_.get()) { + delete data; + return; + } + + const short* samples = reinterpret_cast<const short*>(data->data()); + DCHECK((data->length() % sizeof(short)) == 0); + int num_samples = data->length() / sizeof(short); + + encoder_->Encode(samples, num_samples); + float rms; + endpointer_.ProcessAudio(samples, num_samples, &rms); + delete data; + num_samples_recorded_ += num_samples; + + if (endpointer_.IsEstimatingEnvironment()) { + // Check if we have gathered enough audio for the endpointer to do + // environment estimation and should move on to detect speech/end of speech. + if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * + kAudioSampleRate) / 1000) { + endpointer_.SetUserInputMode(); + delegate_->DidCompleteEnvironmentEstimation(caller_id_); + } + return; // No more processing since we are still estimating environment. + } + + // Check if we have waited too long without hearing any speech. + if (!endpointer_.DidStartReceivingSpeech() && + num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) { + InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH); + return; + } + + // Calculate the input volume to display in the UI, smoothing towards the + // new level. + float level = (rms - kAudioMeterMinDb) / kAudioMeterDbRange; + level = std::min(std::max(0.0f, level), 1.0f); + if (level > audio_level_) { + audio_level_ += (level - audio_level_) * kUpSmoothingFactor; + } else { + audio_level_ += (level - audio_level_) * kDownSmoothingFactor; + } + delegate_->SetInputVolume(caller_id_, audio_level_); + + if (endpointer_.speech_input_complete()) { + StopRecording(); + } + + // TODO(satish): Once we have streaming POST, start sending the data received + // here as POST chunks. +} + +void SpeechRecognizer::SetRecognitionResult( + bool error, const SpeechInputResultArray& result) { + if (result.empty()) { + InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS); + return; + } + + delegate_->SetRecognitionResult(caller_id_, error, result); + + // Guard against the delegate freeing us until we finish our job. + scoped_refptr<SpeechRecognizer> me(this); + delegate_->DidCompleteRecognition(caller_id_); +} + +void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { + CancelRecognition(); + + // Guard against the delegate freeing us until we finish our job. + scoped_refptr<SpeechRecognizer> me(this); + delegate_->OnRecognizerError(caller_id_, error); +} + +} // namespace speech_input |