// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "content/browser/speech/speech_recognizer_impl.h" #include "base/bind.h" #include "base/time.h" #include "content/browser/browser_main_loop.h" #include "content/browser/speech/audio_buffer.h" #include "content/browser/speech/google_one_shot_remote_engine.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/speech_recognition_event_listener.h" #include "content/public/browser/speech_recognizer.h" #include "content/public/common/speech_recognition_error.h" #include "content/public/common/speech_recognition_result.h" #include "net/url_request/url_request_context_getter.h" using content::BrowserMainLoop; using content::BrowserThread; using content::SpeechRecognitionError; using content::SpeechRecognitionEventListener; using content::SpeechRecognitionResult; using content::SpeechRecognizer; using media::AudioInputController; namespace { // The following constants are related to the volume level indicator shown in // the UI for recorded audio. // Multiplier used when new volume is greater than previous level. const float kUpSmoothingFactor = 1.0f; // Multiplier used when new volume is lesser than previous level. const float kDownSmoothingFactor = 0.7f; // RMS dB value of a maximum (unclipped) sine wave for int16 samples. const float kAudioMeterMaxDb = 90.31f; // This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. // Values lower than this will display as empty level-meter. const float kAudioMeterMinDb = 30.0f; const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; // Maximum level to draw to display unclipped meter. (1.0f displays clipping.) const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; // Returns true if more than 5% of the samples are at min or max value. bool DetectClipping(const speech::AudioChunk& chunk) { const int num_samples = chunk.NumSamples(); const int16* samples = chunk.SamplesData16(); const int kThreshold = num_samples / 20; int clipping_samples = 0; for (int i = 0; i < num_samples; ++i) { if (samples[i] <= -32767 || samples[i] >= 32767) { if (++clipping_samples > kThreshold) return true; } } return false; } void OnAudioClosed(AudioInputController*) {} } // namespace SpeechRecognizer* SpeechRecognizer::Create( SpeechRecognitionEventListener* listener, int caller_id, const std::string& language, const std::string& grammar, net::URLRequestContextGetter* context_getter, bool filter_profanities, const std::string& hardware_info, const std::string& origin_url) { return new speech::SpeechRecognizerImpl(listener, caller_id, language, grammar, context_getter, filter_profanities, hardware_info, origin_url); } namespace speech { const int SpeechRecognizerImpl::kAudioSampleRate = 16000; const ChannelLayout SpeechRecognizerImpl::kChannelLayout = CHANNEL_LAYOUT_MONO; const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; SpeechRecognizerImpl::SpeechRecognizerImpl( SpeechRecognitionEventListener* listener, int caller_id, const std::string& language, const std::string& grammar, net::URLRequestContextGetter* context_getter, bool filter_profanities, const std::string& hardware_info, const std::string& origin_url) : listener_(listener), testing_audio_manager_(NULL), endpointer_(kAudioSampleRate), context_getter_(context_getter), caller_id_(caller_id), language_(language), grammar_(grammar), filter_profanities_(filter_profanities), hardware_info_(hardware_info), origin_url_(origin_url), num_samples_recorded_(0), audio_level_(0.0f) { DCHECK(listener_ != NULL); endpointer_.set_speech_input_complete_silence_length( base::Time::kMicrosecondsPerSecond / 2); endpointer_.set_long_speech_input_complete_silence_length( base::Time::kMicrosecondsPerSecond); endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); endpointer_.StartSession(); } SpeechRecognizerImpl::~SpeechRecognizerImpl() { // Recording should have stopped earlier due to the endpointer or // |StopRecording| being called. DCHECK(!audio_controller_.get()); DCHECK(!recognition_engine_.get() || !recognition_engine_->IsRecognitionPending()); endpointer_.EndSession(); } void SpeechRecognizerImpl::StartRecognition() { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); DCHECK(!audio_controller_.get()); DCHECK(!recognition_engine_.get() || !recognition_engine_->IsRecognitionPending()); // The endpointer needs to estimate the environment/background noise before // starting to treat the audio as user input. In |HandleOnData| we wait until // such time has passed before switching to user input mode. endpointer_.SetEnvironmentEstimationMode(); AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? testing_audio_manager_ : BrowserMainLoop::GetAudioManager(); const int samples_per_packet = kAudioSampleRate * GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000; AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet); audio_controller_ = AudioInputController::Create(audio_manager, this, params); DCHECK(audio_controller_.get()); VLOG(1) << "SpeechRecognizer starting record."; num_samples_recorded_ = 0; audio_controller_->Record(); } void SpeechRecognizerImpl::AbortRecognition() { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); DCHECK(audio_controller_.get() || recognition_engine_.get()); // Stop recording if required. if (audio_controller_.get()) { CloseAudioControllerAsynchronously(); } VLOG(1) << "SpeechRecognizer canceling recognition."; recognition_engine_.reset(); } void SpeechRecognizerImpl::StopAudioCapture() { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // If audio recording has already stopped and we are in recognition phase, // silently ignore any more calls to stop recording. if (!audio_controller_.get()) return; CloseAudioControllerAsynchronously(); listener_->OnSoundEnd(caller_id_); listener_->OnAudioEnd(caller_id_); // If we haven't got any audio yet end the recognition sequence here. if (recognition_engine_ == NULL) { // Guard against the listener freeing us until we finish our job. scoped_refptr me(this); listener_->OnRecognitionEnd(caller_id_); } else { recognition_engine_->AudioChunksEnded(); } } // Invoked in the audio thread. void SpeechRecognizerImpl::OnError(AudioInputController* controller, int error_code) { BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, base::Bind(&SpeechRecognizerImpl::HandleOnError, this, error_code)); } void SpeechRecognizerImpl::HandleOnError(int error_code) { LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; // Check if we are still recording before canceling recognition, as // recording might have been stopped after this error was posted to the queue // by |OnError|. if (!audio_controller_.get()) return; InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO); } void SpeechRecognizerImpl::OnData(AudioInputController* controller, const uint8* data, uint32 size) { if (size == 0) // This could happen when recording stops and is normal. return; scoped_refptr raw_audio( new AudioChunk(data, static_cast(size), kNumBitsPerAudioSample / 8)); BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, base::Bind(&SpeechRecognizerImpl::HandleOnData, this, raw_audio)); } void SpeechRecognizerImpl::HandleOnData(scoped_refptr raw_audio) { // Check if we are still recording and if not discard this buffer, as // recording might have been stopped after this buffer was posted to the queue // by |OnData|. if (!audio_controller_.get()) return; bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); float rms; endpointer_.ProcessAudio(*raw_audio, &rms); bool did_clip = DetectClipping(*raw_audio); num_samples_recorded_ += raw_audio->NumSamples(); if (recognition_engine_ == NULL) { // This was the first audio packet recorded, so start a request to the // server to send the data and inform the listener. listener_->OnAudioStart(caller_id_); GoogleOneShotRemoteEngineConfig google_sr_config; google_sr_config.language = language_; google_sr_config.grammar = grammar_; google_sr_config.audio_sample_rate = kAudioSampleRate; google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample; google_sr_config.filter_profanities = filter_profanities_; google_sr_config.hardware_info = hardware_info_; google_sr_config.origin_url = origin_url_; GoogleOneShotRemoteEngine* google_sr_engine = new GoogleOneShotRemoteEngine(context_getter_.get()); google_sr_engine->SetConfig(google_sr_config); recognition_engine_.reset(google_sr_engine); recognition_engine_->set_delegate(this); recognition_engine_->StartRecognition(); } recognition_engine_->TakeAudioChunk(*raw_audio); if (endpointer_.IsEstimatingEnvironment()) { // Check if we have gathered enough audio for the endpointer to do // environment estimation and should move on to detect speech/end of speech. if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * kAudioSampleRate) / 1000) { endpointer_.SetUserInputMode(); listener_->OnEnvironmentEstimationComplete(caller_id_); } return; // No more processing since we are still estimating environment. } // Check if we have waited too long without hearing any speech. bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech(); if (!speech_was_heard_after_packet && num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) { InformErrorAndAbortRecognition( content::SPEECH_RECOGNITION_ERROR_NO_SPEECH); return; } if (!speech_was_heard_before_packet && speech_was_heard_after_packet) listener_->OnSoundStart(caller_id_); // Calculate the input volume to display in the UI, smoothing towards the // new level. float level = (rms - kAudioMeterMinDb) / (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); if (level > audio_level_) { audio_level_ += (level - audio_level_) * kUpSmoothingFactor; } else { audio_level_ += (level - audio_level_) * kDownSmoothingFactor; } float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); noise_level = std::min(std::max(0.0f, noise_level), kAudioMeterRangeMaxUnclipped); listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_, noise_level); if (endpointer_.speech_input_complete()) StopAudioCapture(); } void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( const content::SpeechRecognitionResult& result) { // Guard against the listener freeing us until we finish our job. scoped_refptr me(this); listener_->OnRecognitionResult(caller_id_, result); listener_->OnRecognitionEnd(caller_id_); } void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( const content::SpeechRecognitionError& error) { InformErrorAndAbortRecognition(error.code); } void SpeechRecognizerImpl::InformErrorAndAbortRecognition( content::SpeechRecognitionErrorCode error) { DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); AbortRecognition(); // Guard against the listener freeing us until we finish our job. scoped_refptr me(this); listener_->OnRecognitionError(caller_id_, error); } void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { VLOG(1) << "SpeechRecognizer stopping record."; // Issues a Close on the audio controller, passing an empty callback. The only // purpose of such callback is to keep the audio controller refcounted until // Close has completed (in the audio thread) and automatically destroy it // afterwards (upon return from OnAudioClosed). audio_controller_->Close(base::Bind(&OnAudioClosed, audio_controller_)); audio_controller_ = NULL; // The controller is still refcounted by Bind. } bool SpeechRecognizerImpl::IsActive() const { return (recognition_engine_.get() != NULL); } bool SpeechRecognizerImpl::IsCapturingAudio() const { return (audio_controller_.get() != NULL); } const SpeechRecognitionEngine& SpeechRecognizerImpl::recognition_engine() const { return *(recognition_engine_.get()); } void SpeechRecognizerImpl::SetAudioManagerForTesting( AudioManager* audio_manager) { testing_audio_manager_ = audio_manager; } } // namespace speech