diff options
author | primiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-04-13 13:06:39 +0000 |
---|---|---|
committer | primiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-04-13 13:06:39 +0000 |
commit | 2ba0644d32705803938d2022562d2e42e5ac7615 (patch) | |
tree | ab1a3973ce11d8fcb5855b89c6287dfd9eb66230 /content | |
parent | 0d2dafb39d52717a30631e7104a9c60fa6b0e57b (diff) | |
download | chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.zip chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.tar.gz chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.tar.bz2 |
Speech refactoring: Reimplemented speech_recognizer as a FSM (CL1.5)
BUG=116954
TEST=none
Review URL: http://codereview.chromium.org/9835049
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@132179 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'content')
-rw-r--r-- | content/browser/speech/google_one_shot_remote_engine.h | 2 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl.cc | 642 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl.h | 125 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl_unittest.cc | 137 |
4 files changed, 653 insertions, 253 deletions
diff --git a/content/browser/speech/google_one_shot_remote_engine.h b/content/browser/speech/google_one_shot_remote_engine.h index 236ac94..7e47c67 100644 --- a/content/browser/speech/google_one_shot_remote_engine.h +++ b/content/browser/speech/google_one_shot_remote_engine.h @@ -31,7 +31,7 @@ namespace speech { class AudioChunk; -struct GoogleOneShotRemoteEngineConfig { +struct CONTENT_EXPORT GoogleOneShotRemoteEngineConfig { std::string language; std::string grammar; bool filter_profanities; diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc index 07bd75e..d9d9bab 100644 --- a/content/browser/speech/speech_recognizer_impl.cc +++ b/content/browser/speech/speech_recognizer_impl.cc @@ -4,6 +4,7 @@ #include "content/browser/speech/speech_recognizer_impl.h" +#include "base/basictypes.h" #include "base/bind.h" #include "base/time.h" #include "content/browser/browser_main_loop.h" @@ -24,6 +25,7 @@ using content::SpeechRecognitionResult; using content::SpeechRecognizer; using media::AudioInputController; using media::AudioManager; +using media::AudioParameters; namespace { @@ -49,6 +51,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) { const int16* samples = chunk.SamplesData16(); const int kThreshold = num_samples / 20; int clipping_samples = 0; + for (int i = 0; i < num_samples; ++i) { if (samples[i] <= -32767 || samples[i] >= 32767) { if (++clipping_samples > kThreshold) @@ -69,14 +72,25 @@ SpeechRecognizer* SpeechRecognizer::Create( bool filter_profanities, const std::string& hardware_info, const std::string& origin_url) { + speech::GoogleOneShotRemoteEngineConfig remote_engine_config; + remote_engine_config.language = language; + remote_engine_config.grammar = grammar; + remote_engine_config.audio_sample_rate = + speech::SpeechRecognizerImpl::kAudioSampleRate; + remote_engine_config.audio_num_bits_per_sample = + speech::SpeechRecognizerImpl::kNumBitsPerAudioSample; + remote_engine_config.filter_profanities = filter_profanities; + remote_engine_config.hardware_info = hardware_info; + remote_engine_config.origin_url = origin_url; + + // SpeechRecognizerImpl takes ownership of google_remote_engine. + speech::GoogleOneShotRemoteEngine* google_remote_engine = + new speech::GoogleOneShotRemoteEngine(context_getter); + google_remote_engine->SetConfig(remote_engine_config); + return new speech::SpeechRecognizerImpl(listener, caller_id, - language, - grammar, - context_getter, - filter_profanities, - hardware_info, - origin_url); + google_remote_engine); } namespace speech { @@ -87,247 +101,488 @@ const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; +COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, + kNumBitsPerAudioSample_must_be_a_multiple_of_8); + SpeechRecognizerImpl::SpeechRecognizerImpl( SpeechRecognitionEventListener* listener, int caller_id, - const std::string& language, - const std::string& grammar, - net::URLRequestContextGetter* context_getter, - bool filter_profanities, - const std::string& hardware_info, - const std::string& origin_url) + SpeechRecognitionEngine* engine) : listener_(listener), testing_audio_manager_(NULL), + recognition_engine_(engine), endpointer_(kAudioSampleRate), - context_getter_(context_getter), caller_id_(caller_id), - language_(language), - grammar_(grammar), - filter_profanities_(filter_profanities), - hardware_info_(hardware_info), - origin_url_(origin_url), - num_samples_recorded_(0), - audio_level_(0.0f) { + is_dispatching_event_(false), + state_(STATE_IDLE) { DCHECK(listener_ != NULL); + DCHECK(recognition_engine_ != NULL); endpointer_.set_speech_input_complete_silence_length( base::Time::kMicrosecondsPerSecond / 2); endpointer_.set_long_speech_input_complete_silence_length( base::Time::kMicrosecondsPerSecond); endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); endpointer_.StartSession(); + recognition_engine_->set_delegate(this); } SpeechRecognizerImpl::~SpeechRecognizerImpl() { - // Recording should have stopped earlier due to the endpointer or - // |StopRecording| being called. - DCHECK(!audio_controller_.get()); - DCHECK(!recognition_engine_.get() || - !recognition_engine_->IsRecognitionPending()); endpointer_.EndSession(); } -void SpeechRecognizerImpl::StartRecognition() { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - DCHECK(!audio_controller_.get()); - DCHECK(!recognition_engine_.get() || - !recognition_engine_->IsRecognitionPending()); +// ------- Methods that trigger Finite State Machine (FSM) events ------------ - // The endpointer needs to estimate the environment/background noise before - // starting to treat the audio as user input. In |HandleOnData| we wait until - // such time has passed before switching to user input mode. - endpointer_.SetEnvironmentEstimationMode(); +// NOTE:all the external events and requests should be enqueued (PostTask), even +// if they come from the same (IO) thread, in order to preserve the relationship +// of causality between events and avoid interleaved event processing due to +// synchronous callbacks. - AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? - testing_audio_manager_ : BrowserMainLoop::GetAudioManager(); - const int samples_per_packet = kAudioSampleRate * - GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000; - media::AudioParameters params( - media::AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, - kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet); - audio_controller_ = AudioInputController::Create(audio_manager, this, params); - DCHECK(audio_controller_.get()); - VLOG(1) << "SpeechRecognizer starting record."; - num_samples_recorded_ = 0; - audio_controller_->Record(); +void SpeechRecognizerImpl::StartRecognition() { + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, FSMEventArgs(EVENT_START))); } void SpeechRecognizerImpl::AbortRecognition() { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - DCHECK(audio_controller_.get() || recognition_engine_.get()); - - // Stop recording if required. - if (audio_controller_.get()) { - CloseAudioControllerAsynchronously(); - } - - VLOG(1) << "SpeechRecognizer canceling recognition."; - recognition_engine_.reset(); + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, FSMEventArgs(EVENT_ABORT))); } void SpeechRecognizerImpl::StopAudioCapture() { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - - // If audio recording has already stopped and we are in recognition phase, - // silently ignore any more calls to stop recording. - if (!audio_controller_.get()) - return; + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, FSMEventArgs(EVENT_STOP_CAPTURE))); +} - CloseAudioControllerAsynchronously(); - listener_->OnSoundEnd(caller_id_); - listener_->OnAudioEnd(caller_id_); +bool SpeechRecognizerImpl::IsActive() const { + // Checking the FSM state from another thread (thus, while the FSM is + // potentially concurrently evolving) is meaningless. + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + return state_ != STATE_IDLE; +} - // If we haven't got any audio yet end the recognition sequence here. - if (recognition_engine_ == NULL) { - // Guard against the listener freeing us until we finish our job. - scoped_refptr<SpeechRecognizerImpl> me(this); - listener_->OnRecognitionEnd(caller_id_); - } else { - recognition_engine_->AudioChunksEnded(); - } +bool SpeechRecognizerImpl::IsCapturingAudio() const { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). + const bool is_capturing_audio = state_ >= STATE_STARTING && + state_ <= STATE_RECOGNIZING; + DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || + (!is_capturing_audio && audio_controller_.get() == NULL)); + return is_capturing_audio; } // Invoked in the audio thread. void SpeechRecognizerImpl::OnError(AudioInputController* controller, int error_code) { + FSMEventArgs event_args(EVENT_AUDIO_ERROR); + event_args.audio_error_code = error_code; BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizerImpl::HandleOnError, - this, error_code)); + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, event_args)); } -void SpeechRecognizerImpl::HandleOnError(int error_code) { - LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; - - // Check if we are still recording before canceling recognition, as - // recording might have been stopped after this error was posted to the queue - // by |OnError|. - if (!audio_controller_.get()) +void SpeechRecognizerImpl::OnData(AudioInputController* controller, + const uint8* data, uint32 size) { + if (size == 0) // This could happen when audio capture stops and is normal. return; - InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO); + FSMEventArgs event_args(EVENT_AUDIO_DATA); + event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size), + kNumBitsPerAudioSample / 8); + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, event_args)); } -void SpeechRecognizerImpl::OnData(AudioInputController* controller, - const uint8* data, uint32 size) { - if (size == 0) // This could happen when recording stops and is normal. - return; - scoped_refptr<AudioChunk> raw_audio( - new AudioChunk(data, - static_cast<size_t>(size), - kNumBitsPerAudioSample / 8)); +void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} + +void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( + const content::SpeechRecognitionResult& result) { + FSMEventArgs event_args(EVENT_ENGINE_RESULT); + event_args.engine_result = result; BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizerImpl::HandleOnData, - this, raw_audio)); + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, event_args)); } -void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) { - // Check if we are still recording and if not discard this buffer, as - // recording might have been stopped after this buffer was posted to the queue - // by |OnData|. - if (!audio_controller_.get()) - return; +void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( + const content::SpeechRecognitionError& error) { + FSMEventArgs event_args(EVENT_ENGINE_ERROR); + event_args.engine_error = error; + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, event_args)); +} + +// ----------------------- Core FSM implementation --------------------------- +// TODO(primiano) After the changes in the media package (r129173), this class +// slightly violates the SpeechRecognitionEventListener interface contract. In +// particular, it is not true anymore that this class can be freed after the +// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous +// call can be still in progress after the end event. Currently, it does not +// represent a problem for the browser itself, since refcounting protects us +// against such race conditions. However, we should fix this in the next CLs. +// For instance, tests are currently working just because the +// TestAudioInputController is not closing asynchronously as the real controller +// does, but they will become flaky if TestAudioInputController will be fixed. + +void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + DCHECK_LE(event_args.event, EVENT_MAX_VALUE); + DCHECK_LE(state_, STATE_MAX_VALUE); + + // Event dispatching must be sequential, otherwise it will break all the rules + // and the assumptions of the finite state automata model. + DCHECK(!is_dispatching_event_); + is_dispatching_event_ = true; - bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); - - float rms; - endpointer_.ProcessAudio(*raw_audio, &rms); - bool did_clip = DetectClipping(*raw_audio); - num_samples_recorded_ += raw_audio->NumSamples(); - - if (recognition_engine_ == NULL) { - // This was the first audio packet recorded, so start a request to the - // server to send the data and inform the listener. - listener_->OnAudioStart(caller_id_); - GoogleOneShotRemoteEngineConfig google_sr_config; - google_sr_config.language = language_; - google_sr_config.grammar = grammar_; - google_sr_config.audio_sample_rate = kAudioSampleRate; - google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample; - google_sr_config.filter_profanities = filter_profanities_; - google_sr_config.hardware_info = hardware_info_; - google_sr_config.origin_url = origin_url_; - GoogleOneShotRemoteEngine* google_sr_engine = - new GoogleOneShotRemoteEngine(context_getter_.get()); - google_sr_engine->SetConfig(google_sr_config); - recognition_engine_.reset(google_sr_engine); - recognition_engine_->set_delegate(this); - recognition_engine_->StartRecognition(); + // Guard against the delegate freeing us until we finish processing the event. + scoped_refptr<SpeechRecognizerImpl> me(this); + + if (event_args.event == EVENT_AUDIO_DATA) { + DCHECK(event_args.audio_data.get() != NULL); + ProcessAudioPipeline(*event_args.audio_data); } - recognition_engine_->TakeAudioChunk(*raw_audio); + // The audio pipeline must be processed before the event dispatch, otherwise + // it would take actions according to the future state instead of the current. + state_ = ExecuteTransitionAndGetNextState(event_args); - if (endpointer_.IsEstimatingEnvironment()) { - // Check if we have gathered enough audio for the endpointer to do - // environment estimation and should move on to detect speech/end of speech. - if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * - kAudioSampleRate) / 1000) { - endpointer_.SetUserInputMode(); - listener_->OnEnvironmentEstimationComplete(caller_id_); - } - return; // No more processing since we are still estimating environment. + is_dispatching_event_ = false; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( + const FSMEventArgs& event_args) { + const FSMEvent event = event_args.event; + switch (state_) { + case STATE_IDLE: + switch (event) { + // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and + // EVENT_STOP_CAPTURE below once speech input extensions are fixed. + case EVENT_ABORT: + return DoNothing(event_args); + case EVENT_START: + return StartRecording(event_args); + case EVENT_STOP_CAPTURE: // Corner cases related to queued messages + case EVENT_AUDIO_DATA: // being lately dispatched. + case EVENT_ENGINE_RESULT: + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return DoNothing(event_args); + } + break; + case STATE_STARTING: + switch (event) { + case EVENT_ABORT: + return Abort(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + return Abort(event_args); + case EVENT_AUDIO_DATA: + return StartRecognitionEngine(event_args); + case EVENT_ENGINE_RESULT: + return NotFeasible(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return Abort(event_args); + } + break; + case STATE_ESTIMATING_ENVIRONMENT: + switch (event) { + case EVENT_ABORT: + return Abort(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + return StopCaptureAndWaitForResult(event_args); + case EVENT_AUDIO_DATA: + return WaitEnvironmentEstimationCompletion(event_args); + case EVENT_ENGINE_RESULT: + return ProcessIntermediateResult(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return Abort(event_args); + } + break; + case STATE_WAITING_FOR_SPEECH: + switch (event) { + case EVENT_ABORT: + return Abort(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + return StopCaptureAndWaitForResult(event_args); + case EVENT_AUDIO_DATA: + return DetectUserSpeechOrTimeout(event_args); + case EVENT_ENGINE_RESULT: + return ProcessIntermediateResult(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return Abort(event_args); + } + break; + case STATE_RECOGNIZING: + switch (event) { + case EVENT_ABORT: + return Abort(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + return StopCaptureAndWaitForResult(event_args); + case EVENT_AUDIO_DATA: + return DetectEndOfSpeech(event_args); + case EVENT_ENGINE_RESULT: + return ProcessIntermediateResult(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return Abort(event_args); + } + break; + case STATE_WAITING_FINAL_RESULT: + switch (event) { + case EVENT_ABORT: + return Abort(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + case EVENT_AUDIO_DATA: + return DoNothing(event_args); + case EVENT_ENGINE_RESULT: + return ProcessFinalResult(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return Abort(event_args); + } + break; } + return NotFeasible(event_args); +} - // Check if we have waited too long without hearing any speech. - bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech(); - if (!speech_was_heard_after_packet && - num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) { - InformErrorAndAbortRecognition( - content::SPEECH_RECOGNITION_ERROR_NO_SPEECH); - return; +// ----------- Contract for all the FSM evolution functions below ------------- +// - Are guaranteed to be executed in the IO thread; +// - Are guaranteed to be not reentrant (themselves and each other); +// - event_args members are guaranteed to be stable during the call; +// - The class won't be freed in the meanwhile due to callbacks; +// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. + +// TODO(primiano) the audio pipeline is currently serial. However, the +// clipper->endpointer->vumeter chain and the sr_engine could be parallelized. +// We should profile the execution to see if it would be worth or not. +void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { + const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && + state_ <= STATE_RECOGNIZING; + const bool route_to_sr_engine = route_to_endpointer; + const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && + state_ <= STATE_RECOGNIZING; + const bool clip_detected = DetectClipping(raw_audio); + float rms = 0.0f; + + num_samples_recorded_ += raw_audio.NumSamples(); + + if (route_to_endpointer) + endpointer_.ProcessAudio(raw_audio, &rms); + + if (route_to_vumeter) { + DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. + UpdateSignalAndNoiseLevels(rms, clip_detected); } + if (route_to_sr_engine) { + DCHECK(recognition_engine_.get() != NULL); + recognition_engine_->TakeAudioChunk(raw_audio); + } +} - if (!speech_was_heard_before_packet && speech_was_heard_after_packet) - listener_->OnSoundStart(caller_id_); +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { + DCHECK(recognition_engine_.get() != NULL); + DCHECK(!IsCapturingAudio()); + AudioManager* audio_manager = (testing_audio_manager_ != NULL) ? + testing_audio_manager_ : + BrowserMainLoop::GetAudioManager(); + DCHECK(audio_manager != NULL); - // Calculate the input volume to display in the UI, smoothing towards the - // new level. - float level = (rms - kAudioMeterMinDb) / - (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); - level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); - if (level > audio_level_) { - audio_level_ += (level - audio_level_) * kUpSmoothingFactor; + DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; + num_samples_recorded_ = 0; + audio_level_ = 0; + listener_->OnRecognitionStart(caller_id_); + + if (!audio_manager->HasAudioInputDevices()) { + return AbortWithError(SpeechRecognitionError( + content::SPEECH_RECOGNITION_ERROR_AUDIO, + content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); + } + + if (audio_manager->IsRecordingInProcess()) { + return AbortWithError(SpeechRecognitionError( + content::SPEECH_RECOGNITION_ERROR_AUDIO, + content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE)); + } + + const int samples_per_packet = (kAudioSampleRate * + recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000; + AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, + kAudioSampleRate, kNumBitsPerAudioSample, + samples_per_packet); + audio_controller_ = AudioInputController::Create(audio_manager, this, params); + + if (audio_controller_.get() == NULL) { + return AbortWithError( + SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); + } + + // The endpointer needs to estimate the environment/background noise before + // starting to treat the audio as user input. We wait in the state + // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching + // to user input mode. + endpointer_.SetEnvironmentEstimationMode(); + audio_controller_->Record(); + return STATE_STARTING; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { + // This is the first audio packet captured, so the recognition engine is + // started and the delegate notified about the event. + DCHECK(recognition_engine_.get() != NULL); + recognition_engine_->StartRecognition(); + listener_->OnAudioStart(caller_id_); + + // This is a little hack, since TakeAudioChunk() is already called by + // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping + // the first audio chunk captured after opening the audio device. + recognition_engine_->TakeAudioChunk(*(event_args.audio_data)); + return STATE_ESTIMATING_ENVIRONMENT; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { + DCHECK(endpointer_.IsEstimatingEnvironment()); + if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { + endpointer_.SetUserInputMode(); + listener_->OnEnvironmentEstimationComplete(caller_id_); + return STATE_WAITING_FOR_SPEECH; } else { - audio_level_ += (level - audio_level_) * kDownSmoothingFactor; + return STATE_ESTIMATING_ENVIRONMENT; + } +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { + if (endpointer_.DidStartReceivingSpeech()) { + listener_->OnSoundStart(caller_id_); + return STATE_RECOGNIZING; + } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { + return AbortWithError( + SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH)); } + return STATE_WAITING_FOR_SPEECH; +} - float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / - (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); - noise_level = std::min(std::max(0.0f, noise_level), - kAudioMeterRangeMaxUnclipped); +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { + if (endpointer_.speech_input_complete()) { + return StopCaptureAndWaitForResult(event_args); + } + return STATE_RECOGNIZING; +} - listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_, - noise_level); +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { + DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); - if (endpointer_.speech_input_complete()) - StopAudioCapture(); + DVLOG(1) << "Concluding recognition"; + CloseAudioControllerAsynchronously(); + recognition_engine_->AudioChunksEnded(); + + if (state_ > STATE_WAITING_FOR_SPEECH) + listener_->OnSoundEnd(caller_id_); + + listener_->OnAudioEnd(caller_id_); + return STATE_WAITING_FINAL_RESULT; } -void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) { + // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of + // other specific error sources (so that it was an explicit abort request). + // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by + // ChromeSpeechRecognitionManagerDelegate and would cause an exception. + // JS support will probably need it in future. + if (event_args.event == EVENT_AUDIO_ERROR) { + return AbortWithError( + SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO)); + } else if (event_args.event == EVENT_ENGINE_ERROR) { + return AbortWithError(event_args.engine_error); + } + return AbortWithError(NULL); +} + +SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( + const SpeechRecognitionError& error) { + return AbortWithError(&error); +} + +SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError( + const SpeechRecognitionError* error) { + if (IsCapturingAudio()) + CloseAudioControllerAsynchronously(); + + DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; + + // The recognition engine is initialized only after STATE_STARTING. + if (state_ > STATE_STARTING) { + DCHECK(recognition_engine_.get() != NULL); + recognition_engine_->EndRecognition(); + } + + if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) + listener_->OnSoundEnd(caller_id_); + + if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) + listener_->OnAudioEnd(caller_id_); + + if (error != NULL) + listener_->OnRecognitionError(caller_id_, *error); -void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult( - const content::SpeechRecognitionResult& result) { - // Guard against the listener freeing us until we finish our job. - scoped_refptr<SpeechRecognizerImpl> me(this); - listener_->OnRecognitionResult(caller_id_, result); listener_->OnRecognitionEnd(caller_id_); + + return STATE_IDLE; } -void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( - const content::SpeechRecognitionError& error) { - InformErrorAndAbortRecognition(error.code); +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) { + // This is in preparation for future speech recognition functions. + NOTREACHED(); + return state_; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { + const SpeechRecognitionResult& result = event_args.engine_result; + DVLOG(1) << "Got valid result"; + recognition_engine_->EndRecognition(); + listener_->OnRecognitionResult(caller_id_, result); + listener_->OnRecognitionEnd(caller_id_); + return STATE_IDLE; } -void SpeechRecognizerImpl::InformErrorAndAbortRecognition( - content::SpeechRecognitionErrorCode error) { - DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE); - AbortRecognition(); +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { + return state_; // Just keep the current state. +} - // Guard against the listener freeing us until we finish our job. - scoped_refptr<SpeechRecognizerImpl> me(this); - listener_->OnRecognitionError(caller_id_, error); +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { + NOTREACHED() << "Unfeasible event " << event_args.event + << " in state " << state_; + return state_; } void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { - VLOG(1) << "SpeechRecognizer stopping record."; + DCHECK(IsCapturingAudio()); + DVLOG(1) << "SpeechRecognizerImpl stopping audio capture."; // Issues a Close on the audio controller, passing an empty callback. The only // purpose of such callback is to keep the audio controller refcounted until // Close has completed (in the audio thread) and automatically destroy it @@ -337,12 +592,30 @@ void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { audio_controller_ = NULL; // The controller is still refcounted by Bind. } -bool SpeechRecognizerImpl::IsActive() const { - return (recognition_engine_.get() != NULL); +int SpeechRecognizerImpl::GetElapsedTimeMs() const { + return (num_samples_recorded_ * 1000) / kAudioSampleRate; } -bool SpeechRecognizerImpl::IsCapturingAudio() const { - return (audio_controller_.get() != NULL); +void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, + bool clip_detected) { + // Calculate the input volume to display in the UI, smoothing towards the + // new level. + // TODO(primiano) Do we really need all this floating point arith here? + // Perhaps it might be quite expensive on mobile. + float level = (rms - kAudioMeterMinDb) / + (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); + level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); + const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : + kDownSmoothingFactor; + audio_level_ += (level - audio_level_) * smoothing_factor; + + float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / + (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); + noise_level = std::min(std::max(0.0f, noise_level), + kAudioMeterRangeMaxUnclipped); + + listener_->OnAudioLevelsChange( + caller_id_, clip_detected ? 1.0f : audio_level_, noise_level); } const SpeechRecognitionEngine& @@ -355,5 +628,14 @@ void SpeechRecognizerImpl::SetAudioManagerForTesting( testing_audio_manager_ = audio_manager; } +SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) + : event(event_value), + audio_error_code(0), + audio_data(NULL), + engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) { +} + +SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { +} } // namespace speech diff --git a/content/browser/speech/speech_recognizer_impl.h b/content/browser/speech/speech_recognizer_impl.h index 516dfea..a2cce74 100644 --- a/content/browser/speech/speech_recognizer_impl.h +++ b/content/browser/speech/speech_recognizer_impl.h @@ -12,6 +12,7 @@ #include "content/browser/speech/speech_recognition_engine.h" #include "content/public/browser/speech_recognizer.h" #include "content/public/common/speech_recognition_error.h" +#include "content/public/common/speech_recognition_result.h" #include "media/audio/audio_input_controller.h" #include "net/url_request/url_request_context_getter.h" @@ -27,8 +28,13 @@ class AudioManager; namespace speech { -// Records audio, sends recorded audio to server and translates server response -// to recognition result. +// TODO(primiano) Next CL: Remove the Impl suffix and the exported +// /content/public/browser/speech_recognizer.h interface since this class should +// not be visible outside (currently we need it for speech input extension API). + +// Handles speech recognition for a session (identified by |caller_id|), taking +// care of audio capture, silence detection/endpointer and interaction with the +// SpeechRecognitionEngine. class CONTENT_EXPORT SpeechRecognizerImpl : public NON_EXPORTED_BASE(content::SpeechRecognizer), public media::AudioInputController::EventHandler, @@ -41,14 +47,9 @@ class CONTENT_EXPORT SpeechRecognizerImpl static const int kEndpointerEstimationTimeMs; SpeechRecognizerImpl( - content::SpeechRecognitionEventListener* listener, - int caller_id, - const std::string& language, - const std::string& grammar, - net::URLRequestContextGetter* context_getter, - bool filter_profanities, - const std::string& hardware_info, - const std::string& origin_url); + content::SpeechRecognitionEventListener* listener, + int caller_id, + SpeechRecognitionEngine* engine); virtual ~SpeechRecognizerImpl(); // content::SpeechRecognizer methods. @@ -59,14 +60,86 @@ class CONTENT_EXPORT SpeechRecognizerImpl virtual bool IsCapturingAudio() const OVERRIDE; const SpeechRecognitionEngine& recognition_engine() const; + private: + friend class SpeechRecognizerImplTest; + + enum FSMState { + STATE_IDLE = 0, + STATE_STARTING, + STATE_ESTIMATING_ENVIRONMENT, + STATE_WAITING_FOR_SPEECH, + STATE_RECOGNIZING, + STATE_WAITING_FINAL_RESULT, + STATE_MAX_VALUE = STATE_WAITING_FINAL_RESULT + }; + + enum FSMEvent { + EVENT_ABORT = 0, + EVENT_START, + EVENT_STOP_CAPTURE, + EVENT_AUDIO_DATA, + EVENT_ENGINE_RESULT, + EVENT_ENGINE_ERROR, + EVENT_AUDIO_ERROR, + EVENT_MAX_VALUE = EVENT_AUDIO_ERROR + }; + + struct FSMEventArgs { + explicit FSMEventArgs(FSMEvent event_value); + ~FSMEventArgs(); + + FSMEvent event; + int audio_error_code; + scoped_refptr<AudioChunk> audio_data; + content::SpeechRecognitionResult engine_result; + content::SpeechRecognitionError engine_error; + }; + + // Entry point for pushing any new external event into the recognizer FSM. + void DispatchEvent(const FSMEventArgs& event_args); + + // Defines the behavior of the recognizer FSM, selecting the appropriate + // transition according to the current state and event. + FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); + + // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). + void ProcessAudioPipeline(const AudioChunk& raw_audio); + + // The methods below handle transitions of the recognizer FSM. + FSMState StartRecording(const FSMEventArgs& event_args); + FSMState StartRecognitionEngine(const FSMEventArgs& event_args); + FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); + FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); + FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); + FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); + FSMState ProcessFinalResult(const FSMEventArgs& event_args); + FSMState Abort(const FSMEventArgs& event_args); + FSMState AbortWithError(const content::SpeechRecognitionError* error); + FSMState AbortWithError(const content::SpeechRecognitionError& error); + FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); + FSMState DoNothing(const FSMEventArgs& event_args) const; + FSMState NotFeasible(const FSMEventArgs& event_args); + + // Returns the time span of captured audio samples since the start of capture. + int GetElapsedTimeMs() const; + + // Calculates the input volume to be displayed in the UI, triggering the + // OnAudioLevelsChange event accordingly. + void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); + + void CloseAudioControllerAsynchronously(); + void SetAudioManagerForTesting(media::AudioManager* audio_manager); + + // Callback called on IO thread by audio_controller->Close(). + void OnAudioClosed(media::AudioInputController*); + // AudioInputController::EventHandler methods. virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {} virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {} virtual void OnError(media::AudioInputController* controller, int error_code) OVERRIDE; virtual void OnData(media::AudioInputController* controller, - const uint8* data, - uint32 size) OVERRIDE; + const uint8* data, uint32 size) OVERRIDE; // SpeechRecognitionEngineDelegate methods. virtual void OnSpeechRecognitionEngineResult( @@ -74,40 +147,16 @@ class CONTENT_EXPORT SpeechRecognizerImpl virtual void OnSpeechRecognitionEngineError( const content::SpeechRecognitionError& error) OVERRIDE; - private: - friend class SpeechRecognizerImplTest; - - void InformErrorAndAbortRecognition( - content::SpeechRecognitionErrorCode error); - void SendRecordedAudioToServer(); - - void HandleOnError(int error_code); // Handles OnError in the IO thread. - - // Handles OnData in the IO thread. - void HandleOnData(scoped_refptr<AudioChunk> raw_audio); - - void OnAudioClosed(media::AudioInputController*); - - // Helper method which closes the audio controller and frees it asynchronously - // without blocking the IO thread. - void CloseAudioControllerAsynchronously(); - - void SetAudioManagerForTesting(media::AudioManager* audio_manager); - content::SpeechRecognitionEventListener* listener_; media::AudioManager* testing_audio_manager_; scoped_ptr<SpeechRecognitionEngine> recognition_engine_; Endpointer endpointer_; scoped_refptr<media::AudioInputController> audio_controller_; - scoped_refptr<net::URLRequestContextGetter> context_getter_; int caller_id_; - std::string language_; - std::string grammar_; - bool filter_profanities_; - std::string hardware_info_; - std::string origin_url_; int num_samples_recorded_; float audio_level_; + bool is_dispatching_event_; + FSMState state_; DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl); }; diff --git a/content/browser/speech/speech_recognizer_impl_unittest.cc b/content/browser/speech/speech_recognizer_impl_unittest.cc index 01b7e4c..5dbe6cc 100644 --- a/content/browser/speech/speech_recognizer_impl_unittest.cc +++ b/content/browser/speech/speech_recognizer_impl_unittest.cc @@ -17,6 +17,7 @@ #include "net/url_request/url_request_status.h" #include "testing/gtest/include/gtest/gtest.h" +using base::MessageLoopProxy; using content::BrowserThread; using content::BrowserThreadImpl; using media::AudioInputController; @@ -97,16 +98,28 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener, SpeechRecognizerImplTest() : io_thread_(BrowserThread::IO, &message_loop_), audio_manager_(new MockAudioManager()), - audio_ended_(false), + recognition_started_(false), recognition_ended_(false), result_received_(false), audio_started_(false), + audio_ended_(false), + sound_started_(false), + sound_ended_(false), error_(content::SPEECH_RECOGNITION_ERROR_NONE), volume_(-1.0f) { - recognizer_ = new SpeechRecognizerImpl( - this, 1, std::string(), std::string(), NULL, false, std::string(), - std::string()); + // SpeechRecognizerImpl takes ownership of sr_engine. + GoogleOneShotRemoteEngine* sr_engine = + new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */); + GoogleOneShotRemoteEngineConfig config; + config.audio_num_bits_per_sample = + SpeechRecognizerImpl::kNumBitsPerAudioSample; + config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate; + config.filter_profanities = false; + sr_engine->SetConfig(config); + + recognizer_ = new SpeechRecognizerImpl(this, 1, sr_engine); recognizer_->SetAudioManagerForTesting(audio_manager_.get()); + int audio_packet_length_bytes = (SpeechRecognizerImpl::kAudioSampleRate * GoogleOneShotRemoteEngine::kAudioPacketIntervalMs * @@ -115,13 +128,33 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener, audio_packet_.resize(audio_packet_length_bytes); } + void CheckEventsConsistency() { + // Note: "!x || y" == "x implies y". + EXPECT_TRUE(!recognition_ended_ || recognition_started_); + EXPECT_TRUE(!audio_ended_ || audio_started_); + EXPECT_TRUE(!sound_ended_ || sound_started_); + EXPECT_TRUE(!audio_started_ || recognition_started_); + EXPECT_TRUE(!sound_started_ || audio_started_); + EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_)); + EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_)); + } + + void CheckFinalEventsConsistency() { + // Note: "!(x ^ y)" == "(x && y) || (!x && !x)". + EXPECT_FALSE(recognition_started_ ^ recognition_ended_); + EXPECT_FALSE(audio_started_ ^ audio_ended_); + EXPECT_FALSE(sound_started_ ^ sound_ended_); + } + // Overridden from content::SpeechRecognitionEventListener: virtual void OnAudioStart(int caller_id) OVERRIDE { audio_started_ = true; + CheckEventsConsistency(); } virtual void OnAudioEnd(int caller_id) OVERRIDE { audio_ended_ = true; + CheckEventsConsistency(); } virtual void OnRecognitionResult( @@ -130,8 +163,9 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener, } virtual void OnRecognitionError( - int caller_id, - const content::SpeechRecognitionError& error) OVERRIDE { + int caller_id, const content::SpeechRecognitionError& error) OVERRIDE { + EXPECT_TRUE(recognition_started_); + EXPECT_FALSE(recognition_ended_); error_ = error.code; } @@ -143,12 +177,25 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener, virtual void OnRecognitionEnd(int caller_id) OVERRIDE { recognition_ended_ = true; + CheckEventsConsistency(); + } + + virtual void OnRecognitionStart(int caller_id) OVERRIDE { + recognition_started_ = true; + CheckEventsConsistency(); } - virtual void OnRecognitionStart(int caller_id) OVERRIDE {} virtual void OnEnvironmentEstimationComplete(int caller_id) OVERRIDE {} - virtual void OnSoundStart(int caller_id) OVERRIDE {} - virtual void OnSoundEnd(int caller_id) OVERRIDE {} + + virtual void OnSoundStart(int caller_id) OVERRIDE { + sound_started_ = true; + CheckEventsConsistency(); + } + + virtual void OnSoundEnd(int caller_id) OVERRIDE { + sound_ended_ = true; + CheckEventsConsistency(); + } // testing::Test methods. virtual void SetUp() OVERRIDE { @@ -180,10 +227,13 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener, BrowserThreadImpl io_thread_; scoped_refptr<SpeechRecognizerImpl> recognizer_; scoped_ptr<AudioManager> audio_manager_; - bool audio_ended_; + bool recognition_started_; bool recognition_ended_; bool result_received_; bool audio_started_; + bool audio_ended_; + bool sound_started_; + bool sound_ended_; content::SpeechRecognitionErrorCode error_; TestURLFetcherFactory url_fetcher_factory_; TestAudioInputControllerFactory audio_input_controller_factory_; @@ -196,11 +246,12 @@ TEST_F(SpeechRecognizerImplTest, StopNoData) { // Check for callbacks when stopping record before any audio gets recorded. recognizer_->StartRecognition(); recognizer_->AbortRecognition(); - EXPECT_FALSE(audio_ended_); - EXPECT_FALSE(recognition_ended_); - EXPECT_FALSE(result_received_); + MessageLoop::current()->RunAllPending(); + EXPECT_TRUE(recognition_started_); EXPECT_FALSE(audio_started_); + EXPECT_FALSE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, CancelNoData) { @@ -208,17 +259,19 @@ TEST_F(SpeechRecognizerImplTest, CancelNoData) { // recorded. recognizer_->StartRecognition(); recognizer_->StopAudioCapture(); - EXPECT_TRUE(audio_ended_); - EXPECT_TRUE(recognition_ended_); - EXPECT_FALSE(result_received_); + MessageLoop::current()->RunAllPending(); + EXPECT_TRUE(recognition_started_); EXPECT_FALSE(audio_started_); + EXPECT_FALSE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, StopWithData) { // Start recording, give some data and then stop. This should wait for the // network callback to arrive before completion. recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); @@ -238,6 +291,7 @@ TEST_F(SpeechRecognizerImplTest, StopWithData) { } recognizer_->StopAudioCapture(); + MessageLoop::current()->RunAllPending(); EXPECT_TRUE(audio_started_); EXPECT_TRUE(audio_ended_); EXPECT_FALSE(recognition_ended_); @@ -256,16 +310,17 @@ TEST_F(SpeechRecognizerImplTest, StopWithData) { fetcher->SetResponseString( "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}"); fetcher->delegate()->OnURLFetchComplete(fetcher); - + MessageLoop::current()->RunAllPending(); EXPECT_TRUE(recognition_ended_); EXPECT_TRUE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, CancelWithData) { - // Start recording, give some data and then cancel. This should create - // a network request but give no callbacks. + // Start recording, give some data and then cancel. recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); @@ -273,18 +328,20 @@ TEST_F(SpeechRecognizerImplTest, CancelWithData) { audio_packet_.size()); MessageLoop::current()->RunAllPending(); recognizer_->AbortRecognition(); + MessageLoop::current()->RunAllPending(); ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); + EXPECT_TRUE(recognition_started_); EXPECT_TRUE(audio_started_); - EXPECT_FALSE(audio_ended_); - EXPECT_FALSE(recognition_ended_); EXPECT_FALSE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, ConnectionError) { // Start recording, give some data and then stop. Issue the network callback // with a connection error and verify that the recognizer bubbles the error up recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); @@ -295,6 +352,7 @@ TEST_F(SpeechRecognizerImplTest, ConnectionError) { ASSERT_TRUE(fetcher); recognizer_->StopAudioCapture(); + MessageLoop::current()->RunAllPending(); EXPECT_TRUE(audio_started_); EXPECT_TRUE(audio_ended_); EXPECT_FALSE(recognition_ended_); @@ -310,16 +368,18 @@ TEST_F(SpeechRecognizerImplTest, ConnectionError) { fetcher->set_response_code(0); fetcher->SetResponseString(""); fetcher->delegate()->OnURLFetchComplete(fetcher); - - EXPECT_FALSE(recognition_ended_); + MessageLoop::current()->RunAllPending(); + EXPECT_TRUE(recognition_ended_); EXPECT_FALSE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, ServerError) { // Start recording, give some data and then stop. Issue the network callback // with a 500 error and verify that the recognizer bubbles the error up recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); @@ -330,6 +390,7 @@ TEST_F(SpeechRecognizerImplTest, ServerError) { ASSERT_TRUE(fetcher); recognizer_->StopAudioCapture(); + MessageLoop::current()->RunAllPending(); EXPECT_TRUE(audio_started_); EXPECT_TRUE(audio_ended_); EXPECT_FALSE(recognition_ended_); @@ -344,31 +405,34 @@ TEST_F(SpeechRecognizerImplTest, ServerError) { fetcher->set_response_code(500); fetcher->SetResponseString("Internal Server Error"); fetcher->delegate()->OnURLFetchComplete(fetcher); - - EXPECT_FALSE(recognition_ended_); + MessageLoop::current()->RunAllPending(); + EXPECT_TRUE(recognition_ended_); EXPECT_FALSE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) { // Check if things tear down properly if AudioInputController threw an error. recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); controller->event_handler()->OnError(controller, 0); MessageLoop::current()->RunAllPending(); + EXPECT_TRUE(recognition_started_); EXPECT_FALSE(audio_started_); - EXPECT_FALSE(audio_ended_); - EXPECT_FALSE(recognition_ended_); EXPECT_FALSE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_AUDIO, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) { // Check if things tear down properly if AudioInputController threw an error // after giving some audio data. recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); @@ -377,36 +441,35 @@ TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) { controller->event_handler()->OnError(controller, 0); MessageLoop::current()->RunAllPending(); ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); + EXPECT_TRUE(recognition_started_); EXPECT_TRUE(audio_started_); - EXPECT_FALSE(audio_ended_); - EXPECT_FALSE(recognition_ended_); EXPECT_FALSE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_AUDIO, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) { // Start recording and give a lot of packets with audio samples set to zero. // This should trigger the no-speech detector and issue a callback. recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); - controller = audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / - GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; + GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1; // The vector is already filled with zero value samples on create. for (int i = 0; i < num_packets; ++i) { controller->event_handler()->OnData(controller, &audio_packet_[0], audio_packet_.size()); } MessageLoop::current()->RunAllPending(); + EXPECT_TRUE(recognition_started_); EXPECT_TRUE(audio_started_); - EXPECT_FALSE(audio_ended_); - EXPECT_FALSE(recognition_ended_); EXPECT_FALSE(result_received_); EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) { @@ -415,6 +478,7 @@ TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) { // treated as normal speech input and the no-speech detector should not get // triggered. recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); @@ -442,6 +506,8 @@ TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) { EXPECT_FALSE(audio_ended_); EXPECT_FALSE(recognition_ended_); recognizer_->AbortRecognition(); + MessageLoop::current()->RunAllPending(); + CheckFinalEventsConsistency(); } TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) { @@ -450,6 +516,7 @@ TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) { // get the callback during estimation phase, then get zero for the silence // samples and proper volume for the loud audio. recognizer_->StartRecognition(); + MessageLoop::current()->RunAllPending(); TestAudioInputController* controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); @@ -484,6 +551,8 @@ TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) { EXPECT_FALSE(audio_ended_); EXPECT_FALSE(recognition_ended_); recognizer_->AbortRecognition(); + MessageLoop::current()->RunAllPending(); + CheckFinalEventsConsistency(); } } // namespace speech |