From ce1adc3482a91262df06fbe9a824d29817a8771d Mon Sep 17 00:00:00 2001 From: "janx@chromium.org" Date: Mon, 20 May 2013 13:35:43 +0000 Subject: Extract interface from content::SpeechRecognizer SpeechRecognizer's current design assumes that the audio capture and the endpointer are always performed inside the browser. This is not going to be true for some platforms, for instance Android, where we plan to delegate not only the recognition activity, but also the audio capture and the endpointer, to the OS. TBR=avi@chromium.org (gypi) BUG=222352 Review URL: https://chromiumcodereview.appspot.com/15230003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@201082 0039d316-1c4b-4281-b951-d872f2087c98 --- .../speech/speech_recognition_manager_impl.cc | 11 +- content/browser/speech/speech_recognizer.cc | 666 --------------------- content/browser/speech/speech_recognizer.h | 142 +---- content/browser/speech/speech_recognizer_impl.cc | 662 ++++++++++++++++++++ content/browser/speech/speech_recognizer_impl.h | 156 +++++ .../speech/speech_recognizer_impl_unittest.cc | 498 +++++++++++++++ .../browser/speech/speech_recognizer_unittest.cc | 497 --------------- 7 files changed, 1334 insertions(+), 1298 deletions(-) delete mode 100644 content/browser/speech/speech_recognizer.cc create mode 100644 content/browser/speech/speech_recognizer_impl.cc create mode 100644 content/browser/speech/speech_recognizer_impl.h create mode 100644 content/browser/speech/speech_recognizer_impl_unittest.cc delete mode 100644 content/browser/speech/speech_recognizer_unittest.cc (limited to 'content/browser/speech') diff --git a/content/browser/speech/speech_recognition_manager_impl.cc b/content/browser/speech/speech_recognition_manager_impl.cc index ea49579..f1dca14 100644 --- a/content/browser/speech/speech_recognition_manager_impl.cc +++ b/content/browser/speech/speech_recognition_manager_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -10,7 +10,7 @@ #include "content/browser/speech/google_one_shot_remote_engine.h" #include "content/browser/speech/google_streaming_remote_engine.h" #include "content/browser/speech/speech_recognition_engine.h" -#include "content/browser/speech/speech_recognizer.h" +#include "content/browser/speech/speech_recognizer_impl.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/content_browser_client.h" #include "content/public/browser/resource_context.h" @@ -92,9 +92,10 @@ int SpeechRecognitionManagerImpl::CreateSession( SpeechRecognitionEngineConfig remote_engine_config; remote_engine_config.language = config.language; remote_engine_config.grammars = config.grammars; - remote_engine_config.audio_sample_rate = SpeechRecognizer::kAudioSampleRate; + remote_engine_config.audio_sample_rate = + SpeechRecognizerImpl::kAudioSampleRate; remote_engine_config.audio_num_bits_per_sample = - SpeechRecognizer::kNumBitsPerAudioSample; + SpeechRecognizerImpl::kNumBitsPerAudioSample; remote_engine_config.filter_profanities = config.filter_profanities; remote_engine_config.continuous = config.continuous; remote_engine_config.interim_results = config.interim_results; @@ -117,7 +118,7 @@ int SpeechRecognitionManagerImpl::CreateSession( // The legacy api cannot use continuous mode. DCHECK(!config.is_legacy_api || !config.continuous); - session.recognizer = new SpeechRecognizer( + session.recognizer = new SpeechRecognizerImpl( this, session_id, !config.continuous, diff --git a/content/browser/speech/speech_recognizer.cc b/content/browser/speech/speech_recognizer.cc deleted file mode 100644 index 62c1b35..0000000 --- a/content/browser/speech/speech_recognizer.cc +++ /dev/null @@ -1,666 +0,0 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "content/browser/speech/speech_recognizer.h" - -#include "base/basictypes.h" -#include "base/bind.h" -#include "base/time.h" -#include "content/browser/browser_main_loop.h" -#include "content/browser/speech/audio_buffer.h" -#include "content/browser/speech/google_one_shot_remote_engine.h" -#include "content/public/browser/browser_thread.h" -#include "content/public/browser/speech_recognition_event_listener.h" -#include "content/public/common/speech_recognition_error.h" -#include "content/public/common/speech_recognition_grammar.h" -#include "content/public/common/speech_recognition_result.h" -#include "net/url_request/url_request_context_getter.h" - -using media::AudioInputController; -using media::AudioManager; -using media::AudioParameters; -using media::ChannelLayout; - -namespace content { -namespace { - -// The following constants are related to the volume level indicator shown in -// the UI for recorded audio. -// Multiplier used when new volume is greater than previous level. -const float kUpSmoothingFactor = 1.0f; -// Multiplier used when new volume is lesser than previous level. -const float kDownSmoothingFactor = 0.7f; -// RMS dB value of a maximum (unclipped) sine wave for int16 samples. -const float kAudioMeterMaxDb = 90.31f; -// This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. -// Values lower than this will display as empty level-meter. -const float kAudioMeterMinDb = 30.0f; -const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; - -// Maximum level to draw to display unclipped meter. (1.0f displays clipping.) -const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; - -// Returns true if more than 5% of the samples are at min or max value. -bool DetectClipping(const AudioChunk& chunk) { - const int num_samples = chunk.NumSamples(); - const int16* samples = chunk.SamplesData16(); - const int kThreshold = num_samples / 20; - int clipping_samples = 0; - - for (int i = 0; i < num_samples; ++i) { - if (samples[i] <= -32767 || samples[i] >= 32767) { - if (++clipping_samples > kThreshold) - return true; - } - } - return false; -} - -void KeepAudioControllerRefcountedForDtor(scoped_refptr) { -} - -} // namespace - -const int SpeechRecognizer::kAudioSampleRate = 16000; -const ChannelLayout SpeechRecognizer::kChannelLayout = - media::CHANNEL_LAYOUT_MONO; -const int SpeechRecognizer::kNumBitsPerAudioSample = 16; -const int SpeechRecognizer::kNoSpeechTimeoutMs = 8000; -const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; -media::AudioManager* SpeechRecognizer::audio_manager_for_tests_ = NULL; - -COMPILE_ASSERT(SpeechRecognizer::kNumBitsPerAudioSample % 8 == 0, - kNumBitsPerAudioSample_must_be_a_multiple_of_8); - -SpeechRecognizer::SpeechRecognizer( - SpeechRecognitionEventListener* listener, - int session_id, - bool is_single_shot, - SpeechRecognitionEngine* engine) - : listener_(listener), - recognition_engine_(engine), - endpointer_(kAudioSampleRate), - session_id_(session_id), - is_dispatching_event_(false), - is_single_shot_(is_single_shot), - state_(STATE_IDLE) { - DCHECK(listener_ != NULL); - DCHECK(recognition_engine_ != NULL); - if (is_single_shot) { - // In single shot recognition, the session is automatically ended after: - // - 0.5 seconds of silence if time < 3 seconds - // - 1 seconds of silence if time >= 3 seconds - endpointer_.set_speech_input_complete_silence_length( - base::Time::kMicrosecondsPerSecond / 2); - endpointer_.set_long_speech_input_complete_silence_length( - base::Time::kMicrosecondsPerSecond); - endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); - } else { - // In continuous recognition, the session is automatically ended after 15 - // seconds of silence. - const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15; - endpointer_.set_speech_input_complete_silence_length(cont_timeout_us); - endpointer_.set_long_speech_length(0); // Use only a single timeout. - } - endpointer_.StartSession(); - recognition_engine_->set_delegate(this); -} - -// ------- Methods that trigger Finite State Machine (FSM) events ------------ - -// NOTE:all the external events and requests should be enqueued (PostTask), even -// if they come from the same (IO) thread, in order to preserve the relationship -// of causality between events and avoid interleaved event processing due to -// synchronous callbacks. - -void SpeechRecognizer::StartRecognition() { - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, - this, FSMEventArgs(EVENT_START))); -} - -void SpeechRecognizer::AbortRecognition() { - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, - this, FSMEventArgs(EVENT_ABORT))); -} - -void SpeechRecognizer::StopAudioCapture() { - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, - this, FSMEventArgs(EVENT_STOP_CAPTURE))); -} - -bool SpeechRecognizer::IsActive() const { - // Checking the FSM state from another thread (thus, while the FSM is - // potentially concurrently evolving) is meaningless. - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - return state_ != STATE_IDLE; -} - -bool SpeechRecognizer::IsCapturingAudio() const { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). - const bool is_capturing_audio = state_ >= STATE_STARTING && - state_ <= STATE_RECOGNIZING; - DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || - (!is_capturing_audio && audio_controller_.get() == NULL)); - return is_capturing_audio; -} - -const SpeechRecognitionEngine& -SpeechRecognizer::recognition_engine() const { - return *(recognition_engine_.get()); -} - -SpeechRecognizer::~SpeechRecognizer() { - endpointer_.EndSession(); - if (audio_controller_) { - audio_controller_->Close(base::Bind(&KeepAudioControllerRefcountedForDtor, - audio_controller_)); - } -} - -// Invoked in the audio thread. -void SpeechRecognizer::OnError(AudioInputController* controller) { - FSMEventArgs event_args(EVENT_AUDIO_ERROR); - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, - this, event_args)); -} - -void SpeechRecognizer::OnData(AudioInputController* controller, - const uint8* data, uint32 size) { - if (size == 0) // This could happen when audio capture stops and is normal. - return; - - FSMEventArgs event_args(EVENT_AUDIO_DATA); - event_args.audio_data = new AudioChunk(data, static_cast(size), - kNumBitsPerAudioSample / 8); - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, - this, event_args)); -} - -void SpeechRecognizer::OnAudioClosed(AudioInputController*) {} - -void SpeechRecognizer::OnSpeechRecognitionEngineResults( - const SpeechRecognitionResults& results) { - FSMEventArgs event_args(EVENT_ENGINE_RESULT); - event_args.engine_results = results; - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, - this, event_args)); -} - -void SpeechRecognizer::OnSpeechRecognitionEngineError( - const SpeechRecognitionError& error) { - FSMEventArgs event_args(EVENT_ENGINE_ERROR); - event_args.engine_error = error; - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, - this, event_args)); -} - -// ----------------------- Core FSM implementation --------------------------- -// TODO(primiano): After the changes in the media package (r129173), this class -// slightly violates the SpeechRecognitionEventListener interface contract. In -// particular, it is not true anymore that this class can be freed after the -// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous -// call can be still in progress after the end event. Currently, it does not -// represent a problem for the browser itself, since refcounting protects us -// against such race conditions. However, we should fix this in the next CLs. -// For instance, tests are currently working just because the -// TestAudioInputController is not closing asynchronously as the real controller -// does, but they will become flaky if TestAudioInputController will be fixed. - -void SpeechRecognizer::DispatchEvent(const FSMEventArgs& event_args) { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - DCHECK_LE(event_args.event, EVENT_MAX_VALUE); - DCHECK_LE(state_, STATE_MAX_VALUE); - - // Event dispatching must be sequential, otherwise it will break all the rules - // and the assumptions of the finite state automata model. - DCHECK(!is_dispatching_event_); - is_dispatching_event_ = true; - - // Guard against the delegate freeing us until we finish processing the event. - scoped_refptr me(this); - - if (event_args.event == EVENT_AUDIO_DATA) { - DCHECK(event_args.audio_data.get() != NULL); - ProcessAudioPipeline(*event_args.audio_data); - } - - // The audio pipeline must be processed before the event dispatch, otherwise - // it would take actions according to the future state instead of the current. - state_ = ExecuteTransitionAndGetNextState(event_args); - is_dispatching_event_ = false; -} - -SpeechRecognizer::FSMState -SpeechRecognizer::ExecuteTransitionAndGetNextState( - const FSMEventArgs& event_args) { - const FSMEvent event = event_args.event; - switch (state_) { - case STATE_IDLE: - switch (event) { - // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and - // EVENT_STOP_CAPTURE below once speech input extensions are fixed. - case EVENT_ABORT: - return AbortSilently(event_args); - case EVENT_START: - return StartRecording(event_args); - case EVENT_STOP_CAPTURE: - return AbortSilently(event_args); - case EVENT_AUDIO_DATA: // Corner cases related to queued messages - case EVENT_ENGINE_RESULT: // being lately dispatched. - case EVENT_ENGINE_ERROR: - case EVENT_AUDIO_ERROR: - return DoNothing(event_args); - } - break; - case STATE_STARTING: - switch (event) { - case EVENT_ABORT: - return AbortWithError(event_args); - case EVENT_START: - return NotFeasible(event_args); - case EVENT_STOP_CAPTURE: - return AbortSilently(event_args); - case EVENT_AUDIO_DATA: - return StartRecognitionEngine(event_args); - case EVENT_ENGINE_RESULT: - return NotFeasible(event_args); - case EVENT_ENGINE_ERROR: - case EVENT_AUDIO_ERROR: - return AbortWithError(event_args); - } - break; - case STATE_ESTIMATING_ENVIRONMENT: - switch (event) { - case EVENT_ABORT: - return AbortWithError(event_args); - case EVENT_START: - return NotFeasible(event_args); - case EVENT_STOP_CAPTURE: - return StopCaptureAndWaitForResult(event_args); - case EVENT_AUDIO_DATA: - return WaitEnvironmentEstimationCompletion(event_args); - case EVENT_ENGINE_RESULT: - return ProcessIntermediateResult(event_args); - case EVENT_ENGINE_ERROR: - case EVENT_AUDIO_ERROR: - return AbortWithError(event_args); - } - break; - case STATE_WAITING_FOR_SPEECH: - switch (event) { - case EVENT_ABORT: - return AbortWithError(event_args); - case EVENT_START: - return NotFeasible(event_args); - case EVENT_STOP_CAPTURE: - return StopCaptureAndWaitForResult(event_args); - case EVENT_AUDIO_DATA: - return DetectUserSpeechOrTimeout(event_args); - case EVENT_ENGINE_RESULT: - return ProcessIntermediateResult(event_args); - case EVENT_ENGINE_ERROR: - case EVENT_AUDIO_ERROR: - return AbortWithError(event_args); - } - break; - case STATE_RECOGNIZING: - switch (event) { - case EVENT_ABORT: - return AbortWithError(event_args); - case EVENT_START: - return NotFeasible(event_args); - case EVENT_STOP_CAPTURE: - return StopCaptureAndWaitForResult(event_args); - case EVENT_AUDIO_DATA: - return DetectEndOfSpeech(event_args); - case EVENT_ENGINE_RESULT: - return ProcessIntermediateResult(event_args); - case EVENT_ENGINE_ERROR: - case EVENT_AUDIO_ERROR: - return AbortWithError(event_args); - } - break; - case STATE_WAITING_FINAL_RESULT: - switch (event) { - case EVENT_ABORT: - return AbortWithError(event_args); - case EVENT_START: - return NotFeasible(event_args); - case EVENT_STOP_CAPTURE: - case EVENT_AUDIO_DATA: - return DoNothing(event_args); - case EVENT_ENGINE_RESULT: - return ProcessFinalResult(event_args); - case EVENT_ENGINE_ERROR: - case EVENT_AUDIO_ERROR: - return AbortWithError(event_args); - } - break; - } - return NotFeasible(event_args); -} - -// ----------- Contract for all the FSM evolution functions below ------------- -// - Are guaranteed to be executed in the IO thread; -// - Are guaranteed to be not reentrant (themselves and each other); -// - event_args members are guaranteed to be stable during the call; -// - The class won't be freed in the meanwhile due to callbacks; -// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. - -// TODO(primiano): the audio pipeline is currently serial. However, the -// clipper->endpointer->vumeter chain and the sr_engine could be parallelized. -// We should profile the execution to see if it would be worth or not. -void SpeechRecognizer::ProcessAudioPipeline(const AudioChunk& raw_audio) { - const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && - state_ <= STATE_RECOGNIZING; - const bool route_to_sr_engine = route_to_endpointer; - const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && - state_ <= STATE_RECOGNIZING; - const bool clip_detected = DetectClipping(raw_audio); - float rms = 0.0f; - - num_samples_recorded_ += raw_audio.NumSamples(); - - if (route_to_endpointer) - endpointer_.ProcessAudio(raw_audio, &rms); - - if (route_to_vumeter) { - DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. - UpdateSignalAndNoiseLevels(rms, clip_detected); - } - if (route_to_sr_engine) { - DCHECK(recognition_engine_.get() != NULL); - recognition_engine_->TakeAudioChunk(raw_audio); - } -} - -SpeechRecognizer::FSMState -SpeechRecognizer::StartRecording(const FSMEventArgs&) { - DCHECK(recognition_engine_.get() != NULL); - DCHECK(!IsCapturingAudio()); - AudioManager* audio_manager = (audio_manager_for_tests_ != NULL) ? - audio_manager_for_tests_ : - BrowserMainLoop::GetAudioManager(); - DCHECK(audio_manager != NULL); - - DVLOG(1) << "SpeechRecognizer starting audio capture."; - num_samples_recorded_ = 0; - audio_level_ = 0; - listener_->OnRecognitionStart(session_id_); - - if (!audio_manager->HasAudioInputDevices()) { - return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, - SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); - } - - if (audio_manager->IsRecordingInProcess()) { - return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, - SPEECH_AUDIO_ERROR_DETAILS_IN_USE)); - } - - const int samples_per_packet = (kAudioSampleRate * - recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000; - AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, - kAudioSampleRate, kNumBitsPerAudioSample, - samples_per_packet); - audio_controller_ = AudioInputController::Create(audio_manager, this, params); - - if (audio_controller_.get() == NULL) { - return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); - } - - // The endpointer needs to estimate the environment/background noise before - // starting to treat the audio as user input. We wait in the state - // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching - // to user input mode. - endpointer_.SetEnvironmentEstimationMode(); - audio_controller_->Record(); - return STATE_STARTING; -} - -SpeechRecognizer::FSMState -SpeechRecognizer::StartRecognitionEngine(const FSMEventArgs& event_args) { - // This is the first audio packet captured, so the recognition engine is - // started and the delegate notified about the event. - DCHECK(recognition_engine_.get() != NULL); - recognition_engine_->StartRecognition(); - listener_->OnAudioStart(session_id_); - - // This is a little hack, since TakeAudioChunk() is already called by - // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping - // the first audio chunk captured after opening the audio device. - recognition_engine_->TakeAudioChunk(*(event_args.audio_data)); - return STATE_ESTIMATING_ENVIRONMENT; -} - -SpeechRecognizer::FSMState -SpeechRecognizer::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { - DCHECK(endpointer_.IsEstimatingEnvironment()); - if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { - endpointer_.SetUserInputMode(); - listener_->OnEnvironmentEstimationComplete(session_id_); - return STATE_WAITING_FOR_SPEECH; - } else { - return STATE_ESTIMATING_ENVIRONMENT; - } -} - -SpeechRecognizer::FSMState -SpeechRecognizer::DetectUserSpeechOrTimeout(const FSMEventArgs&) { - if (endpointer_.DidStartReceivingSpeech()) { - listener_->OnSoundStart(session_id_); - return STATE_RECOGNIZING; - } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { - return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH)); - } - return STATE_WAITING_FOR_SPEECH; -} - -SpeechRecognizer::FSMState -SpeechRecognizer::DetectEndOfSpeech(const FSMEventArgs& event_args) { - if (endpointer_.speech_input_complete()) - return StopCaptureAndWaitForResult(event_args); - return STATE_RECOGNIZING; -} - -SpeechRecognizer::FSMState -SpeechRecognizer::StopCaptureAndWaitForResult(const FSMEventArgs&) { - DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); - - DVLOG(1) << "Concluding recognition"; - CloseAudioControllerAsynchronously(); - recognition_engine_->AudioChunksEnded(); - - if (state_ > STATE_WAITING_FOR_SPEECH) - listener_->OnSoundEnd(session_id_); - - listener_->OnAudioEnd(session_id_); - return STATE_WAITING_FINAL_RESULT; -} - -SpeechRecognizer::FSMState -SpeechRecognizer::AbortSilently(const FSMEventArgs& event_args) { - DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); - DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); - return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE)); -} - -SpeechRecognizer::FSMState -SpeechRecognizer::AbortWithError(const FSMEventArgs& event_args) { - if (event_args.event == EVENT_AUDIO_ERROR) { - return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); - } else if (event_args.event == EVENT_ENGINE_ERROR) { - return Abort(event_args.engine_error); - } - return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); -} - -SpeechRecognizer::FSMState SpeechRecognizer::Abort( - const SpeechRecognitionError& error) { - if (IsCapturingAudio()) - CloseAudioControllerAsynchronously(); - - DVLOG(1) << "SpeechRecognizer canceling recognition. "; - - // The recognition engine is initialized only after STATE_STARTING. - if (state_ > STATE_STARTING) { - DCHECK(recognition_engine_.get() != NULL); - recognition_engine_->EndRecognition(); - } - - if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) - listener_->OnSoundEnd(session_id_); - - if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) - listener_->OnAudioEnd(session_id_); - - if (error.code != SPEECH_RECOGNITION_ERROR_NONE) - listener_->OnRecognitionError(session_id_, error); - - listener_->OnRecognitionEnd(session_id_); - - return STATE_IDLE; -} - -SpeechRecognizer::FSMState SpeechRecognizer::ProcessIntermediateResult( - const FSMEventArgs& event_args) { - // Provisional results can occur only during continuous (non one-shot) mode. - // If this check is reached it means that a continuous speech recognition - // engine is being used for a one shot recognition. - DCHECK_EQ(false, is_single_shot_); - - // In continuous recognition, intermediate results can occur even when we are - // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the - // recognition engine is "faster" than our endpointer). In these cases we - // skip the endpointer and fast-forward to the RECOGNIZING state, with respect - // of the events triggering order. - if (state_ == STATE_ESTIMATING_ENVIRONMENT) { - DCHECK(endpointer_.IsEstimatingEnvironment()); - endpointer_.SetUserInputMode(); - listener_->OnEnvironmentEstimationComplete(session_id_); - } else if (state_ == STATE_WAITING_FOR_SPEECH) { - listener_->OnSoundStart(session_id_); - } else { - DCHECK_EQ(STATE_RECOGNIZING, state_); - } - - listener_->OnRecognitionResults(session_id_, event_args.engine_results); - return STATE_RECOGNIZING; -} - -SpeechRecognizer::FSMState -SpeechRecognizer::ProcessFinalResult(const FSMEventArgs& event_args) { - const SpeechRecognitionResults& results = event_args.engine_results; - SpeechRecognitionResults::const_iterator i = results.begin(); - bool provisional_results_pending = false; - bool results_are_empty = true; - for (; i != results.end(); ++i) { - const SpeechRecognitionResult& result = *i; - if (result.is_provisional) { - provisional_results_pending = true; - DCHECK(!is_single_shot_); - } else if (results_are_empty) { - results_are_empty = result.hypotheses.empty(); - } - } - - if (provisional_results_pending) { - listener_->OnRecognitionResults(session_id_, results); - // We don't end the recognition if a provisional result is received in - // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will - // end the recognition. - return state_; - } - - recognition_engine_->EndRecognition(); - - if (!results_are_empty) { - // We could receive an empty result (which we won't propagate further) - // in the following (continuous) scenario: - // 1. The caller start pushing audio and receives some results; - // 2. A |StopAudioCapture| is issued later; - // 3. The final audio frames captured in the interval ]1,2] do not lead to - // any result (nor any error); - // 4. The speech recognition engine, therefore, emits an empty result to - // notify that the recognition is ended with no error, yet neither any - // further result. - listener_->OnRecognitionResults(session_id_, results); - } - - listener_->OnRecognitionEnd(session_id_); - return STATE_IDLE; -} - -SpeechRecognizer::FSMState -SpeechRecognizer::DoNothing(const FSMEventArgs&) const { - return state_; // Just keep the current state. -} - -SpeechRecognizer::FSMState -SpeechRecognizer::NotFeasible(const FSMEventArgs& event_args) { - NOTREACHED() << "Unfeasible event " << event_args.event - << " in state " << state_; - return state_; -} - -void SpeechRecognizer::CloseAudioControllerAsynchronously() { - DCHECK(IsCapturingAudio()); - DVLOG(1) << "SpeechRecognizer closing audio controller."; - // Issues a Close on the audio controller, passing an empty callback. The only - // purpose of such callback is to keep the audio controller refcounted until - // Close has completed (in the audio thread) and automatically destroy it - // afterwards (upon return from OnAudioClosed). - audio_controller_->Close(base::Bind(&SpeechRecognizer::OnAudioClosed, - this, audio_controller_)); - audio_controller_ = NULL; // The controller is still refcounted by Bind. -} - -int SpeechRecognizer::GetElapsedTimeMs() const { - return (num_samples_recorded_ * 1000) / kAudioSampleRate; -} - -void SpeechRecognizer::UpdateSignalAndNoiseLevels(const float& rms, - bool clip_detected) { - // Calculate the input volume to display in the UI, smoothing towards the - // new level. - // TODO(primiano): Do we really need all this floating point arith here? - // Perhaps it might be quite expensive on mobile. - float level = (rms - kAudioMeterMinDb) / - (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); - level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); - const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : - kDownSmoothingFactor; - audio_level_ += (level - audio_level_) * smoothing_factor; - - float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / - (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); - noise_level = std::min(std::max(0.0f, noise_level), - kAudioMeterRangeMaxUnclipped); - - listener_->OnAudioLevelsChange( - session_id_, clip_detected ? 1.0f : audio_level_, noise_level); -} - -void SpeechRecognizer::SetAudioManagerForTests( - AudioManager* audio_manager) { - audio_manager_for_tests_ = audio_manager; -} - -SpeechRecognizer::FSMEventArgs::FSMEventArgs(FSMEvent event_value) - : event(event_value), - audio_data(NULL), - engine_error(SPEECH_RECOGNITION_ERROR_NONE) { -} - -SpeechRecognizer::FSMEventArgs::~FSMEventArgs() { -} - -} // namespace content diff --git a/content/browser/speech/speech_recognizer.h b/content/browser/speech/speech_recognizer.h index 12da905..bb8fd97 100644 --- a/content/browser/speech/speech_recognizer.h +++ b/content/browser/speech/speech_recognizer.h @@ -1,155 +1,37 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ -#include "base/basictypes.h" #include "base/memory/ref_counted.h" -#include "base/memory/scoped_ptr.h" -#include "content/browser/speech/endpointer/endpointer.h" -#include "content/browser/speech/speech_recognition_engine.h" -#include "content/public/common/speech_recognition_error.h" -#include "content/public/common/speech_recognition_result.h" -#include "media/audio/audio_input_controller.h" -#include "net/url_request/url_request_context_getter.h" - -namespace media { -class AudioManager; -} namespace content { class SpeechRecognitionEventListener; -// Handles speech recognition for a session (identified by |session_id|), taking -// care of audio capture, silence detection/endpointer and interaction with the -// SpeechRecognitionEngine. +// Handles speech recognition for a session (identified by |session_id|). class CONTENT_EXPORT SpeechRecognizer - : public base::RefCountedThreadSafe, - public media::AudioInputController::EventHandler, - public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) { + : public base::RefCountedThreadSafe { public: - static const int kAudioSampleRate; - static const media::ChannelLayout kChannelLayout; - static const int kNumBitsPerAudioSample; - static const int kNoSpeechTimeoutMs; - static const int kEndpointerEstimationTimeMs; - - static void SetAudioManagerForTests(media::AudioManager* audio_manager); - SpeechRecognizer(SpeechRecognitionEventListener* listener, - int session_id, - bool is_single_shot, - SpeechRecognitionEngine* engine); + SpeechRecognizer(SpeechRecognitionEventListener* listener, int session_id) + : listener_(listener), session_id_(session_id) {} - void StartRecognition(); - void AbortRecognition(); - void StopAudioCapture(); - bool IsActive() const; - bool IsCapturingAudio() const; - const SpeechRecognitionEngine& recognition_engine() const; + virtual void StartRecognition() = 0; + virtual void AbortRecognition() = 0; + virtual void StopAudioCapture() = 0; + virtual bool IsActive() const = 0; + virtual bool IsCapturingAudio() const = 0; - private: + protected: friend class base::RefCountedThreadSafe; - friend class SpeechRecognizerTest; - - enum FSMState { - STATE_IDLE = 0, - STATE_STARTING, - STATE_ESTIMATING_ENVIRONMENT, - STATE_WAITING_FOR_SPEECH, - STATE_RECOGNIZING, - STATE_WAITING_FINAL_RESULT, - STATE_MAX_VALUE = STATE_WAITING_FINAL_RESULT - }; - - enum FSMEvent { - EVENT_ABORT = 0, - EVENT_START, - EVENT_STOP_CAPTURE, - EVENT_AUDIO_DATA, - EVENT_ENGINE_RESULT, - EVENT_ENGINE_ERROR, - EVENT_AUDIO_ERROR, - EVENT_MAX_VALUE = EVENT_AUDIO_ERROR - }; - - struct FSMEventArgs { - explicit FSMEventArgs(FSMEvent event_value); - ~FSMEventArgs(); - - FSMEvent event; - scoped_refptr audio_data; - SpeechRecognitionResults engine_results; - SpeechRecognitionError engine_error; - }; - - virtual ~SpeechRecognizer(); - - // Entry point for pushing any new external event into the recognizer FSM. - void DispatchEvent(const FSMEventArgs& event_args); - - // Defines the behavior of the recognizer FSM, selecting the appropriate - // transition according to the current state and event. - FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); - - // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). - void ProcessAudioPipeline(const AudioChunk& raw_audio); - - // The methods below handle transitions of the recognizer FSM. - FSMState StartRecording(const FSMEventArgs& event_args); - FSMState StartRecognitionEngine(const FSMEventArgs& event_args); - FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); - FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); - FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); - FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); - FSMState ProcessFinalResult(const FSMEventArgs& event_args); - FSMState AbortSilently(const FSMEventArgs& event_args); - FSMState AbortWithError(const FSMEventArgs& event_args); - FSMState Abort(const SpeechRecognitionError& error); - FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); - FSMState DoNothing(const FSMEventArgs& event_args) const; - FSMState NotFeasible(const FSMEventArgs& event_args); - - // Returns the time span of captured audio samples since the start of capture. - int GetElapsedTimeMs() const; - - // Calculates the input volume to be displayed in the UI, triggering the - // OnAudioLevelsChange event accordingly. - void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); - - void CloseAudioControllerAsynchronously(); - - // Callback called on IO thread by audio_controller->Close(). - void OnAudioClosed(media::AudioInputController*); - - // AudioInputController::EventHandler methods. - virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {} - virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {} - virtual void OnError(media::AudioInputController* controller) OVERRIDE; - virtual void OnData(media::AudioInputController* controller, - const uint8* data, uint32 size) OVERRIDE; - - // SpeechRecognitionEngineDelegate methods. - virtual void OnSpeechRecognitionEngineResults( - const SpeechRecognitionResults& results) OVERRIDE; - virtual void OnSpeechRecognitionEngineError( - const SpeechRecognitionError& error) OVERRIDE; - static media::AudioManager* audio_manager_for_tests_; + virtual ~SpeechRecognizer() {} SpeechRecognitionEventListener* listener_; - scoped_ptr recognition_engine_; - Endpointer endpointer_; - scoped_refptr audio_controller_; int session_id_; - int num_samples_recorded_; - float audio_level_; - bool is_dispatching_event_; - bool is_single_shot_; - FSMState state_; DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer); }; diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc new file mode 100644 index 0000000..d207ba4 --- /dev/null +++ b/content/browser/speech/speech_recognizer_impl.cc @@ -0,0 +1,662 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/speech_recognizer_impl.h" + +#include "base/basictypes.h" +#include "base/bind.h" +#include "base/time.h" +#include "content/browser/browser_main_loop.h" +#include "content/browser/speech/audio_buffer.h" +#include "content/browser/speech/google_one_shot_remote_engine.h" +#include "content/public/browser/browser_thread.h" +#include "content/public/browser/speech_recognition_event_listener.h" +#include "net/url_request/url_request_context_getter.h" + +using media::AudioInputController; +using media::AudioManager; +using media::AudioParameters; +using media::ChannelLayout; + +namespace content { +namespace { + +// The following constants are related to the volume level indicator shown in +// the UI for recorded audio. +// Multiplier used when new volume is greater than previous level. +const float kUpSmoothingFactor = 1.0f; +// Multiplier used when new volume is lesser than previous level. +const float kDownSmoothingFactor = 0.7f; +// RMS dB value of a maximum (unclipped) sine wave for int16 samples. +const float kAudioMeterMaxDb = 90.31f; +// This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0. +// Values lower than this will display as empty level-meter. +const float kAudioMeterMinDb = 30.0f; +const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; + +// Maximum level to draw to display unclipped meter. (1.0f displays clipping.) +const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; + +// Returns true if more than 5% of the samples are at min or max value. +bool DetectClipping(const AudioChunk& chunk) { + const int num_samples = chunk.NumSamples(); + const int16* samples = chunk.SamplesData16(); + const int kThreshold = num_samples / 20; + int clipping_samples = 0; + + for (int i = 0; i < num_samples; ++i) { + if (samples[i] <= -32767 || samples[i] >= 32767) { + if (++clipping_samples > kThreshold) + return true; + } + } + return false; +} + +void KeepAudioControllerRefcountedForDtor(scoped_refptr) { +} + +} // namespace + +const int SpeechRecognizerImpl::kAudioSampleRate = 16000; +const ChannelLayout SpeechRecognizerImpl::kChannelLayout = + media::CHANNEL_LAYOUT_MONO; +const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; +const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; +const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; +media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; + +COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, + kNumBitsPerAudioSample_must_be_a_multiple_of_8); + +SpeechRecognizerImpl::SpeechRecognizerImpl( + SpeechRecognitionEventListener* listener, + int session_id, + bool is_single_shot, + SpeechRecognitionEngine* engine) + : SpeechRecognizer(listener, session_id), + recognition_engine_(engine), + endpointer_(kAudioSampleRate), + is_dispatching_event_(false), + is_single_shot_(is_single_shot), + state_(STATE_IDLE) { + DCHECK(listener_ != NULL); + DCHECK(recognition_engine_ != NULL); + if (is_single_shot) { + // In single shot recognition, the session is automatically ended after: + // - 0.5 seconds of silence if time < 3 seconds + // - 1 seconds of silence if time >= 3 seconds + endpointer_.set_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond / 2); + endpointer_.set_long_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond); + endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); + } else { + // In continuous recognition, the session is automatically ended after 15 + // seconds of silence. + const int64 cont_timeout_us = base::Time::kMicrosecondsPerSecond * 15; + endpointer_.set_speech_input_complete_silence_length(cont_timeout_us); + endpointer_.set_long_speech_length(0); // Use only a single timeout. + } + endpointer_.StartSession(); + recognition_engine_->set_delegate(this); +} + +// ------- Methods that trigger Finite State Machine (FSM) events ------------ + +// NOTE:all the external events and requests should be enqueued (PostTask), even +// if they come from the same (IO) thread, in order to preserve the relationship +// of causality between events and avoid interleaved event processing due to +// synchronous callbacks. + +void SpeechRecognizerImpl::StartRecognition() { + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, FSMEventArgs(EVENT_START))); +} + +void SpeechRecognizerImpl::AbortRecognition() { + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, FSMEventArgs(EVENT_ABORT))); +} + +void SpeechRecognizerImpl::StopAudioCapture() { + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, FSMEventArgs(EVENT_STOP_CAPTURE))); +} + +bool SpeechRecognizerImpl::IsActive() const { + // Checking the FSM state from another thread (thus, while the FSM is + // potentially concurrently evolving) is meaningless. + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + return state_ != STATE_IDLE; +} + +bool SpeechRecognizerImpl::IsCapturingAudio() const { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). + const bool is_capturing_audio = state_ >= STATE_STARTING && + state_ <= STATE_RECOGNIZING; + DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) || + (!is_capturing_audio && audio_controller_.get() == NULL)); + return is_capturing_audio; +} + +const SpeechRecognitionEngine& +SpeechRecognizerImpl::recognition_engine() const { + return *(recognition_engine_.get()); +} + +SpeechRecognizerImpl::~SpeechRecognizerImpl() { + endpointer_.EndSession(); + if (audio_controller_) { + audio_controller_->Close(base::Bind(&KeepAudioControllerRefcountedForDtor, + audio_controller_)); + } +} + +// Invoked in the audio thread. +void SpeechRecognizerImpl::OnError(AudioInputController* controller) { + FSMEventArgs event_args(EVENT_AUDIO_ERROR); + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, event_args)); +} + +void SpeechRecognizerImpl::OnData(AudioInputController* controller, + const uint8* data, uint32 size) { + if (size == 0) // This could happen when audio capture stops and is normal. + return; + + FSMEventArgs event_args(EVENT_AUDIO_DATA); + event_args.audio_data = new AudioChunk(data, static_cast(size), + kNumBitsPerAudioSample / 8); + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, event_args)); +} + +void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} + +void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( + const SpeechRecognitionResults& results) { + FSMEventArgs event_args(EVENT_ENGINE_RESULT); + event_args.engine_results = results; + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, event_args)); +} + +void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( + const SpeechRecognitionError& error) { + FSMEventArgs event_args(EVENT_ENGINE_ERROR); + event_args.engine_error = error; + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, + this, event_args)); +} + +// ----------------------- Core FSM implementation --------------------------- +// TODO(primiano): After the changes in the media package (r129173), this class +// slightly violates the SpeechRecognitionEventListener interface contract. In +// particular, it is not true anymore that this class can be freed after the +// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous +// call can be still in progress after the end event. Currently, it does not +// represent a problem for the browser itself, since refcounting protects us +// against such race conditions. However, we should fix this in the next CLs. +// For instance, tests are currently working just because the +// TestAudioInputController is not closing asynchronously as the real controller +// does, but they will become flaky if TestAudioInputController will be fixed. + +void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + DCHECK_LE(event_args.event, EVENT_MAX_VALUE); + DCHECK_LE(state_, STATE_MAX_VALUE); + + // Event dispatching must be sequential, otherwise it will break all the rules + // and the assumptions of the finite state automata model. + DCHECK(!is_dispatching_event_); + is_dispatching_event_ = true; + + // Guard against the delegate freeing us until we finish processing the event. + scoped_refptr me(this); + + if (event_args.event == EVENT_AUDIO_DATA) { + DCHECK(event_args.audio_data.get() != NULL); + ProcessAudioPipeline(*event_args.audio_data); + } + + // The audio pipeline must be processed before the event dispatch, otherwise + // it would take actions according to the future state instead of the current. + state_ = ExecuteTransitionAndGetNextState(event_args); + is_dispatching_event_ = false; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( + const FSMEventArgs& event_args) { + const FSMEvent event = event_args.event; + switch (state_) { + case STATE_IDLE: + switch (event) { + // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and + // EVENT_STOP_CAPTURE below once speech input extensions are fixed. + case EVENT_ABORT: + return AbortSilently(event_args); + case EVENT_START: + return StartRecording(event_args); + case EVENT_STOP_CAPTURE: + return AbortSilently(event_args); + case EVENT_AUDIO_DATA: // Corner cases related to queued messages + case EVENT_ENGINE_RESULT: // being lately dispatched. + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return DoNothing(event_args); + } + break; + case STATE_STARTING: + switch (event) { + case EVENT_ABORT: + return AbortWithError(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + return AbortSilently(event_args); + case EVENT_AUDIO_DATA: + return StartRecognitionEngine(event_args); + case EVENT_ENGINE_RESULT: + return NotFeasible(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return AbortWithError(event_args); + } + break; + case STATE_ESTIMATING_ENVIRONMENT: + switch (event) { + case EVENT_ABORT: + return AbortWithError(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + return StopCaptureAndWaitForResult(event_args); + case EVENT_AUDIO_DATA: + return WaitEnvironmentEstimationCompletion(event_args); + case EVENT_ENGINE_RESULT: + return ProcessIntermediateResult(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return AbortWithError(event_args); + } + break; + case STATE_WAITING_FOR_SPEECH: + switch (event) { + case EVENT_ABORT: + return AbortWithError(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + return StopCaptureAndWaitForResult(event_args); + case EVENT_AUDIO_DATA: + return DetectUserSpeechOrTimeout(event_args); + case EVENT_ENGINE_RESULT: + return ProcessIntermediateResult(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return AbortWithError(event_args); + } + break; + case STATE_RECOGNIZING: + switch (event) { + case EVENT_ABORT: + return AbortWithError(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + return StopCaptureAndWaitForResult(event_args); + case EVENT_AUDIO_DATA: + return DetectEndOfSpeech(event_args); + case EVENT_ENGINE_RESULT: + return ProcessIntermediateResult(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return AbortWithError(event_args); + } + break; + case STATE_WAITING_FINAL_RESULT: + switch (event) { + case EVENT_ABORT: + return AbortWithError(event_args); + case EVENT_START: + return NotFeasible(event_args); + case EVENT_STOP_CAPTURE: + case EVENT_AUDIO_DATA: + return DoNothing(event_args); + case EVENT_ENGINE_RESULT: + return ProcessFinalResult(event_args); + case EVENT_ENGINE_ERROR: + case EVENT_AUDIO_ERROR: + return AbortWithError(event_args); + } + break; + } + return NotFeasible(event_args); +} + +// ----------- Contract for all the FSM evolution functions below ------------- +// - Are guaranteed to be executed in the IO thread; +// - Are guaranteed to be not reentrant (themselves and each other); +// - event_args members are guaranteed to be stable during the call; +// - The class won't be freed in the meanwhile due to callbacks; +// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. + +// TODO(primiano): the audio pipeline is currently serial. However, the +// clipper->endpointer->vumeter chain and the sr_engine could be parallelized. +// We should profile the execution to see if it would be worth or not. +void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { + const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && + state_ <= STATE_RECOGNIZING; + const bool route_to_sr_engine = route_to_endpointer; + const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH && + state_ <= STATE_RECOGNIZING; + const bool clip_detected = DetectClipping(raw_audio); + float rms = 0.0f; + + num_samples_recorded_ += raw_audio.NumSamples(); + + if (route_to_endpointer) + endpointer_.ProcessAudio(raw_audio, &rms); + + if (route_to_vumeter) { + DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|. + UpdateSignalAndNoiseLevels(rms, clip_detected); + } + if (route_to_sr_engine) { + DCHECK(recognition_engine_.get() != NULL); + recognition_engine_->TakeAudioChunk(raw_audio); + } +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { + DCHECK(recognition_engine_.get() != NULL); + DCHECK(!IsCapturingAudio()); + AudioManager* audio_manager = (audio_manager_for_tests_ != NULL) ? + audio_manager_for_tests_ : + BrowserMainLoop::GetAudioManager(); + DCHECK(audio_manager != NULL); + + DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; + num_samples_recorded_ = 0; + audio_level_ = 0; + listener_->OnRecognitionStart(session_id_); + + if (!audio_manager->HasAudioInputDevices()) { + return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, + SPEECH_AUDIO_ERROR_DETAILS_NO_MIC)); + } + + if (audio_manager->IsRecordingInProcess()) { + return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO, + SPEECH_AUDIO_ERROR_DETAILS_IN_USE)); + } + + const int samples_per_packet = (kAudioSampleRate * + recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000; + AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout, + kAudioSampleRate, kNumBitsPerAudioSample, + samples_per_packet); + audio_controller_ = AudioInputController::Create(audio_manager, this, params); + + if (audio_controller_.get() == NULL) { + return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); + } + + // The endpointer needs to estimate the environment/background noise before + // starting to treat the audio as user input. We wait in the state + // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching + // to user input mode. + endpointer_.SetEnvironmentEstimationMode(); + audio_controller_->Record(); + return STATE_STARTING; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { + // This is the first audio packet captured, so the recognition engine is + // started and the delegate notified about the event. + DCHECK(recognition_engine_.get() != NULL); + recognition_engine_->StartRecognition(); + listener_->OnAudioStart(session_id_); + + // This is a little hack, since TakeAudioChunk() is already called by + // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping + // the first audio chunk captured after opening the audio device. + recognition_engine_->TakeAudioChunk(*(event_args.audio_data)); + return STATE_ESTIMATING_ENVIRONMENT; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { + DCHECK(endpointer_.IsEstimatingEnvironment()); + if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { + endpointer_.SetUserInputMode(); + listener_->OnEnvironmentEstimationComplete(session_id_); + return STATE_WAITING_FOR_SPEECH; + } else { + return STATE_ESTIMATING_ENVIRONMENT; + } +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { + if (endpointer_.DidStartReceivingSpeech()) { + listener_->OnSoundStart(session_id_); + return STATE_RECOGNIZING; + } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) { + return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NO_SPEECH)); + } + return STATE_WAITING_FOR_SPEECH; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { + if (endpointer_.speech_input_complete()) + return StopCaptureAndWaitForResult(event_args); + return STATE_RECOGNIZING; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { + DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); + + DVLOG(1) << "Concluding recognition"; + CloseAudioControllerAsynchronously(); + recognition_engine_->AudioChunksEnded(); + + if (state_ > STATE_WAITING_FOR_SPEECH) + listener_->OnSoundEnd(session_id_); + + listener_->OnAudioEnd(session_id_); + return STATE_WAITING_FINAL_RESULT; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { + DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); + DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); + return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE)); +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { + if (event_args.event == EVENT_AUDIO_ERROR) { + return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); + } else if (event_args.event == EVENT_ENGINE_ERROR) { + return Abort(event_args.engine_error); + } + return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); +} + +SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( + const SpeechRecognitionError& error) { + if (IsCapturingAudio()) + CloseAudioControllerAsynchronously(); + + DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; + + // The recognition engine is initialized only after STATE_STARTING. + if (state_ > STATE_STARTING) { + DCHECK(recognition_engine_.get() != NULL); + recognition_engine_->EndRecognition(); + } + + if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT) + listener_->OnSoundEnd(session_id_); + + if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT) + listener_->OnAudioEnd(session_id_); + + if (error.code != SPEECH_RECOGNITION_ERROR_NONE) + listener_->OnRecognitionError(session_id_, error); + + listener_->OnRecognitionEnd(session_id_); + + return STATE_IDLE; +} + +SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( + const FSMEventArgs& event_args) { + // Provisional results can occur only during continuous (non one-shot) mode. + // If this check is reached it means that a continuous speech recognition + // engine is being used for a one shot recognition. + DCHECK_EQ(false, is_single_shot_); + + // In continuous recognition, intermediate results can occur even when we are + // in the ESTIMATING_ENVIRONMENT or WAITING_FOR_SPEECH states (if the + // recognition engine is "faster" than our endpointer). In these cases we + // skip the endpointer and fast-forward to the RECOGNIZING state, with respect + // of the events triggering order. + if (state_ == STATE_ESTIMATING_ENVIRONMENT) { + DCHECK(endpointer_.IsEstimatingEnvironment()); + endpointer_.SetUserInputMode(); + listener_->OnEnvironmentEstimationComplete(session_id_); + } else if (state_ == STATE_WAITING_FOR_SPEECH) { + listener_->OnSoundStart(session_id_); + } else { + DCHECK_EQ(STATE_RECOGNIZING, state_); + } + + listener_->OnRecognitionResults(session_id_, event_args.engine_results); + return STATE_RECOGNIZING; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { + const SpeechRecognitionResults& results = event_args.engine_results; + SpeechRecognitionResults::const_iterator i = results.begin(); + bool provisional_results_pending = false; + bool results_are_empty = true; + for (; i != results.end(); ++i) { + const SpeechRecognitionResult& result = *i; + if (result.is_provisional) { + provisional_results_pending = true; + DCHECK(!is_single_shot_); + } else if (results_are_empty) { + results_are_empty = result.hypotheses.empty(); + } + } + + if (provisional_results_pending) { + listener_->OnRecognitionResults(session_id_, results); + // We don't end the recognition if a provisional result is received in + // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will + // end the recognition. + return state_; + } + + recognition_engine_->EndRecognition(); + + if (!results_are_empty) { + // We could receive an empty result (which we won't propagate further) + // in the following (continuous) scenario: + // 1. The caller start pushing audio and receives some results; + // 2. A |StopAudioCapture| is issued later; + // 3. The final audio frames captured in the interval ]1,2] do not lead to + // any result (nor any error); + // 4. The speech recognition engine, therefore, emits an empty result to + // notify that the recognition is ended with no error, yet neither any + // further result. + listener_->OnRecognitionResults(session_id_, results); + } + + listener_->OnRecognitionEnd(session_id_); + return STATE_IDLE; +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { + return state_; // Just keep the current state. +} + +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { + NOTREACHED() << "Unfeasible event " << event_args.event + << " in state " << state_; + return state_; +} + +void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { + DCHECK(IsCapturingAudio()); + DVLOG(1) << "SpeechRecognizerImpl closing audio controller."; + // Issues a Close on the audio controller, passing an empty callback. The only + // purpose of such callback is to keep the audio controller refcounted until + // Close has completed (in the audio thread) and automatically destroy it + // afterwards (upon return from OnAudioClosed). + audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, + this, audio_controller_)); + audio_controller_ = NULL; // The controller is still refcounted by Bind. +} + +int SpeechRecognizerImpl::GetElapsedTimeMs() const { + return (num_samples_recorded_ * 1000) / kAudioSampleRate; +} + +void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, + bool clip_detected) { + // Calculate the input volume to display in the UI, smoothing towards the + // new level. + // TODO(primiano): Do we really need all this floating point arith here? + // Perhaps it might be quite expensive on mobile. + float level = (rms - kAudioMeterMinDb) / + (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); + level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped); + const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor : + kDownSmoothingFactor; + audio_level_ += (level - audio_level_) * smoothing_factor; + + float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) / + (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); + noise_level = std::min(std::max(0.0f, noise_level), + kAudioMeterRangeMaxUnclipped); + + listener_->OnAudioLevelsChange( + session_id_, clip_detected ? 1.0f : audio_level_, noise_level); +} + +void SpeechRecognizerImpl::SetAudioManagerForTests( + AudioManager* audio_manager) { + audio_manager_for_tests_ = audio_manager; +} + +SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) + : event(event_value), + audio_data(NULL), + engine_error(SPEECH_RECOGNITION_ERROR_NONE) { +} + +SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { +} + +} // namespace content diff --git a/content/browser/speech/speech_recognizer_impl.h b/content/browser/speech/speech_recognizer_impl.h new file mode 100644 index 0000000..2397716 --- /dev/null +++ b/content/browser/speech/speech_recognizer_impl.h @@ -0,0 +1,156 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ +#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ + +#include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "content/browser/speech/endpointer/endpointer.h" +#include "content/browser/speech/speech_recognition_engine.h" +#include "content/browser/speech/speech_recognizer.h" +#include "content/public/common/speech_recognition_error.h" +#include "content/public/common/speech_recognition_result.h" +#include "media/audio/audio_input_controller.h" +#include "net/url_request/url_request_context_getter.h" + +namespace media { +class AudioManager; +} + +namespace content { + +class SpeechRecognitionEventListener; + +// Handles speech recognition for a session (identified by |session_id|), taking +// care of audio capture, silence detection/endpointer and interaction with the +// SpeechRecognitionEngine. +class CONTENT_EXPORT SpeechRecognizerImpl + : public SpeechRecognizer, + public media::AudioInputController::EventHandler, + public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) { + public: + static const int kAudioSampleRate; + static const media::ChannelLayout kChannelLayout; + static const int kNumBitsPerAudioSample; + static const int kNoSpeechTimeoutMs; + static const int kEndpointerEstimationTimeMs; + + static void SetAudioManagerForTests(media::AudioManager* audio_manager); + + SpeechRecognizerImpl(SpeechRecognitionEventListener* listener, + int session_id, + bool is_single_shot, + SpeechRecognitionEngine* engine); + + virtual void StartRecognition() OVERRIDE; + virtual void AbortRecognition() OVERRIDE; + virtual void StopAudioCapture() OVERRIDE; + virtual bool IsActive() const OVERRIDE; + virtual bool IsCapturingAudio() const OVERRIDE; + const SpeechRecognitionEngine& recognition_engine() const; + + private: + friend class SpeechRecognizerTest; + + enum FSMState { + STATE_IDLE = 0, + STATE_STARTING, + STATE_ESTIMATING_ENVIRONMENT, + STATE_WAITING_FOR_SPEECH, + STATE_RECOGNIZING, + STATE_WAITING_FINAL_RESULT, + STATE_MAX_VALUE = STATE_WAITING_FINAL_RESULT + }; + + enum FSMEvent { + EVENT_ABORT = 0, + EVENT_START, + EVENT_STOP_CAPTURE, + EVENT_AUDIO_DATA, + EVENT_ENGINE_RESULT, + EVENT_ENGINE_ERROR, + EVENT_AUDIO_ERROR, + EVENT_MAX_VALUE = EVENT_AUDIO_ERROR + }; + + struct FSMEventArgs { + explicit FSMEventArgs(FSMEvent event_value); + ~FSMEventArgs(); + + FSMEvent event; + scoped_refptr audio_data; + SpeechRecognitionResults engine_results; + SpeechRecognitionError engine_error; + }; + + virtual ~SpeechRecognizerImpl(); + + // Entry point for pushing any new external event into the recognizer FSM. + void DispatchEvent(const FSMEventArgs& event_args); + + // Defines the behavior of the recognizer FSM, selecting the appropriate + // transition according to the current state and event. + FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); + + // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). + void ProcessAudioPipeline(const AudioChunk& raw_audio); + + // The methods below handle transitions of the recognizer FSM. + FSMState StartRecording(const FSMEventArgs& event_args); + FSMState StartRecognitionEngine(const FSMEventArgs& event_args); + FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); + FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); + FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); + FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); + FSMState ProcessFinalResult(const FSMEventArgs& event_args); + FSMState AbortSilently(const FSMEventArgs& event_args); + FSMState AbortWithError(const FSMEventArgs& event_args); + FSMState Abort(const SpeechRecognitionError& error); + FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); + FSMState DoNothing(const FSMEventArgs& event_args) const; + FSMState NotFeasible(const FSMEventArgs& event_args); + + // Returns the time span of captured audio samples since the start of capture. + int GetElapsedTimeMs() const; + + // Calculates the input volume to be displayed in the UI, triggering the + // OnAudioLevelsChange event accordingly. + void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); + + void CloseAudioControllerAsynchronously(); + + // Callback called on IO thread by audio_controller->Close(). + void OnAudioClosed(media::AudioInputController*); + + // AudioInputController::EventHandler methods. + virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {} + virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {} + virtual void OnError(media::AudioInputController* controller) OVERRIDE; + virtual void OnData(media::AudioInputController* controller, + const uint8* data, uint32 size) OVERRIDE; + + // SpeechRecognitionEngineDelegate methods. + virtual void OnSpeechRecognitionEngineResults( + const SpeechRecognitionResults& results) OVERRIDE; + virtual void OnSpeechRecognitionEngineError( + const SpeechRecognitionError& error) OVERRIDE; + + static media::AudioManager* audio_manager_for_tests_; + + scoped_ptr recognition_engine_; + Endpointer endpointer_; + scoped_refptr audio_controller_; + int num_samples_recorded_; + float audio_level_; + bool is_dispatching_event_; + bool is_single_shot_; + FSMState state_; + + DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl); +}; + +} // namespace content + +#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ diff --git a/content/browser/speech/speech_recognizer_impl_unittest.cc b/content/browser/speech/speech_recognizer_impl_unittest.cc new file mode 100644 index 0000000..8c7c2d7 --- /dev/null +++ b/content/browser/speech/speech_recognizer_impl_unittest.cc @@ -0,0 +1,498 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "content/browser/browser_thread_impl.h" +#include "content/browser/speech/google_one_shot_remote_engine.h" +#include "content/browser/speech/speech_recognizer_impl.h" +#include "content/public/browser/speech_recognition_event_listener.h" +#include "media/audio/fake_audio_input_stream.h" +#include "media/audio/fake_audio_output_stream.h" +#include "media/audio/mock_audio_manager.h" +#include "media/audio/test_audio_input_controller_factory.h" +#include "net/base/net_errors.h" +#include "net/url_request/test_url_fetcher_factory.h" +#include "net/url_request/url_request_status.h" +#include "testing/gtest/include/gtest/gtest.h" + +using base::MessageLoopProxy; +using media::AudioInputController; +using media::AudioInputStream; +using media::AudioManager; +using media::AudioOutputStream; +using media::AudioParameters; +using media::TestAudioInputController; +using media::TestAudioInputControllerFactory; + +namespace content { + +class SpeechRecognizerImplTest : public SpeechRecognitionEventListener, + public testing::Test { + public: + SpeechRecognizerImplTest() + : io_thread_(BrowserThread::IO, &message_loop_), + recognition_started_(false), + recognition_ended_(false), + result_received_(false), + audio_started_(false), + audio_ended_(false), + sound_started_(false), + sound_ended_(false), + error_(SPEECH_RECOGNITION_ERROR_NONE), + volume_(-1.0f) { + // SpeechRecognizer takes ownership of sr_engine. + SpeechRecognitionEngine* sr_engine = + new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */); + SpeechRecognitionEngineConfig config; + config.audio_num_bits_per_sample = + SpeechRecognizerImpl::kNumBitsPerAudioSample; + config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate; + config.filter_profanities = false; + sr_engine->SetConfig(config); + + const int kTestingSessionId = 1; + const bool kOneShotMode = true; + recognizer_ = new SpeechRecognizerImpl( + this, kTestingSessionId, kOneShotMode, sr_engine); + audio_manager_.reset(new media::MockAudioManager( + base::MessageLoop::current()->message_loop_proxy())); + recognizer_->SetAudioManagerForTests(audio_manager_.get()); + + int audio_packet_length_bytes = + (SpeechRecognizerImpl::kAudioSampleRate * + GoogleOneShotRemoteEngine::kAudioPacketIntervalMs * + ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) * + SpeechRecognizerImpl::kNumBitsPerAudioSample) / (8 * 1000); + audio_packet_.resize(audio_packet_length_bytes); + } + + void CheckEventsConsistency() { + // Note: "!x || y" == "x implies y". + EXPECT_TRUE(!recognition_ended_ || recognition_started_); + EXPECT_TRUE(!audio_ended_ || audio_started_); + EXPECT_TRUE(!sound_ended_ || sound_started_); + EXPECT_TRUE(!audio_started_ || recognition_started_); + EXPECT_TRUE(!sound_started_ || audio_started_); + EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_)); + EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_)); + } + + void CheckFinalEventsConsistency() { + // Note: "!(x ^ y)" == "(x && y) || (!x && !x)". + EXPECT_FALSE(recognition_started_ ^ recognition_ended_); + EXPECT_FALSE(audio_started_ ^ audio_ended_); + EXPECT_FALSE(sound_started_ ^ sound_ended_); + } + + // Overridden from SpeechRecognitionEventListener: + virtual void OnAudioStart(int session_id) OVERRIDE { + audio_started_ = true; + CheckEventsConsistency(); + } + + virtual void OnAudioEnd(int session_id) OVERRIDE { + audio_ended_ = true; + CheckEventsConsistency(); + } + + virtual void OnRecognitionResults( + int session_id, const SpeechRecognitionResults& results) OVERRIDE { + result_received_ = true; + } + + virtual void OnRecognitionError( + int session_id, const SpeechRecognitionError& error) OVERRIDE { + EXPECT_TRUE(recognition_started_); + EXPECT_FALSE(recognition_ended_); + error_ = error.code; + } + + virtual void OnAudioLevelsChange(int session_id, float volume, + float noise_volume) OVERRIDE { + volume_ = volume; + noise_volume_ = noise_volume; + } + + virtual void OnRecognitionEnd(int session_id) OVERRIDE { + recognition_ended_ = true; + CheckEventsConsistency(); + } + + virtual void OnRecognitionStart(int session_id) OVERRIDE { + recognition_started_ = true; + CheckEventsConsistency(); + } + + virtual void OnEnvironmentEstimationComplete(int session_id) OVERRIDE {} + + virtual void OnSoundStart(int session_id) OVERRIDE { + sound_started_ = true; + CheckEventsConsistency(); + } + + virtual void OnSoundEnd(int session_id) OVERRIDE { + sound_ended_ = true; + CheckEventsConsistency(); + } + + // testing::Test methods. + virtual void SetUp() OVERRIDE { + AudioInputController::set_factory_for_testing( + &audio_input_controller_factory_); + } + + virtual void TearDown() OVERRIDE { + AudioInputController::set_factory_for_testing(NULL); + } + + void FillPacketWithTestWaveform() { + // Fill the input with a simple pattern, a 125Hz sawtooth waveform. + for (size_t i = 0; i < audio_packet_.size(); ++i) + audio_packet_[i] = static_cast(i); + } + + void FillPacketWithNoise() { + int value = 0; + int factor = 175; + for (size_t i = 0; i < audio_packet_.size(); ++i) { + value += factor; + audio_packet_[i] = value % 100; + } + } + + protected: + base::MessageLoopForIO message_loop_; + BrowserThreadImpl io_thread_; + scoped_refptr recognizer_; + scoped_ptr audio_manager_; + bool recognition_started_; + bool recognition_ended_; + bool result_received_; + bool audio_started_; + bool audio_ended_; + bool sound_started_; + bool sound_ended_; + SpeechRecognitionErrorCode error_; + net::TestURLFetcherFactory url_fetcher_factory_; + TestAudioInputControllerFactory audio_input_controller_factory_; + std::vector audio_packet_; + float volume_; + float noise_volume_; +}; + +TEST_F(SpeechRecognizerImplTest, StopNoData) { + // Check for callbacks when stopping record before any audio gets recorded. + recognizer_->StartRecognition(); + recognizer_->StopAudioCapture(); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(recognition_started_); + EXPECT_FALSE(audio_started_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, CancelNoData) { + // Check for callbacks when canceling recognition before any audio gets + // recorded. + recognizer_->StartRecognition(); + recognizer_->AbortRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(recognition_started_); + EXPECT_FALSE(audio_started_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, StopWithData) { + // Start recording, give some data and then stop. This should wait for the + // network callback to arrive before completion. + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + + // Try sending 5 chunks of mock audio data and verify that each of them + // resulted immediately in a packet sent out via the network. This verifies + // that we are streaming out encoded data as chunks without waiting for the + // full recording to complete. + const size_t kNumChunks = 5; + for (size_t i = 0; i < kNumChunks; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + base::MessageLoop::current()->RunUntilIdle(); + net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); + ASSERT_TRUE(fetcher); + EXPECT_EQ(i + 1, fetcher->upload_chunks().size()); + } + + recognizer_->StopAudioCapture(); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(audio_started_); + EXPECT_TRUE(audio_ended_); + EXPECT_FALSE(recognition_ended_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); + + // Issue the network callback to complete the process. + net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); + ASSERT_TRUE(fetcher); + + fetcher->set_url(fetcher->GetOriginalURL()); + net::URLRequestStatus status; + status.set_status(net::URLRequestStatus::SUCCESS); + fetcher->set_status(status); + fetcher->set_response_code(200); + fetcher->SetResponseString( + "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}"); + fetcher->delegate()->OnURLFetchComplete(fetcher); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(recognition_ended_); + EXPECT_TRUE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, CancelWithData) { + // Start recording, give some data and then cancel. + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + base::MessageLoop::current()->RunUntilIdle(); + recognizer_->AbortRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); + EXPECT_TRUE(recognition_started_); + EXPECT_TRUE(audio_started_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, ConnectionError) { + // Start recording, give some data and then stop. Issue the network callback + // with a connection error and verify that the recognizer bubbles the error up + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + base::MessageLoop::current()->RunUntilIdle(); + net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); + ASSERT_TRUE(fetcher); + + recognizer_->StopAudioCapture(); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(audio_started_); + EXPECT_TRUE(audio_ended_); + EXPECT_FALSE(recognition_ended_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); + + // Issue the network callback to complete the process. + fetcher->set_url(fetcher->GetOriginalURL()); + net::URLRequestStatus status; + status.set_status(net::URLRequestStatus::FAILED); + status.set_error(net::ERR_CONNECTION_REFUSED); + fetcher->set_status(status); + fetcher->set_response_code(0); + fetcher->SetResponseString(std::string()); + fetcher->delegate()->OnURLFetchComplete(fetcher); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(recognition_ended_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, ServerError) { + // Start recording, give some data and then stop. Issue the network callback + // with a 500 error and verify that the recognizer bubbles the error up + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + base::MessageLoop::current()->RunUntilIdle(); + net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); + ASSERT_TRUE(fetcher); + + recognizer_->StopAudioCapture(); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(audio_started_); + EXPECT_TRUE(audio_ended_); + EXPECT_FALSE(recognition_ended_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); + + // Issue the network callback to complete the process. + fetcher->set_url(fetcher->GetOriginalURL()); + net::URLRequestStatus status; + status.set_status(net::URLRequestStatus::SUCCESS); + fetcher->set_status(status); + fetcher->set_response_code(500); + fetcher->SetResponseString("Internal Server Error"); + fetcher->delegate()->OnURLFetchComplete(fetcher); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(recognition_ended_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) { + // Check if things tear down properly if AudioInputController threw an error. + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnError(controller); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(recognition_started_); + EXPECT_FALSE(audio_started_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) { + // Check if things tear down properly if AudioInputController threw an error + // after giving some audio data. + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + controller->event_handler()->OnError(controller); + base::MessageLoop::current()->RunUntilIdle(); + ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); + EXPECT_TRUE(recognition_started_); + EXPECT_TRUE(audio_started_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) { + // Start recording and give a lot of packets with audio samples set to zero. + // This should trigger the no-speech detector and issue a callback. + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + + int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / + GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1; + // The vector is already filled with zero value samples on create. + for (int i = 0; i < num_packets; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + } + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_TRUE(recognition_started_); + EXPECT_TRUE(audio_started_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) { + // Start recording and give a lot of packets with audio samples set to zero + // and then some more with reasonably loud audio samples. This should be + // treated as normal speech input and the no-speech detector should not get + // triggered. + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller = audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + + int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / + GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; + + // The vector is already filled with zero value samples on create. + for (int i = 0; i < num_packets / 2; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + } + + FillPacketWithTestWaveform(); + for (int i = 0; i < num_packets / 2; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + } + + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); + EXPECT_TRUE(audio_started_); + EXPECT_FALSE(audio_ended_); + EXPECT_FALSE(recognition_ended_); + recognizer_->AbortRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + CheckFinalEventsConsistency(); +} + +TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) { + // Start recording and give a lot of packets with audio samples set to zero + // and then some more with reasonably loud audio samples. Check that we don't + // get the callback during estimation phase, then get zero for the silence + // samples and proper volume for the loud audio. + recognizer_->StartRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller = audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + + // Feed some samples to begin with for the endpointer to do noise estimation. + int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs / + GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; + FillPacketWithNoise(); + for (int i = 0; i < num_packets; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + } + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_EQ(-1.0f, volume_); // No audio volume set yet. + + // The vector is already filled with zero value samples on create. + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_FLOAT_EQ(0.74939233f, volume_); + + FillPacketWithTestWaveform(); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + base::MessageLoop::current()->RunUntilIdle(); + EXPECT_FLOAT_EQ(0.89926866f, volume_); + EXPECT_FLOAT_EQ(0.75071919f, noise_volume_); + + EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); + EXPECT_FALSE(audio_ended_); + EXPECT_FALSE(recognition_ended_); + recognizer_->AbortRecognition(); + base::MessageLoop::current()->RunUntilIdle(); + CheckFinalEventsConsistency(); +} + +} // namespace content diff --git a/content/browser/speech/speech_recognizer_unittest.cc b/content/browser/speech/speech_recognizer_unittest.cc deleted file mode 100644 index 9b55ec5..0000000 --- a/content/browser/speech/speech_recognizer_unittest.cc +++ /dev/null @@ -1,497 +0,0 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include - -#include "content/browser/browser_thread_impl.h" -#include "content/browser/speech/google_one_shot_remote_engine.h" -#include "content/browser/speech/speech_recognizer.h" -#include "content/public/browser/speech_recognition_event_listener.h" -#include "media/audio/mock_audio_manager.h" -#include "media/audio/fake_audio_input_stream.h" -#include "media/audio/fake_audio_output_stream.h" -#include "media/audio/test_audio_input_controller_factory.h" -#include "net/base/net_errors.h" -#include "net/url_request/test_url_fetcher_factory.h" -#include "net/url_request/url_request_status.h" -#include "testing/gtest/include/gtest/gtest.h" - -using base::MessageLoopProxy; -using media::AudioInputController; -using media::AudioInputStream; -using media::AudioManager; -using media::AudioOutputStream; -using media::AudioParameters; -using media::TestAudioInputController; -using media::TestAudioInputControllerFactory; - -namespace content { - -class SpeechRecognizerTest : public SpeechRecognitionEventListener, - public testing::Test { - public: - SpeechRecognizerTest() - : io_thread_(BrowserThread::IO, &message_loop_), - recognition_started_(false), - recognition_ended_(false), - result_received_(false), - audio_started_(false), - audio_ended_(false), - sound_started_(false), - sound_ended_(false), - error_(SPEECH_RECOGNITION_ERROR_NONE), - volume_(-1.0f) { - // SpeechRecognizer takes ownership of sr_engine. - SpeechRecognitionEngine* sr_engine = - new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */); - SpeechRecognitionEngineConfig config; - config.audio_num_bits_per_sample = SpeechRecognizer::kNumBitsPerAudioSample; - config.audio_sample_rate = SpeechRecognizer::kAudioSampleRate; - config.filter_profanities = false; - sr_engine->SetConfig(config); - - const int kTestingSessionId = 1; - const bool kOneShotMode = true; - recognizer_ = new SpeechRecognizer( - this, kTestingSessionId, kOneShotMode, sr_engine); - audio_manager_.reset(new media::MockAudioManager( - base::MessageLoop::current()->message_loop_proxy())); - recognizer_->SetAudioManagerForTests(audio_manager_.get()); - - int audio_packet_length_bytes = - (SpeechRecognizer::kAudioSampleRate * - GoogleOneShotRemoteEngine::kAudioPacketIntervalMs * - ChannelLayoutToChannelCount(SpeechRecognizer::kChannelLayout) * - SpeechRecognizer::kNumBitsPerAudioSample) / (8 * 1000); - audio_packet_.resize(audio_packet_length_bytes); - } - - void CheckEventsConsistency() { - // Note: "!x || y" == "x implies y". - EXPECT_TRUE(!recognition_ended_ || recognition_started_); - EXPECT_TRUE(!audio_ended_ || audio_started_); - EXPECT_TRUE(!sound_ended_ || sound_started_); - EXPECT_TRUE(!audio_started_ || recognition_started_); - EXPECT_TRUE(!sound_started_ || audio_started_); - EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_)); - EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_)); - } - - void CheckFinalEventsConsistency() { - // Note: "!(x ^ y)" == "(x && y) || (!x && !x)". - EXPECT_FALSE(recognition_started_ ^ recognition_ended_); - EXPECT_FALSE(audio_started_ ^ audio_ended_); - EXPECT_FALSE(sound_started_ ^ sound_ended_); - } - - // Overridden from SpeechRecognitionEventListener: - virtual void OnAudioStart(int session_id) OVERRIDE { - audio_started_ = true; - CheckEventsConsistency(); - } - - virtual void OnAudioEnd(int session_id) OVERRIDE { - audio_ended_ = true; - CheckEventsConsistency(); - } - - virtual void OnRecognitionResults( - int session_id, const SpeechRecognitionResults& results) OVERRIDE { - result_received_ = true; - } - - virtual void OnRecognitionError( - int session_id, const SpeechRecognitionError& error) OVERRIDE { - EXPECT_TRUE(recognition_started_); - EXPECT_FALSE(recognition_ended_); - error_ = error.code; - } - - virtual void OnAudioLevelsChange(int session_id, float volume, - float noise_volume) OVERRIDE { - volume_ = volume; - noise_volume_ = noise_volume; - } - - virtual void OnRecognitionEnd(int session_id) OVERRIDE { - recognition_ended_ = true; - CheckEventsConsistency(); - } - - virtual void OnRecognitionStart(int session_id) OVERRIDE { - recognition_started_ = true; - CheckEventsConsistency(); - } - - virtual void OnEnvironmentEstimationComplete(int session_id) OVERRIDE {} - - virtual void OnSoundStart(int session_id) OVERRIDE { - sound_started_ = true; - CheckEventsConsistency(); - } - - virtual void OnSoundEnd(int session_id) OVERRIDE { - sound_ended_ = true; - CheckEventsConsistency(); - } - - // testing::Test methods. - virtual void SetUp() OVERRIDE { - AudioInputController::set_factory_for_testing( - &audio_input_controller_factory_); - } - - virtual void TearDown() OVERRIDE { - AudioInputController::set_factory_for_testing(NULL); - } - - void FillPacketWithTestWaveform() { - // Fill the input with a simple pattern, a 125Hz sawtooth waveform. - for (size_t i = 0; i < audio_packet_.size(); ++i) - audio_packet_[i] = static_cast(i); - } - - void FillPacketWithNoise() { - int value = 0; - int factor = 175; - for (size_t i = 0; i < audio_packet_.size(); ++i) { - value += factor; - audio_packet_[i] = value % 100; - } - } - - protected: - base::MessageLoopForIO message_loop_; - BrowserThreadImpl io_thread_; - scoped_refptr recognizer_; - scoped_ptr audio_manager_; - bool recognition_started_; - bool recognition_ended_; - bool result_received_; - bool audio_started_; - bool audio_ended_; - bool sound_started_; - bool sound_ended_; - SpeechRecognitionErrorCode error_; - net::TestURLFetcherFactory url_fetcher_factory_; - TestAudioInputControllerFactory audio_input_controller_factory_; - std::vector audio_packet_; - float volume_; - float noise_volume_; -}; - -TEST_F(SpeechRecognizerTest, StopNoData) { - // Check for callbacks when stopping record before any audio gets recorded. - recognizer_->StartRecognition(); - recognizer_->StopAudioCapture(); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(recognition_started_); - EXPECT_FALSE(audio_started_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, CancelNoData) { - // Check for callbacks when canceling recognition before any audio gets - // recorded. - recognizer_->StartRecognition(); - recognizer_->AbortRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(recognition_started_); - EXPECT_FALSE(audio_started_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, StopWithData) { - // Start recording, give some data and then stop. This should wait for the - // network callback to arrive before completion. - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - - // Try sending 5 chunks of mock audio data and verify that each of them - // resulted immediately in a packet sent out via the network. This verifies - // that we are streaming out encoded data as chunks without waiting for the - // full recording to complete. - const size_t kNumChunks = 5; - for (size_t i = 0; i < kNumChunks; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - base::MessageLoop::current()->RunUntilIdle(); - net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); - ASSERT_TRUE(fetcher); - EXPECT_EQ(i + 1, fetcher->upload_chunks().size()); - } - - recognizer_->StopAudioCapture(); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(audio_started_); - EXPECT_TRUE(audio_ended_); - EXPECT_FALSE(recognition_ended_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); - - // Issue the network callback to complete the process. - net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); - ASSERT_TRUE(fetcher); - - fetcher->set_url(fetcher->GetOriginalURL()); - net::URLRequestStatus status; - status.set_status(net::URLRequestStatus::SUCCESS); - fetcher->set_status(status); - fetcher->set_response_code(200); - fetcher->SetResponseString( - "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}"); - fetcher->delegate()->OnURLFetchComplete(fetcher); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(recognition_ended_); - EXPECT_TRUE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, CancelWithData) { - // Start recording, give some data and then cancel. - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - base::MessageLoop::current()->RunUntilIdle(); - recognizer_->AbortRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); - EXPECT_TRUE(recognition_started_); - EXPECT_TRUE(audio_started_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_ABORTED, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, ConnectionError) { - // Start recording, give some data and then stop. Issue the network callback - // with a connection error and verify that the recognizer bubbles the error up - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - base::MessageLoop::current()->RunUntilIdle(); - net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); - ASSERT_TRUE(fetcher); - - recognizer_->StopAudioCapture(); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(audio_started_); - EXPECT_TRUE(audio_ended_); - EXPECT_FALSE(recognition_ended_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); - - // Issue the network callback to complete the process. - fetcher->set_url(fetcher->GetOriginalURL()); - net::URLRequestStatus status; - status.set_status(net::URLRequestStatus::FAILED); - status.set_error(net::ERR_CONNECTION_REFUSED); - fetcher->set_status(status); - fetcher->set_response_code(0); - fetcher->SetResponseString(std::string()); - fetcher->delegate()->OnURLFetchComplete(fetcher); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(recognition_ended_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, ServerError) { - // Start recording, give some data and then stop. Issue the network callback - // with a 500 error and verify that the recognizer bubbles the error up - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - base::MessageLoop::current()->RunUntilIdle(); - net::TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); - ASSERT_TRUE(fetcher); - - recognizer_->StopAudioCapture(); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(audio_started_); - EXPECT_TRUE(audio_ended_); - EXPECT_FALSE(recognition_ended_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); - - // Issue the network callback to complete the process. - fetcher->set_url(fetcher->GetOriginalURL()); - net::URLRequestStatus status; - status.set_status(net::URLRequestStatus::SUCCESS); - fetcher->set_status(status); - fetcher->set_response_code(500); - fetcher->SetResponseString("Internal Server Error"); - fetcher->delegate()->OnURLFetchComplete(fetcher); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(recognition_ended_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, AudioControllerErrorNoData) { - // Check if things tear down properly if AudioInputController threw an error. - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnError(controller); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(recognition_started_); - EXPECT_FALSE(audio_started_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) { - // Check if things tear down properly if AudioInputController threw an error - // after giving some audio data. - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - controller->event_handler()->OnError(controller); - base::MessageLoop::current()->RunUntilIdle(); - ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0)); - EXPECT_TRUE(recognition_started_); - EXPECT_TRUE(audio_started_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_AUDIO, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, NoSpeechCallbackIssued) { - // Start recording and give a lot of packets with audio samples set to zero. - // This should trigger the no-speech detector and issue a callback. - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - - int num_packets = (SpeechRecognizer::kNoSpeechTimeoutMs) / - GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1; - // The vector is already filled with zero value samples on create. - for (int i = 0; i < num_packets; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - } - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_TRUE(recognition_started_); - EXPECT_TRUE(audio_started_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, NoSpeechCallbackNotIssued) { - // Start recording and give a lot of packets with audio samples set to zero - // and then some more with reasonably loud audio samples. This should be - // treated as normal speech input and the no-speech detector should not get - // triggered. - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller = audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - - int num_packets = (SpeechRecognizer::kNoSpeechTimeoutMs) / - GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; - - // The vector is already filled with zero value samples on create. - for (int i = 0; i < num_packets / 2; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - } - - FillPacketWithTestWaveform(); - for (int i = 0; i < num_packets / 2; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - } - - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); - EXPECT_TRUE(audio_started_); - EXPECT_FALSE(audio_ended_); - EXPECT_FALSE(recognition_ended_); - recognizer_->AbortRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - CheckFinalEventsConsistency(); -} - -TEST_F(SpeechRecognizerTest, SetInputVolumeCallback) { - // Start recording and give a lot of packets with audio samples set to zero - // and then some more with reasonably loud audio samples. Check that we don't - // get the callback during estimation phase, then get zero for the silence - // samples and proper volume for the loud audio. - recognizer_->StartRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller = audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - - // Feed some samples to begin with for the endpointer to do noise estimation. - int num_packets = SpeechRecognizer::kEndpointerEstimationTimeMs / - GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; - FillPacketWithNoise(); - for (int i = 0; i < num_packets; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - } - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_EQ(-1.0f, volume_); // No audio volume set yet. - - // The vector is already filled with zero value samples on create. - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_FLOAT_EQ(0.74939233f, volume_); - - FillPacketWithTestWaveform(); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - base::MessageLoop::current()->RunUntilIdle(); - EXPECT_FLOAT_EQ(0.89926866f, volume_); - EXPECT_FLOAT_EQ(0.75071919f, noise_volume_); - - EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); - EXPECT_FALSE(audio_ended_); - EXPECT_FALSE(recognition_ended_); - recognizer_->AbortRecognition(); - base::MessageLoop::current()->RunUntilIdle(); - CheckFinalEventsConsistency(); -} - -} // namespace content -- cgit v1.1