diff options
author | janx@chromium.org <janx@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-20 13:35:43 +0000 |
---|---|---|
committer | janx@chromium.org <janx@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-20 13:35:43 +0000 |
commit | ce1adc3482a91262df06fbe9a824d29817a8771d (patch) | |
tree | 28dcc4dc66366821d7597fcbe6490d091f5ddfc9 /content/browser/speech | |
parent | a3f9516f938d3aa22914cdddfb7fe71f097981a4 (diff) | |
download | chromium_src-ce1adc3482a91262df06fbe9a824d29817a8771d.zip chromium_src-ce1adc3482a91262df06fbe9a824d29817a8771d.tar.gz chromium_src-ce1adc3482a91262df06fbe9a824d29817a8771d.tar.bz2 |
Extract interface from content::SpeechRecognizer
SpeechRecognizer's current design assumes that the audio capture and the
endpointer are always performed inside the browser. This is not going to be true
for some platforms, for instance Android, where we plan to delegate not only the
recognition activity, but also the audio capture and the endpointer, to the
OS.
TBR=avi@chromium.org (gypi)
BUG=222352
Review URL: https://chromiumcodereview.appspot.com/15230003
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@201082 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'content/browser/speech')
-rw-r--r-- | content/browser/speech/speech_recognition_manager_impl.cc | 11 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer.h | 142 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl.cc (renamed from content/browser/speech/speech_recognizer.cc) | 142 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl.h | 156 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl_unittest.cc (renamed from content/browser/speech/speech_recognizer_unittest.cc) | 55 |
5 files changed, 271 insertions, 235 deletions
diff --git a/content/browser/speech/speech_recognition_manager_impl.cc b/content/browser/speech/speech_recognition_manager_impl.cc index ea49579..f1dca14 100644 --- a/content/browser/speech/speech_recognition_manager_impl.cc +++ b/content/browser/speech/speech_recognition_manager_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -10,7 +10,7 @@ #include "content/browser/speech/google_one_shot_remote_engine.h" #include "content/browser/speech/google_streaming_remote_engine.h" #include "content/browser/speech/speech_recognition_engine.h" -#include "content/browser/speech/speech_recognizer.h" +#include "content/browser/speech/speech_recognizer_impl.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/content_browser_client.h" #include "content/public/browser/resource_context.h" @@ -92,9 +92,10 @@ int SpeechRecognitionManagerImpl::CreateSession( SpeechRecognitionEngineConfig remote_engine_config; remote_engine_config.language = config.language; remote_engine_config.grammars = config.grammars; - remote_engine_config.audio_sample_rate = SpeechRecognizer::kAudioSampleRate; + remote_engine_config.audio_sample_rate = + SpeechRecognizerImpl::kAudioSampleRate; remote_engine_config.audio_num_bits_per_sample = - SpeechRecognizer::kNumBitsPerAudioSample; + SpeechRecognizerImpl::kNumBitsPerAudioSample; remote_engine_config.filter_profanities = config.filter_profanities; remote_engine_config.continuous = config.continuous; remote_engine_config.interim_results = config.interim_results; @@ -117,7 +118,7 @@ int SpeechRecognitionManagerImpl::CreateSession( // The legacy api cannot use continuous mode. DCHECK(!config.is_legacy_api || !config.continuous); - session.recognizer = new SpeechRecognizer( + session.recognizer = new SpeechRecognizerImpl( this, session_id, !config.continuous, diff --git a/content/browser/speech/speech_recognizer.h b/content/browser/speech/speech_recognizer.h index 12da905..bb8fd97 100644 --- a/content/browser/speech/speech_recognizer.h +++ b/content/browser/speech/speech_recognizer.h @@ -1,155 +1,37 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ -#include "base/basictypes.h" #include "base/memory/ref_counted.h" -#include "base/memory/scoped_ptr.h" -#include "content/browser/speech/endpointer/endpointer.h" -#include "content/browser/speech/speech_recognition_engine.h" -#include "content/public/common/speech_recognition_error.h" -#include "content/public/common/speech_recognition_result.h" -#include "media/audio/audio_input_controller.h" -#include "net/url_request/url_request_context_getter.h" - -namespace media { -class AudioManager; -} namespace content { class SpeechRecognitionEventListener; -// Handles speech recognition for a session (identified by |session_id|), taking -// care of audio capture, silence detection/endpointer and interaction with the -// SpeechRecognitionEngine. +// Handles speech recognition for a session (identified by |session_id|). class CONTENT_EXPORT SpeechRecognizer - : public base::RefCountedThreadSafe<SpeechRecognizer>, - public media::AudioInputController::EventHandler, - public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) { + : public base::RefCountedThreadSafe<SpeechRecognizer> { public: - static const int kAudioSampleRate; - static const media::ChannelLayout kChannelLayout; - static const int kNumBitsPerAudioSample; - static const int kNoSpeechTimeoutMs; - static const int kEndpointerEstimationTimeMs; - - static void SetAudioManagerForTests(media::AudioManager* audio_manager); - SpeechRecognizer(SpeechRecognitionEventListener* listener, - int session_id, - bool is_single_shot, - SpeechRecognitionEngine* engine); + SpeechRecognizer(SpeechRecognitionEventListener* listener, int session_id) + : listener_(listener), session_id_(session_id) {} - void StartRecognition(); - void AbortRecognition(); - void StopAudioCapture(); - bool IsActive() const; - bool IsCapturingAudio() const; - const SpeechRecognitionEngine& recognition_engine() const; + virtual void StartRecognition() = 0; + virtual void AbortRecognition() = 0; + virtual void StopAudioCapture() = 0; + virtual bool IsActive() const = 0; + virtual bool IsCapturingAudio() const = 0; - private: + protected: friend class base::RefCountedThreadSafe<SpeechRecognizer>; - friend class SpeechRecognizerTest; - - enum FSMState { - STATE_IDLE = 0, - STATE_STARTING, - STATE_ESTIMATING_ENVIRONMENT, - STATE_WAITING_FOR_SPEECH, - STATE_RECOGNIZING, - STATE_WAITING_FINAL_RESULT, - STATE_MAX_VALUE = STATE_WAITING_FINAL_RESULT - }; - - enum FSMEvent { - EVENT_ABORT = 0, - EVENT_START, - EVENT_STOP_CAPTURE, - EVENT_AUDIO_DATA, - EVENT_ENGINE_RESULT, - EVENT_ENGINE_ERROR, - EVENT_AUDIO_ERROR, - EVENT_MAX_VALUE = EVENT_AUDIO_ERROR - }; - - struct FSMEventArgs { - explicit FSMEventArgs(FSMEvent event_value); - ~FSMEventArgs(); - - FSMEvent event; - scoped_refptr<AudioChunk> audio_data; - SpeechRecognitionResults engine_results; - SpeechRecognitionError engine_error; - }; - - virtual ~SpeechRecognizer(); - - // Entry point for pushing any new external event into the recognizer FSM. - void DispatchEvent(const FSMEventArgs& event_args); - - // Defines the behavior of the recognizer FSM, selecting the appropriate - // transition according to the current state and event. - FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); - - // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). - void ProcessAudioPipeline(const AudioChunk& raw_audio); - - // The methods below handle transitions of the recognizer FSM. - FSMState StartRecording(const FSMEventArgs& event_args); - FSMState StartRecognitionEngine(const FSMEventArgs& event_args); - FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); - FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); - FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); - FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); - FSMState ProcessFinalResult(const FSMEventArgs& event_args); - FSMState AbortSilently(const FSMEventArgs& event_args); - FSMState AbortWithError(const FSMEventArgs& event_args); - FSMState Abort(const SpeechRecognitionError& error); - FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); - FSMState DoNothing(const FSMEventArgs& event_args) const; - FSMState NotFeasible(const FSMEventArgs& event_args); - - // Returns the time span of captured audio samples since the start of capture. - int GetElapsedTimeMs() const; - - // Calculates the input volume to be displayed in the UI, triggering the - // OnAudioLevelsChange event accordingly. - void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); - - void CloseAudioControllerAsynchronously(); - - // Callback called on IO thread by audio_controller->Close(). - void OnAudioClosed(media::AudioInputController*); - - // AudioInputController::EventHandler methods. - virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {} - virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {} - virtual void OnError(media::AudioInputController* controller) OVERRIDE; - virtual void OnData(media::AudioInputController* controller, - const uint8* data, uint32 size) OVERRIDE; - - // SpeechRecognitionEngineDelegate methods. - virtual void OnSpeechRecognitionEngineResults( - const SpeechRecognitionResults& results) OVERRIDE; - virtual void OnSpeechRecognitionEngineError( - const SpeechRecognitionError& error) OVERRIDE; - static media::AudioManager* audio_manager_for_tests_; + virtual ~SpeechRecognizer() {} SpeechRecognitionEventListener* listener_; - scoped_ptr<SpeechRecognitionEngine> recognition_engine_; - Endpointer endpointer_; - scoped_refptr<media::AudioInputController> audio_controller_; int session_id_; - int num_samples_recorded_; - float audio_level_; - bool is_dispatching_event_; - bool is_single_shot_; - FSMState state_; DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer); }; diff --git a/content/browser/speech/speech_recognizer.cc b/content/browser/speech/speech_recognizer_impl.cc index 62c1b35..d207ba4 100644 --- a/content/browser/speech/speech_recognizer.cc +++ b/content/browser/speech/speech_recognizer_impl.cc @@ -1,8 +1,8 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "content/browser/speech/speech_recognizer.h" +#include "content/browser/speech/speech_recognizer_impl.h" #include "base/basictypes.h" #include "base/bind.h" @@ -12,9 +12,6 @@ #include "content/browser/speech/google_one_shot_remote_engine.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/speech_recognition_event_listener.h" -#include "content/public/common/speech_recognition_error.h" -#include "content/public/common/speech_recognition_grammar.h" -#include "content/public/common/speech_recognition_result.h" #include "net/url_request/url_request_context_getter.h" using media::AudioInputController; @@ -62,26 +59,25 @@ void KeepAudioControllerRefcountedForDtor(scoped_refptr<AudioInputController>) { } // namespace -const int SpeechRecognizer::kAudioSampleRate = 16000; -const ChannelLayout SpeechRecognizer::kChannelLayout = +const int SpeechRecognizerImpl::kAudioSampleRate = 16000; +const ChannelLayout SpeechRecognizerImpl::kChannelLayout = media::CHANNEL_LAYOUT_MONO; -const int SpeechRecognizer::kNumBitsPerAudioSample = 16; -const int SpeechRecognizer::kNoSpeechTimeoutMs = 8000; -const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; -media::AudioManager* SpeechRecognizer::audio_manager_for_tests_ = NULL; +const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16; +const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000; +const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300; +media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL; -COMPILE_ASSERT(SpeechRecognizer::kNumBitsPerAudioSample % 8 == 0, +COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, kNumBitsPerAudioSample_must_be_a_multiple_of_8); -SpeechRecognizer::SpeechRecognizer( +SpeechRecognizerImpl::SpeechRecognizerImpl( SpeechRecognitionEventListener* listener, int session_id, bool is_single_shot, SpeechRecognitionEngine* engine) - : listener_(listener), + : SpeechRecognizer(listener, session_id), recognition_engine_(engine), endpointer_(kAudioSampleRate), - session_id_(session_id), is_dispatching_event_(false), is_single_shot_(is_single_shot), state_(STATE_IDLE) { @@ -114,32 +110,32 @@ SpeechRecognizer::SpeechRecognizer( // of causality between events and avoid interleaved event processing due to // synchronous callbacks. -void SpeechRecognizer::StartRecognition() { +void SpeechRecognizerImpl::StartRecognition() { BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, FSMEventArgs(EVENT_START))); } -void SpeechRecognizer::AbortRecognition() { +void SpeechRecognizerImpl::AbortRecognition() { BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, FSMEventArgs(EVENT_ABORT))); } -void SpeechRecognizer::StopAudioCapture() { +void SpeechRecognizerImpl::StopAudioCapture() { BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, FSMEventArgs(EVENT_STOP_CAPTURE))); } -bool SpeechRecognizer::IsActive() const { +bool SpeechRecognizerImpl::IsActive() const { // Checking the FSM state from another thread (thus, while the FSM is // potentially concurrently evolving) is meaningless. DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); return state_ != STATE_IDLE; } -bool SpeechRecognizer::IsCapturingAudio() const { +bool SpeechRecognizerImpl::IsCapturingAudio() const { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive(). const bool is_capturing_audio = state_ >= STATE_STARTING && state_ <= STATE_RECOGNIZING; @@ -149,11 +145,11 @@ bool SpeechRecognizer::IsCapturingAudio() const { } const SpeechRecognitionEngine& -SpeechRecognizer::recognition_engine() const { +SpeechRecognizerImpl::recognition_engine() const { return *(recognition_engine_.get()); } -SpeechRecognizer::~SpeechRecognizer() { +SpeechRecognizerImpl::~SpeechRecognizerImpl() { endpointer_.EndSession(); if (audio_controller_) { audio_controller_->Close(base::Bind(&KeepAudioControllerRefcountedForDtor, @@ -162,14 +158,14 @@ SpeechRecognizer::~SpeechRecognizer() { } // Invoked in the audio thread. -void SpeechRecognizer::OnError(AudioInputController* controller) { +void SpeechRecognizerImpl::OnError(AudioInputController* controller) { FSMEventArgs event_args(EVENT_AUDIO_ERROR); BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); } -void SpeechRecognizer::OnData(AudioInputController* controller, +void SpeechRecognizerImpl::OnData(AudioInputController* controller, const uint8* data, uint32 size) { if (size == 0) // This could happen when audio capture stops and is normal. return; @@ -178,27 +174,27 @@ void SpeechRecognizer::OnData(AudioInputController* controller, event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size), kNumBitsPerAudioSample / 8); BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); } -void SpeechRecognizer::OnAudioClosed(AudioInputController*) {} +void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {} -void SpeechRecognizer::OnSpeechRecognitionEngineResults( +void SpeechRecognizerImpl::OnSpeechRecognitionEngineResults( const SpeechRecognitionResults& results) { FSMEventArgs event_args(EVENT_ENGINE_RESULT); event_args.engine_results = results; BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); } -void SpeechRecognizer::OnSpeechRecognitionEngineError( +void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( const SpeechRecognitionError& error) { FSMEventArgs event_args(EVENT_ENGINE_ERROR); event_args.engine_error = error; BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - base::Bind(&SpeechRecognizer::DispatchEvent, + base::Bind(&SpeechRecognizerImpl::DispatchEvent, this, event_args)); } @@ -214,7 +210,7 @@ void SpeechRecognizer::OnSpeechRecognitionEngineError( // TestAudioInputController is not closing asynchronously as the real controller // does, but they will become flaky if TestAudioInputController will be fixed. -void SpeechRecognizer::DispatchEvent(const FSMEventArgs& event_args) { +void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); DCHECK_LE(event_args.event, EVENT_MAX_VALUE); DCHECK_LE(state_, STATE_MAX_VALUE); @@ -225,7 +221,7 @@ void SpeechRecognizer::DispatchEvent(const FSMEventArgs& event_args) { is_dispatching_event_ = true; // Guard against the delegate freeing us until we finish processing the event. - scoped_refptr<SpeechRecognizer> me(this); + scoped_refptr<SpeechRecognizerImpl> me(this); if (event_args.event == EVENT_AUDIO_DATA) { DCHECK(event_args.audio_data.get() != NULL); @@ -238,8 +234,8 @@ void SpeechRecognizer::DispatchEvent(const FSMEventArgs& event_args) { is_dispatching_event_ = false; } -SpeechRecognizer::FSMState -SpeechRecognizer::ExecuteTransitionAndGetNextState( +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( const FSMEventArgs& event_args) { const FSMEvent event = event_args.event; switch (state_) { @@ -358,7 +354,7 @@ SpeechRecognizer::ExecuteTransitionAndGetNextState( // TODO(primiano): the audio pipeline is currently serial. However, the // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. // We should profile the execution to see if it would be worth or not. -void SpeechRecognizer::ProcessAudioPipeline(const AudioChunk& raw_audio) { +void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING; const bool route_to_sr_engine = route_to_endpointer; @@ -382,8 +378,8 @@ void SpeechRecognizer::ProcessAudioPipeline(const AudioChunk& raw_audio) { } } -SpeechRecognizer::FSMState -SpeechRecognizer::StartRecording(const FSMEventArgs&) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) { DCHECK(recognition_engine_.get() != NULL); DCHECK(!IsCapturingAudio()); AudioManager* audio_manager = (audio_manager_for_tests_ != NULL) ? @@ -391,7 +387,7 @@ SpeechRecognizer::StartRecording(const FSMEventArgs&) { BrowserMainLoop::GetAudioManager(); DCHECK(audio_manager != NULL); - DVLOG(1) << "SpeechRecognizer starting audio capture."; + DVLOG(1) << "SpeechRecognizerImpl starting audio capture."; num_samples_recorded_ = 0; audio_level_ = 0; listener_->OnRecognitionStart(session_id_); @@ -426,8 +422,8 @@ SpeechRecognizer::StartRecording(const FSMEventArgs&) { return STATE_STARTING; } -SpeechRecognizer::FSMState -SpeechRecognizer::StartRecognitionEngine(const FSMEventArgs& event_args) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) { // This is the first audio packet captured, so the recognition engine is // started and the delegate notified about the event. DCHECK(recognition_engine_.get() != NULL); @@ -441,8 +437,8 @@ SpeechRecognizer::StartRecognitionEngine(const FSMEventArgs& event_args) { return STATE_ESTIMATING_ENVIRONMENT; } -SpeechRecognizer::FSMState -SpeechRecognizer::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { DCHECK(endpointer_.IsEstimatingEnvironment()); if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) { endpointer_.SetUserInputMode(); @@ -453,8 +449,8 @@ SpeechRecognizer::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) { } } -SpeechRecognizer::FSMState -SpeechRecognizer::DetectUserSpeechOrTimeout(const FSMEventArgs&) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { if (endpointer_.DidStartReceivingSpeech()) { listener_->OnSoundStart(session_id_); return STATE_RECOGNIZING; @@ -464,15 +460,15 @@ SpeechRecognizer::DetectUserSpeechOrTimeout(const FSMEventArgs&) { return STATE_WAITING_FOR_SPEECH; } -SpeechRecognizer::FSMState -SpeechRecognizer::DetectEndOfSpeech(const FSMEventArgs& event_args) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { if (endpointer_.speech_input_complete()) return StopCaptureAndWaitForResult(event_args); return STATE_RECOGNIZING; } -SpeechRecognizer::FSMState -SpeechRecognizer::StopCaptureAndWaitForResult(const FSMEventArgs&) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) { DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING); DVLOG(1) << "Concluding recognition"; @@ -486,15 +482,15 @@ SpeechRecognizer::StopCaptureAndWaitForResult(const FSMEventArgs&) { return STATE_WAITING_FINAL_RESULT; } -SpeechRecognizer::FSMState -SpeechRecognizer::AbortSilently(const FSMEventArgs& event_args) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::AbortSilently(const FSMEventArgs& event_args) { DCHECK_NE(event_args.event, EVENT_AUDIO_ERROR); DCHECK_NE(event_args.event, EVENT_ENGINE_ERROR); return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_NONE)); } -SpeechRecognizer::FSMState -SpeechRecognizer::AbortWithError(const FSMEventArgs& event_args) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::AbortWithError(const FSMEventArgs& event_args) { if (event_args.event == EVENT_AUDIO_ERROR) { return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO)); } else if (event_args.event == EVENT_ENGINE_ERROR) { @@ -503,12 +499,12 @@ SpeechRecognizer::AbortWithError(const FSMEventArgs& event_args) { return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_ABORTED)); } -SpeechRecognizer::FSMState SpeechRecognizer::Abort( +SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( const SpeechRecognitionError& error) { if (IsCapturingAudio()) CloseAudioControllerAsynchronously(); - DVLOG(1) << "SpeechRecognizer canceling recognition. "; + DVLOG(1) << "SpeechRecognizerImpl canceling recognition. "; // The recognition engine is initialized only after STATE_STARTING. if (state_ > STATE_STARTING) { @@ -530,7 +526,7 @@ SpeechRecognizer::FSMState SpeechRecognizer::Abort( return STATE_IDLE; } -SpeechRecognizer::FSMState SpeechRecognizer::ProcessIntermediateResult( +SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( const FSMEventArgs& event_args) { // Provisional results can occur only during continuous (non one-shot) mode. // If this check is reached it means that a continuous speech recognition @@ -556,8 +552,8 @@ SpeechRecognizer::FSMState SpeechRecognizer::ProcessIntermediateResult( return STATE_RECOGNIZING; } -SpeechRecognizer::FSMState -SpeechRecognizer::ProcessFinalResult(const FSMEventArgs& event_args) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { const SpeechRecognitionResults& results = event_args.engine_results; SpeechRecognitionResults::const_iterator i = results.begin(); bool provisional_results_pending = false; @@ -599,35 +595,35 @@ SpeechRecognizer::ProcessFinalResult(const FSMEventArgs& event_args) { return STATE_IDLE; } -SpeechRecognizer::FSMState -SpeechRecognizer::DoNothing(const FSMEventArgs&) const { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const { return state_; // Just keep the current state. } -SpeechRecognizer::FSMState -SpeechRecognizer::NotFeasible(const FSMEventArgs& event_args) { +SpeechRecognizerImpl::FSMState +SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { NOTREACHED() << "Unfeasible event " << event_args.event << " in state " << state_; return state_; } -void SpeechRecognizer::CloseAudioControllerAsynchronously() { +void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { DCHECK(IsCapturingAudio()); - DVLOG(1) << "SpeechRecognizer closing audio controller."; + DVLOG(1) << "SpeechRecognizerImpl closing audio controller."; // Issues a Close on the audio controller, passing an empty callback. The only // purpose of such callback is to keep the audio controller refcounted until // Close has completed (in the audio thread) and automatically destroy it // afterwards (upon return from OnAudioClosed). - audio_controller_->Close(base::Bind(&SpeechRecognizer::OnAudioClosed, + audio_controller_->Close(base::Bind(&SpeechRecognizerImpl::OnAudioClosed, this, audio_controller_)); audio_controller_ = NULL; // The controller is still refcounted by Bind. } -int SpeechRecognizer::GetElapsedTimeMs() const { +int SpeechRecognizerImpl::GetElapsedTimeMs() const { return (num_samples_recorded_ * 1000) / kAudioSampleRate; } -void SpeechRecognizer::UpdateSignalAndNoiseLevels(const float& rms, +void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected) { // Calculate the input volume to display in the UI, smoothing towards the // new level. @@ -649,18 +645,18 @@ void SpeechRecognizer::UpdateSignalAndNoiseLevels(const float& rms, session_id_, clip_detected ? 1.0f : audio_level_, noise_level); } -void SpeechRecognizer::SetAudioManagerForTests( +void SpeechRecognizerImpl::SetAudioManagerForTests( AudioManager* audio_manager) { audio_manager_for_tests_ = audio_manager; } -SpeechRecognizer::FSMEventArgs::FSMEventArgs(FSMEvent event_value) +SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value) : event(event_value), audio_data(NULL), engine_error(SPEECH_RECOGNITION_ERROR_NONE) { } -SpeechRecognizer::FSMEventArgs::~FSMEventArgs() { +SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() { } } // namespace content diff --git a/content/browser/speech/speech_recognizer_impl.h b/content/browser/speech/speech_recognizer_impl.h new file mode 100644 index 0000000..2397716 --- /dev/null +++ b/content/browser/speech/speech_recognizer_impl.h @@ -0,0 +1,156 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ +#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ + +#include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "content/browser/speech/endpointer/endpointer.h" +#include "content/browser/speech/speech_recognition_engine.h" +#include "content/browser/speech/speech_recognizer.h" +#include "content/public/common/speech_recognition_error.h" +#include "content/public/common/speech_recognition_result.h" +#include "media/audio/audio_input_controller.h" +#include "net/url_request/url_request_context_getter.h" + +namespace media { +class AudioManager; +} + +namespace content { + +class SpeechRecognitionEventListener; + +// Handles speech recognition for a session (identified by |session_id|), taking +// care of audio capture, silence detection/endpointer and interaction with the +// SpeechRecognitionEngine. +class CONTENT_EXPORT SpeechRecognizerImpl + : public SpeechRecognizer, + public media::AudioInputController::EventHandler, + public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) { + public: + static const int kAudioSampleRate; + static const media::ChannelLayout kChannelLayout; + static const int kNumBitsPerAudioSample; + static const int kNoSpeechTimeoutMs; + static const int kEndpointerEstimationTimeMs; + + static void SetAudioManagerForTests(media::AudioManager* audio_manager); + + SpeechRecognizerImpl(SpeechRecognitionEventListener* listener, + int session_id, + bool is_single_shot, + SpeechRecognitionEngine* engine); + + virtual void StartRecognition() OVERRIDE; + virtual void AbortRecognition() OVERRIDE; + virtual void StopAudioCapture() OVERRIDE; + virtual bool IsActive() const OVERRIDE; + virtual bool IsCapturingAudio() const OVERRIDE; + const SpeechRecognitionEngine& recognition_engine() const; + + private: + friend class SpeechRecognizerTest; + + enum FSMState { + STATE_IDLE = 0, + STATE_STARTING, + STATE_ESTIMATING_ENVIRONMENT, + STATE_WAITING_FOR_SPEECH, + STATE_RECOGNIZING, + STATE_WAITING_FINAL_RESULT, + STATE_MAX_VALUE = STATE_WAITING_FINAL_RESULT + }; + + enum FSMEvent { + EVENT_ABORT = 0, + EVENT_START, + EVENT_STOP_CAPTURE, + EVENT_AUDIO_DATA, + EVENT_ENGINE_RESULT, + EVENT_ENGINE_ERROR, + EVENT_AUDIO_ERROR, + EVENT_MAX_VALUE = EVENT_AUDIO_ERROR + }; + + struct FSMEventArgs { + explicit FSMEventArgs(FSMEvent event_value); + ~FSMEventArgs(); + + FSMEvent event; + scoped_refptr<AudioChunk> audio_data; + SpeechRecognitionResults engine_results; + SpeechRecognitionError engine_error; + }; + + virtual ~SpeechRecognizerImpl(); + + // Entry point for pushing any new external event into the recognizer FSM. + void DispatchEvent(const FSMEventArgs& event_args); + + // Defines the behavior of the recognizer FSM, selecting the appropriate + // transition according to the current state and event. + FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); + + // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). + void ProcessAudioPipeline(const AudioChunk& raw_audio); + + // The methods below handle transitions of the recognizer FSM. + FSMState StartRecording(const FSMEventArgs& event_args); + FSMState StartRecognitionEngine(const FSMEventArgs& event_args); + FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); + FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); + FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); + FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); + FSMState ProcessFinalResult(const FSMEventArgs& event_args); + FSMState AbortSilently(const FSMEventArgs& event_args); + FSMState AbortWithError(const FSMEventArgs& event_args); + FSMState Abort(const SpeechRecognitionError& error); + FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); + FSMState DoNothing(const FSMEventArgs& event_args) const; + FSMState NotFeasible(const FSMEventArgs& event_args); + + // Returns the time span of captured audio samples since the start of capture. + int GetElapsedTimeMs() const; + + // Calculates the input volume to be displayed in the UI, triggering the + // OnAudioLevelsChange event accordingly. + void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); + + void CloseAudioControllerAsynchronously(); + + // Callback called on IO thread by audio_controller->Close(). + void OnAudioClosed(media::AudioInputController*); + + // AudioInputController::EventHandler methods. + virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {} + virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {} + virtual void OnError(media::AudioInputController* controller) OVERRIDE; + virtual void OnData(media::AudioInputController* controller, + const uint8* data, uint32 size) OVERRIDE; + + // SpeechRecognitionEngineDelegate methods. + virtual void OnSpeechRecognitionEngineResults( + const SpeechRecognitionResults& results) OVERRIDE; + virtual void OnSpeechRecognitionEngineError( + const SpeechRecognitionError& error) OVERRIDE; + + static media::AudioManager* audio_manager_for_tests_; + + scoped_ptr<SpeechRecognitionEngine> recognition_engine_; + Endpointer endpointer_; + scoped_refptr<media::AudioInputController> audio_controller_; + int num_samples_recorded_; + float audio_level_; + bool is_dispatching_event_; + bool is_single_shot_; + FSMState state_; + + DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl); +}; + +} // namespace content + +#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ diff --git a/content/browser/speech/speech_recognizer_unittest.cc b/content/browser/speech/speech_recognizer_impl_unittest.cc index 9b55ec5..8c7c2d7 100644 --- a/content/browser/speech/speech_recognizer_unittest.cc +++ b/content/browser/speech/speech_recognizer_impl_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -6,11 +6,11 @@ #include "content/browser/browser_thread_impl.h" #include "content/browser/speech/google_one_shot_remote_engine.h" -#include "content/browser/speech/speech_recognizer.h" +#include "content/browser/speech/speech_recognizer_impl.h" #include "content/public/browser/speech_recognition_event_listener.h" -#include "media/audio/mock_audio_manager.h" #include "media/audio/fake_audio_input_stream.h" #include "media/audio/fake_audio_output_stream.h" +#include "media/audio/mock_audio_manager.h" #include "media/audio/test_audio_input_controller_factory.h" #include "net/base/net_errors.h" #include "net/url_request/test_url_fetcher_factory.h" @@ -28,10 +28,10 @@ using media::TestAudioInputControllerFactory; namespace content { -class SpeechRecognizerTest : public SpeechRecognitionEventListener, - public testing::Test { +class SpeechRecognizerImplTest : public SpeechRecognitionEventListener, + public testing::Test { public: - SpeechRecognizerTest() + SpeechRecognizerImplTest() : io_thread_(BrowserThread::IO, &message_loop_), recognition_started_(false), recognition_ended_(false), @@ -46,24 +46,25 @@ class SpeechRecognizerTest : public SpeechRecognitionEventListener, SpeechRecognitionEngine* sr_engine = new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */); SpeechRecognitionEngineConfig config; - config.audio_num_bits_per_sample = SpeechRecognizer::kNumBitsPerAudioSample; - config.audio_sample_rate = SpeechRecognizer::kAudioSampleRate; + config.audio_num_bits_per_sample = + SpeechRecognizerImpl::kNumBitsPerAudioSample; + config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate; config.filter_profanities = false; sr_engine->SetConfig(config); const int kTestingSessionId = 1; const bool kOneShotMode = true; - recognizer_ = new SpeechRecognizer( + recognizer_ = new SpeechRecognizerImpl( this, kTestingSessionId, kOneShotMode, sr_engine); audio_manager_.reset(new media::MockAudioManager( base::MessageLoop::current()->message_loop_proxy())); recognizer_->SetAudioManagerForTests(audio_manager_.get()); int audio_packet_length_bytes = - (SpeechRecognizer::kAudioSampleRate * + (SpeechRecognizerImpl::kAudioSampleRate * GoogleOneShotRemoteEngine::kAudioPacketIntervalMs * - ChannelLayoutToChannelCount(SpeechRecognizer::kChannelLayout) * - SpeechRecognizer::kNumBitsPerAudioSample) / (8 * 1000); + ChannelLayoutToChannelCount(SpeechRecognizerImpl::kChannelLayout) * + SpeechRecognizerImpl::kNumBitsPerAudioSample) / (8 * 1000); audio_packet_.resize(audio_packet_length_bytes); } @@ -164,7 +165,7 @@ class SpeechRecognizerTest : public SpeechRecognitionEventListener, protected: base::MessageLoopForIO message_loop_; BrowserThreadImpl io_thread_; - scoped_refptr<SpeechRecognizer> recognizer_; + scoped_refptr<SpeechRecognizerImpl> recognizer_; scoped_ptr<AudioManager> audio_manager_; bool recognition_started_; bool recognition_ended_; @@ -181,7 +182,7 @@ class SpeechRecognizerTest : public SpeechRecognitionEventListener, float noise_volume_; }; -TEST_F(SpeechRecognizerTest, StopNoData) { +TEST_F(SpeechRecognizerImplTest, StopNoData) { // Check for callbacks when stopping record before any audio gets recorded. recognizer_->StartRecognition(); recognizer_->StopAudioCapture(); @@ -193,7 +194,7 @@ TEST_F(SpeechRecognizerTest, StopNoData) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, CancelNoData) { +TEST_F(SpeechRecognizerImplTest, CancelNoData) { // Check for callbacks when canceling recognition before any audio gets // recorded. recognizer_->StartRecognition(); @@ -206,7 +207,7 @@ TEST_F(SpeechRecognizerTest, CancelNoData) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, StopWithData) { +TEST_F(SpeechRecognizerImplTest, StopWithData) { // Start recording, give some data and then stop. This should wait for the // network callback to arrive before completion. recognizer_->StartRecognition(); @@ -256,7 +257,7 @@ TEST_F(SpeechRecognizerTest, StopWithData) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, CancelWithData) { +TEST_F(SpeechRecognizerImplTest, CancelWithData) { // Start recording, give some data and then cancel. recognizer_->StartRecognition(); base::MessageLoop::current()->RunUntilIdle(); @@ -276,7 +277,7 @@ TEST_F(SpeechRecognizerTest, CancelWithData) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, ConnectionError) { +TEST_F(SpeechRecognizerImplTest, ConnectionError) { // Start recording, give some data and then stop. Issue the network callback // with a connection error and verify that the recognizer bubbles the error up recognizer_->StartRecognition(); @@ -314,7 +315,7 @@ TEST_F(SpeechRecognizerTest, ConnectionError) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, ServerError) { +TEST_F(SpeechRecognizerImplTest, ServerError) { // Start recording, give some data and then stop. Issue the network callback // with a 500 error and verify that the recognizer bubbles the error up recognizer_->StartRecognition(); @@ -351,7 +352,7 @@ TEST_F(SpeechRecognizerTest, ServerError) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, AudioControllerErrorNoData) { +TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) { // Check if things tear down properly if AudioInputController threw an error. recognizer_->StartRecognition(); base::MessageLoop::current()->RunUntilIdle(); @@ -367,7 +368,7 @@ TEST_F(SpeechRecognizerTest, AudioControllerErrorNoData) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) { +TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) { // Check if things tear down properly if AudioInputController threw an error // after giving some audio data. recognizer_->StartRecognition(); @@ -387,7 +388,7 @@ TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, NoSpeechCallbackIssued) { +TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) { // Start recording and give a lot of packets with audio samples set to zero. // This should trigger the no-speech detector and issue a callback. recognizer_->StartRecognition(); @@ -396,7 +397,7 @@ TEST_F(SpeechRecognizerTest, NoSpeechCallbackIssued) { audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); - int num_packets = (SpeechRecognizer::kNoSpeechTimeoutMs) / + int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1; // The vector is already filled with zero value samples on create. for (int i = 0; i < num_packets; ++i) { @@ -411,7 +412,7 @@ TEST_F(SpeechRecognizerTest, NoSpeechCallbackIssued) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, NoSpeechCallbackNotIssued) { +TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) { // Start recording and give a lot of packets with audio samples set to zero // and then some more with reasonably loud audio samples. This should be // treated as normal speech input and the no-speech detector should not get @@ -424,7 +425,7 @@ TEST_F(SpeechRecognizerTest, NoSpeechCallbackNotIssued) { controller = audio_input_controller_factory_.controller(); ASSERT_TRUE(controller); - int num_packets = (SpeechRecognizer::kNoSpeechTimeoutMs) / + int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) / GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; // The vector is already filled with zero value samples on create. @@ -449,7 +450,7 @@ TEST_F(SpeechRecognizerTest, NoSpeechCallbackNotIssued) { CheckFinalEventsConsistency(); } -TEST_F(SpeechRecognizerTest, SetInputVolumeCallback) { +TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) { // Start recording and give a lot of packets with audio samples set to zero // and then some more with reasonably loud audio samples. Check that we don't // get the callback during estimation phase, then get zero for the silence @@ -463,7 +464,7 @@ TEST_F(SpeechRecognizerTest, SetInputVolumeCallback) { ASSERT_TRUE(controller); // Feed some samples to begin with for the endpointer to do noise estimation. - int num_packets = SpeechRecognizer::kEndpointerEstimationTimeMs / + int num_packets = SpeechRecognizerImpl::kEndpointerEstimationTimeMs / GoogleOneShotRemoteEngine::kAudioPacketIntervalMs; FillPacketWithNoise(); for (int i = 0; i < num_packets; ++i) { |