Speech refactoring: Reimplemented speech_recognizer as a FSM (CL1.5)

BUG=116954 TEST=none Review URL: http://codereview.chromium.org/9835049 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@132179 0039d316-1c4b-4281-b951-d872f2087c98
author: primiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-04-13 13:06:39 +0000
committer: primiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-04-13 13:06:39 +0000
commit: 2ba0644d32705803938d2022562d2e42e5ac7615 (patch)
tree: ab1a3973ce11d8fcb5855b89c6287dfd9eb66230 /content
parent: 0d2dafb39d52717a30631e7104a9c60fa6b0e57b (diff)
download: chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.zip
chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.tar.gz
chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.tar.bz2
4 files changed, 653 insertions, 253 deletions
diff --git a/content/browser/speech/google_one_shot_remote_engine.h b/content/browser/speech/google_one_shot_remote_engine.h
index 236ac94..7e47c67 100644
--- a/content/browser/speech/google_one_shot_remote_engine.h
+++ b/content/browser/speech/google_one_shot_remote_engine.h
@@ -31,7 +31,7 @@ namespace speech {
 
 class AudioChunk;
 
-struct GoogleOneShotRemoteEngineConfig {
+struct CONTENT_EXPORT GoogleOneShotRemoteEngineConfig {
   std::string language;
   std::string grammar;
   bool filter_profanities;
diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc
index 07bd75e..d9d9bab 100644
--- a/content/browser/speech/speech_recognizer_impl.cc
+++ b/content/browser/speech/speech_recognizer_impl.cc
@@ -4,6 +4,7 @@
 
 #include "content/browser/speech/speech_recognizer_impl.h"
 
+#include "base/basictypes.h"
 #include "base/bind.h"
 #include "base/time.h"
 #include "content/browser/browser_main_loop.h"
@@ -24,6 +25,7 @@ using content::SpeechRecognitionResult;
 using content::SpeechRecognizer;
 using media::AudioInputController;
 using media::AudioManager;
+using media::AudioParameters;
 
 namespace {
 
@@ -49,6 +51,7 @@ bool DetectClipping(const speech::AudioChunk& chunk) {
   const int16* samples = chunk.SamplesData16();
   const int kThreshold = num_samples / 20;
   int clipping_samples = 0;
+
   for (int i = 0; i < num_samples; ++i) {
     if (samples[i] <= -32767 || samples[i] >= 32767) {
       if (++clipping_samples > kThreshold)
@@ -69,14 +72,25 @@ SpeechRecognizer* SpeechRecognizer::Create(
     bool filter_profanities,
     const std::string& hardware_info,
     const std::string& origin_url) {
+  speech::GoogleOneShotRemoteEngineConfig remote_engine_config;
+  remote_engine_config.language = language;
+  remote_engine_config.grammar = grammar;
+  remote_engine_config.audio_sample_rate =
+      speech::SpeechRecognizerImpl::kAudioSampleRate;
+  remote_engine_config.audio_num_bits_per_sample =
+      speech::SpeechRecognizerImpl::kNumBitsPerAudioSample;
+  remote_engine_config.filter_profanities = filter_profanities;
+  remote_engine_config.hardware_info = hardware_info;
+  remote_engine_config.origin_url = origin_url;
+
+  // SpeechRecognizerImpl takes ownership of google_remote_engine.
+  speech::GoogleOneShotRemoteEngine* google_remote_engine =
+      new speech::GoogleOneShotRemoteEngine(context_getter);
+  google_remote_engine->SetConfig(remote_engine_config);
+
   return new speech::SpeechRecognizerImpl(listener,
                                           caller_id,
-                                          language,
-                                          grammar,
-                                          context_getter,
-                                          filter_profanities,
-                                          hardware_info,
-                                          origin_url);
+                                          google_remote_engine);
 }
 
 namespace speech {
@@ -87,247 +101,488 @@ const int SpeechRecognizerImpl::kNumBitsPerAudioSample = 16;
 const int SpeechRecognizerImpl::kNoSpeechTimeoutMs = 8000;
 const int SpeechRecognizerImpl::kEndpointerEstimationTimeMs = 300;
 
+COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,
+               kNumBitsPerAudioSample_must_be_a_multiple_of_8);
+
 SpeechRecognizerImpl::SpeechRecognizerImpl(
     SpeechRecognitionEventListener* listener,
     int caller_id,
-    const std::string& language,
-    const std::string& grammar,
-    net::URLRequestContextGetter* context_getter,
-    bool filter_profanities,
-    const std::string& hardware_info,
-    const std::string& origin_url)
+    SpeechRecognitionEngine* engine)
     : listener_(listener),
       testing_audio_manager_(NULL),
+      recognition_engine_(engine),
       endpointer_(kAudioSampleRate),
-      context_getter_(context_getter),
       caller_id_(caller_id),
-      language_(language),
-      grammar_(grammar),
-      filter_profanities_(filter_profanities),
-      hardware_info_(hardware_info),
-      origin_url_(origin_url),
-      num_samples_recorded_(0),
-      audio_level_(0.0f) {
+      is_dispatching_event_(false),
+      state_(STATE_IDLE) {
   DCHECK(listener_ != NULL);
+  DCHECK(recognition_engine_ != NULL);
   endpointer_.set_speech_input_complete_silence_length(
       base::Time::kMicrosecondsPerSecond / 2);
   endpointer_.set_long_speech_input_complete_silence_length(
       base::Time::kMicrosecondsPerSecond);
   endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
   endpointer_.StartSession();
+  recognition_engine_->set_delegate(this);
 }
 
 SpeechRecognizerImpl::~SpeechRecognizerImpl() {
-  // Recording should have stopped earlier due to the endpointer or
-  // |StopRecording| being called.
-  DCHECK(!audio_controller_.get());
-  DCHECK(!recognition_engine_.get() ||
-         !recognition_engine_->IsRecognitionPending());
   endpointer_.EndSession();
 }
 
-void SpeechRecognizerImpl::StartRecognition() {
-  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
-  DCHECK(!audio_controller_.get());
-  DCHECK(!recognition_engine_.get() ||
-         !recognition_engine_->IsRecognitionPending());
+// -------  Methods that trigger Finite State Machine (FSM) events ------------
 
-  // The endpointer needs to estimate the environment/background noise before
-  // starting to treat the audio as user input. In |HandleOnData| we wait until
-  // such time has passed before switching to user input mode.
-  endpointer_.SetEnvironmentEstimationMode();
+// NOTE:all the external events and requests should be enqueued (PostTask), even
+// if they come from the same (IO) thread, in order to preserve the relationship
+// of causality between events and avoid interleaved event processing due to
+// synchronous callbacks.
 
-  AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?
-      testing_audio_manager_ : BrowserMainLoop::GetAudioManager();
-  const int samples_per_packet = kAudioSampleRate *
-      GoogleOneShotRemoteEngine::kAudioPacketIntervalMs / 1000;
-  media::AudioParameters params(
-      media::AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
-      kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet);
-  audio_controller_ = AudioInputController::Create(audio_manager, this, params);
-  DCHECK(audio_controller_.get());
-  VLOG(1) << "SpeechRecognizer starting record.";
-  num_samples_recorded_ = 0;
-  audio_controller_->Record();
+void SpeechRecognizerImpl::StartRecognition() {
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                          base::Bind(&SpeechRecognizerImpl::DispatchEvent,
+                                     this, FSMEventArgs(EVENT_START)));
 }
 
 void SpeechRecognizerImpl::AbortRecognition() {
-  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
-  DCHECK(audio_controller_.get() || recognition_engine_.get());
-
-  // Stop recording if required.
-  if (audio_controller_.get()) {
-    CloseAudioControllerAsynchronously();
-  }
-
-  VLOG(1) << "SpeechRecognizer canceling recognition.";
-  recognition_engine_.reset();
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                          base::Bind(&SpeechRecognizerImpl::DispatchEvent,
+                                     this, FSMEventArgs(EVENT_ABORT)));
 }
 
 void SpeechRecognizerImpl::StopAudioCapture() {
-  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
-
-  // If audio recording has already stopped and we are in recognition phase,
-  // silently ignore any more calls to stop recording.
-  if (!audio_controller_.get())
-    return;
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                          base::Bind(&SpeechRecognizerImpl::DispatchEvent,
+                                     this, FSMEventArgs(EVENT_STOP_CAPTURE)));
+}
 
-  CloseAudioControllerAsynchronously();
-  listener_->OnSoundEnd(caller_id_);
-  listener_->OnAudioEnd(caller_id_);
+bool SpeechRecognizerImpl::IsActive() const {
+  // Checking the FSM state from another thread (thus, while the FSM is
+  // potentially concurrently evolving) is meaningless.
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  return state_ != STATE_IDLE;
+}
 
-  // If we haven't got any audio yet end the recognition sequence here.
-  if (recognition_engine_ == NULL) {
-    // Guard against the listener freeing us until we finish our job.
-    scoped_refptr<SpeechRecognizerImpl> me(this);
-    listener_->OnRecognitionEnd(caller_id_);
-  } else {
-    recognition_engine_->AudioChunksEnded();
-  }
+bool SpeechRecognizerImpl::IsCapturingAudio() const {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // See IsActive().
+  const bool is_capturing_audio = state_ >= STATE_STARTING &&
+                                  state_ <= STATE_RECOGNIZING;
+  DCHECK((is_capturing_audio && (audio_controller_.get() != NULL)) ||
+         (!is_capturing_audio && audio_controller_.get() == NULL));
+  return is_capturing_audio;
 }
 
 // Invoked in the audio thread.
 void SpeechRecognizerImpl::OnError(AudioInputController* controller,
                                    int error_code) {
+  FSMEventArgs event_args(EVENT_AUDIO_ERROR);
+  event_args.audio_error_code = error_code;
   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
-                         base::Bind(&SpeechRecognizerImpl::HandleOnError,
-                                    this, error_code));
+                          base::Bind(&SpeechRecognizerImpl::DispatchEvent,
+                                     this, event_args));
 }
 
-void SpeechRecognizerImpl::HandleOnError(int error_code) {
-  LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
-
-  // Check if we are still recording before canceling recognition, as
-  // recording might have been stopped after this error was posted to the queue
-  // by |OnError|.
-  if (!audio_controller_.get())
+void SpeechRecognizerImpl::OnData(AudioInputController* controller,
+                                  const uint8* data, uint32 size) {
+  if (size == 0)  // This could happen when audio capture stops and is normal.
     return;
 
-  InformErrorAndAbortRecognition(content::SPEECH_RECOGNITION_ERROR_AUDIO);
+  FSMEventArgs event_args(EVENT_AUDIO_DATA);
+  event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size),
+                                         kNumBitsPerAudioSample / 8);
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                          base::Bind(&SpeechRecognizerImpl::DispatchEvent,
+                                     this, event_args));
 }
 
-void SpeechRecognizerImpl::OnData(AudioInputController* controller,
-                                  const uint8* data, uint32 size) {
-  if (size == 0)  // This could happen when recording stops and is normal.
-    return;
-  scoped_refptr<AudioChunk> raw_audio(
-      new AudioChunk(data,
-                     static_cast<size_t>(size),
-                     kNumBitsPerAudioSample / 8));
+void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
+
+void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(
+    const content::SpeechRecognitionResult& result) {
+  FSMEventArgs event_args(EVENT_ENGINE_RESULT);
+  event_args.engine_result = result;
   BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
-                          base::Bind(&SpeechRecognizerImpl::HandleOnData,
-                                     this, raw_audio));
+                          base::Bind(&SpeechRecognizerImpl::DispatchEvent,
+                                     this, event_args));
 }
 
-void SpeechRecognizerImpl::HandleOnData(scoped_refptr<AudioChunk> raw_audio) {
-  // Check if we are still recording and if not discard this buffer, as
-  // recording might have been stopped after this buffer was posted to the queue
-  // by |OnData|.
-  if (!audio_controller_.get())
-    return;
+void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
+    const content::SpeechRecognitionError& error) {
+  FSMEventArgs event_args(EVENT_ENGINE_ERROR);
+  event_args.engine_error = error;
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                          base::Bind(&SpeechRecognizerImpl::DispatchEvent,
+                                     this, event_args));
+}
+
+// -----------------------  Core FSM implementation ---------------------------
+// TODO(primiano) After the changes in the media package (r129173), this class
+// slightly violates the SpeechRecognitionEventListener interface contract. In
+// particular, it is not true anymore that this class can be freed after the
+// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous
+// call can be still in progress after the end event. Currently, it does not
+// represent a problem for the browser itself, since refcounting protects us
+// against such race conditions. However, we should fix this in the next CLs.
+// For instance, tests are currently working just because the
+// TestAudioInputController is not closing asynchronously as the real controller
+// does, but they will become flaky if TestAudioInputController will be fixed.
+
+void SpeechRecognizerImpl::DispatchEvent(const FSMEventArgs& event_args) {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
+  DCHECK_LE(state_, STATE_MAX_VALUE);
+
+  // Event dispatching must be sequential, otherwise it will break all the rules
+  // and the assumptions of the finite state automata model.
+  DCHECK(!is_dispatching_event_);
+  is_dispatching_event_ = true;
 
-  bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech();
-
-  float rms;
-  endpointer_.ProcessAudio(*raw_audio, &rms);
-  bool did_clip = DetectClipping(*raw_audio);
-  num_samples_recorded_ += raw_audio->NumSamples();
-
-  if (recognition_engine_ == NULL) {
-    // This was the first audio packet recorded, so start a request to the
-    // server to send the data and inform the listener.
-    listener_->OnAudioStart(caller_id_);
-    GoogleOneShotRemoteEngineConfig google_sr_config;
-    google_sr_config.language = language_;
-    google_sr_config.grammar = grammar_;
-    google_sr_config.audio_sample_rate = kAudioSampleRate;
-    google_sr_config.audio_num_bits_per_sample = kNumBitsPerAudioSample;
-    google_sr_config.filter_profanities = filter_profanities_;
-    google_sr_config.hardware_info = hardware_info_;
-    google_sr_config.origin_url = origin_url_;
-    GoogleOneShotRemoteEngine* google_sr_engine =
-        new GoogleOneShotRemoteEngine(context_getter_.get());
-    google_sr_engine->SetConfig(google_sr_config);
-    recognition_engine_.reset(google_sr_engine);
-    recognition_engine_->set_delegate(this);
-    recognition_engine_->StartRecognition();
+  // Guard against the delegate freeing us until we finish processing the event.
+  scoped_refptr<SpeechRecognizerImpl> me(this);
+
+  if (event_args.event == EVENT_AUDIO_DATA) {
+    DCHECK(event_args.audio_data.get() != NULL);
+    ProcessAudioPipeline(*event_args.audio_data);
   }
 
-  recognition_engine_->TakeAudioChunk(*raw_audio);
+  // The audio pipeline must be processed before the event dispatch, otherwise
+  // it would take actions according to the future state instead of the current.
+  state_ = ExecuteTransitionAndGetNextState(event_args);
 
-  if (endpointer_.IsEstimatingEnvironment()) {
-    // Check if we have gathered enough audio for the endpointer to do
-    // environment estimation and should move on to detect speech/end of speech.
-    if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
-                                  kAudioSampleRate) / 1000) {
-      endpointer_.SetUserInputMode();
-      listener_->OnEnvironmentEstimationComplete(caller_id_);
-    }
-    return;  // No more processing since we are still estimating environment.
+  is_dispatching_event_ = false;
+}
+
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::ExecuteTransitionAndGetNextState(
+    const FSMEventArgs& event_args) {
+  const FSMEvent event = event_args.event;
+  switch (state_) {
+    case STATE_IDLE:
+      switch (event) {
+        // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and
+        // EVENT_STOP_CAPTURE below once speech input extensions are fixed.
+        case EVENT_ABORT:
+          return DoNothing(event_args);
+        case EVENT_START:
+          return StartRecording(event_args);
+        case EVENT_STOP_CAPTURE:  // Corner cases related to queued messages
+        case EVENT_AUDIO_DATA:    // being lately dispatched.
+        case EVENT_ENGINE_RESULT:
+        case EVENT_ENGINE_ERROR:
+        case EVENT_AUDIO_ERROR:
+          return DoNothing(event_args);
+      }
+      break;
+    case STATE_STARTING:
+      switch (event) {
+        case EVENT_ABORT:
+          return Abort(event_args);
+        case EVENT_START:
+          return NotFeasible(event_args);
+        case EVENT_STOP_CAPTURE:
+          return Abort(event_args);
+        case EVENT_AUDIO_DATA:
+          return StartRecognitionEngine(event_args);
+        case EVENT_ENGINE_RESULT:
+          return NotFeasible(event_args);
+        case EVENT_ENGINE_ERROR:
+        case EVENT_AUDIO_ERROR:
+          return Abort(event_args);
+      }
+      break;
+    case STATE_ESTIMATING_ENVIRONMENT:
+      switch (event) {
+        case EVENT_ABORT:
+          return Abort(event_args);
+        case EVENT_START:
+          return NotFeasible(event_args);
+        case EVENT_STOP_CAPTURE:
+          return StopCaptureAndWaitForResult(event_args);
+        case EVENT_AUDIO_DATA:
+          return WaitEnvironmentEstimationCompletion(event_args);
+        case EVENT_ENGINE_RESULT:
+          return ProcessIntermediateResult(event_args);
+        case EVENT_ENGINE_ERROR:
+        case EVENT_AUDIO_ERROR:
+          return Abort(event_args);
+      }
+      break;
+    case STATE_WAITING_FOR_SPEECH:
+      switch (event) {
+        case EVENT_ABORT:
+          return Abort(event_args);
+        case EVENT_START:
+          return NotFeasible(event_args);
+        case EVENT_STOP_CAPTURE:
+          return StopCaptureAndWaitForResult(event_args);
+        case EVENT_AUDIO_DATA:
+          return DetectUserSpeechOrTimeout(event_args);
+        case EVENT_ENGINE_RESULT:
+          return ProcessIntermediateResult(event_args);
+        case EVENT_ENGINE_ERROR:
+        case EVENT_AUDIO_ERROR:
+          return Abort(event_args);
+      }
+      break;
+    case STATE_RECOGNIZING:
+      switch (event) {
+        case EVENT_ABORT:
+          return Abort(event_args);
+        case EVENT_START:
+          return NotFeasible(event_args);
+        case EVENT_STOP_CAPTURE:
+          return StopCaptureAndWaitForResult(event_args);
+        case EVENT_AUDIO_DATA:
+          return DetectEndOfSpeech(event_args);
+        case EVENT_ENGINE_RESULT:
+          return ProcessIntermediateResult(event_args);
+        case EVENT_ENGINE_ERROR:
+        case EVENT_AUDIO_ERROR:
+          return Abort(event_args);
+      }
+      break;
+    case STATE_WAITING_FINAL_RESULT:
+      switch (event) {
+        case EVENT_ABORT:
+          return Abort(event_args);
+        case EVENT_START:
+          return NotFeasible(event_args);
+        case EVENT_STOP_CAPTURE:
+        case EVENT_AUDIO_DATA:
+          return DoNothing(event_args);
+        case EVENT_ENGINE_RESULT:
+          return ProcessFinalResult(event_args);
+        case EVENT_ENGINE_ERROR:
+        case EVENT_AUDIO_ERROR:
+          return Abort(event_args);
+      }
+      break;
   }
+  return NotFeasible(event_args);
+}
 
-  // Check if we have waited too long without hearing any speech.
-  bool speech_was_heard_after_packet = endpointer_.DidStartReceivingSpeech();
-  if (!speech_was_heard_after_packet &&
-      num_samples_recorded_ >= (kNoSpeechTimeoutMs / 1000) * kAudioSampleRate) {
-    InformErrorAndAbortRecognition(
-        content::SPEECH_RECOGNITION_ERROR_NO_SPEECH);
-    return;
+// ----------- Contract for all the FSM evolution functions below -------------
+//  - Are guaranteed to be executed in the IO thread;
+//  - Are guaranteed to be not reentrant (themselves and each other);
+//  - event_args members are guaranteed to be stable during the call;
+//  - The class won't be freed in the meanwhile due to callbacks;
+//  - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.
+
+// TODO(primiano) the audio pipeline is currently serial. However, the
+// clipper->endpointer->vumeter chain and the sr_engine could be parallelized.
+// We should profile the execution to see if it would be worth or not.
+void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {
+  const bool route_to_endpointer = state_ >= STATE_ESTIMATING_ENVIRONMENT &&
+                                   state_ <= STATE_RECOGNIZING;
+  const bool route_to_sr_engine = route_to_endpointer;
+  const bool route_to_vumeter = state_ >= STATE_WAITING_FOR_SPEECH &&
+                                state_ <= STATE_RECOGNIZING;
+  const bool clip_detected = DetectClipping(raw_audio);
+  float rms = 0.0f;
+
+  num_samples_recorded_ += raw_audio.NumSamples();
+
+  if (route_to_endpointer)
+    endpointer_.ProcessAudio(raw_audio, &rms);
+
+  if (route_to_vumeter) {
+    DCHECK(route_to_endpointer); // Depends on endpointer due to |rms|.
+    UpdateSignalAndNoiseLevels(rms, clip_detected);
   }
+  if (route_to_sr_engine) {
+    DCHECK(recognition_engine_.get() != NULL);
+    recognition_engine_->TakeAudioChunk(raw_audio);
+  }
+}
 
-  if (!speech_was_heard_before_packet && speech_was_heard_after_packet)
-    listener_->OnSoundStart(caller_id_);
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {
+  DCHECK(recognition_engine_.get() != NULL);
+  DCHECK(!IsCapturingAudio());
+  AudioManager* audio_manager = (testing_audio_manager_ != NULL) ?
+                                 testing_audio_manager_ :
+                                 BrowserMainLoop::GetAudioManager();
+  DCHECK(audio_manager != NULL);
 
-  // Calculate the input volume to display in the UI, smoothing towards the
-  // new level.
-  float level = (rms - kAudioMeterMinDb) /
-      (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
-  level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
-  if (level > audio_level_) {
-    audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
+  DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";
+  num_samples_recorded_ = 0;
+  audio_level_ = 0;
+  listener_->OnRecognitionStart(caller_id_);
+
+  if (!audio_manager->HasAudioInputDevices()) {
+    return AbortWithError(SpeechRecognitionError(
+        content::SPEECH_RECOGNITION_ERROR_AUDIO,
+        content::SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));
+  }
+
+  if (audio_manager->IsRecordingInProcess()) {
+    return AbortWithError(SpeechRecognitionError(
+        content::SPEECH_RECOGNITION_ERROR_AUDIO,
+        content::SPEECH_AUDIO_ERROR_DETAILS_IN_USE));
+  }
+
+  const int samples_per_packet = (kAudioSampleRate *
+      recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;
+  AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
+                         kAudioSampleRate, kNumBitsPerAudioSample,
+                         samples_per_packet);
+  audio_controller_ = AudioInputController::Create(audio_manager, this, params);
+
+  if (audio_controller_.get() == NULL) {
+    return AbortWithError(
+        SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));
+  }
+
+  // The endpointer needs to estimate the environment/background noise before
+  // starting to treat the audio as user input. We wait in the state
+  // ESTIMATING_ENVIRONMENT until such interval has elapsed before switching
+  // to user input mode.
+  endpointer_.SetEnvironmentEstimationMode();
+  audio_controller_->Record();
+  return STATE_STARTING;
+}
+
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::StartRecognitionEngine(const FSMEventArgs& event_args) {
+  // This is the first audio packet captured, so the recognition engine is
+  // started and the delegate notified about the event.
+  DCHECK(recognition_engine_.get() != NULL);
+  recognition_engine_->StartRecognition();
+  listener_->OnAudioStart(caller_id_);
+
+  // This is a little hack, since TakeAudioChunk() is already called by
+  // ProcessAudioPipeline(). It is the best tradeoff, unless we allow dropping
+  // the first audio chunk captured after opening the audio device.
+  recognition_engine_->TakeAudioChunk(*(event_args.audio_data));
+  return STATE_ESTIMATING_ENVIRONMENT;
+}
+
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::WaitEnvironmentEstimationCompletion(const FSMEventArgs&) {
+  DCHECK(endpointer_.IsEstimatingEnvironment());
+  if (GetElapsedTimeMs() >= kEndpointerEstimationTimeMs) {
+    endpointer_.SetUserInputMode();
+    listener_->OnEnvironmentEstimationComplete(caller_id_);
+    return STATE_WAITING_FOR_SPEECH;
   } else {
-    audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
+    return STATE_ESTIMATING_ENVIRONMENT;
+  }
+}
+
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {
+  if (endpointer_.DidStartReceivingSpeech()) {
+    listener_->OnSoundStart(caller_id_);
+    return STATE_RECOGNIZING;
+  } else if (GetElapsedTimeMs() >= kNoSpeechTimeoutMs) {
+    return AbortWithError(
+        SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH));
   }
+  return STATE_WAITING_FOR_SPEECH;
+}
 
-  float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
-      (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
-  noise_level = std::min(std::max(0.0f, noise_level),
-      kAudioMeterRangeMaxUnclipped);
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {
+  if (endpointer_.speech_input_complete()) {
+    return StopCaptureAndWaitForResult(event_args);
+  }
+  return STATE_RECOGNIZING;
+}
 
-  listener_->OnAudioLevelsChange(caller_id_, did_clip ? 1.0f : audio_level_,
-                                 noise_level);
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::StopCaptureAndWaitForResult(const FSMEventArgs&) {
+  DCHECK(state_ >= STATE_ESTIMATING_ENVIRONMENT && state_ <= STATE_RECOGNIZING);
 
-  if (endpointer_.speech_input_complete())
-    StopAudioCapture();
+  DVLOG(1) << "Concluding recognition";
+  CloseAudioControllerAsynchronously();
+  recognition_engine_->AudioChunksEnded();
+
+  if (state_ > STATE_WAITING_FOR_SPEECH)
+    listener_->OnSoundEnd(caller_id_);
+
+  listener_->OnAudioEnd(caller_id_);
+  return STATE_WAITING_FINAL_RESULT;
 }
 
-void SpeechRecognizerImpl::OnAudioClosed(AudioInputController*) {}
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::Abort(const FSMEventArgs& event_args) {
+  // TODO(primiano) Should raise SPEECH_RECOGNITION_ERROR_ABORTED in lack of
+  // other specific error sources (so that it was an explicit abort request).
+  // However, SPEECH_RECOGNITION_ERROR_ABORTED is not currently caught by
+  // ChromeSpeechRecognitionManagerDelegate and would cause an exception.
+  // JS support will probably need it in future.
+  if (event_args.event == EVENT_AUDIO_ERROR) {
+    return AbortWithError(
+        SpeechRecognitionError(content::SPEECH_RECOGNITION_ERROR_AUDIO));
+  } else if (event_args.event == EVENT_ENGINE_ERROR) {
+    return AbortWithError(event_args.engine_error);
+  }
+  return AbortWithError(NULL);
+}
+
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(
+    const SpeechRecognitionError& error) {
+  return AbortWithError(&error);
+}
+
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::AbortWithError(
+    const SpeechRecognitionError* error) {
+  if (IsCapturingAudio())
+    CloseAudioControllerAsynchronously();
+
+  DVLOG(1) << "SpeechRecognizerImpl canceling recognition. ";
+
+  // The recognition engine is initialized only after STATE_STARTING.
+  if (state_ > STATE_STARTING) {
+    DCHECK(recognition_engine_.get() != NULL);
+    recognition_engine_->EndRecognition();
+  }
+
+  if (state_ > STATE_WAITING_FOR_SPEECH && state_ < STATE_WAITING_FINAL_RESULT)
+    listener_->OnSoundEnd(caller_id_);
+
+  if (state_ > STATE_STARTING && state_ < STATE_WAITING_FINAL_RESULT)
+    listener_->OnAudioEnd(caller_id_);
+
+  if (error != NULL)
+    listener_->OnRecognitionError(caller_id_, *error);
 
-void SpeechRecognizerImpl::OnSpeechRecognitionEngineResult(
-    const content::SpeechRecognitionResult& result) {
-  // Guard against the listener freeing us until we finish our job.
-  scoped_refptr<SpeechRecognizerImpl> me(this);
-  listener_->OnRecognitionResult(caller_id_, result);
   listener_->OnRecognitionEnd(caller_id_);
+
+  return STATE_IDLE;
 }
 
-void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
-    const content::SpeechRecognitionError& error) {
-  InformErrorAndAbortRecognition(error.code);
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {
+  // This is in preparation for future speech recognition functions.
+  NOTREACHED();
+  return state_;
+}
+
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {
+  const SpeechRecognitionResult& result = event_args.engine_result;
+  DVLOG(1) << "Got valid result";
+  recognition_engine_->EndRecognition();
+  listener_->OnRecognitionResult(caller_id_, result);
+  listener_->OnRecognitionEnd(caller_id_);
+  return STATE_IDLE;
 }
 
-void SpeechRecognizerImpl::InformErrorAndAbortRecognition(
-    content::SpeechRecognitionErrorCode error) {
-  DCHECK_NE(error, content::SPEECH_RECOGNITION_ERROR_NONE);
-  AbortRecognition();
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::DoNothing(const FSMEventArgs&) const {
+  return state_;  // Just keep the current state.
+}
 
-  // Guard against the listener freeing us until we finish our job.
-  scoped_refptr<SpeechRecognizerImpl> me(this);
-  listener_->OnRecognitionError(caller_id_, error);
+SpeechRecognizerImpl::FSMState
+SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {
+  NOTREACHED() << "Unfeasible event " << event_args.event
+               << " in state " << state_;
+  return state_;
 }
 
 void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {
-  VLOG(1) << "SpeechRecognizer stopping record.";
+  DCHECK(IsCapturingAudio());
+  DVLOG(1) << "SpeechRecognizerImpl stopping audio capture.";
   // Issues a Close on the audio controller, passing an empty callback. The only
   // purpose of such callback is to keep the audio controller refcounted until
   // Close has completed (in the audio thread) and automatically destroy it
@@ -337,12 +592,30 @@ void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {
   audio_controller_ = NULL;  // The controller is still refcounted by Bind.
 }
 
-bool SpeechRecognizerImpl::IsActive() const {
-  return (recognition_engine_.get() != NULL);
+int SpeechRecognizerImpl::GetElapsedTimeMs() const {
+  return (num_samples_recorded_ * 1000) / kAudioSampleRate;
 }
 
-bool SpeechRecognizerImpl::IsCapturingAudio() const {
-  return (audio_controller_.get() != NULL);
+void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,
+                                                      bool clip_detected) {
+  // Calculate the input volume to display in the UI, smoothing towards the
+  // new level.
+  // TODO(primiano) Do we really need all this floating point arith here?
+  // Perhaps it might be quite expensive on mobile.
+  float level = (rms - kAudioMeterMinDb) /
+      (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
+  level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
+  const float smoothing_factor = (level > audio_level_) ? kUpSmoothingFactor :
+                                                          kDownSmoothingFactor;
+  audio_level_ += (level - audio_level_) * smoothing_factor;
+
+  float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
+      (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
+  noise_level = std::min(std::max(0.0f, noise_level),
+                         kAudioMeterRangeMaxUnclipped);
+
+  listener_->OnAudioLevelsChange(
+      caller_id_, clip_detected ? 1.0f : audio_level_, noise_level);
 }
 
 const SpeechRecognitionEngine&
@@ -355,5 +628,14 @@ void SpeechRecognizerImpl::SetAudioManagerForTesting(
   testing_audio_manager_ = audio_manager;
 }
 
+SpeechRecognizerImpl::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
+    : event(event_value),
+      audio_error_code(0),
+      audio_data(NULL),
+      engine_error(content::SPEECH_RECOGNITION_ERROR_NONE) {
+}
+
+SpeechRecognizerImpl::FSMEventArgs::~FSMEventArgs() {
+}
 
 }  // namespace speech
diff --git a/content/browser/speech/speech_recognizer_impl.h b/content/browser/speech/speech_recognizer_impl.h
index 516dfea..a2cce74 100644
--- a/content/browser/speech/speech_recognizer_impl.h
+++ b/content/browser/speech/speech_recognizer_impl.h
@@ -12,6 +12,7 @@
 #include "content/browser/speech/speech_recognition_engine.h"
 #include "content/public/browser/speech_recognizer.h"
 #include "content/public/common/speech_recognition_error.h"
+#include "content/public/common/speech_recognition_result.h"
 #include "media/audio/audio_input_controller.h"
 #include "net/url_request/url_request_context_getter.h"
 
@@ -27,8 +28,13 @@ class AudioManager;
 
 namespace speech {
 
-// Records audio, sends recorded audio to server and translates server response
-// to recognition result.
+// TODO(primiano) Next CL: Remove the Impl suffix and the exported
+// /content/public/browser/speech_recognizer.h interface since this class should
+// not be visible outside (currently we need it for speech input extension API).
+
+// Handles speech recognition for a session (identified by |caller_id|), taking
+// care of audio capture, silence detection/endpointer and interaction with the
+// SpeechRecognitionEngine.
 class CONTENT_EXPORT SpeechRecognizerImpl
     : public NON_EXPORTED_BASE(content::SpeechRecognizer),
       public media::AudioInputController::EventHandler,
@@ -41,14 +47,9 @@ class CONTENT_EXPORT SpeechRecognizerImpl
   static const int kEndpointerEstimationTimeMs;
 
   SpeechRecognizerImpl(
-    content::SpeechRecognitionEventListener* listener,
-    int caller_id,
-    const std::string& language,
-    const std::string& grammar,
-    net::URLRequestContextGetter* context_getter,
-    bool filter_profanities,
-    const std::string& hardware_info,
-    const std::string& origin_url);
+      content::SpeechRecognitionEventListener* listener,
+      int caller_id,
+      SpeechRecognitionEngine* engine);
   virtual ~SpeechRecognizerImpl();
 
   // content::SpeechRecognizer methods.
@@ -59,14 +60,86 @@ class CONTENT_EXPORT SpeechRecognizerImpl
   virtual bool IsCapturingAudio() const OVERRIDE;
   const SpeechRecognitionEngine& recognition_engine() const;
 
+ private:
+  friend class SpeechRecognizerImplTest;
+
+  enum FSMState {
+    STATE_IDLE = 0,
+    STATE_STARTING,
+    STATE_ESTIMATING_ENVIRONMENT,
+    STATE_WAITING_FOR_SPEECH,
+    STATE_RECOGNIZING,
+    STATE_WAITING_FINAL_RESULT,
+    STATE_MAX_VALUE = STATE_WAITING_FINAL_RESULT
+  };
+
+  enum FSMEvent {
+    EVENT_ABORT = 0,
+    EVENT_START,
+    EVENT_STOP_CAPTURE,
+    EVENT_AUDIO_DATA,
+    EVENT_ENGINE_RESULT,
+    EVENT_ENGINE_ERROR,
+    EVENT_AUDIO_ERROR,
+    EVENT_MAX_VALUE = EVENT_AUDIO_ERROR
+  };
+
+  struct FSMEventArgs {
+    explicit FSMEventArgs(FSMEvent event_value);
+    ~FSMEventArgs();
+
+    FSMEvent event;
+    int audio_error_code;
+    scoped_refptr<AudioChunk> audio_data;
+    content::SpeechRecognitionResult engine_result;
+    content::SpeechRecognitionError engine_error;
+  };
+
+  // Entry point for pushing any new external event into the recognizer FSM.
+  void DispatchEvent(const FSMEventArgs& event_args);
+
+  // Defines the behavior of the recognizer FSM, selecting the appropriate
+  // transition according to the current state and event.
+  FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args);
+
+  // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc).
+  void ProcessAudioPipeline(const AudioChunk& raw_audio);
+
+  // The methods below handle transitions of the recognizer FSM.
+  FSMState StartRecording(const FSMEventArgs& event_args);
+  FSMState StartRecognitionEngine(const FSMEventArgs& event_args);
+  FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args);
+  FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args);
+  FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args);
+  FSMState ProcessIntermediateResult(const FSMEventArgs& event_args);
+  FSMState ProcessFinalResult(const FSMEventArgs& event_args);
+  FSMState Abort(const FSMEventArgs& event_args);
+  FSMState AbortWithError(const content::SpeechRecognitionError* error);
+  FSMState AbortWithError(const content::SpeechRecognitionError& error);
+  FSMState DetectEndOfSpeech(const FSMEventArgs& event_args);
+  FSMState DoNothing(const FSMEventArgs& event_args) const;
+  FSMState NotFeasible(const FSMEventArgs& event_args);
+
+  // Returns the time span of captured audio samples since the start of capture.
+  int GetElapsedTimeMs() const;
+
+  // Calculates the input volume to be displayed in the UI, triggering the
+  // OnAudioLevelsChange event accordingly.
+  void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected);
+
+  void CloseAudioControllerAsynchronously();
+  void SetAudioManagerForTesting(media::AudioManager* audio_manager);
+
+  // Callback called on IO thread by audio_controller->Close().
+  void OnAudioClosed(media::AudioInputController*);
+
   // AudioInputController::EventHandler methods.
   virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {}
   virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {}
   virtual void OnError(media::AudioInputController* controller,
                        int error_code) OVERRIDE;
   virtual void OnData(media::AudioInputController* controller,
-                      const uint8* data,
-                      uint32 size) OVERRIDE;
+                      const uint8* data, uint32 size) OVERRIDE;
 
   // SpeechRecognitionEngineDelegate methods.
   virtual void OnSpeechRecognitionEngineResult(
@@ -74,40 +147,16 @@ class CONTENT_EXPORT SpeechRecognizerImpl
   virtual void OnSpeechRecognitionEngineError(
       const content::SpeechRecognitionError& error) OVERRIDE;
 
- private:
-  friend class SpeechRecognizerImplTest;
-
-  void InformErrorAndAbortRecognition(
-      content::SpeechRecognitionErrorCode error);
-  void SendRecordedAudioToServer();
-
-  void HandleOnError(int error_code);  // Handles OnError in the IO thread.
-
-  // Handles OnData in the IO thread.
-  void HandleOnData(scoped_refptr<AudioChunk> raw_audio);
-
-  void OnAudioClosed(media::AudioInputController*);
-
-  // Helper method which closes the audio controller and frees it asynchronously
-  // without blocking the IO thread.
-  void CloseAudioControllerAsynchronously();
-
-  void SetAudioManagerForTesting(media::AudioManager* audio_manager);
-
   content::SpeechRecognitionEventListener* listener_;
   media::AudioManager* testing_audio_manager_;
   scoped_ptr<SpeechRecognitionEngine> recognition_engine_;
   Endpointer endpointer_;
   scoped_refptr<media::AudioInputController> audio_controller_;
-  scoped_refptr<net::URLRequestContextGetter> context_getter_;
   int caller_id_;
-  std::string language_;
-  std::string grammar_;
-  bool filter_profanities_;
-  std::string hardware_info_;
-  std::string origin_url_;
   int num_samples_recorded_;
   float audio_level_;
+  bool is_dispatching_event_;
+  FSMState state_;
 
   DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl);
 };
diff --git a/content/browser/speech/speech_recognizer_impl_unittest.cc b/content/browser/speech/speech_recognizer_impl_unittest.cc
index 01b7e4c..5dbe6cc 100644
--- a/content/browser/speech/speech_recognizer_impl_unittest.cc
+++ b/content/browser/speech/speech_recognizer_impl_unittest.cc
@@ -17,6 +17,7 @@
 #include "net/url_request/url_request_status.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
+using base::MessageLoopProxy;
 using content::BrowserThread;
 using content::BrowserThreadImpl;
 using media::AudioInputController;
@@ -97,16 +98,28 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener,
   SpeechRecognizerImplTest()
       : io_thread_(BrowserThread::IO, &message_loop_),
         audio_manager_(new MockAudioManager()),
-        audio_ended_(false),
+        recognition_started_(false),
         recognition_ended_(false),
         result_received_(false),
         audio_started_(false),
+        audio_ended_(false),
+        sound_started_(false),
+        sound_ended_(false),
         error_(content::SPEECH_RECOGNITION_ERROR_NONE),
         volume_(-1.0f) {
-    recognizer_ = new SpeechRecognizerImpl(
-        this, 1, std::string(), std::string(), NULL, false, std::string(),
-        std::string());
+    // SpeechRecognizerImpl takes ownership of sr_engine.
+    GoogleOneShotRemoteEngine* sr_engine =
+        new GoogleOneShotRemoteEngine(NULL /* URLRequestContextGetter */);
+    GoogleOneShotRemoteEngineConfig config;
+    config.audio_num_bits_per_sample =
+        SpeechRecognizerImpl::kNumBitsPerAudioSample;
+    config.audio_sample_rate = SpeechRecognizerImpl::kAudioSampleRate;
+    config.filter_profanities = false;
+    sr_engine->SetConfig(config);
+
+    recognizer_ = new SpeechRecognizerImpl(this, 1, sr_engine);
     recognizer_->SetAudioManagerForTesting(audio_manager_.get());
+
     int audio_packet_length_bytes =
         (SpeechRecognizerImpl::kAudioSampleRate *
          GoogleOneShotRemoteEngine::kAudioPacketIntervalMs *
@@ -115,13 +128,33 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener,
     audio_packet_.resize(audio_packet_length_bytes);
   }
 
+  void CheckEventsConsistency() {
+    // Note: "!x || y" == "x implies y".
+    EXPECT_TRUE(!recognition_ended_ || recognition_started_);
+    EXPECT_TRUE(!audio_ended_ || audio_started_);
+    EXPECT_TRUE(!sound_ended_ || sound_started_);
+    EXPECT_TRUE(!audio_started_ || recognition_started_);
+    EXPECT_TRUE(!sound_started_ || audio_started_);
+    EXPECT_TRUE(!audio_ended_ || (sound_ended_ || !sound_started_));
+    EXPECT_TRUE(!recognition_ended_ || (audio_ended_ || !audio_started_));
+  }
+
+  void CheckFinalEventsConsistency() {
+    // Note: "!(x ^ y)" == "(x && y) || (!x && !x)".
+    EXPECT_FALSE(recognition_started_ ^ recognition_ended_);
+    EXPECT_FALSE(audio_started_ ^ audio_ended_);
+    EXPECT_FALSE(sound_started_ ^ sound_ended_);
+  }
+
   // Overridden from content::SpeechRecognitionEventListener:
   virtual void OnAudioStart(int caller_id) OVERRIDE {
     audio_started_ = true;
+    CheckEventsConsistency();
   }
 
   virtual void OnAudioEnd(int caller_id) OVERRIDE {
     audio_ended_ = true;
+    CheckEventsConsistency();
   }
 
   virtual void OnRecognitionResult(
@@ -130,8 +163,9 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener,
   }
 
   virtual void OnRecognitionError(
-      int caller_id,
-      const content::SpeechRecognitionError& error) OVERRIDE {
+      int caller_id, const content::SpeechRecognitionError& error) OVERRIDE {
+    EXPECT_TRUE(recognition_started_);
+    EXPECT_FALSE(recognition_ended_);
     error_ = error.code;
   }
 
@@ -143,12 +177,25 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener,
 
   virtual void OnRecognitionEnd(int caller_id) OVERRIDE {
     recognition_ended_ = true;
+    CheckEventsConsistency();
+  }
+
+  virtual void OnRecognitionStart(int caller_id) OVERRIDE {
+    recognition_started_ = true;
+    CheckEventsConsistency();
   }
 
-  virtual void OnRecognitionStart(int caller_id) OVERRIDE {}
   virtual void OnEnvironmentEstimationComplete(int caller_id) OVERRIDE {}
-  virtual void OnSoundStart(int caller_id) OVERRIDE {}
-  virtual void OnSoundEnd(int caller_id) OVERRIDE {}
+
+  virtual void OnSoundStart(int caller_id) OVERRIDE {
+    sound_started_ = true;
+    CheckEventsConsistency();
+  }
+
+  virtual void OnSoundEnd(int caller_id) OVERRIDE {
+    sound_ended_ = true;
+    CheckEventsConsistency();
+  }
 
   // testing::Test methods.
   virtual void SetUp() OVERRIDE {
@@ -180,10 +227,13 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener,
   BrowserThreadImpl io_thread_;
   scoped_refptr<SpeechRecognizerImpl> recognizer_;
   scoped_ptr<AudioManager> audio_manager_;
-  bool audio_ended_;
+  bool recognition_started_;
   bool recognition_ended_;
   bool result_received_;
   bool audio_started_;
+  bool audio_ended_;
+  bool sound_started_;
+  bool sound_ended_;
   content::SpeechRecognitionErrorCode error_;
   TestURLFetcherFactory url_fetcher_factory_;
   TestAudioInputControllerFactory audio_input_controller_factory_;
@@ -196,11 +246,12 @@ TEST_F(SpeechRecognizerImplTest, StopNoData) {
   // Check for callbacks when stopping record before any audio gets recorded.
   recognizer_->StartRecognition();
   recognizer_->AbortRecognition();
-  EXPECT_FALSE(audio_ended_);
-  EXPECT_FALSE(recognition_ended_);
-  EXPECT_FALSE(result_received_);
+  MessageLoop::current()->RunAllPending();
+  EXPECT_TRUE(recognition_started_);
   EXPECT_FALSE(audio_started_);
+  EXPECT_FALSE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, CancelNoData) {
@@ -208,17 +259,19 @@ TEST_F(SpeechRecognizerImplTest, CancelNoData) {
   // recorded.
   recognizer_->StartRecognition();
   recognizer_->StopAudioCapture();
-  EXPECT_TRUE(audio_ended_);
-  EXPECT_TRUE(recognition_ended_);
-  EXPECT_FALSE(result_received_);
+  MessageLoop::current()->RunAllPending();
+  EXPECT_TRUE(recognition_started_);
   EXPECT_FALSE(audio_started_);
+  EXPECT_FALSE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, StopWithData) {
   // Start recording, give some data and then stop. This should wait for the
   // network callback to arrive before completion.
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
@@ -238,6 +291,7 @@ TEST_F(SpeechRecognizerImplTest, StopWithData) {
   }
 
   recognizer_->StopAudioCapture();
+  MessageLoop::current()->RunAllPending();
   EXPECT_TRUE(audio_started_);
   EXPECT_TRUE(audio_ended_);
   EXPECT_FALSE(recognition_ended_);
@@ -256,16 +310,17 @@ TEST_F(SpeechRecognizerImplTest, StopWithData) {
   fetcher->SetResponseString(
       "{\"status\":0,\"hypotheses\":[{\"utterance\":\"123\"}]}");
   fetcher->delegate()->OnURLFetchComplete(fetcher);
-
+  MessageLoop::current()->RunAllPending();
   EXPECT_TRUE(recognition_ended_);
   EXPECT_TRUE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, CancelWithData) {
-  // Start recording, give some data and then cancel. This should create
-  // a network request but give no callbacks.
+  // Start recording, give some data and then cancel.
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
@@ -273,18 +328,20 @@ TEST_F(SpeechRecognizerImplTest, CancelWithData) {
                                       audio_packet_.size());
   MessageLoop::current()->RunAllPending();
   recognizer_->AbortRecognition();
+  MessageLoop::current()->RunAllPending();
   ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0));
+  EXPECT_TRUE(recognition_started_);
   EXPECT_TRUE(audio_started_);
-  EXPECT_FALSE(audio_ended_);
-  EXPECT_FALSE(recognition_ended_);
   EXPECT_FALSE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, ConnectionError) {
   // Start recording, give some data and then stop. Issue the network callback
   // with a connection error and verify that the recognizer bubbles the error up
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
@@ -295,6 +352,7 @@ TEST_F(SpeechRecognizerImplTest, ConnectionError) {
   ASSERT_TRUE(fetcher);
 
   recognizer_->StopAudioCapture();
+  MessageLoop::current()->RunAllPending();
   EXPECT_TRUE(audio_started_);
   EXPECT_TRUE(audio_ended_);
   EXPECT_FALSE(recognition_ended_);
@@ -310,16 +368,18 @@ TEST_F(SpeechRecognizerImplTest, ConnectionError) {
   fetcher->set_response_code(0);
   fetcher->SetResponseString("");
   fetcher->delegate()->OnURLFetchComplete(fetcher);
-
-  EXPECT_FALSE(recognition_ended_);
+  MessageLoop::current()->RunAllPending();
+  EXPECT_TRUE(recognition_ended_);
   EXPECT_FALSE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, ServerError) {
   // Start recording, give some data and then stop. Issue the network callback
   // with a 500 error and verify that the recognizer bubbles the error up
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
@@ -330,6 +390,7 @@ TEST_F(SpeechRecognizerImplTest, ServerError) {
   ASSERT_TRUE(fetcher);
 
   recognizer_->StopAudioCapture();
+  MessageLoop::current()->RunAllPending();
   EXPECT_TRUE(audio_started_);
   EXPECT_TRUE(audio_ended_);
   EXPECT_FALSE(recognition_ended_);
@@ -344,31 +405,34 @@ TEST_F(SpeechRecognizerImplTest, ServerError) {
   fetcher->set_response_code(500);
   fetcher->SetResponseString("Internal Server Error");
   fetcher->delegate()->OnURLFetchComplete(fetcher);
-
-  EXPECT_FALSE(recognition_ended_);
+  MessageLoop::current()->RunAllPending();
+  EXPECT_TRUE(recognition_ended_);
   EXPECT_FALSE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorNoData) {
   // Check if things tear down properly if AudioInputController threw an error.
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
   controller->event_handler()->OnError(controller, 0);
   MessageLoop::current()->RunAllPending();
+  EXPECT_TRUE(recognition_started_);
   EXPECT_FALSE(audio_started_);
-  EXPECT_FALSE(audio_ended_);
-  EXPECT_FALSE(recognition_ended_);
   EXPECT_FALSE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_AUDIO, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) {
   // Check if things tear down properly if AudioInputController threw an error
   // after giving some audio data.
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
@@ -377,36 +441,35 @@ TEST_F(SpeechRecognizerImplTest, AudioControllerErrorWithData) {
   controller->event_handler()->OnError(controller, 0);
   MessageLoop::current()->RunAllPending();
   ASSERT_TRUE(url_fetcher_factory_.GetFetcherByID(0));
+  EXPECT_TRUE(recognition_started_);
   EXPECT_TRUE(audio_started_);
-  EXPECT_FALSE(audio_ended_);
-  EXPECT_FALSE(recognition_ended_);
   EXPECT_FALSE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_AUDIO, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackIssued) {
   // Start recording and give a lot of packets with audio samples set to zero.
   // This should trigger the no-speech detector and issue a callback.
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
-  controller = audio_input_controller_factory_.controller();
-  ASSERT_TRUE(controller);
 
   int num_packets = (SpeechRecognizerImpl::kNoSpeechTimeoutMs) /
-                     GoogleOneShotRemoteEngine::kAudioPacketIntervalMs;
+                     GoogleOneShotRemoteEngine::kAudioPacketIntervalMs + 1;
   // The vector is already filled with zero value samples on create.
   for (int i = 0; i < num_packets; ++i) {
     controller->event_handler()->OnData(controller, &audio_packet_[0],
                                         audio_packet_.size());
   }
   MessageLoop::current()->RunAllPending();
+  EXPECT_TRUE(recognition_started_);
   EXPECT_TRUE(audio_started_);
-  EXPECT_FALSE(audio_ended_);
-  EXPECT_FALSE(recognition_ended_);
   EXPECT_FALSE(result_received_);
   EXPECT_EQ(content::SPEECH_RECOGNITION_ERROR_NO_SPEECH, error_);
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) {
@@ -415,6 +478,7 @@ TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) {
   // treated as normal speech input and the no-speech detector should not get
   // triggered.
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
@@ -442,6 +506,8 @@ TEST_F(SpeechRecognizerImplTest, NoSpeechCallbackNotIssued) {
   EXPECT_FALSE(audio_ended_);
   EXPECT_FALSE(recognition_ended_);
   recognizer_->AbortRecognition();
+  MessageLoop::current()->RunAllPending();
+  CheckFinalEventsConsistency();
 }
 
 TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
@@ -450,6 +516,7 @@ TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
   // get the callback during estimation phase, then get zero for the silence
   // samples and proper volume for the loud audio.
   recognizer_->StartRecognition();
+  MessageLoop::current()->RunAllPending();
   TestAudioInputController* controller =
       audio_input_controller_factory_.controller();
   ASSERT_TRUE(controller);
@@ -484,6 +551,8 @@ TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
   EXPECT_FALSE(audio_ended_);
   EXPECT_FALSE(recognition_ended_);
   recognizer_->AbortRecognition();
+  MessageLoop::current()->RunAllPending();
+  CheckFinalEventsConsistency();
 }
 
 }  // namespace speech
author	primiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-04-13 13:06:39 +0000
committer	primiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-04-13 13:06:39 +0000
commit	2ba0644d32705803938d2022562d2e42e5ac7615 (patch)
tree	ab1a3973ce11d8fcb5855b89c6287dfd9eb66230 /content
parent	0d2dafb39d52717a30631e7104a9c60fa6b0e57b (diff)
download	chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.zip chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.tar.gz chromium_src-2ba0644d32705803938d2022562d2e42e5ac7615.tar.bz2