Adds SpeechRecognizer which provides a simple interface to record and recognize speech.

Also added a unit test for checking the callbacks fire as expected. TEST=unit_tests --gtest_filter=SpeechRecognizerTest.* BUG=none Review URL: http://codereview.chromium.org/3124009 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@55918 0039d316-1c4b-4281-b951-d872f2087c98
author: satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-08-12 19:57:31 +0000
committer: satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-08-12 19:57:31 +0000
commit: 4b9a8c385aaa926cedf5ba7c213c0fc247ef2cb4 (patch)
tree: 50d0565e66f421d6ce0b734e1c18eb614b7f5a06 /chrome/browser/speech
parent: df9a4de661636de81fccb6cfa552de94e84efa50 (diff)
download: chromium_src-4b9a8c385aaa926cedf5ba7c213c0fc247ef2cb4.zip
chromium_src-4b9a8c385aaa926cedf5ba7c213c0fc247ef2cb4.tar.gz
chromium_src-4b9a8c385aaa926cedf5ba7c213c0fc247ef2cb4.tar.bz2
3 files changed, 446 insertions, 0 deletions
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc
new file mode 100644
index 0000000..0f8b116
--- /dev/null
+++ b/chrome/browser/speech/speech_recognizer.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/speech/speech_recognizer.h"
+
+#include "base/ref_counted.h"
+#include "base/scoped_ptr.h"
+#include "chrome/browser/chrome_thread.h"
+#include "chrome/browser/profile.h"
+#include "chrome/common/net/url_request_context_getter.h"
+
+using media::AudioInputController;
+using std::list;
+using std::string;
+
+namespace {
+const char* kDefaultSpeechRecognitionUrl =
+    "http://www.google.com/speech-api/v1/recognize?lang=en-us&client=chromium";
+const int kAudioPacketIntervalMs = 100;  // Record 100ms long audio packets.
+const int kNumAudioChannels = 1;  // Speech is recorded as mono.
+const int kNumBitsPerAudioSample = 16;
+}  // namespace
+
+namespace speech_input {
+
+SpeechRecognizer::SpeechRecognizer(Delegate* delegate, int render_view_id)
+    : delegate_(delegate),
+      render_view_id_(render_view_id) {
+}
+
+SpeechRecognizer::~SpeechRecognizer() {
+  // Recording should have stopped earlier due to the endpointer or
+  // |StopRecording| being called.
+  DCHECK(!audio_controller_.get());
+  DCHECK(!request_.get() || !request_->HasPendingRequest());
+  DCHECK(audio_buffers_.empty());
+}
+
+bool SpeechRecognizer::StartRecording() {
+  DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));
+  DCHECK(!audio_controller_.get());
+  DCHECK(!request_.get() || !request_->HasPendingRequest());
+
+  audio_controller_ = AudioInputController::Create(this,
+      AudioManager::AUDIO_PCM_LINEAR, kNumAudioChannels,
+      AudioManager::kTelephoneSampleRate, kNumBitsPerAudioSample,
+      (AudioManager::kTelephoneSampleRate * kAudioPacketIntervalMs) / 1000);
+  DCHECK(audio_controller_.get());
+  LOG(INFO) << "SpeechRecognizer starting record.";
+  audio_controller_->Record();
+
+  return true;
+}
+
+void SpeechRecognizer::CancelRecognition() {
+  DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));
+  DCHECK(audio_controller_.get() || request_.get());
+
+  // Stop recording if required.
+  if (audio_controller_.get()) {
+    LOG(INFO) << "SpeechRecognizer stopping record.";
+    audio_controller_->Close();
+    audio_controller_ = NULL;  // Releases the ref ptr.
+  }
+
+  LOG(INFO) << "SpeechRecognizer canceling recognition.";
+  ReleaseAudioBuffers();
+  request_.reset();
+}
+
+void SpeechRecognizer::StopRecording() {
+  DCHECK(ChromeThread::CurrentlyOn(ChromeThread::IO));
+
+  // If audio recording has already stopped and we are in recognition phase,
+  // silently ignore any more calls to stop recording.
+  if (!audio_controller_.get())
+    return;
+
+  LOG(INFO) << "SpeechRecognizer stopping record.";
+  audio_controller_->Close();
+  audio_controller_ = NULL;  // Releases the ref ptr.
+  delegate_->DidCompleteRecording(render_view_id_);
+
+  // If we haven't got any audio yet end the recognition sequence here.
+  if (audio_buffers_.empty()) {
+    // Guard against the delegate freeing us until we finish our job.
+    scoped_refptr<SpeechRecognizer> me(this);
+    delegate_->DidCompleteRecognition(render_view_id_);
+    return;
+  }
+
+  // We now have recorded audio in our buffers, so start a recognition request.
+  // Since the http request takes a single string as POST data, allocate
+  // one and copy over bytes from the audio buffers to the string.
+  int audio_buffer_length = 0;
+  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+       it != audio_buffers_.end(); it++) {
+    audio_buffer_length += (*it)->length();
+  }
+  string data;
+  data.reserve(audio_buffer_length);
+  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+       it != audio_buffers_.end(); it++) {
+    data.append(*(*it));
+  }
+  DCHECK(!request_.get());
+  request_.reset(new SpeechRecognitionRequest(
+      Profile::GetDefaultRequestContext(),
+      GURL(kDefaultSpeechRecognitionUrl),
+      this));
+  request_->Send(data);
+  ReleaseAudioBuffers();  // No need to keep the audio anymore.
+}
+
+void SpeechRecognizer::ReleaseAudioBuffers() {
+  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+       it != audio_buffers_.end(); it++)
+    delete *it;
+  audio_buffers_.clear();
+}
+
+// Invoked in the audio thread.
+void SpeechRecognizer::OnError(AudioInputController* controller,
+                               int error_code) {
+  ChromeThread::PostTask(ChromeThread::IO, FROM_HERE,
+                         NewRunnableMethod(this,
+                                           &SpeechRecognizer::HandleOnError,
+                                           error_code));
+}
+
+void SpeechRecognizer::HandleOnError(int error_code) {
+  LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
+
+  // Check if we are still recording before canceling recognition, as
+  // recording might have been stopped after this error was posted to the queue
+  // by |OnError|.
+  if (!audio_controller_.get())
+    return;
+
+  CancelRecognition();
+  delegate_->DidCompleteRecording(render_view_id_);
+  delegate_->DidCompleteRecognition(render_view_id_);
+}
+
+void SpeechRecognizer::OnData(AudioInputController* controller,
+                              const uint8* data, uint32 size) {
+  if (size == 0)  // This could happen when recording stops and is normal.
+    return;
+
+  string* str_data = new string(reinterpret_cast<const char*>(data), size);
+  ChromeThread::PostTask(ChromeThread::IO, FROM_HERE,
+                         NewRunnableMethod(this,
+                                           &SpeechRecognizer::HandleOnData,
+                                           str_data));
+}
+
+void SpeechRecognizer::HandleOnData(string* data) {
+  // Check if we are still recording and if not discard this buffer, as
+  // recording might have been stopped after this buffer was posted to the queue
+  // by |OnData|.
+  if (!audio_controller_.get()) {
+    delete data;
+    return;
+  }
+
+  // TODO(satish): Once we have streaming POST, start sending the data received
+  // here as POST chunks.
+  audio_buffers_.push_back(data);
+}
+
+void SpeechRecognizer::SetRecognitionResult(bool error, const string16& value) {
+  delegate_->SetRecognitionResult(render_view_id_, error, value);
+
+  // Guard against the delegate freeing us until we finish our job.
+  scoped_refptr<SpeechRecognizer> me(this);
+  delegate_->DidCompleteRecognition(render_view_id_);
+}
+
+}  // namespace speech_input
diff --git a/chrome/browser/speech/speech_recognizer.h b/chrome/browser/speech/speech_recognizer.h
new file mode 100644
index 0000000..fd8c7c4
--- /dev/null
+++ b/chrome/browser/speech/speech_recognizer.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
+#define CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
+
+#include "base/ref_counted.h"
+#include "base/scoped_ptr.h"
+#include "media/audio/audio_input_controller.h"
+#include "chrome/browser/speech/speech_recognition_request.h"
+#include <list>
+#include <string>
+
+namespace speech_input {
+
+// Records audio, sends recorded audio to server and translates server response
+// to recognition result.
+class SpeechRecognizer
+    : public base::RefCountedThreadSafe<SpeechRecognizer>,
+      public media::AudioInputController::EventHandler,
+      public SpeechRecognitionRequestDelegate {
+ public:
+  // Implemented by the caller to receive recognition events.
+  class Delegate {
+   public:
+    virtual void SetRecognitionResult(int render_view_id, bool error,
+                                      const string16& value) = 0;
+
+    // Invoked when audio recording stops, either due to the end pointer
+    // detecting silence in user input or if |StopRecording| was called. The
+    // delegate has to wait until |DidCompleteRecognition| is invoked before
+    // destroying the |SpeechRecognizer| object.
+    virtual void DidCompleteRecording(int render_view_id) = 0;
+
+    // This is guaranteed to be the last method invoked in the recognition
+    // sequence and the |SpeechRecognizer| object can be freed up if necessary.
+    virtual void DidCompleteRecognition(int render_view_id) = 0;
+
+   protected:
+    virtual ~Delegate() {}
+  };
+
+  SpeechRecognizer(Delegate* delegate, int render_view_id);
+  ~SpeechRecognizer();
+
+  // Starts audio recording and does recognition after recording ends. The same
+  // SpeechRecognizer instance can be used multiple times for speech recognition
+  // though each recognition request can be made only after the previous one
+  // completes (i.e. after receiving Delegate::DidCompleteRecognition).
+  bool StartRecording();
+
+  // Stops recording audio and starts recognition.
+  void StopRecording();
+
+  // Stops recording audio and cancels recognition. Any audio recorded so far
+  // gets discarded.
+  void CancelRecognition();
+
+  // AudioInputController::EventHandler methods.
+  void OnCreated(media::AudioInputController* controller) { }
+  void OnRecording(media::AudioInputController* controller) { }
+  void OnError(media::AudioInputController* controller, int error_code);
+  void OnData(media::AudioInputController* controller, const uint8* data,
+              uint32 size);
+
+  // SpeechRecognitionRequest::Delegate methods.
+  void SetRecognitionResult(bool error, const string16& value);
+
+ private:
+  void ReleaseAudioBuffers();
+
+  void HandleOnError(int error_code);  // Handles OnError in the IO thread.
+
+  // Handles OnData in the IO thread. Takes ownership of |data|.
+  void HandleOnData(std::string* data);
+
+  Delegate* delegate_;
+  int render_view_id_;
+
+  // Buffer holding the recorded audio. Owns the strings inside the list.
+  typedef std::list<std::string*> AudioBufferQueue;
+  AudioBufferQueue audio_buffers_;
+
+  scoped_ptr<SpeechRecognitionRequest> request_;
+  scoped_refptr<media::AudioInputController> audio_controller_;
+
+  DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer);
+};
+
+// This typedef is to workaround the issue with certain versions of
+// Visual Studio where it gets confused between multiple Delegate
+// classes and gives a C2500 error. (I saw this error on the try bots -
+// the workaround was not needed for my machine).
+typedef SpeechRecognizer::Delegate SpeechRecognizerDelegate;
+
+}  // namespace speech_input
+
+#endif  // CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
diff --git a/chrome/browser/speech/speech_recognizer_unittest.cc b/chrome/browser/speech/speech_recognizer_unittest.cc
new file mode 100644
index 0000000..1068364
--- /dev/null
+++ b/chrome/browser/speech/speech_recognizer_unittest.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/chrome_thread.h"
+#include "chrome/browser/speech/speech_recognizer.h"
+#include "chrome/common/net/test_url_fetcher_factory.h"
+#include "media/audio/test_audio_input_controller_factory.h"
+#include "net/url_request/url_request_status.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using media::AudioInputController;
+using media::TestAudioInputController;
+using media::TestAudioInputControllerFactory;
+
+namespace speech_input {
+
+class SpeechRecognizerTest : public SpeechRecognizerDelegate,
+                             public testing::Test {
+ public:
+  SpeechRecognizerTest()
+      : io_thread_(ChromeThread::IO, &message_loop_),
+        ALLOW_THIS_IN_INITIALIZER_LIST(
+            recognizer_(new SpeechRecognizer(this, 1))) {
+  }
+
+  void StartTest() {
+    EXPECT_TRUE(recognizer_->StartRecording());
+  }
+
+  // SpeechRecognizer::Delegate methods.
+  virtual void SetRecognitionResult(int render_view_id, bool error,
+                                    const string16& result) {
+    result_received_ = true;
+  }
+
+  virtual void DidCompleteRecording(int render_view_id) {
+    recording_complete_ = true;
+  }
+
+  virtual void DidCompleteRecognition(int render_view_id) {
+    recognition_complete_ = true;
+  }
+
+  // testing::Test methods.
+  virtual void SetUp() {
+    result_received_ = false;
+    recording_complete_ = false;
+    recognition_complete_ = false;
+    URLFetcher::set_factory(&url_fetcher_factory_);
+    AudioInputController::set_factory(&audio_input_controller_factory_);
+  }
+
+  virtual void TearDown() {
+    URLFetcher::set_factory(NULL);
+    AudioInputController::set_factory(NULL);
+  }
+
+ protected:
+  MessageLoopForIO message_loop_;
+  ChromeThread io_thread_;
+  scoped_refptr<SpeechRecognizer> recognizer_;
+  bool recording_complete_;
+  bool recognition_complete_;
+  bool result_received_;
+  TestURLFetcherFactory url_fetcher_factory_;
+  TestAudioInputControllerFactory audio_input_controller_factory_;
+};
+
+TEST_F(SpeechRecognizerTest, StopNoData) {
+  // Check for callbacks when stopping record before any audio gets recorded.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  recognizer_->CancelRecognition();
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+}
+
+TEST_F(SpeechRecognizerTest, CancelNoData) {
+  // Check for callbacks when canceling recognition before any audio gets
+  // recorded.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  recognizer_->StopRecording();
+  EXPECT_TRUE(recording_complete_);
+  EXPECT_TRUE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+}
+
+TEST_F(SpeechRecognizerTest, StopWithData) {
+  uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+
+  // Start recording, give some data and then stop. This should wait for the
+  // network callback to arrive before completion.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller = audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller->event_handler()->OnData(controller, data, sizeof(data));
+  MessageLoop::current()->RunAllPending();
+  recognizer_->StopRecording();
+  EXPECT_TRUE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+
+  // Issue the network callback to complete the process.
+  TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
+  ASSERT_TRUE(fetcher);
+  URLRequestStatus status;
+  status.set_status(URLRequestStatus::SUCCESS);
+  fetcher->delegate()->OnURLFetchComplete(fetcher, fetcher->original_url(),
+                                          status, 200, ResponseCookies(), "");
+  EXPECT_TRUE(recognition_complete_);
+  EXPECT_TRUE(result_received_);
+}
+
+TEST_F(SpeechRecognizerTest, CancelWithData) {
+  uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+
+  // Start recording, give some data and then cancel. This should not create
+  // a network request and finish immediately.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller->event_handler()->OnData(controller, data, sizeof(data));
+  MessageLoop::current()->RunAllPending();
+  recognizer_->CancelRecognition();
+  EXPECT_EQ(NULL, url_fetcher_factory_.GetFetcherByID(0));
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+}
+
+TEST_F(SpeechRecognizerTest, AudioControllerErrorNoData) {
+  // Check if things tear down properly if AudioInputController threw an error.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller->event_handler()->OnError(controller, 0);
+  MessageLoop::current()->RunAllPending();
+  EXPECT_TRUE(recording_complete_);
+  EXPECT_TRUE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+}
+
+TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) {
+  uint8 data[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+
+  // Check if things tear down properly if AudioInputController threw an error
+  // after giving some audio data.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller->event_handler()->OnData(controller, data, sizeof(data));
+  controller->event_handler()->OnError(controller, 0);
+  MessageLoop::current()->RunAllPending();
+  EXPECT_EQ(NULL, url_fetcher_factory_.GetFetcherByID(0));
+  EXPECT_TRUE(recording_complete_);
+  EXPECT_TRUE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+}
+
+}  // namespace speech_input
author	satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-08-12 19:57:31 +0000
committer	satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-08-12 19:57:31 +0000
commit	4b9a8c385aaa926cedf5ba7c213c0fc247ef2cb4 (patch)
tree	50d0565e66f421d6ce0b734e1c18eb614b7f5a06 /chrome/browser/speech
parent	df9a4de661636de81fccb6cfa552de94e84efa50 (diff)
download	chromium_src-4b9a8c385aaa926cedf5ba7c213c0fc247ef2cb4.zip chromium_src-4b9a8c385aaa926cedf5ba7c213c0fc247ef2cb4.tar.gz chromium_src-4b9a8c385aaa926cedf5ba7c213c0fc247ef2cb4.tar.bz2