Move core pieces of speech from chrome to content.

TBR=satish Review URL: http://codereview.chromium.org/6591024 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@76165 0039d316-1c4b-4281-b951-d872f2087c98
author: jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-02-26 18:46:15 +0000
committer: jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-02-26 18:46:15 +0000
commit: 50fab53bddb2c3cb24d5682c913a03226ccf49ef (patch)
tree: bb04af83ca5f2be010e32c2e10cfd245117a4847 /content/browser/speech/speech_recognizer.cc
parent: 5c557f37629dc12dfd99e8fb55c235c8c46a8098 (diff)
download: chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.zip
chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.gz
chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.bz2
1 files changed, 262 insertions, 0 deletions
diff --git a/content/browser/speech/speech_recognizer.cc b/content/browser/speech/speech_recognizer.cc
new file mode 100644
index 0000000..fdc1a4c
--- /dev/null
+++ b/content/browser/speech/speech_recognizer.cc
@@ -0,0 +1,262 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/speech_recognizer.h"
+
+#include "base/time.h"
+#include "chrome/browser/profiles/profile.h"
+#include "chrome/common/net/url_request_context_getter.h"
+#include "content/browser/browser_thread.h"
+
+using media::AudioInputController;
+using std::string;
+
+namespace {
+
+// The following constants are related to the volume level indicator shown in
+// the UI for recorded audio.
+// Multiplier used when new volume is greater than previous level.
+const float kUpSmoothingFactor = 0.9f;
+// Multiplier used when new volume is lesser than previous level.
+const float kDownSmoothingFactor = 0.4f;
+const float kAudioMeterMinDb = 10.0f;  // Lower bar for volume meter.
+const float kAudioMeterDbRange = 25.0f;
+}  // namespace
+
+namespace speech_input {
+
+const int SpeechRecognizer::kAudioSampleRate = 16000;
+const int SpeechRecognizer::kAudioPacketIntervalMs = 100;
+const int SpeechRecognizer::kNumAudioChannels = 1;
+const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
+const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
+const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;
+
+SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
+                                   int caller_id,
+                                   const std::string& language,
+                                   const std::string& grammar,
+                                   const std::string& hardware_info,
+                                   const std::string& origin_url)
+    : delegate_(delegate),
+      caller_id_(caller_id),
+      language_(language),
+      grammar_(grammar),
+      hardware_info_(hardware_info),
+      origin_url_(origin_url),
+      codec_(AudioEncoder::CODEC_SPEEX),
+      encoder_(NULL),
+      endpointer_(kAudioSampleRate),
+      num_samples_recorded_(0),
+      audio_level_(0.0f) {
+  endpointer_.set_speech_input_complete_silence_length(
+      base::Time::kMicrosecondsPerSecond / 2);
+  endpointer_.set_long_speech_input_complete_silence_length(
+      base::Time::kMicrosecondsPerSecond);
+  endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
+  endpointer_.StartSession();
+}
+
+SpeechRecognizer::~SpeechRecognizer() {
+  // Recording should have stopped earlier due to the endpointer or
+  // |StopRecording| being called.
+  DCHECK(!audio_controller_.get());
+  DCHECK(!request_.get() || !request_->HasPendingRequest());
+  DCHECK(!encoder_.get());
+  endpointer_.EndSession();
+}
+
+bool SpeechRecognizer::StartRecording() {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  DCHECK(!audio_controller_.get());
+  DCHECK(!request_.get() || !request_->HasPendingRequest());
+  DCHECK(!encoder_.get());
+
+  // The endpointer needs to estimate the environment/background noise before
+  // starting to treat the audio as user input. In |HandleOnData| we wait until
+  // such time has passed before switching to user input mode.
+  endpointer_.SetEnvironmentEstimationMode();
+
+  encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,
+                                      kNumBitsPerAudioSample));
+  int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
+  AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
+                         kAudioSampleRate, kNumBitsPerAudioSample,
+                         samples_per_packet);
+  audio_controller_ = AudioInputController::Create(this, params);
+  DCHECK(audio_controller_.get());
+  VLOG(1) << "SpeechRecognizer starting record.";
+  num_samples_recorded_ = 0;
+  audio_controller_->Record();
+
+  return true;
+}
+
+void SpeechRecognizer::CancelRecognition() {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  DCHECK(audio_controller_.get() || request_.get());
+
+  // Stop recording if required.
+  if (audio_controller_.get()) {
+    VLOG(1) << "SpeechRecognizer stopping record.";
+    audio_controller_->Close();
+    audio_controller_ = NULL;  // Releases the ref ptr.
+  }
+
+  VLOG(1) << "SpeechRecognizer canceling recognition.";
+  encoder_.reset();
+  request_.reset();
+}
+
+void SpeechRecognizer::StopRecording() {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+
+  // If audio recording has already stopped and we are in recognition phase,
+  // silently ignore any more calls to stop recording.
+  if (!audio_controller_.get())
+    return;
+
+  VLOG(1) << "SpeechRecognizer stopping record.";
+  audio_controller_->Close();
+  audio_controller_ = NULL;  // Releases the ref ptr.
+  encoder_->Flush();
+
+  delegate_->DidCompleteRecording(caller_id_);
+
+  // Since the http request takes a single string as POST data, allocate
+  // one and copy over bytes from the audio buffers to the string.
+  // And If we haven't got any audio yet end the recognition sequence here.
+  string mime_type = encoder_->mime_type();
+  string data;
+  encoder_->GetEncodedData(&data);
+  encoder_.reset();
+
+  if (data.empty()) {
+    // Guard against the delegate freeing us until we finish our job.
+    scoped_refptr<SpeechRecognizer> me(this);
+    delegate_->DidCompleteRecognition(caller_id_);
+  } else {
+    DCHECK(!request_.get());
+    request_.reset(new SpeechRecognitionRequest(
+        Profile::GetDefaultRequestContext(), this));
+    request_->Send(language_, grammar_, hardware_info_, origin_url_,
+                   mime_type, data);
+  }
+}
+
+void SpeechRecognizer::ReleaseAudioBuffers() {
+}
+
+// Invoked in the audio thread.
+void SpeechRecognizer::OnError(AudioInputController* controller,
+                               int error_code) {
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                         NewRunnableMethod(this,
+                                           &SpeechRecognizer::HandleOnError,
+                                           error_code));
+}
+
+void SpeechRecognizer::HandleOnError(int error_code) {
+  LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
+
+  // Check if we are still recording before canceling recognition, as
+  // recording might have been stopped after this error was posted to the queue
+  // by |OnError|.
+  if (!audio_controller_.get())
+    return;
+
+  InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);
+}
+
+void SpeechRecognizer::OnData(AudioInputController* controller,
+                              const uint8* data, uint32 size) {
+  if (size == 0)  // This could happen when recording stops and is normal.
+    return;
+
+  string* str_data = new string(reinterpret_cast<const char*>(data), size);
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                         NewRunnableMethod(this,
+                                           &SpeechRecognizer::HandleOnData,
+                                           str_data));
+}
+
+void SpeechRecognizer::HandleOnData(string* data) {
+  // Check if we are still recording and if not discard this buffer, as
+  // recording might have been stopped after this buffer was posted to the queue
+  // by |OnData|.
+  if (!audio_controller_.get()) {
+    delete data;
+    return;
+  }
+
+  const short* samples = reinterpret_cast<const short*>(data->data());
+  DCHECK((data->length() % sizeof(short)) == 0);
+  int num_samples = data->length() / sizeof(short);
+
+  encoder_->Encode(samples, num_samples);
+  float rms;
+  endpointer_.ProcessAudio(samples, num_samples, &rms);
+  delete data;
+  num_samples_recorded_ += num_samples;
+
+  if (endpointer_.IsEstimatingEnvironment()) {
+    // Check if we have gathered enough audio for the endpointer to do
+    // environment estimation and should move on to detect speech/end of speech.
+    if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
+                                  kAudioSampleRate) / 1000) {
+      endpointer_.SetUserInputMode();
+      delegate_->DidCompleteEnvironmentEstimation(caller_id_);
+    }
+    return;  // No more processing since we are still estimating environment.
+  }
+
+  // Check if we have waited too long without hearing any speech.
+  if (!endpointer_.DidStartReceivingSpeech() &&
+      num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {
+    InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);
+    return;
+  }
+
+  // Calculate the input volume to display in the UI, smoothing towards the
+  // new level.
+  float level = (rms - kAudioMeterMinDb) / kAudioMeterDbRange;
+  level = std::min(std::max(0.0f, level), 1.0f);
+  if (level > audio_level_) {
+    audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
+  } else {
+    audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
+  }
+  delegate_->SetInputVolume(caller_id_, audio_level_);
+
+  if (endpointer_.speech_input_complete()) {
+    StopRecording();
+  }
+
+  // TODO(satish): Once we have streaming POST, start sending the data received
+  // here as POST chunks.
+}
+
+void SpeechRecognizer::SetRecognitionResult(
+    bool error, const SpeechInputResultArray& result) {
+  if (result.empty()) {
+    InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS);
+    return;
+  }
+
+  delegate_->SetRecognitionResult(caller_id_, error, result);
+
+  // Guard against the delegate freeing us until we finish our job.
+  scoped_refptr<SpeechRecognizer> me(this);
+  delegate_->DidCompleteRecognition(caller_id_);
+}
+
+void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {
+  CancelRecognition();
+
+  // Guard against the delegate freeing us until we finish our job.
+  scoped_refptr<SpeechRecognizer> me(this);
+  delegate_->OnRecognizerError(caller_id_, error);
+}
+
+}  // namespace speech_input
author	jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-02-26 18:46:15 +0000
committer	jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-02-26 18:46:15 +0000
commit	50fab53bddb2c3cb24d5682c913a03226ccf49ef (patch)
tree	bb04af83ca5f2be010e32c2e10cfd245117a4847 /content/browser/speech/speech_recognizer.cc
parent	5c557f37629dc12dfd99e8fb55c235c8c46a8098 (diff)
download	chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.zip chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.gz chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.bz2