Move core pieces of speech from chrome to content.

TBR=satish Review URL: http://codereview.chromium.org/6591024 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@76165 0039d316-1c4b-4281-b951-d872f2087c98
author: jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-02-26 18:46:15 +0000
committer: jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-02-26 18:46:15 +0000
commit: 50fab53bddb2c3cb24d5682c913a03226ccf49ef (patch)
tree: bb04af83ca5f2be010e32c2e10cfd245117a4847 /content/browser
parent: 5c557f37629dc12dfd99e8fb55c235c8c46a8098 (diff)
download: chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.zip
chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.gz
chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.bz2
20 files changed, 3095 insertions, 0 deletions
diff --git a/content/browser/speech/OWNERS b/content/browser/speech/OWNERS
new file mode 100644
index 0000000..2ad1bbd
--- /dev/null
+++ b/content/browser/speech/OWNERS
@@ -0,0 +1 @@
+satish@chromium.org
diff --git a/content/browser/speech/audio_encoder.cc b/content/browser/speech/audio_encoder.cc
new file mode 100644
index 0000000..c24f45f
--- /dev/null
+++ b/content/browser/speech/audio_encoder.cc
@@ -0,0 +1,206 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/audio_encoder.h"
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "base/scoped_ptr.h"
+#include "base/stl_util-inl.h"
+#include "base/string_number_conversions.h"
+#include "third_party/flac/flac.h"
+#include "third_party/speex/speex.h"
+
+using std::string;
+
+namespace {
+
+//-------------------------------- FLACEncoder ---------------------------------
+
+const char* const kContentTypeFLAC = "audio/x-flac; rate=";
+const int kFLACCompressionLevel = 0;  // 0 for speed
+
+class FLACEncoder : public speech_input::AudioEncoder {
+ public:
+  FLACEncoder(int sampling_rate, int bits_per_sample);
+  virtual ~FLACEncoder();
+  virtual void Encode(const short* samples, int num_samples);
+  virtual void Flush();
+
+ private:
+  static FLAC__StreamEncoderWriteStatus WriteCallback(
+      const FLAC__StreamEncoder* encoder,
+      const FLAC__byte buffer[],
+      size_t bytes,
+      unsigned samples,
+      unsigned current_frame,
+      void* client_data);
+
+  FLAC__StreamEncoder* encoder_;
+  bool is_encoder_initialized_;
+
+  DISALLOW_COPY_AND_ASSIGN(FLACEncoder);
+};
+
+FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback(
+    const FLAC__StreamEncoder* encoder,
+    const FLAC__byte buffer[],
+    size_t bytes,
+    unsigned samples,
+    unsigned current_frame,
+    void* client_data) {
+  FLACEncoder* me = static_cast<FLACEncoder*>(client_data);
+  DCHECK(me->encoder_ == encoder);
+  me->AppendToBuffer(new string(reinterpret_cast<const char*>(buffer), bytes));
+  return FLAC__STREAM_ENCODER_WRITE_STATUS_OK;
+}
+
+FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample)
+    : AudioEncoder(std::string(kContentTypeFLAC) +
+                   base::IntToString(sampling_rate)),
+      encoder_(FLAC__stream_encoder_new()),
+      is_encoder_initialized_(false) {
+  FLAC__stream_encoder_set_channels(encoder_, 1);
+  FLAC__stream_encoder_set_bits_per_sample(encoder_, bits_per_sample);
+  FLAC__stream_encoder_set_sample_rate(encoder_, sampling_rate);
+  FLAC__stream_encoder_set_compression_level(encoder_, kFLACCompressionLevel);
+
+  // Initializing the encoder will cause sync bytes to be written to
+  // its output stream, so we wait until the first call to this method
+  // before doing so.
+}
+
+FLACEncoder::~FLACEncoder() {
+  FLAC__stream_encoder_delete(encoder_);
+}
+
+void FLACEncoder::Encode(const short* samples, int num_samples) {
+  if (!is_encoder_initialized_) {
+    const FLAC__StreamEncoderInitStatus encoder_status =
+        FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL,
+                                         NULL, this);
+    DCHECK(encoder_status == FLAC__STREAM_ENCODER_INIT_STATUS_OK);
+    is_encoder_initialized_ = true;
+  }
+
+  // FLAC encoder wants samples as int32s.
+  scoped_ptr<FLAC__int32> flac_samples(new FLAC__int32[num_samples]);
+  FLAC__int32* flac_samples_ptr = flac_samples.get();
+  for (int i = 0; i < num_samples; ++i)
+    flac_samples_ptr[i] = samples[i];
+
+  FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples);
+}
+
+void FLACEncoder::Flush() {
+  FLAC__stream_encoder_finish(encoder_);
+}
+
+//-------------------------------- SpeexEncoder --------------------------------
+
+const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate=";
+const int kSpeexEncodingQuality = 8;
+const int kMaxSpeexFrameLength = 110;  // (44kbps rate sampled at 32kHz).
+
+// Since the frame length gets written out as a byte in the encoded packet,
+// make sure it is within the byte range.
+COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
+
+class SpeexEncoder : public speech_input::AudioEncoder {
+ public:
+  explicit SpeexEncoder(int sampling_rate);
+  virtual ~SpeexEncoder();
+  virtual void Encode(const short* samples, int num_samples);
+  virtual void Flush() {}
+
+ private:
+  void* encoder_state_;
+  SpeexBits bits_;
+  int samples_per_frame_;
+  char encoded_frame_data_[kMaxSpeexFrameLength + 1];  // +1 for the frame size.
+  DISALLOW_COPY_AND_ASSIGN(SpeexEncoder);
+};
+
+SpeexEncoder::SpeexEncoder(int sampling_rate)
+    : AudioEncoder(std::string(kContentTypeSpeex) +
+                   base::IntToString(sampling_rate)) {
+   // speex_bits_init() does not initialize all of the |bits_| struct.
+   memset(&bits_, 0, sizeof(bits_));
+   speex_bits_init(&bits_);
+   encoder_state_ = speex_encoder_init(&speex_wb_mode);
+   DCHECK(encoder_state_);
+   speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
+   DCHECK(samples_per_frame_ > 0);
+   int quality = kSpeexEncodingQuality;
+   speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
+   int vbr = 1;
+   speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
+   memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
+}
+
+SpeexEncoder::~SpeexEncoder() {
+  speex_bits_destroy(&bits_);
+  speex_encoder_destroy(encoder_state_);
+}
+
+void SpeexEncoder::Encode(const short* samples, int num_samples) {
+  // Drop incomplete frames, typically those which come in when recording stops.
+  num_samples -= (num_samples % samples_per_frame_);
+  for (int i = 0; i < num_samples; i += samples_per_frame_) {
+    speex_bits_reset(&bits_);
+    speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
+                     &bits_);
+
+    // Encode the frame and place the size of the frame as the first byte. This
+    // is the packet format for MIME type x-speex-with-header-byte.
+    int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
+                                        kMaxSpeexFrameLength);
+    encoded_frame_data_[0] = static_cast<char>(frame_length);
+    AppendToBuffer(new string(encoded_frame_data_, frame_length + 1));
+  }
+}
+
+}  // namespace
+
+namespace speech_input {
+
+AudioEncoder* AudioEncoder::Create(Codec codec,
+                                   int sampling_rate,
+                                   int bits_per_sample) {
+  if (codec == CODEC_FLAC)
+    return new FLACEncoder(sampling_rate, bits_per_sample);
+  return new SpeexEncoder(sampling_rate);
+}
+
+AudioEncoder::AudioEncoder(const std::string& mime_type)
+    : mime_type_(mime_type) {
+}
+
+AudioEncoder::~AudioEncoder() {
+  STLDeleteElements(&audio_buffers_);
+}
+
+bool AudioEncoder::GetEncodedData(std::string* encoded_data) {
+  if (!audio_buffers_.size())
+    return false;
+
+  int audio_buffer_length = 0;
+  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+       it != audio_buffers_.end(); ++it) {
+    audio_buffer_length += (*it)->length();
+  }
+  encoded_data->reserve(audio_buffer_length);
+  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+       it != audio_buffers_.end(); ++it) {
+    encoded_data->append(*(*it));
+  }
+
+  return true;
+}
+
+void AudioEncoder::AppendToBuffer(std::string* item) {
+  audio_buffers_.push_back(item);
+}
+
+}  // namespace speech_input
diff --git a/content/browser/speech/audio_encoder.h b/content/browser/speech/audio_encoder.h
new file mode 100644
index 0000000..c70bfd0
--- /dev/null
+++ b/content/browser/speech/audio_encoder.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_BROWSER_SPEECH_AUDIO_ENCODER_H_
+#define CONTENT_BROWSER_SPEECH_AUDIO_ENCODER_H_
+
+#include <list>
+#include <string>
+
+#include "base/basictypes.h"
+
+namespace speech_input {
+
+// Provides a simple interface to encode raw audio using the various speech
+// codecs.
+class AudioEncoder {
+ public:
+  enum Codec {
+    CODEC_FLAC,
+    CODEC_SPEEX,
+  };
+
+  static AudioEncoder* Create(Codec codec,
+                              int sampling_rate,
+                              int bits_per_sample);
+
+  virtual ~AudioEncoder();
+
+  // Encodes each frame of raw audio in |samples| to the internal buffer. Use
+  // |GetEncodedData| to read the result after this call or when recording
+  // completes.
+  virtual void Encode(const short* samples, int num_samples) = 0;
+
+  // Finish encoding and flush any pending encoded bits out.
+  virtual void Flush() = 0;
+
+  // Copies the encoded audio to the given string. Returns true if the output
+  // is not empty.
+  bool GetEncodedData(std::string* encoded_data);
+
+  const std::string& mime_type() { return mime_type_; }
+
+ protected:
+  AudioEncoder(const std::string& mime_type);
+
+  void AppendToBuffer(std::string* item);
+
+ private:
+  // Buffer holding the recorded audio. Owns the strings inside the list.
+  typedef std::list<std::string*> AudioBufferQueue;
+  AudioBufferQueue audio_buffers_;
+  std::string mime_type_;
+  DISALLOW_COPY_AND_ASSIGN(AudioEncoder);
+};
+
+}  // namespace speech_input
+
+#endif  // CONTENT_BROWSER_SPEECH_AUDIO_ENCODER_H_
diff --git a/content/browser/speech/endpointer/endpointer.cc b/content/browser/speech/endpointer/endpointer.cc
new file mode 100644
index 0000000..69c79a6
--- /dev/null
+++ b/content/browser/speech/endpointer/endpointer.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/endpointer/endpointer.h"
+
+#include "base/time.h"
+
+using base::Time;
+
+namespace {
+static const int kFrameRate = 50;  // 1 frame = 20ms of audio.
+}
+
+namespace speech_input {
+
+Endpointer::Endpointer(int sample_rate)
+    : speech_input_possibly_complete_silence_length_us_(-1),
+      speech_input_complete_silence_length_us_(-1),
+      audio_frame_time_us_(0),
+      sample_rate_(sample_rate),
+      frame_size_(0) {
+  Reset();
+
+  frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate));
+
+  speech_input_minimum_length_us_ =
+      static_cast<int64>(1.7 * Time::kMicrosecondsPerSecond);
+  speech_input_complete_silence_length_us_ =
+      static_cast<int64>(0.5 * Time::kMicrosecondsPerSecond);
+  long_speech_input_complete_silence_length_us_ = -1;
+  long_speech_length_us_ = -1;
+  speech_input_possibly_complete_silence_length_us_ =
+      1 * Time::kMicrosecondsPerSecond;
+
+  // Set the default configuration for Push To Talk mode.
+  EnergyEndpointerParams ep_config;
+  ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
+  ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
+  ep_config.set_endpoint_margin(0.2f);
+  ep_config.set_onset_window(0.15f);
+  ep_config.set_speech_on_window(0.4f);
+  ep_config.set_offset_window(0.15f);
+  ep_config.set_onset_detect_dur(0.09f);
+  ep_config.set_onset_confirm_dur(0.075f);
+  ep_config.set_on_maintain_dur(0.10f);
+  ep_config.set_offset_confirm_dur(0.12f);
+  ep_config.set_decision_threshold(1000.0f);
+  ep_config.set_min_decision_threshold(50.0f);
+  ep_config.set_fast_update_dur(0.2f);
+  ep_config.set_sample_rate(static_cast<float>(sample_rate));
+  ep_config.set_min_fundamental_frequency(57.143f);
+  ep_config.set_max_fundamental_frequency(400.0f);
+  ep_config.set_contamination_rejection_period(0.25f);
+  energy_endpointer_.Init(ep_config);
+}
+
+void Endpointer::Reset() {
+  old_ep_status_ = EP_PRE_SPEECH;
+  waiting_for_speech_possibly_complete_timeout_ = false;
+  waiting_for_speech_complete_timeout_ = false;
+  speech_previously_detected_ = false;
+  speech_input_complete_ = false;
+  audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer.
+  speech_end_time_us_ = -1;
+  speech_start_time_us_ = -1;
+}
+
+void Endpointer::StartSession() {
+  Reset();
+  energy_endpointer_.StartSession();
+}
+
+void Endpointer::EndSession() {
+  energy_endpointer_.EndSession();
+}
+
+void Endpointer::SetEnvironmentEstimationMode() {
+  Reset();
+  energy_endpointer_.SetEnvironmentEstimationMode();
+}
+
+void Endpointer::SetUserInputMode() {
+  energy_endpointer_.SetUserInputMode();
+}
+
+EpStatus Endpointer::Status(int64 *time) {
+  return energy_endpointer_.Status(time);
+}
+
+EpStatus Endpointer::ProcessAudio(const int16* audio_data, int num_samples,
+                                  float* rms_out) {
+  EpStatus ep_status = EP_PRE_SPEECH;
+
+  // Process the input data in blocks of frame_size_, dropping any incomplete
+  // frames at the end (which is ok since typically the caller will be recording
+  // audio in multiples of our frame size).
+  int sample_index = 0;
+  while (sample_index + frame_size_ <= num_samples) {
+    // Have the endpointer process the frame.
+    energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_,
+                                         audio_data + sample_index,
+                                         frame_size_,
+                                         rms_out);
+    sample_index += frame_size_;
+    audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) /
+                         sample_rate_;
+
+    // Get the status of the endpointer.
+    int64 ep_time;
+    ep_status = energy_endpointer_.Status(&ep_time);
+
+    // Handle state changes.
+    if ((EP_SPEECH_PRESENT == ep_status) &&
+        (EP_POSSIBLE_ONSET == old_ep_status_)) {
+      speech_end_time_us_ = -1;
+      waiting_for_speech_possibly_complete_timeout_ = false;
+      waiting_for_speech_complete_timeout_ = false;
+      // Trigger SpeechInputDidStart event on first detection.
+      if (false == speech_previously_detected_) {
+        speech_previously_detected_ = true;
+        speech_start_time_us_ = ep_time;
+      }
+    }
+    if ((EP_PRE_SPEECH == ep_status) &&
+        (EP_POSSIBLE_OFFSET == old_ep_status_)) {
+      speech_end_time_us_ = ep_time;
+      waiting_for_speech_possibly_complete_timeout_ = true;
+      waiting_for_speech_complete_timeout_ = true;
+    }
+    if (ep_time > speech_input_minimum_length_us_) {
+      // Speech possibly complete timeout.
+      if ((waiting_for_speech_possibly_complete_timeout_) &&
+          (ep_time - speech_end_time_us_ >
+              speech_input_possibly_complete_silence_length_us_)) {
+        waiting_for_speech_possibly_complete_timeout_ = false;
+      }
+      if (waiting_for_speech_complete_timeout_) {
+        // The length of the silence timeout period can be held constant, or it
+        // can be changed after a fixed amount of time from the beginning of
+        // speech.
+        bool has_stepped_silence =
+            (long_speech_length_us_ > 0) &&
+            (long_speech_input_complete_silence_length_us_ > 0);
+        int64 requested_silence_length;
+        if (has_stepped_silence &&
+            (ep_time - speech_start_time_us_) > long_speech_length_us_) {
+          requested_silence_length =
+              long_speech_input_complete_silence_length_us_;
+        } else {
+          requested_silence_length =
+              speech_input_complete_silence_length_us_;
+        }
+
+        // Speech complete timeout.
+        if ((ep_time - speech_end_time_us_) > requested_silence_length) {
+          waiting_for_speech_complete_timeout_ = false;
+          speech_input_complete_ = true;
+        }
+      }
+    }
+    old_ep_status_ = ep_status;
+  }
+  return ep_status;
+}
+
+}  // namespace speech
diff --git a/content/browser/speech/endpointer/endpointer.h b/content/browser/speech/endpointer/endpointer.h
new file mode 100644
index 0000000..be4bd65
--- /dev/null
+++ b/content/browser/speech/endpointer/endpointer.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
+#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
+
+#include "base/basictypes.h"
+#include "content/browser/speech/endpointer/energy_endpointer.h"
+
+class EpStatus;
+
+namespace speech_input {
+
+// A simple interface to the underlying energy-endpointer implementation, this
+// class lets callers provide audio as being recorded and let them poll to find
+// when the user has stopped speaking.
+//
+// There are two events that may trigger the end of speech:
+//
+// speechInputPossiblyComplete event:
+//
+// Signals that silence/noise has  been detected for a *short* amount of
+// time after some speech has been detected. It can be used for low latency
+// UI feedback. To disable it, set it to a large amount.
+//
+// speechInputComplete event:
+//
+// This event is intended to signal end of input and to stop recording.
+// The amount of time to wait after speech is set by
+// speech_input_complete_silence_length_ and optionally two other
+// parameters (see below).
+// This time can be held constant, or can change as more speech is detected.
+// In the latter case, the time changes after a set amount of time from the
+// *beginning* of speech.  This is motivated by the expectation that there
+// will be two distinct types of inputs: short search queries and longer
+// dictation style input.
+//
+// Three parameters are used to define the piecewise constant timeout function.
+// The timeout length is speech_input_complete_silence_length until
+// long_speech_length, when it changes to
+// long_speech_input_complete_silence_length.
+class Endpointer {
+ public:
+  explicit Endpointer(int sample_rate);
+
+  // Start the endpointer. This should be called at the beginning of a session.
+  void StartSession();
+
+  // Stop the endpointer.
+  void EndSession();
+
+  // Start environment estimation. Audio will be used for environment estimation
+  // i.e. noise level estimation.
+  void SetEnvironmentEstimationMode();
+
+  // Start user input. This should be called when the user indicates start of
+  // input, e.g. by pressing a button.
+  void SetUserInputMode();
+
+  // Process a segment of audio, which may be more than one frame.
+  // The status of the last frame will be returned.
+  EpStatus ProcessAudio(const int16* audio_data, int num_samples,
+                        float* rms_out);
+
+  // Get the status of the endpointer.
+  EpStatus Status(int64 *time_us);
+
+  // Returns true if the endpointer detected reasonable audio levels above
+  // background noise which could be user speech, false if not.
+  bool DidStartReceivingSpeech() const {
+    return speech_previously_detected_;
+  }
+
+  bool IsEstimatingEnvironment() const {
+    return energy_endpointer_.estimating_environment();
+  }
+
+  void set_speech_input_complete_silence_length(int64 time_us) {
+    speech_input_complete_silence_length_us_ = time_us;
+  }
+
+  void set_long_speech_input_complete_silence_length(int64 time_us) {
+    long_speech_input_complete_silence_length_us_ = time_us;
+  }
+
+  void set_speech_input_possibly_complete_silence_length(int64 time_us) {
+    speech_input_possibly_complete_silence_length_us_ = time_us;
+  }
+
+  void set_long_speech_length(int64 time_us) {
+    long_speech_length_us_ = time_us;
+  }
+
+  bool speech_input_complete() const {
+    return speech_input_complete_;
+  }
+
+ private:
+  // Reset internal states. Helper method common to initial input utterance
+  // and following input utternaces.
+  void Reset();
+
+  // Minimum allowable length of speech input.
+  int64 speech_input_minimum_length_us_;
+
+  // The speechInputPossiblyComplete event signals that silence/noise has been
+  // detected for a *short* amount of time after some speech has been detected.
+  // This proporty specifies the time period.
+  int64 speech_input_possibly_complete_silence_length_us_;
+
+  // The speechInputComplete event signals that silence/noise has been
+  // detected for a *long* amount of time after some speech has been detected.
+  // This property specifies the time period.
+  int64 speech_input_complete_silence_length_us_;
+
+  // Same as above, this specifies the required silence period after speech
+  // detection. This period is used instead of
+  // speech_input_complete_silence_length_ when the utterance is longer than
+  // long_speech_length_. This parameter is optional.
+  int64 long_speech_input_complete_silence_length_us_;
+
+  // The period of time after which the endpointer should consider
+  // long_speech_input_complete_silence_length_ as a valid silence period
+  // instead of speech_input_complete_silence_length_. This parameter is
+  // optional.
+  int64 long_speech_length_us_;
+
+  // First speech onset time, used in determination of speech complete timeout.
+  int64 speech_start_time_us_;
+
+  // Most recent end time, used in determination of speech complete timeout.
+  int64 speech_end_time_us_;
+
+  int64 audio_frame_time_us_;
+  EpStatus old_ep_status_;
+  bool waiting_for_speech_possibly_complete_timeout_;
+  bool waiting_for_speech_complete_timeout_;
+  bool speech_previously_detected_;
+  bool speech_input_complete_;
+  EnergyEndpointer energy_endpointer_;
+  int sample_rate_;
+  int32 frame_size_;
+};
+
+}  // namespace speech_input
+
+#endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
diff --git a/content/browser/speech/endpointer/endpointer_unittest.cc b/content/browser/speech/endpointer/endpointer_unittest.cc
new file mode 100644
index 0000000..3d1583e
--- /dev/null
+++ b/content/browser/speech/endpointer/endpointer_unittest.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/task.h"
+#include "content/browser/speech/endpointer/endpointer.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace {
+const int kFrameRate = 50;  // 20 ms long frames for AMR encoding.
+const int kSampleRate = 8000;  // 8 k samples per second for AMR encoding.
+
+// At 8 sample per second a 20 ms frame is 160 samples, which corrsponds
+// to the AMR codec.
+const int kFrameSize = kSampleRate / kFrameRate;  // 160 samples.
+COMPILE_ASSERT(kFrameSize == 160, invalid_frame_size);
+}
+
+namespace speech_input {
+
+class FrameProcessor {
+ public:
+  // Process a single frame of test audio samples.
+  virtual EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) = 0;
+};
+
+void RunEndpointerEventsTest(FrameProcessor* processor) {
+  int16 samples[kFrameSize];
+
+  // We will create a white noise signal of 150 frames. The frames from 50 to
+  // 100 will have more power, and the endpointer should fire on those frames.
+  const int kNumFrames = 150;
+
+  // Create a random sequence of samples.
+  srand(1);
+  float gain = 0.0;
+  int64 time = 0;
+  for (int frame_count = 0; frame_count < kNumFrames; ++frame_count) {
+    // The frames from 50 to 100 will have more power, and the endpointer
+    // should detect those frames as speech.
+    if ((frame_count >= 50) && (frame_count < 100)) {
+      gain = 2000.0;
+    } else {
+      gain = 1.0;
+    }
+    // Create random samples.
+    for (int i = 0; i < kFrameSize; ++i) {
+      float randNum = static_cast<float>(rand() - (RAND_MAX / 2)) /
+          static_cast<float>(RAND_MAX);
+      samples[i] = static_cast<int16>(gain * randNum);
+    }
+
+    EpStatus ep_status = processor->ProcessFrame(time, samples, kFrameSize);
+    time += static_cast<int64>(kFrameSize * (1e6 / kSampleRate));
+
+    // Log the status.
+    if (20 == frame_count)
+      EXPECT_EQ(EP_PRE_SPEECH, ep_status);
+    if (70 == frame_count)
+      EXPECT_EQ(EP_SPEECH_PRESENT, ep_status);
+    if (120 == frame_count)
+      EXPECT_EQ(EP_PRE_SPEECH, ep_status);
+  }
+}
+
+// This test instantiates and initializes a stand alone endpointer module.
+// The test creates FrameData objects with random noise and send them
+// to the endointer module. The energy of the first 50 frames is low,
+// followed by 500 high energy frames, and another 50 low energy frames.
+// We test that the correct start and end frames were detected.
+class EnergyEndpointerFrameProcessor : public FrameProcessor {
+ public:
+  explicit EnergyEndpointerFrameProcessor(EnergyEndpointer* endpointer)
+      : endpointer_(endpointer) {}
+
+  EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) {
+    endpointer_->ProcessAudioFrame(time, samples, kFrameSize, NULL);
+    int64 ep_time;
+    return endpointer_->Status(&ep_time);
+  }
+
+ private:
+  EnergyEndpointer* endpointer_;
+};
+
+TEST(EndpointerTest, TestEnergyEndpointerEvents) {
+  // Initialize endpointer and configure it. We specify the parameters
+  // here for a 20ms window, and a 20ms step size, which corrsponds to
+  // the narrow band AMR codec.
+  EnergyEndpointerParams ep_config;
+  ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
+  ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
+  ep_config.set_endpoint_margin(0.2f);
+  ep_config.set_onset_window(0.15f);
+  ep_config.set_speech_on_window(0.4f);
+  ep_config.set_offset_window(0.15f);
+  ep_config.set_onset_detect_dur(0.09f);
+  ep_config.set_onset_confirm_dur(0.075f);
+  ep_config.set_on_maintain_dur(0.10f);
+  ep_config.set_offset_confirm_dur(0.12f);
+  ep_config.set_decision_threshold(100.0f);
+  EnergyEndpointer endpointer;
+  endpointer.Init(ep_config);
+
+  endpointer.StartSession();
+
+  EnergyEndpointerFrameProcessor frame_processor(&endpointer);
+  RunEndpointerEventsTest(&frame_processor);
+
+  endpointer.EndSession();
+};
+
+// Test endpointer wrapper class.
+class EndpointerFrameProcessor : public FrameProcessor {
+ public:
+  explicit EndpointerFrameProcessor(Endpointer* endpointer)
+      : endpointer_(endpointer) {}
+
+  EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) {
+    endpointer_->ProcessAudio(samples, kFrameSize, NULL);
+    int64 ep_time;
+    return endpointer_->Status(&ep_time);
+  }
+
+ private:
+  Endpointer* endpointer_;
+};
+
+TEST(EndpointerTest, TestEmbeddedEndpointerEvents) {
+  const int kSampleRate = 8000;  // 8 k samples per second for AMR encoding.
+
+  Endpointer endpointer(kSampleRate);
+  const int64 kMillisecondsPerMicrosecond = 1000;
+  const int64 short_timeout = 300 * kMillisecondsPerMicrosecond;
+  endpointer.set_speech_input_possibly_complete_silence_length(short_timeout);
+  const int64 long_timeout = 500 * kMillisecondsPerMicrosecond;
+  endpointer.set_speech_input_complete_silence_length(long_timeout);
+  endpointer.StartSession();
+
+  EndpointerFrameProcessor frame_processor(&endpointer);
+  RunEndpointerEventsTest(&frame_processor);
+
+  endpointer.EndSession();
+}
+
+}  // namespace speech_input
diff --git a/content/browser/speech/endpointer/energy_endpointer.cc b/content/browser/speech/endpointer/energy_endpointer.cc
new file mode 100644
index 0000000..c806aed
--- /dev/null
+++ b/content/browser/speech/endpointer/energy_endpointer.cc
@@ -0,0 +1,369 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// To know more about the algorithm used and the original code which this is
+// based of, see
+// https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef
+
+#include "content/browser/speech/endpointer/energy_endpointer.h"
+
+#include <math.h>
+
+#include "base/logging.h"
+
+namespace {
+
+// Returns the RMS (quadratic mean) of the input signal.
+float RMS(const int16* samples, int num_samples) {
+  int64 ssq_int64 = 0;
+  int64 sum_int64 = 0;
+  for (int i = 0; i < num_samples; ++i) {
+    sum_int64 += samples[i];
+    ssq_int64 += samples[i] * samples[i];
+  }
+  // now convert to floats.
+  double sum = static_cast<double>(sum_int64);
+  sum /= num_samples;
+  double ssq = static_cast<double>(ssq_int64);
+  return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
+}
+
+int64 Secs2Usecs(float seconds) {
+  return static_cast<int64>(0.5 + (1.0e6 * seconds));
+}
+
+}  // namespace
+
+namespace speech_input {
+
+// Stores threshold-crossing histories for making decisions about the speech
+// state.
+class EnergyEndpointer::HistoryRing {
+ public:
+  HistoryRing() : insertion_index_(0) {}
+
+  // Resets the ring to |size| elements each with state |initial_state|
+  void SetRing(int size, bool initial_state);
+
+  // Inserts a new entry into the ring and drops the oldest entry.
+  void Insert(int64 time_us, bool decision);
+
+  // Returns the time in microseconds of the most recently added entry.
+  int64 EndTime() const;
+
+  // Returns the sum of all intervals during which 'decision' is true within
+  // the time in seconds specified by 'duration'. The returned interval is
+  // in seconds.
+  float RingSum(float duration_sec);
+
+ private:
+  struct DecisionPoint {
+    int64 time_us;
+    bool decision;
+  };
+
+  std::vector<DecisionPoint> decision_points_;
+  int insertion_index_;  // Index at which the next item gets added/inserted.
+
+  DISALLOW_COPY_AND_ASSIGN(HistoryRing);
+};
+
+void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
+  insertion_index_ = 0;
+  decision_points_.clear();
+  DecisionPoint init = { -1, initial_state };
+  decision_points_.resize(size, init);
+}
+
+void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) {
+  decision_points_[insertion_index_].time_us = time_us;
+  decision_points_[insertion_index_].decision = decision;
+  insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
+}
+
+int64 EnergyEndpointer::HistoryRing::EndTime() const {
+  int ind = insertion_index_ - 1;
+  if (ind < 0)
+    ind = decision_points_.size() - 1;
+  return decision_points_[ind].time_us;
+}
+
+float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
+  if (!decision_points_.size())
+    return 0.0;
+
+  int64 sum_us = 0;
+  int ind = insertion_index_ - 1;
+  if (ind < 0)
+    ind = decision_points_.size() - 1;
+  int64 end_us = decision_points_[ind].time_us;
+  bool is_on = decision_points_[ind].decision;
+  int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec));
+  if (start_us < 0)
+    start_us = 0;
+  size_t n_summed = 1;  // n points ==> (n-1) intervals
+  while ((decision_points_[ind].time_us > start_us) &&
+         (n_summed < decision_points_.size())) {
+    --ind;
+    if (ind < 0)
+      ind = decision_points_.size() - 1;
+    if (is_on)
+      sum_us += end_us - decision_points_[ind].time_us;
+    is_on = decision_points_[ind].decision;
+    end_us = decision_points_[ind].time_us;
+    n_summed++;
+  }
+
+  return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
+}
+
+EnergyEndpointer::EnergyEndpointer()
+    : status_(EP_PRE_SPEECH),
+      offset_confirm_dur_sec_(0),
+      endpointer_time_us_(0),
+      fast_update_frames_(0),
+      frame_counter_(0),
+      max_window_dur_(4.0),
+      sample_rate_(0),
+      history_(new HistoryRing()),
+      decision_threshold_(0),
+      estimating_environment_(false),
+      noise_level_(0),
+      rms_adapt_(0),
+      start_lag_(0),
+      end_lag_(0),
+      user_input_start_time_us_(0) {
+}
+
+EnergyEndpointer::~EnergyEndpointer() {
+}
+
+int EnergyEndpointer::TimeToFrame(float time) const {
+  return static_cast<int32>(0.5 + (time / params_.frame_period()));
+}
+
+void EnergyEndpointer::Restart(bool reset_threshold) {
+  status_ = EP_PRE_SPEECH;
+  user_input_start_time_us_ = 0;
+
+  if (reset_threshold) {
+    decision_threshold_ = params_.decision_threshold();
+    rms_adapt_ = decision_threshold_;
+    noise_level_ = params_.decision_threshold() / 2.0f;
+    frame_counter_ = 0;  // Used for rapid initial update of levels.
+  }
+
+  // Set up the memories to hold the history windows.
+  history_->SetRing(TimeToFrame(max_window_dur_), false);
+
+  // Flag that indicates that current input should be used for
+  // estimating the environment. The user has not yet started input
+  // by e.g. pressed the push-to-talk button. By default, this is
+  // false for backward compatibility.
+  estimating_environment_ = false;
+}
+
+void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
+  params_ = params;
+
+  // Find the longest history interval to be used, and make the ring
+  // large enough to accommodate that number of frames.  NOTE: This
+  // depends upon ep_frame_period being set correctly in the factory
+  // that did this instantiation.
+  max_window_dur_ = params_.onset_window();
+  if (params_.speech_on_window() > max_window_dur_)
+    max_window_dur_ = params_.speech_on_window();
+  if (params_.offset_window() > max_window_dur_)
+    max_window_dur_ = params_.offset_window();
+  Restart(true);
+
+  offset_confirm_dur_sec_ = params_.offset_window() -
+                            params_.offset_confirm_dur();
+  if (offset_confirm_dur_sec_ < 0.0)
+    offset_confirm_dur_sec_ = 0.0;
+
+  user_input_start_time_us_ = 0;
+
+  // Flag that indicates that  current input should be used for
+  // estimating the environment. The user has not yet started input
+  // by e.g. pressed the push-to-talk button. By default, this is
+  // false for backward compatibility.
+  estimating_environment_ = false;
+  // The initial value of the noise and speech levels is inconsequential.
+  // The level of the first frame will overwrite these values.
+  noise_level_ = params_.decision_threshold() / 2.0f;
+  fast_update_frames_ =
+      static_cast<int64>(params_.fast_update_dur() / params_.frame_period());
+
+  frame_counter_ = 0;  // Used for rapid initial update of levels.
+
+  sample_rate_ = params_.sample_rate();
+  start_lag_ = static_cast<int>(sample_rate_ /
+                                params_.max_fundamental_frequency());
+  end_lag_ = static_cast<int>(sample_rate_ /
+                              params_.min_fundamental_frequency());
+}
+
+void EnergyEndpointer::StartSession() {
+  Restart(true);
+}
+
+void EnergyEndpointer::EndSession() {
+  status_ = EP_POST_SPEECH;
+}
+
+void EnergyEndpointer::SetEnvironmentEstimationMode() {
+  Restart(true);
+  estimating_environment_ = true;
+}
+
+void EnergyEndpointer::SetUserInputMode() {
+  estimating_environment_ = false;
+  user_input_start_time_us_ = endpointer_time_us_;
+}
+
+void EnergyEndpointer::ProcessAudioFrame(int64 time_us,
+                                         const int16* samples,
+                                         int num_samples,
+                                         float* rms_out) {
+  endpointer_time_us_ = time_us;
+  float rms = RMS(samples, num_samples);
+
+  // Check that this is user input audio vs. pre-input adaptation audio.
+  // Input audio starts when the user indicates start of input, by e.g.
+  // pressing push-to-talk. Audio recieved prior to that is used to update
+  // noise and speech level estimates.
+  if (!estimating_environment_) {
+    bool decision = false;
+    if ((endpointer_time_us_ - user_input_start_time_us_) <
+        Secs2Usecs(params_.contamination_rejection_period())) {
+      decision = false;
+      DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_;
+    } else {
+      decision = (rms > decision_threshold_);
+    }
+
+    history_->Insert(endpointer_time_us_, decision);
+
+    switch (status_) {
+      case EP_PRE_SPEECH:
+        if (history_->RingSum(params_.onset_window()) >
+            params_.onset_detect_dur()) {
+          status_ = EP_POSSIBLE_ONSET;
+        }
+        break;
+
+      case EP_POSSIBLE_ONSET: {
+        float tsum = history_->RingSum(params_.onset_window());
+        if (tsum > params_.onset_confirm_dur()) {
+          status_ = EP_SPEECH_PRESENT;
+        } else {  // If signal is not maintained, drop back to pre-speech.
+          if (tsum <= params_.onset_detect_dur())
+            status_ = EP_PRE_SPEECH;
+        }
+        break;
+      }
+
+      case EP_SPEECH_PRESENT: {
+        // To induce hysteresis in the state residency, we allow a
+        // smaller residency time in the on_ring, than was required to
+        // enter the SPEECH_PERSENT state.
+        float on_time = history_->RingSum(params_.speech_on_window());
+        if (on_time < params_.on_maintain_dur())
+          status_ = EP_POSSIBLE_OFFSET;
+        break;
+      }
+
+      case EP_POSSIBLE_OFFSET:
+        if (history_->RingSum(params_.offset_window()) <=
+            offset_confirm_dur_sec_) {
+          // Note that this offset time may be beyond the end
+          // of the input buffer in a real-time system.  It will be up
+          // to the RecognizerSession to decide what to do.
+          status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
+        } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
+          if (history_->RingSum(params_.speech_on_window()) >=
+              params_.on_maintain_dur())
+            status_ = EP_SPEECH_PRESENT;
+        }
+        break;
+
+      default:
+        LOG(WARNING) << "Invalid case in switch: " << status_;
+        break;
+    }
+
+    // If this is a quiet, non-speech region, slowly adapt the detection
+    // threshold to be about 6dB above the average RMS.
+    if ((!decision) && (status_ == EP_PRE_SPEECH)) {
+      decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
+      rms_adapt_ = decision_threshold_;
+    } else {
+      // If this is in a speech region, adapt the decision threshold to
+      // be about 10dB below the average RMS. If the noise level is high,
+      // the threshold is pushed up.
+      // Adaptation up to a higher level is 5 times faster than decay to
+      // a lower level.
+      if ((status_ == EP_SPEECH_PRESENT) && decision) {
+        if (rms_adapt_ > rms) {
+          rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
+        } else {
+          rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
+        }
+        float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
+        decision_threshold_ = (.90f * decision_threshold_) +
+                              (0.10f * target_threshold);
+      }
+    }
+
+    // Set a floor
+    if (decision_threshold_ < params_.min_decision_threshold())
+      decision_threshold_ = params_.min_decision_threshold();
+  }
+
+  // Update speech and noise levels.
+  UpdateLevels(rms);
+  ++frame_counter_;
+
+  if (rms_out) {
+    *rms_out = -120.0;
+    if ((noise_level_ > 0.0) && ((rms / noise_level_ ) > 0.000001))
+      *rms_out = static_cast<float>(20.0 * log10(rms / noise_level_));
+  }
+}
+
+void EnergyEndpointer::UpdateLevels(float rms) {
+  // Update quickly initially. We assume this is noise and that
+  // speech is 6dB above the noise.
+  if (frame_counter_ < fast_update_frames_) {
+    // Alpha increases from 0 to (k-1)/k where k is the number of time
+    // steps in the initial adaptation period.
+    float alpha = static_cast<float>(frame_counter_) /
+        static_cast<float>(fast_update_frames_);
+    noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
+    DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_
+             << ", fast_update_frames_ " << fast_update_frames_;
+  } else {
+    // Update Noise level. The noise level adapts quickly downward, but
+    // slowly upward. The noise_level_ parameter is not currently used
+    // for threshold adaptation. It is used for UI feedback.
+    if (noise_level_ < rms)
+      noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
+    else
+      noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
+  }
+  if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
+    decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
+    // Set a floor
+    if (decision_threshold_ < params_.min_decision_threshold())
+      decision_threshold_ = params_.min_decision_threshold();
+  }
+}
+
+EpStatus EnergyEndpointer::Status(int64* status_time)  const {
+  *status_time = history_->EndTime();
+  return status_;
+}
+
+}  // namespace speech
diff --git a/content/browser/speech/endpointer/energy_endpointer.h b/content/browser/speech/endpointer/energy_endpointer.h
new file mode 100644
index 0000000..b10d8b7
--- /dev/null
+++ b/content/browser/speech/endpointer/energy_endpointer.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// The EnergyEndpointer class finds likely speech onset and offset points.
+//
+// The implementation described here is about the simplest possible.
+// It is based on timings of threshold crossings for overall signal
+// RMS. It is suitable for light weight applications.
+//
+// As written, the basic idea is that one specifies intervals that
+// must be occupied by super- and sub-threshold energy levels, and
+// defers decisions re onset and offset times until these
+// specifications have been met.  Three basic intervals are tested: an
+// onset window, a speech-on window, and an offset window.  We require
+// super-threshold to exceed some mimimum total durations in the onset
+// and speech-on windows before declaring the speech onset time, and
+// we specify a required sub-threshold residency in the offset window
+// before declaring speech offset. As the various residency requirements are
+// met, the EnergyEndpointer instance assumes various states, and can return the
+// ID of these states to the client (see EpStatus below).
+//
+// The levels of the speech and background noise are continuously updated. It is
+// important that the background noise level be estimated initially for
+// robustness in noisy conditions. The first frames are assumed to be background
+// noise and a fast update rate is used for the noise level. The duration for
+// fast update is controlled by the fast_update_dur_ paramter.
+//
+// If used in noisy conditions, the endpointer should be started and run in the
+// EnvironmentEstimation mode, for at least 200ms, before switching to
+// UserInputMode.
+// Audio feedback contamination can appear in the input audio, if not cut
+// out or handled by echo cancellation. Audio feedback can trigger a false
+// accept. The false accepts can be ignored by setting
+// ep_contamination_rejection_period.
+
+#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
+#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
+
+#include <vector>
+
+#include "base/basictypes.h"
+#include "base/scoped_ptr.h"
+#include "content/browser/speech/endpointer/energy_endpointer_params.h"
+
+namespace speech_input {
+
+// Endpointer status codes
+enum EpStatus {
+  EP_PRE_SPEECH = 10,
+  EP_POSSIBLE_ONSET,
+  EP_SPEECH_PRESENT,
+  EP_POSSIBLE_OFFSET,
+  EP_POST_SPEECH,
+};
+
+class EnergyEndpointer {
+ public:
+  // The default construction MUST be followed by Init(), before any
+  // other use can be made of the instance.
+  EnergyEndpointer();
+  virtual ~EnergyEndpointer();
+
+  void Init(const EnergyEndpointerParams& params);
+
+  // Start the endpointer. This should be called at the beginning of a session.
+  void StartSession();
+
+  // Stop the endpointer.
+  void EndSession();
+
+  // Start environment estimation. Audio will be used for environment estimation
+  // i.e. noise level estimation.
+  void SetEnvironmentEstimationMode();
+
+  // Start user input. This should be called when the user indicates start of
+  // input, e.g. by pressing a button.
+  void SetUserInputMode();
+
+  // Computes the next input frame and modifies EnergyEndpointer status as
+  // appropriate based on the computation.
+  void ProcessAudioFrame(int64 time_us,
+                         const int16* samples, int num_samples,
+                         float* rms_out);
+
+  // Returns the current state of the EnergyEndpointer and the time
+  // corresponding to the most recently computed frame.
+  EpStatus Status(int64* status_time_us) const;
+
+  bool estimating_environment() const {
+    return estimating_environment_;
+  }
+
+ private:
+  class HistoryRing;
+
+  // Resets the endpointer internal state.  If reset_threshold is true, the
+  // state will be reset completely, including adaptive thresholds and the
+  // removal of all history information.
+  void Restart(bool reset_threshold);
+
+  // Update internal speech and noise levels.
+  void UpdateLevels(float rms);
+
+  // Returns the number of frames (or frame number) corresponding to
+  // the 'time' (in seconds).
+  int TimeToFrame(float time) const;
+
+  EpStatus status_;  // The current state of this instance.
+  float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH
+  int64 endpointer_time_us_;  // Time of the most recently received audio frame.
+  int64 fast_update_frames_; // Number of frames for initial level adaptation.
+  int64 frame_counter_;  // Number of frames seen. Used for initial adaptation.
+  float max_window_dur_;  // Largest search window size (seconds)
+  float sample_rate_;  // Sampling rate.
+
+  // Ring buffers to hold the speech activity history.
+  scoped_ptr<HistoryRing> history_;
+
+  // Configuration parameters.
+  EnergyEndpointerParams params_;
+
+  // RMS which must be exceeded to conclude frame is speech.
+  float decision_threshold_;
+
+  // Flag to indicate that audio should be used to estimate environment, prior
+  // to receiving user input.
+  bool estimating_environment_;
+
+  // Estimate of the background noise level. Used externally for UI feedback.
+  float noise_level_;
+
+  // An adaptive threshold used to update decision_threshold_ when appropriate.
+  float rms_adapt_;
+
+  // Start lag corresponds to the highest fundamental frequency.
+  int start_lag_;
+
+  // End lag corresponds to the lowest fundamental frequency.
+  int end_lag_;
+
+  // Time when mode switched from environment estimation to user input. This
+  // is used to time forced rejection of audio feedback contamination.
+  int64 user_input_start_time_us_;
+
+  DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer);
+};
+
+}  // namespace speech_input
+
+#endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
diff --git a/content/browser/speech/endpointer/energy_endpointer_params.cc b/content/browser/speech/endpointer/energy_endpointer_params.cc
new file mode 100644
index 0000000..e110b24
--- /dev/null
+++ b/content/browser/speech/endpointer/energy_endpointer_params.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/endpointer/energy_endpointer_params.h"
+
+namespace speech_input {
+
+EnergyEndpointerParams::EnergyEndpointerParams() {
+  SetDefaults();
+}
+
+void EnergyEndpointerParams::SetDefaults() {
+  frame_period_ = 0.01f;
+  frame_duration_ = 0.01f;
+  endpoint_margin_ = 0.2f;
+  onset_window_ = 0.15f;
+  speech_on_window_ = 0.4f;
+  offset_window_ = 0.15f;
+  onset_detect_dur_ = 0.09f;
+  onset_confirm_dur_ = 0.075f;
+  on_maintain_dur_ = 0.10f;
+  offset_confirm_dur_ = 0.12f;
+  decision_threshold_ = 150.0f;
+  min_decision_threshold_ = 50.0f;
+  fast_update_dur_ = 0.2f;
+  sample_rate_ = 8000.0f;
+  min_fundamental_frequency_ = 57.143f;
+  max_fundamental_frequency_ = 400.0f;
+  contamination_rejection_period_ = 0.25f;
+}
+
+void EnergyEndpointerParams::operator=(const EnergyEndpointerParams& source) {
+  frame_period_ = source.frame_period();
+  frame_duration_ = source.frame_duration();
+  endpoint_margin_ = source.endpoint_margin();
+  onset_window_ = source.onset_window();
+  speech_on_window_ = source.speech_on_window();
+  offset_window_ = source.offset_window();
+  onset_detect_dur_ = source.onset_detect_dur();
+  onset_confirm_dur_ = source.onset_confirm_dur();
+  on_maintain_dur_ = source.on_maintain_dur();
+  offset_confirm_dur_ = source.offset_confirm_dur();
+  decision_threshold_ = source.decision_threshold();
+  min_decision_threshold_ = source.min_decision_threshold();
+  fast_update_dur_ = source.fast_update_dur();
+  sample_rate_ = source.sample_rate();
+  min_fundamental_frequency_ = source.min_fundamental_frequency();
+  max_fundamental_frequency_ = source.max_fundamental_frequency();
+  contamination_rejection_period_ = source.contamination_rejection_period();
+}
+
+}  //  namespace speech_input
diff --git a/content/browser/speech/endpointer/energy_endpointer_params.h b/content/browser/speech/endpointer/energy_endpointer_params.h
new file mode 100644
index 0000000..5fd923d
--- /dev/null
+++ b/content/browser/speech/endpointer/energy_endpointer_params.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_
+#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_
+
+#include "base/basictypes.h"
+
+namespace speech_input {
+
+// Input parameters for the EnergyEndpointer class.
+class EnergyEndpointerParams {
+ public:
+  EnergyEndpointerParams();
+
+  void SetDefaults();
+
+  void operator=(const EnergyEndpointerParams& source);
+
+  // Accessors and mutators
+  float frame_period() const { return frame_period_; }
+  void set_frame_period(float frame_period) {
+    frame_period_ = frame_period;
+  }
+
+  float frame_duration() const { return frame_duration_; }
+  void set_frame_duration(float frame_duration) {
+    frame_duration_ = frame_duration;
+  }
+
+  float endpoint_margin() const { return endpoint_margin_; }
+  void set_endpoint_margin(float endpoint_margin) {
+    endpoint_margin_ = endpoint_margin;
+  }
+
+  float onset_window() const { return onset_window_; }
+  void set_onset_window(float onset_window) { onset_window_ = onset_window; }
+
+  float speech_on_window() const { return speech_on_window_; }
+  void set_speech_on_window(float speech_on_window) {
+    speech_on_window_ = speech_on_window;
+  }
+
+  float offset_window() const { return offset_window_; }
+  void set_offset_window(float offset_window) {
+    offset_window_ = offset_window;
+  }
+
+  float onset_detect_dur() const { return onset_detect_dur_; }
+  void set_onset_detect_dur(float onset_detect_dur) {
+    onset_detect_dur_ = onset_detect_dur;
+  }
+
+  float onset_confirm_dur() const { return onset_confirm_dur_; }
+  void set_onset_confirm_dur(float onset_confirm_dur) {
+    onset_confirm_dur_ = onset_confirm_dur;
+  }
+
+  float on_maintain_dur() const { return on_maintain_dur_; }
+  void set_on_maintain_dur(float on_maintain_dur) {
+    on_maintain_dur_ = on_maintain_dur;
+  }
+
+  float offset_confirm_dur() const { return offset_confirm_dur_; }
+  void set_offset_confirm_dur(float offset_confirm_dur) {
+    offset_confirm_dur_ = offset_confirm_dur;
+  }
+
+  float decision_threshold() const { return decision_threshold_; }
+  void set_decision_threshold(float decision_threshold) {
+    decision_threshold_ = decision_threshold;
+  }
+
+  float min_decision_threshold() const { return min_decision_threshold_; }
+  void set_min_decision_threshold(float min_decision_threshold) {
+    min_decision_threshold_ = min_decision_threshold;
+  }
+
+  float fast_update_dur() const { return fast_update_dur_; }
+  void set_fast_update_dur(float fast_update_dur) {
+    fast_update_dur_ = fast_update_dur;
+  }
+
+  float sample_rate() const { return sample_rate_; }
+  void set_sample_rate(float sample_rate) { sample_rate_ = sample_rate; }
+
+  float min_fundamental_frequency() const { return min_fundamental_frequency_; }
+  void set_min_fundamental_frequency(float min_fundamental_frequency) {
+    min_fundamental_frequency_ = min_fundamental_frequency;
+  }
+
+  float max_fundamental_frequency() const { return max_fundamental_frequency_; }
+  void set_max_fundamental_frequency(float max_fundamental_frequency) {
+    max_fundamental_frequency_ = max_fundamental_frequency;
+  }
+
+  float contamination_rejection_period() const {
+    return contamination_rejection_period_;
+  }
+  void set_contamination_rejection_period(
+      float contamination_rejection_period) {
+    contamination_rejection_period_ = contamination_rejection_period;
+  }
+
+ private:
+  float frame_period_;  // Frame period
+  float frame_duration_;  // Window size
+  float onset_window_;  // Interval scanned for onset activity
+  float speech_on_window_;  // Inverval scanned for ongoing speech
+  float offset_window_;  // Interval scanned for offset evidence
+  float offset_confirm_dur_;  // Silence duration required to confirm offset
+  float decision_threshold_;  // Initial rms detection threshold
+  float min_decision_threshold_;  // Minimum rms detection threshold
+  float fast_update_dur_;  // Period for initial estimation of levels.
+  float sample_rate_;  // Expected sample rate.
+
+  // Time to add on either side of endpoint threshold crossings
+  float endpoint_margin_;
+  // Total dur within onset_window required to enter ONSET state
+  float onset_detect_dur_;
+  // Total on time within onset_window required to enter SPEECH_ON state
+  float onset_confirm_dur_;
+  // Minimum dur in SPEECH_ON state required to maintain ON state
+  float on_maintain_dur_;
+  // Minimum fundamental frequency for autocorrelation.
+  float min_fundamental_frequency_;
+  // Maximum fundamental frequency for autocorrelation.
+  float max_fundamental_frequency_;
+  // Period after start of user input that above threshold values are ignored.
+  // This is to reject audio feedback contamination.
+  float contamination_rejection_period_;
+};
+
+}  //  namespace speech_input
+
+#endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_
diff --git a/content/browser/speech/speech_input_browsertest.cc b/content/browser/speech/speech_input_browsertest.cc
new file mode 100644
index 0000000..c827f47
--- /dev/null
+++ b/content/browser/speech/speech_input_browsertest.cc
@@ -0,0 +1,207 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/command_line.h"
+#include "base/file_path.h"
+#include "base/string_number_conversions.h"
+#include "base/utf_string_conversions.h"
+#include "chrome/browser/ui/browser.h"
+#include "chrome/common/chrome_switches.h"
+#include "chrome/test/in_process_browser_test.h"
+#include "chrome/test/ui_test_utils.h"
+#include "content/browser/renderer_host/render_view_host.h"
+#include "content/browser/speech/speech_input_dispatcher_host.h"
+#include "content/browser/speech/speech_input_manager.h"
+#include "content/browser/tab_contents/tab_contents.h"
+#include "third_party/WebKit/Source/WebKit/chromium/public/WebInputEvent.h"
+
+namespace speech_input {
+class FakeSpeechInputManager;
+}
+
+// This class does not need to be refcounted (typically done by PostTask) since
+// it will outlive the test and gets released only when the test shuts down.
+// Disabling refcounting here saves a bit of unnecessary code and the factory
+// method can return a plain pointer below as required by the real code.
+DISABLE_RUNNABLE_METHOD_REFCOUNT(speech_input::FakeSpeechInputManager);
+
+namespace speech_input {
+
+const char* kTestResult = "Pictures of the moon";
+
+class FakeSpeechInputManager : public SpeechInputManager {
+ public:
+  FakeSpeechInputManager()
+      : caller_id_(0),
+        delegate_(NULL) {
+  }
+
+  std::string grammar() {
+    return grammar_;
+  }
+
+  // SpeechInputManager methods.
+  virtual void StartRecognition(Delegate* delegate,
+                                int caller_id,
+                                int render_process_id,
+                                int render_view_id,
+                                const gfx::Rect& element_rect,
+                                const std::string& language,
+                                const std::string& grammar,
+                                const std::string& origin_url) {
+    VLOG(1) << "StartRecognition invoked.";
+    EXPECT_EQ(0, caller_id_);
+    EXPECT_EQ(NULL, delegate_);
+    caller_id_ = caller_id;
+    delegate_ = delegate;
+    grammar_ = grammar;
+    // Give the fake result in a short while.
+    MessageLoop::current()->PostTask(FROM_HERE, NewRunnableMethod(this,
+        &FakeSpeechInputManager::SetFakeRecognitionResult));
+  }
+  virtual void CancelRecognition(int caller_id) {
+    VLOG(1) << "CancelRecognition invoked.";
+    EXPECT_EQ(caller_id_, caller_id);
+    caller_id_ = 0;
+    delegate_ = NULL;
+  }
+  virtual void StopRecording(int caller_id) {
+    VLOG(1) << "StopRecording invoked.";
+    EXPECT_EQ(caller_id_, caller_id);
+    // Nothing to do here since we aren't really recording.
+  }
+  virtual void CancelAllRequestsWithDelegate(Delegate* delegate) {
+    VLOG(1) << "CancelAllRequestsWithDelegate invoked.";
+  }
+
+ private:
+  void SetFakeRecognitionResult() {
+    if (caller_id_) {  // Do a check in case we were cancelled..
+      VLOG(1) << "Setting fake recognition result.";
+      delegate_->DidCompleteRecording(caller_id_);
+      SpeechInputResultArray results;
+      results.push_back(SpeechInputResultItem(ASCIIToUTF16(kTestResult), 1.0));
+      delegate_->SetRecognitionResult(caller_id_, results);
+      delegate_->DidCompleteRecognition(caller_id_);
+      caller_id_ = 0;
+      delegate_ = NULL;
+      VLOG(1) << "Finished setting fake recognition result.";
+    }
+  }
+
+  int caller_id_;
+  Delegate* delegate_;
+  std::string grammar_;
+};
+
+class SpeechInputBrowserTest : public InProcessBrowserTest {
+ public:
+  // InProcessBrowserTest methods
+  GURL testUrl(const FilePath::CharType* filename) {
+    const FilePath kTestDir(FILE_PATH_LITERAL("speech"));
+    return ui_test_utils::GetTestUrl(kTestDir, FilePath(filename));
+  }
+
+ protected:
+  void LoadAndRunSpeechInputTest(const FilePath::CharType* filename) {
+    // The test page calculates the speech button's coordinate in the page on
+    // load & sets that coordinate in the URL fragment. We send mouse down & up
+    // events at that coordinate to trigger speech recognition.
+    GURL test_url = testUrl(filename);
+    ui_test_utils::NavigateToURL(browser(), test_url);
+    std::string coords = browser()->GetSelectedTabContents()->GetURL().ref();
+    VLOG(1) << "Coordinates given by script: " << coords;
+    int comma_pos = coords.find(',');
+    ASSERT_NE(-1, comma_pos);
+    int x = 0;
+    ASSERT_TRUE(base::StringToInt(coords.substr(0, comma_pos).c_str(), &x));
+    int y = 0;
+    ASSERT_TRUE(base::StringToInt(coords.substr(comma_pos + 1).c_str(), &y));
+
+    WebKit::WebMouseEvent mouse_event;
+    mouse_event.type = WebKit::WebInputEvent::MouseDown;
+    mouse_event.button = WebKit::WebMouseEvent::ButtonLeft;
+    mouse_event.x = x;
+    mouse_event.y = y;
+    mouse_event.clickCount = 1;
+    TabContents* tab_contents = browser()->GetSelectedTabContents();
+    tab_contents->render_view_host()->ForwardMouseEvent(mouse_event);
+    mouse_event.type = WebKit::WebInputEvent::MouseUp;
+    tab_contents->render_view_host()->ForwardMouseEvent(mouse_event);
+
+    // The fake speech input manager would receive the speech input
+    // request and return the test string as recognition result. The test page
+    // then sets the URL fragment as 'pass' if it received the expected string.
+    ui_test_utils::WaitForNavigations(&tab_contents->controller(), 1);
+    EXPECT_EQ("pass", browser()->GetSelectedTabContents()->GetURL().ref());
+  }
+
+  // InProcessBrowserTest methods.
+  virtual void SetUpInProcessBrowserTestFixture() {
+    speech_input_manager_ = &fake_speech_input_manager_;
+
+    // Inject the fake manager factory so that the test result is returned to
+    // the web page.
+    SpeechInputDispatcherHost::set_manager_accessor(&fakeManagerAccessor);
+  }
+
+  virtual void TearDownInProcessBrowserTestFixture() {
+    speech_input_manager_ = NULL;
+  }
+
+  // Factory method.
+  static SpeechInputManager* fakeManagerAccessor() {
+    return speech_input_manager_;
+  }
+
+  FakeSpeechInputManager fake_speech_input_manager_;
+
+  // This is used by the static |fakeManagerAccessor|, and it is a pointer
+  // rather than a direct instance per the style guide.
+  static SpeechInputManager* speech_input_manager_;
+};
+
+SpeechInputManager* SpeechInputBrowserTest::speech_input_manager_ = NULL;
+
+// Marked as FLAKY due to http://crbug.com/51337
+//
+// TODO(satish): Once this flakiness has been fixed, add a second test here to
+// check for sending many clicks in succession to the speech button and verify
+// that it doesn't cause any crash but works as expected. This should act as the
+// test for http://crbug.com/59173
+//
+// TODO(satish): Similar to above, once this flakiness has been fixed add
+// another test here to check that when speech recognition is in progress and
+// a renderer crashes, we get a call to
+// SpeechInputManager::CancelAllRequestsWithDelegate.
+//
+// Marked as DISABLED due to http://crbug.com/71227
+#if defined(GOOGLE_CHROME_BUILD)
+#define MAYBE_TestBasicRecognition DISABLED_TestBasicRecognition
+#elif defined(OS_WIN)
+#define MAYBE_TestBasicRecognition FLAKY_TestBasicRecognition
+#else
+#define MAYBE_TestBasicRecognition TestBasicRecognition
+#endif
+IN_PROC_BROWSER_TEST_F(SpeechInputBrowserTest, MAYBE_TestBasicRecognition) {
+  LoadAndRunSpeechInputTest(FILE_PATH_LITERAL("basic_recognition.html"));
+  EXPECT_TRUE(fake_speech_input_manager_.grammar().empty());
+}
+
+// Marked as FLAKY due to http://crbug.com/51337
+// Marked as DISALBED due to http://crbug.com/71227
+#if defined(GOOGLE_CHROME_BUILD)
+#define MAYBE_GrammarAttribute DISABLED_GrammarAttribute
+#elif defined(OS_WIN)
+#define MAYBE_GrammarAttribute FLAKY_GrammarAttribute
+#else
+#define MAYBE_GrammarAttribute GrammarAttribute
+#endif
+IN_PROC_BROWSER_TEST_F(SpeechInputBrowserTest, MAYBE_GrammarAttribute) {
+  LoadAndRunSpeechInputTest(FILE_PATH_LITERAL("grammar_attribute.html"));
+  EXPECT_EQ("http://example.com/grammar.xml",
+            fake_speech_input_manager_.grammar());
+}
+
+}  // namespace speech_input
diff --git a/content/browser/speech/speech_input_dispatcher_host.cc b/content/browser/speech/speech_input_dispatcher_host.cc
new file mode 100644
index 0000000..84e2a95
--- /dev/null
+++ b/content/browser/speech/speech_input_dispatcher_host.cc
@@ -0,0 +1,225 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/speech_input_dispatcher_host.h"
+
+#include "base/lazy_instance.h"
+#include "chrome/common/speech_input_messages.h"
+
+namespace speech_input {
+
+//----------------------------- SpeechInputCallers -----------------------------
+
+// A singleton class to map the tuple
+// (render-process-id, render-view-id, requestid) to a single ID which is passed
+// through rest of the speech code.
+class SpeechInputDispatcherHost::SpeechInputCallers {
+ public:
+  // Creates a new ID for a given tuple.
+  int CreateId(int render_process_id, int render_view_id, int request_id);
+
+  // Returns the ID for a tuple assuming the ID was created earlier.
+  int GetId(int render_process_id, int render_view_id, int request_id);
+
+  // Removes the ID and associated tuple from the map.
+  void RemoveId(int id);
+
+  // Getters for the various tuple elements for the given ID.
+  int render_process_id(int id);
+  int render_view_id(int id);
+  int request_id(int id);
+
+ private:
+  struct CallerInfo {
+    int render_process_id;
+    int render_view_id;
+    int request_id;
+  };
+  friend struct base::DefaultLazyInstanceTraits<SpeechInputCallers>;
+
+  SpeechInputCallers();
+
+  std::map<int, CallerInfo> callers_;
+  int next_id_;
+};
+
+static base::LazyInstance<SpeechInputDispatcherHost::SpeechInputCallers>
+    g_speech_input_callers(base::LINKER_INITIALIZED);
+
+SpeechInputDispatcherHost::SpeechInputCallers::SpeechInputCallers()
+    : next_id_(1) {
+}
+
+int SpeechInputDispatcherHost::SpeechInputCallers::GetId(int render_process_id,
+                                                         int render_view_id,
+                                                         int request_id) {
+  for (std::map<int, CallerInfo>::iterator it = callers_.begin();
+      it != callers_.end(); it++) {
+    const CallerInfo& item = it->second;
+    if (item.render_process_id == render_process_id &&
+        item.render_view_id == render_view_id &&
+        item.request_id == request_id) {
+      return it->first;
+    }
+  }
+
+  // Not finding an entry here is valid since a cancel/stop may have been issued
+  // by the renderer and before it received our response the user may have
+  // clicked the button to stop again. The caller of this method should take
+  // care of this case.
+  return 0;
+}
+
+int SpeechInputDispatcherHost::SpeechInputCallers::CreateId(
+    int render_process_id,
+    int render_view_id,
+    int request_id) {
+  CallerInfo info;
+  info.render_process_id = render_process_id;
+  info.render_view_id = render_view_id;
+  info.request_id = request_id;
+  callers_[next_id_] = info;
+  return next_id_++;
+}
+
+void SpeechInputDispatcherHost::SpeechInputCallers::RemoveId(int id) {
+  callers_.erase(id);
+}
+
+int SpeechInputDispatcherHost::SpeechInputCallers::render_process_id(int id) {
+  return callers_[id].render_process_id;
+}
+
+int SpeechInputDispatcherHost::SpeechInputCallers::render_view_id(int id) {
+  return callers_[id].render_view_id;
+}
+
+int SpeechInputDispatcherHost::SpeechInputCallers::request_id(int id) {
+  return callers_[id].request_id;
+}
+
+//-------------------------- SpeechInputDispatcherHost -------------------------
+
+SpeechInputManager::AccessorMethod*
+    SpeechInputDispatcherHost::manager_accessor_ = &SpeechInputManager::Get;
+
+SpeechInputDispatcherHost::SpeechInputDispatcherHost(int render_process_id)
+    : render_process_id_(render_process_id),
+      may_have_pending_requests_(false) {
+  // This is initialized by Browser. Do not add any non-trivial
+  // initialization here, instead do it lazily when required (e.g. see the
+  // method |manager()|) or add an Init() method.
+}
+
+SpeechInputDispatcherHost::~SpeechInputDispatcherHost() {
+  // If the renderer crashed for some reason or if we didn't receive a proper
+  // Cancel/Stop call for an existing session, cancel such active sessions now.
+  // We first check if this dispatcher received any speech IPC requst so that
+  // we don't end up creating the speech input manager for web pages which don't
+  // use speech input.
+  if (may_have_pending_requests_)
+    manager()->CancelAllRequestsWithDelegate(this);
+}
+
+SpeechInputManager* SpeechInputDispatcherHost::manager() {
+  return (*manager_accessor_)();
+}
+
+bool SpeechInputDispatcherHost::OnMessageReceived(
+    const IPC::Message& message, bool* message_was_ok) {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+
+  uint32 message_type = message.type();
+  if (message_type == SpeechInputHostMsg_StartRecognition::ID ||
+      message_type == SpeechInputHostMsg_CancelRecognition::ID ||
+      message_type == SpeechInputHostMsg_StopRecording::ID) {
+    if (!SpeechInputManager::IsFeatureEnabled()) {
+      *message_was_ok = false;
+      return true;
+    }
+
+    may_have_pending_requests_ = true;
+    IPC_BEGIN_MESSAGE_MAP_EX(SpeechInputDispatcherHost, message,
+                             *message_was_ok)
+      IPC_MESSAGE_HANDLER(SpeechInputHostMsg_StartRecognition,
+                          OnStartRecognition)
+      IPC_MESSAGE_HANDLER(SpeechInputHostMsg_CancelRecognition,
+                          OnCancelRecognition)
+      IPC_MESSAGE_HANDLER(SpeechInputHostMsg_StopRecording,
+                          OnStopRecording)
+    IPC_END_MESSAGE_MAP()
+    return true;
+  }
+
+  return false;
+}
+
+void SpeechInputDispatcherHost::OnStartRecognition(
+    const SpeechInputHostMsg_StartRecognition_Params &params) {
+  int caller_id = g_speech_input_callers.Get().CreateId(
+      render_process_id_, params.render_view_id, params.request_id);
+  manager()->StartRecognition(this, caller_id,
+                              render_process_id_,
+                              params.render_view_id, params.element_rect,
+                              params.language, params.grammar,
+                              params.origin_url);
+}
+
+void SpeechInputDispatcherHost::OnCancelRecognition(int render_view_id,
+                                                    int request_id) {
+  int caller_id = g_speech_input_callers.Get().GetId(
+      render_process_id_, render_view_id, request_id);
+  if (caller_id) {
+    manager()->CancelRecognition(caller_id);
+    // Request sequence ended so remove mapping.
+    g_speech_input_callers.Get().RemoveId(caller_id);
+  }
+}
+
+void SpeechInputDispatcherHost::OnStopRecording(int render_view_id,
+                                                int request_id) {
+  int caller_id = g_speech_input_callers.Get().GetId(
+      render_process_id_, render_view_id, request_id);
+  if (caller_id)
+    manager()->StopRecording(caller_id);
+}
+
+void SpeechInputDispatcherHost::SetRecognitionResult(
+    int caller_id, const SpeechInputResultArray& result) {
+  VLOG(1) << "SpeechInputDispatcherHost::SetRecognitionResult enter";
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  int caller_render_view_id =
+      g_speech_input_callers.Get().render_view_id(caller_id);
+  int caller_request_id = g_speech_input_callers.Get().request_id(caller_id);
+  Send(new SpeechInputMsg_SetRecognitionResult(caller_render_view_id,
+                                               caller_request_id,
+                                               result));
+  VLOG(1) << "SpeechInputDispatcherHost::SetRecognitionResult exit";
+}
+
+void SpeechInputDispatcherHost::DidCompleteRecording(int caller_id) {
+  VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecording enter";
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  int caller_render_view_id =
+    g_speech_input_callers.Get().render_view_id(caller_id);
+  int caller_request_id = g_speech_input_callers.Get().request_id(caller_id);
+  Send(new SpeechInputMsg_RecordingComplete(caller_render_view_id,
+                                            caller_request_id));
+  VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecording exit";
+}
+
+void SpeechInputDispatcherHost::DidCompleteRecognition(int caller_id) {
+  VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecognition enter";
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  int caller_render_view_id =
+    g_speech_input_callers.Get().render_view_id(caller_id);
+  int caller_request_id = g_speech_input_callers.Get().request_id(caller_id);
+  Send(new SpeechInputMsg_RecognitionComplete(caller_render_view_id,
+                                              caller_request_id));
+  // Request sequence ended, so remove mapping.
+  g_speech_input_callers.Get().RemoveId(caller_id);
+  VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecognition exit";
+}
+
+}  // namespace speech_input
diff --git a/content/browser/speech/speech_input_dispatcher_host.h b/content/browser/speech/speech_input_dispatcher_host.h
new file mode 100644
index 0000000..abd93da
--- /dev/null
+++ b/content/browser/speech/speech_input_dispatcher_host.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_
+#define CONTENT_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_
+
+#include "base/scoped_ptr.h"
+#include "content/browser/browser_message_filter.h"
+#include "content/browser/speech/speech_input_manager.h"
+
+struct SpeechInputHostMsg_StartRecognition_Params;
+
+namespace speech_input {
+
+// SpeechInputDispatcherHost is a delegate for Speech API messages used by
+// RenderMessageFilter.
+// It's the complement of SpeechInputDispatcher (owned by RenderView).
+class SpeechInputDispatcherHost : public BrowserMessageFilter,
+                                  public SpeechInputManager::Delegate {
+ public:
+  class SpeechInputCallers;
+
+  explicit SpeechInputDispatcherHost(int render_process_id);
+
+  // SpeechInputManager::Delegate methods.
+  virtual void SetRecognitionResult(int caller_id,
+                                    const SpeechInputResultArray& result);
+  virtual void DidCompleteRecording(int caller_id);
+  virtual void DidCompleteRecognition(int caller_id);
+
+  // BrowserMessageFilter implementation.
+  virtual bool OnMessageReceived(const IPC::Message& message,
+                                 bool* message_was_ok);
+
+  // Singleton accessor setter useful for tests.
+  static void set_manager_accessor(SpeechInputManager::AccessorMethod* method) {
+    manager_accessor_ = method;
+  }
+
+ private:
+  virtual ~SpeechInputDispatcherHost();
+
+  void OnStartRecognition(
+      const SpeechInputHostMsg_StartRecognition_Params &params);
+  void OnCancelRecognition(int render_view_id, int request_id);
+  void OnStopRecording(int render_view_id, int request_id);
+
+  // Returns the speech input manager to forward events to, creating one if
+  // needed.
+  SpeechInputManager* manager();
+
+  int render_process_id_;
+  bool may_have_pending_requests_;  // Set if we received any speech IPC request
+
+  static SpeechInputManager::AccessorMethod* manager_accessor_;
+
+  DISALLOW_COPY_AND_ASSIGN(SpeechInputDispatcherHost);
+};
+
+}  // namespace speech_input
+
+#endif  // CONTENT_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_
diff --git a/content/browser/speech/speech_input_manager.h b/content/browser/speech/speech_input_manager.h
new file mode 100644
index 0000000..a6ba61f
--- /dev/null
+++ b/content/browser/speech/speech_input_manager.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_
+#define CONTENT_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_
+
+#include "base/basictypes.h"
+#include "chrome/common/speech_input_result.h"
+#include "ui/gfx/rect.h"
+
+namespace speech_input {
+
+// This is the gatekeeper for speech recognition in the browser process. It
+// handles requests received from various render views and makes sure only one
+// of them can use speech recognition at a time. It also sends recognition
+// results and status events to the render views when required.
+// This class is a singleton and accessed via the Get method.
+class SpeechInputManager {
+ public:
+  // Implemented by the dispatcher host to relay events to the render views.
+  class Delegate {
+   public:
+    virtual void SetRecognitionResult(
+        int caller_id,
+        const SpeechInputResultArray& result) = 0;
+    virtual void DidCompleteRecording(int caller_id) = 0;
+    virtual void DidCompleteRecognition(int caller_id) = 0;
+
+   protected:
+    virtual ~Delegate() {}
+  };
+
+  // Whether the speech input feature is enabled, based on the browser channel
+  // information and command line flags.
+  static bool IsFeatureEnabled();
+
+  // Factory method to access the singleton. We have this method here instead of
+  // using Singleton directly in the calling code to aid tests in injection
+  // mocks.
+  static SpeechInputManager* Get();
+  // Factory method definition useful for tests.
+  typedef SpeechInputManager* (AccessorMethod)();
+
+  virtual ~SpeechInputManager() {}
+
+  // Handlers for requests from render views.
+
+  // |delegate| is a weak pointer and should remain valid until
+  // its |DidCompleteRecognition| method is called or recognition is cancelled.
+  // |render_process_id| is the ID of the renderer process initiating the
+  // request.
+  // |element_rect| is the display bounds of the html element requesting speech
+  // input (in page coordinates).
+  virtual void StartRecognition(Delegate* delegate,
+                                int caller_id,
+                                int render_process_id,
+                                int render_view_id,
+                                const gfx::Rect& element_rect,
+                                const std::string& language,
+                                const std::string& grammar,
+                                const std::string& origin_url)  = 0;
+  virtual void CancelRecognition(int caller_id) = 0;
+  virtual void StopRecording(int caller_id) = 0;
+
+  virtual void CancelAllRequestsWithDelegate(Delegate* delegate) = 0;
+};
+
+// This typedef is to workaround the issue with certain versions of
+// Visual Studio where it gets confused between multiple Delegate
+// classes and gives a C2500 error. (I saw this error on the try bots -
+// the workaround was not needed for my machine).
+typedef SpeechInputManager::Delegate SpeechInputManagerDelegate;
+
+}  // namespace speech_input
+
+#endif  // CONTENT_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_
diff --git a/content/browser/speech/speech_recognition_request.cc b/content/browser/speech/speech_recognition_request.cc
new file mode 100644
index 0000000..d127437
--- /dev/null
+++ b/content/browser/speech/speech_recognition_request.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/speech_recognition_request.h"
+
+#include <vector>
+
+#include "base/json/json_reader.h"
+#include "base/string_util.h"
+#include "base/values.h"
+#include "chrome/common/net/url_request_context_getter.h"
+#include "net/base/escape.h"
+#include "net/base/load_flags.h"
+#include "net/url_request/url_request_context.h"
+#include "net/url_request/url_request_status.h"
+#include "ui/base/l10n/l10n_util.h"
+
+namespace {
+
+const char* const kDefaultSpeechRecognitionUrl =
+    "https://www.google.com/speech-api/v1/recognize?client=chromium&";
+const char* const kHypothesesString = "hypotheses";
+const char* const kUtteranceString = "utterance";
+const char* const kConfidenceString = "confidence";
+
+bool ParseServerResponse(const std::string& response_body,
+                         speech_input::SpeechInputResultArray* result) {
+  if (response_body.empty()) {
+    LOG(WARNING) << "ParseServerResponse: Response was empty.";
+    return false;
+  }
+  DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;
+
+  // Parse the response, ignoring comments.
+  std::string error_msg;
+  scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError(
+      response_body, false, NULL, &error_msg));
+  if (response_value == NULL) {
+    LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg;
+    return false;
+  }
+
+  if (!response_value->IsType(Value::TYPE_DICTIONARY)) {
+    VLOG(1) << "ParseServerResponse: Unexpected response type "
+            << response_value->GetType();
+    return false;
+  }
+  const DictionaryValue* response_object =
+      static_cast<DictionaryValue*>(response_value.get());
+
+  // Get the hypotheses
+  Value* hypotheses_value = NULL;
+  if (!response_object->Get(kHypothesesString, &hypotheses_value)) {
+    VLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";
+    return false;
+  }
+  DCHECK(hypotheses_value);
+  if (!hypotheses_value->IsType(Value::TYPE_LIST)) {
+    VLOG(1) << "ParseServerResponse: Unexpected hypotheses type "
+            << hypotheses_value->GetType();
+    return false;
+  }
+  const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value);
+  if (hypotheses_list->GetSize() == 0) {
+    VLOG(1) << "ParseServerResponse: hypotheses list is empty.";
+    return false;
+  }
+
+  size_t index = 0;
+  for (; index < hypotheses_list->GetSize(); ++index) {
+    Value* hypothesis = NULL;
+    if (!hypotheses_list->Get(index, &hypothesis)) {
+      LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";
+      break;
+    }
+    DCHECK(hypothesis);
+    if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) {
+      LOG(WARNING) << "ParseServerResponse: Unexpected value type "
+                   << hypothesis->GetType();
+      break;
+    }
+
+    const DictionaryValue* hypothesis_value =
+        static_cast<DictionaryValue*>(hypothesis);
+    string16 utterance;
+    if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {
+      LOG(WARNING) << "ParseServerResponse: Missing utterance value.";
+      break;
+    }
+
+    // It is not an error if the 'confidence' field is missing.
+    double confidence = 0.0;
+    hypothesis_value->GetDouble(kConfidenceString, &confidence);
+
+    result->push_back(speech_input::SpeechInputResultItem(utterance,
+                                                          confidence));
+  }
+
+  if (index < hypotheses_list->GetSize()) {
+    result->clear();
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+namespace speech_input {
+
+int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0;
+
+SpeechRecognitionRequest::SpeechRecognitionRequest(
+    URLRequestContextGetter* context, Delegate* delegate)
+    : url_context_(context),
+      delegate_(delegate) {
+  DCHECK(delegate);
+}
+
+SpeechRecognitionRequest::~SpeechRecognitionRequest() {}
+
+bool SpeechRecognitionRequest::Send(const std::string& language,
+                                    const std::string& grammar,
+                                    const std::string& hardware_info,
+                                    const std::string& origin_url,
+                                    const std::string& content_type,
+                                    const std::string& audio_data) {
+  DCHECK(!url_fetcher_.get());
+
+  std::vector<std::string> parts;
+
+  std::string lang_param = language;
+  if (lang_param.empty() && url_context_) {
+    // If no language is provided then we use the first from the accepted
+    // language list. If this list is empty then it defaults to "en-US".
+    // Example of the contents of this list: "es,en-GB;q=0.8", ""
+    net::URLRequestContext* request_context =
+        url_context_->GetURLRequestContext();
+    DCHECK(request_context);
+    std::string accepted_language_list = request_context->accept_language();
+    size_t separator = accepted_language_list.find_first_of(",;");
+    lang_param = accepted_language_list.substr(0, separator);
+  }
+  if (lang_param.empty())
+    lang_param = "en-US";
+  parts.push_back("lang=" + EscapeQueryParamValue(lang_param, true));
+
+  if (!grammar.empty())
+    parts.push_back("lm=" + EscapeQueryParamValue(grammar, true));
+  if (!hardware_info.empty())
+    parts.push_back("xhw=" + EscapeQueryParamValue(hardware_info, true));
+  // TODO(satish): Remove this hardcoded value once the page is allowed to
+  // set this via an attribute.
+  parts.push_back("maxresults=3");
+
+  GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&'));
+
+  url_fetcher_.reset(URLFetcher::Create(url_fetcher_id_for_tests,
+                                        url,
+                                        URLFetcher::POST,
+                                        this));
+  url_fetcher_->set_upload_data(content_type, audio_data);
+  url_fetcher_->set_request_context(url_context_);
+  url_fetcher_->set_referrer(origin_url);
+
+  // The speech recognition API does not require user identification as part
+  // of requests, so we don't send cookies or auth data for these requests to
+  // prevent any accidental connection between users who are logged into the
+  // domain for other services (e.g. bookmark sync) with the speech requests.
+  url_fetcher_->set_load_flags(
+      net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES |
+      net::LOAD_DO_NOT_SEND_AUTH_DATA);
+  url_fetcher_->Start();
+  return true;
+}
+
+void SpeechRecognitionRequest::OnURLFetchComplete(
+    const URLFetcher* source,
+    const GURL& url,
+    const net::URLRequestStatus& status,
+    int response_code,
+    const ResponseCookies& cookies,
+    const std::string& data) {
+  DCHECK_EQ(url_fetcher_.get(), source);
+
+  bool error = !status.is_success() || response_code != 200;
+  SpeechInputResultArray result;
+  if (!error)
+    error = !ParseServerResponse(data, &result);
+  url_fetcher_.reset();
+
+  DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result.";
+  delegate_->SetRecognitionResult(error, result);
+}
+
+}  // namespace speech_input
diff --git a/content/browser/speech/speech_recognition_request.h b/content/browser/speech/speech_recognition_request.h
new file mode 100644
index 0000000..3036d59
--- /dev/null
+++ b/content/browser/speech/speech_recognition_request.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_
+#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_
+#pragma once
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/ref_counted.h"
+#include "base/scoped_ptr.h"
+#include "chrome/common/net/url_fetcher.h"
+#include "chrome/common/speech_input_result.h"
+#include "googleurl/src/gurl.h"
+
+class URLFetcher;
+class URLRequestContextGetter;
+
+namespace speech_input {
+
+// Provides a simple interface for sending recorded speech data to the server
+// and get back recognition results.
+class SpeechRecognitionRequest : public URLFetcher::Delegate {
+ public:
+  // ID passed to URLFetcher::Create(). Used for testing.
+  static int url_fetcher_id_for_tests;
+
+  // Interface for receiving callbacks from this object.
+  class Delegate {
+   public:
+    virtual void SetRecognitionResult(
+        bool error, const SpeechInputResultArray& result) = 0;
+
+   protected:
+    virtual ~Delegate() {}
+  };
+
+  // |url| is the server address to which the request wil be sent.
+  SpeechRecognitionRequest(URLRequestContextGetter* context,
+                           Delegate* delegate);
+
+  virtual ~SpeechRecognitionRequest();
+
+  // Sends a new request with the given audio data, returns true if successful.
+  // The same object can be used to send multiple requests but only after the
+  // previous request has completed.
+  bool Send(const std::string& language,
+            const std::string& grammar,
+            const std::string& hardware_info,
+            const std::string& origin_url,
+            const std::string& content_type,
+            const std::string& audio_data);
+
+  bool HasPendingRequest() { return url_fetcher_ != NULL; }
+
+  // URLFetcher::Delegate methods.
+  virtual void OnURLFetchComplete(const URLFetcher* source,
+                                  const GURL& url,
+                                  const net::URLRequestStatus& status,
+                                  int response_code,
+                                  const ResponseCookies& cookies,
+                                  const std::string& data);
+
+ private:
+  scoped_refptr<URLRequestContextGetter> url_context_;
+  Delegate* delegate_;
+  scoped_ptr<URLFetcher> url_fetcher_;
+
+  DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionRequest);
+};
+
+// This typedef is to workaround the issue with certain versions of
+// Visual Studio where it gets confused between multiple Delegate
+// classes and gives a C2500 error. (I saw this error on the try bots -
+// the workaround was not needed for my machine).
+typedef SpeechRecognitionRequest::Delegate SpeechRecognitionRequestDelegate;
+
+}  // namespace speech_input
+
+#endif  // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_
diff --git a/content/browser/speech/speech_recognition_request_unittest.cc b/content/browser/speech/speech_recognition_request_unittest.cc
new file mode 100644
index 0000000..e90f5cd
--- /dev/null
+++ b/content/browser/speech/speech_recognition_request_unittest.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/utf_string_conversions.h"
+#include "chrome/common/net/url_request_context_getter.h"
+#include "chrome/common/net/test_url_fetcher_factory.h"
+#include "content/browser/speech/speech_recognition_request.h"
+#include "net/url_request/url_request_status.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace speech_input {
+
+class SpeechRecognitionRequestTest : public SpeechRecognitionRequestDelegate,
+                                     public testing::Test {
+ public:
+  SpeechRecognitionRequestTest() : error_(false) { }
+
+  // Creates a speech recognition request and invokes it's URL fetcher delegate
+  // with the given test data.
+  void CreateAndTestRequest(bool success, const std::string& http_response);
+
+  // SpeechRecognitionRequestDelegate methods.
+  virtual void SetRecognitionResult(bool error,
+                                    const SpeechInputResultArray& result) {
+    error_ = error;
+    result_ = result;
+  }
+
+  // testing::Test methods.
+  virtual void SetUp() {
+    URLFetcher::set_factory(&url_fetcher_factory_);
+  }
+
+  virtual void TearDown() {
+    URLFetcher::set_factory(NULL);
+  }
+
+ protected:
+  MessageLoop message_loop_;
+  TestURLFetcherFactory url_fetcher_factory_;
+  bool error_;
+  SpeechInputResultArray result_;
+};
+
+void SpeechRecognitionRequestTest::CreateAndTestRequest(
+    bool success, const std::string& http_response) {
+  SpeechRecognitionRequest request(NULL, this);
+  request.Send(std::string(), std::string(), std::string(), std::string(),
+               std::string(), std::string());
+  TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
+  ASSERT_TRUE(fetcher);
+  net::URLRequestStatus status;
+  status.set_status(success ? net::URLRequestStatus::SUCCESS :
+                              net::URLRequestStatus::FAILED);
+  fetcher->delegate()->OnURLFetchComplete(fetcher, fetcher->original_url(),
+                                          status, success ? 200 : 500,
+                                          ResponseCookies(),
+                                          http_response);
+  // Parsed response will be available in result_.
+}
+
+TEST_F(SpeechRecognitionRequestTest, BasicTest) {
+  // Normal success case with one result.
+  CreateAndTestRequest(true,
+      "{\"hypotheses\":[{\"utterance\":\"123456\",\"confidence\":0.9}]}");
+  EXPECT_FALSE(error_);
+  EXPECT_EQ(1U, result_.size());
+  EXPECT_EQ(ASCIIToUTF16("123456"), result_[0].utterance);
+  EXPECT_EQ(0.9, result_[0].confidence);
+
+  // Normal success case with multiple results.
+  CreateAndTestRequest(true,
+      "{\"hypotheses\":[{\"utterance\":\"hello\",\"confidence\":0.9},"
+      "{\"utterance\":\"123456\",\"confidence\":0.5}]}");
+  EXPECT_FALSE(error_);
+  EXPECT_EQ(2u, result_.size());
+  EXPECT_EQ(ASCIIToUTF16("hello"), result_[0].utterance);
+  EXPECT_EQ(0.9, result_[0].confidence);
+  EXPECT_EQ(ASCIIToUTF16("123456"), result_[1].utterance);
+  EXPECT_EQ(0.5, result_[1].confidence);
+
+  // Http failure case.
+  CreateAndTestRequest(false, "");
+  EXPECT_TRUE(error_);
+  EXPECT_EQ(0U, result_.size());
+
+  // Malformed JSON case.
+  CreateAndTestRequest(true, "{\"hypotheses\":[{\"unknownkey\":\"hello\"}]}");
+  EXPECT_TRUE(error_);
+  EXPECT_EQ(0U, result_.size());
+}
+
+}  // namespace speech_input
diff --git a/content/browser/speech/speech_recognizer.cc b/content/browser/speech/speech_recognizer.cc
new file mode 100644
index 0000000..fdc1a4c
--- /dev/null
+++ b/content/browser/speech/speech_recognizer.cc
@@ -0,0 +1,262 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/speech_recognizer.h"
+
+#include "base/time.h"
+#include "chrome/browser/profiles/profile.h"
+#include "chrome/common/net/url_request_context_getter.h"
+#include "content/browser/browser_thread.h"
+
+using media::AudioInputController;
+using std::string;
+
+namespace {
+
+// The following constants are related to the volume level indicator shown in
+// the UI for recorded audio.
+// Multiplier used when new volume is greater than previous level.
+const float kUpSmoothingFactor = 0.9f;
+// Multiplier used when new volume is lesser than previous level.
+const float kDownSmoothingFactor = 0.4f;
+const float kAudioMeterMinDb = 10.0f;  // Lower bar for volume meter.
+const float kAudioMeterDbRange = 25.0f;
+}  // namespace
+
+namespace speech_input {
+
+const int SpeechRecognizer::kAudioSampleRate = 16000;
+const int SpeechRecognizer::kAudioPacketIntervalMs = 100;
+const int SpeechRecognizer::kNumAudioChannels = 1;
+const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
+const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
+const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;
+
+SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
+                                   int caller_id,
+                                   const std::string& language,
+                                   const std::string& grammar,
+                                   const std::string& hardware_info,
+                                   const std::string& origin_url)
+    : delegate_(delegate),
+      caller_id_(caller_id),
+      language_(language),
+      grammar_(grammar),
+      hardware_info_(hardware_info),
+      origin_url_(origin_url),
+      codec_(AudioEncoder::CODEC_SPEEX),
+      encoder_(NULL),
+      endpointer_(kAudioSampleRate),
+      num_samples_recorded_(0),
+      audio_level_(0.0f) {
+  endpointer_.set_speech_input_complete_silence_length(
+      base::Time::kMicrosecondsPerSecond / 2);
+  endpointer_.set_long_speech_input_complete_silence_length(
+      base::Time::kMicrosecondsPerSecond);
+  endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
+  endpointer_.StartSession();
+}
+
+SpeechRecognizer::~SpeechRecognizer() {
+  // Recording should have stopped earlier due to the endpointer or
+  // |StopRecording| being called.
+  DCHECK(!audio_controller_.get());
+  DCHECK(!request_.get() || !request_->HasPendingRequest());
+  DCHECK(!encoder_.get());
+  endpointer_.EndSession();
+}
+
+bool SpeechRecognizer::StartRecording() {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  DCHECK(!audio_controller_.get());
+  DCHECK(!request_.get() || !request_->HasPendingRequest());
+  DCHECK(!encoder_.get());
+
+  // The endpointer needs to estimate the environment/background noise before
+  // starting to treat the audio as user input. In |HandleOnData| we wait until
+  // such time has passed before switching to user input mode.
+  endpointer_.SetEnvironmentEstimationMode();
+
+  encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,
+                                      kNumBitsPerAudioSample));
+  int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
+  AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
+                         kAudioSampleRate, kNumBitsPerAudioSample,
+                         samples_per_packet);
+  audio_controller_ = AudioInputController::Create(this, params);
+  DCHECK(audio_controller_.get());
+  VLOG(1) << "SpeechRecognizer starting record.";
+  num_samples_recorded_ = 0;
+  audio_controller_->Record();
+
+  return true;
+}
+
+void SpeechRecognizer::CancelRecognition() {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+  DCHECK(audio_controller_.get() || request_.get());
+
+  // Stop recording if required.
+  if (audio_controller_.get()) {
+    VLOG(1) << "SpeechRecognizer stopping record.";
+    audio_controller_->Close();
+    audio_controller_ = NULL;  // Releases the ref ptr.
+  }
+
+  VLOG(1) << "SpeechRecognizer canceling recognition.";
+  encoder_.reset();
+  request_.reset();
+}
+
+void SpeechRecognizer::StopRecording() {
+  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
+
+  // If audio recording has already stopped and we are in recognition phase,
+  // silently ignore any more calls to stop recording.
+  if (!audio_controller_.get())
+    return;
+
+  VLOG(1) << "SpeechRecognizer stopping record.";
+  audio_controller_->Close();
+  audio_controller_ = NULL;  // Releases the ref ptr.
+  encoder_->Flush();
+
+  delegate_->DidCompleteRecording(caller_id_);
+
+  // Since the http request takes a single string as POST data, allocate
+  // one and copy over bytes from the audio buffers to the string.
+  // And If we haven't got any audio yet end the recognition sequence here.
+  string mime_type = encoder_->mime_type();
+  string data;
+  encoder_->GetEncodedData(&data);
+  encoder_.reset();
+
+  if (data.empty()) {
+    // Guard against the delegate freeing us until we finish our job.
+    scoped_refptr<SpeechRecognizer> me(this);
+    delegate_->DidCompleteRecognition(caller_id_);
+  } else {
+    DCHECK(!request_.get());
+    request_.reset(new SpeechRecognitionRequest(
+        Profile::GetDefaultRequestContext(), this));
+    request_->Send(language_, grammar_, hardware_info_, origin_url_,
+                   mime_type, data);
+  }
+}
+
+void SpeechRecognizer::ReleaseAudioBuffers() {
+}
+
+// Invoked in the audio thread.
+void SpeechRecognizer::OnError(AudioInputController* controller,
+                               int error_code) {
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                         NewRunnableMethod(this,
+                                           &SpeechRecognizer::HandleOnError,
+                                           error_code));
+}
+
+void SpeechRecognizer::HandleOnError(int error_code) {
+  LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
+
+  // Check if we are still recording before canceling recognition, as
+  // recording might have been stopped after this error was posted to the queue
+  // by |OnError|.
+  if (!audio_controller_.get())
+    return;
+
+  InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);
+}
+
+void SpeechRecognizer::OnData(AudioInputController* controller,
+                              const uint8* data, uint32 size) {
+  if (size == 0)  // This could happen when recording stops and is normal.
+    return;
+
+  string* str_data = new string(reinterpret_cast<const char*>(data), size);
+  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
+                         NewRunnableMethod(this,
+                                           &SpeechRecognizer::HandleOnData,
+                                           str_data));
+}
+
+void SpeechRecognizer::HandleOnData(string* data) {
+  // Check if we are still recording and if not discard this buffer, as
+  // recording might have been stopped after this buffer was posted to the queue
+  // by |OnData|.
+  if (!audio_controller_.get()) {
+    delete data;
+    return;
+  }
+
+  const short* samples = reinterpret_cast<const short*>(data->data());
+  DCHECK((data->length() % sizeof(short)) == 0);
+  int num_samples = data->length() / sizeof(short);
+
+  encoder_->Encode(samples, num_samples);
+  float rms;
+  endpointer_.ProcessAudio(samples, num_samples, &rms);
+  delete data;
+  num_samples_recorded_ += num_samples;
+
+  if (endpointer_.IsEstimatingEnvironment()) {
+    // Check if we have gathered enough audio for the endpointer to do
+    // environment estimation and should move on to detect speech/end of speech.
+    if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
+                                  kAudioSampleRate) / 1000) {
+      endpointer_.SetUserInputMode();
+      delegate_->DidCompleteEnvironmentEstimation(caller_id_);
+    }
+    return;  // No more processing since we are still estimating environment.
+  }
+
+  // Check if we have waited too long without hearing any speech.
+  if (!endpointer_.DidStartReceivingSpeech() &&
+      num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {
+    InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);
+    return;
+  }
+
+  // Calculate the input volume to display in the UI, smoothing towards the
+  // new level.
+  float level = (rms - kAudioMeterMinDb) / kAudioMeterDbRange;
+  level = std::min(std::max(0.0f, level), 1.0f);
+  if (level > audio_level_) {
+    audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
+  } else {
+    audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
+  }
+  delegate_->SetInputVolume(caller_id_, audio_level_);
+
+  if (endpointer_.speech_input_complete()) {
+    StopRecording();
+  }
+
+  // TODO(satish): Once we have streaming POST, start sending the data received
+  // here as POST chunks.
+}
+
+void SpeechRecognizer::SetRecognitionResult(
+    bool error, const SpeechInputResultArray& result) {
+  if (result.empty()) {
+    InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS);
+    return;
+  }
+
+  delegate_->SetRecognitionResult(caller_id_, error, result);
+
+  // Guard against the delegate freeing us until we finish our job.
+  scoped_refptr<SpeechRecognizer> me(this);
+  delegate_->DidCompleteRecognition(caller_id_);
+}
+
+void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {
+  CancelRecognition();
+
+  // Guard against the delegate freeing us until we finish our job.
+  scoped_refptr<SpeechRecognizer> me(this);
+  delegate_->OnRecognizerError(caller_id_, error);
+}
+
+}  // namespace speech_input
diff --git a/content/browser/speech/speech_recognizer.h b/content/browser/speech/speech_recognizer.h
new file mode 100644
index 0000000..a54a59d
--- /dev/null
+++ b/content/browser/speech/speech_recognizer.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
+#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
+
+#include <list>
+#include <string>
+#include <utility>
+
+#include "base/ref_counted.h"
+#include "base/scoped_ptr.h"
+#include "content/browser/speech/audio_encoder.h"
+#include "content/browser/speech/endpointer/endpointer.h"
+#include "content/browser/speech/speech_recognition_request.h"
+#include "media/audio/audio_input_controller.h"
+
+namespace speech_input {
+
+// Records audio, sends recorded audio to server and translates server response
+// to recognition result.
+class SpeechRecognizer
+    : public base::RefCountedThreadSafe<SpeechRecognizer>,
+      public media::AudioInputController::EventHandler,
+      public SpeechRecognitionRequestDelegate {
+ public:
+  enum ErrorCode {
+    RECOGNIZER_NO_ERROR,
+    RECOGNIZER_ERROR_CAPTURE,
+    RECOGNIZER_ERROR_NO_SPEECH,
+    RECOGNIZER_ERROR_NO_RESULTS,
+  };
+
+  // Implemented by the caller to receive recognition events.
+  class Delegate {
+   public:
+    virtual void SetRecognitionResult(
+        int caller_id,
+        bool error,
+        const SpeechInputResultArray& result) = 0;
+
+    // Invoked when audio recording stops, either due to the end pointer
+    // detecting silence in user input or if |StopRecording| was called. The
+    // delegate has to wait until |DidCompleteRecognition| is invoked before
+    // destroying the |SpeechRecognizer| object.
+    virtual void DidCompleteRecording(int caller_id) = 0;
+
+    // This is guaranteed to be the last method invoked in the recognition
+    // sequence and the |SpeechRecognizer| object can be freed up if necessary.
+    virtual void DidCompleteRecognition(int caller_id) = 0;
+
+    // Invoked if there was an error while recording or recognizing audio. The
+    // session has already been cancelled when this call is made and the DidXxxx
+    // callbacks will not be issued. It is safe to destroy/release the
+    // |SpeechRecognizer| object while processing this call.
+    virtual void OnRecognizerError(int caller_id,
+                                   SpeechRecognizer::ErrorCode error) = 0;
+
+    // At the start of recognition, a short amount of audio is recorded to
+    // estimate the environment/background noise and this callback is issued
+    // after that is complete. Typically the delegate brings up any speech
+    // recognition UI once this callback is received.
+    virtual void DidCompleteEnvironmentEstimation(int caller_id) = 0;
+
+    // Informs of a change in the captured audio level, useful if displaying
+    // a microphone volume indicator while recording.
+    // The value of |volume| is in the [0.0, 1.0] range.
+    virtual void SetInputVolume(int caller_id, float volume) = 0;
+
+   protected:
+    virtual ~Delegate() {}
+  };
+
+  SpeechRecognizer(Delegate* delegate,
+                   int caller_id,
+                   const std::string& language,
+                   const std::string& grammar,
+                   const std::string& hardware_info,
+                   const std::string& origin_url);
+  ~SpeechRecognizer();
+
+  // Starts audio recording and does recognition after recording ends. The same
+  // SpeechRecognizer instance can be used multiple times for speech recognition
+  // though each recognition request can be made only after the previous one
+  // completes (i.e. after receiving Delegate::DidCompleteRecognition).
+  bool StartRecording();
+
+  // Stops recording audio and starts recognition.
+  void StopRecording();
+
+  // Stops recording audio and cancels recognition. Any audio recorded so far
+  // gets discarded.
+  void CancelRecognition();
+
+  // AudioInputController::EventHandler methods.
+  virtual void OnCreated(media::AudioInputController* controller) { }
+  virtual void OnRecording(media::AudioInputController* controller) { }
+  virtual void OnError(media::AudioInputController* controller, int error_code);
+  virtual void OnData(media::AudioInputController* controller,
+                      const uint8* data,
+                      uint32 size);
+
+  // SpeechRecognitionRequest::Delegate methods.
+  virtual void SetRecognitionResult(bool error,
+                                    const SpeechInputResultArray& result);
+
+  static const int kAudioSampleRate;
+  static const int kAudioPacketIntervalMs;  // Duration of each audio packet.
+  static const int kNumAudioChannels;
+  static const int kNumBitsPerAudioSample;
+  static const int kNoSpeechTimeoutSec;
+  static const int kEndpointerEstimationTimeMs;
+
+ private:
+  void ReleaseAudioBuffers();
+  void InformErrorAndCancelRecognition(ErrorCode error);
+  void SendRecordedAudioToServer();
+
+  void HandleOnError(int error_code);  // Handles OnError in the IO thread.
+
+  // Handles OnData in the IO thread. Takes ownership of |data|.
+  void HandleOnData(std::string* data);
+
+  Delegate* delegate_;
+  int caller_id_;
+  std::string language_;
+  std::string grammar_;
+  std::string hardware_info_;
+  std::string origin_url_;
+
+  scoped_ptr<SpeechRecognitionRequest> request_;
+  scoped_refptr<media::AudioInputController> audio_controller_;
+  AudioEncoder::Codec codec_;
+  scoped_ptr<AudioEncoder> encoder_;
+  Endpointer endpointer_;
+  int num_samples_recorded_;
+  float audio_level_;
+
+  DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer);
+};
+
+// This typedef is to workaround the issue with certain versions of
+// Visual Studio where it gets confused between multiple Delegate
+// classes and gives a C2500 error. (I saw this error on the try bots -
+// the workaround was not needed for my machine).
+typedef SpeechRecognizer::Delegate SpeechRecognizerDelegate;
+
+}  // namespace speech_input
+
+#endif  // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
diff --git a/content/browser/speech/speech_recognizer_unittest.cc b/content/browser/speech/speech_recognizer_unittest.cc
new file mode 100644
index 0000000..8365396
--- /dev/null
+++ b/content/browser/speech/speech_recognizer_unittest.cc
@@ -0,0 +1,300 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <vector>
+
+#include "chrome/common/net/test_url_fetcher_factory.h"
+#include "content/browser/browser_thread.h"
+#include "content/browser/speech/speech_recognizer.h"
+#include "media/audio/test_audio_input_controller_factory.h"
+#include "net/url_request/url_request_status.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using media::AudioInputController;
+using media::TestAudioInputController;
+using media::TestAudioInputControllerFactory;
+
+namespace speech_input {
+
+class SpeechRecognizerTest : public SpeechRecognizerDelegate,
+                             public testing::Test {
+ public:
+  SpeechRecognizerTest()
+      : io_thread_(BrowserThread::IO, &message_loop_),
+        ALLOW_THIS_IN_INITIALIZER_LIST(
+            recognizer_(new SpeechRecognizer(this, 1, std::string(),
+                                             std::string(), std::string(),
+                                             std::string()))),
+        recording_complete_(false),
+        recognition_complete_(false),
+        result_received_(false),
+        error_(SpeechRecognizer::RECOGNIZER_NO_ERROR),
+        volume_(-1.0f) {
+    int audio_packet_length_bytes =
+        (SpeechRecognizer::kAudioSampleRate *
+         SpeechRecognizer::kAudioPacketIntervalMs *
+         SpeechRecognizer::kNumAudioChannels *
+         SpeechRecognizer::kNumBitsPerAudioSample) / (8 * 1000);
+    audio_packet_.resize(audio_packet_length_bytes);
+  }
+
+  // SpeechRecognizer::Delegate methods.
+  virtual void SetRecognitionResult(int caller_id,
+                                    bool error,
+                                    const SpeechInputResultArray& result) {
+    result_received_ = true;
+  }
+
+  virtual void DidCompleteRecording(int caller_id) {
+    recording_complete_ = true;
+  }
+
+  virtual void DidCompleteRecognition(int caller_id) {
+    recognition_complete_ = true;
+  }
+
+  virtual void DidCompleteEnvironmentEstimation(int caller_id) {
+  }
+
+  virtual void OnRecognizerError(int caller_id,
+                                 SpeechRecognizer::ErrorCode error) {
+    error_ = error;
+  }
+
+  virtual void SetInputVolume(int caller_id, float volume) {
+    volume_ = volume;
+  }
+
+  // testing::Test methods.
+  virtual void SetUp() {
+    URLFetcher::set_factory(&url_fetcher_factory_);
+    AudioInputController::set_factory(&audio_input_controller_factory_);
+  }
+
+  virtual void TearDown() {
+    URLFetcher::set_factory(NULL);
+    AudioInputController::set_factory(NULL);
+  }
+
+  void FillPacketWithTestWaveform() {
+    // Fill the input with a simple pattern, a 125Hz sawtooth waveform.
+    for (size_t i = 0; i < audio_packet_.size(); ++i)
+      audio_packet_[i] = static_cast<uint8>(i);
+  }
+
+ protected:
+  MessageLoopForIO message_loop_;
+  BrowserThread io_thread_;
+  scoped_refptr<SpeechRecognizer> recognizer_;
+  bool recording_complete_;
+  bool recognition_complete_;
+  bool result_received_;
+  SpeechRecognizer::ErrorCode error_;
+  TestURLFetcherFactory url_fetcher_factory_;
+  TestAudioInputControllerFactory audio_input_controller_factory_;
+  std::vector<uint8> audio_packet_;
+  float volume_;
+};
+
+TEST_F(SpeechRecognizerTest, StopNoData) {
+  // Check for callbacks when stopping record before any audio gets recorded.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  recognizer_->CancelRecognition();
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_);
+}
+
+TEST_F(SpeechRecognizerTest, CancelNoData) {
+  // Check for callbacks when canceling recognition before any audio gets
+  // recorded.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  recognizer_->StopRecording();
+  EXPECT_TRUE(recording_complete_);
+  EXPECT_TRUE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_);
+}
+
+TEST_F(SpeechRecognizerTest, StopWithData) {
+  // Start recording, give some data and then stop. This should wait for the
+  // network callback to arrive before completion.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller = audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                      audio_packet_.size());
+  MessageLoop::current()->RunAllPending();
+  recognizer_->StopRecording();
+  EXPECT_TRUE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_);
+
+  // Issue the network callback to complete the process.
+  TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0);
+  ASSERT_TRUE(fetcher);
+  net::URLRequestStatus status;
+  status.set_status(net::URLRequestStatus::SUCCESS);
+  fetcher->delegate()->OnURLFetchComplete(
+      fetcher, fetcher->original_url(), status, 200, ResponseCookies(),
+      "{\"hypotheses\":[{\"utterance\":\"123\"}]}");
+  EXPECT_TRUE(recognition_complete_);
+  EXPECT_TRUE(result_received_);
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_);
+}
+
+TEST_F(SpeechRecognizerTest, CancelWithData) {
+  // Start recording, give some data and then cancel. This should not create
+  // a network request and finish immediately.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                      audio_packet_.size());
+  MessageLoop::current()->RunAllPending();
+  recognizer_->CancelRecognition();
+  EXPECT_EQ(NULL, url_fetcher_factory_.GetFetcherByID(0));
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_);
+}
+
+TEST_F(SpeechRecognizerTest, AudioControllerErrorNoData) {
+  // Check if things tear down properly if AudioInputController threw an error.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller->event_handler()->OnError(controller, 0);
+  MessageLoop::current()->RunAllPending();
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_CAPTURE, error_);
+}
+
+TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) {
+  // Check if things tear down properly if AudioInputController threw an error
+  // after giving some audio data.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                      audio_packet_.size());
+  controller->event_handler()->OnError(controller, 0);
+  MessageLoop::current()->RunAllPending();
+  EXPECT_EQ(NULL, url_fetcher_factory_.GetFetcherByID(0));
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_CAPTURE, error_);
+}
+
+TEST_F(SpeechRecognizerTest, NoSpeechCallbackIssued) {
+  // Start recording and give a lot of packets with audio samples set to zero.
+  // This should trigger the no-speech detector and issue a callback.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller = audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+
+  int num_packets = (SpeechRecognizer::kNoSpeechTimeoutSec * 1000) /
+                     SpeechRecognizer::kAudioPacketIntervalMs;
+  // The vector is already filled with zero value samples on create.
+  for (int i = 0; i < num_packets; ++i) {
+    controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                        audio_packet_.size());
+  }
+  MessageLoop::current()->RunAllPending();
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  EXPECT_FALSE(result_received_);
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_NO_SPEECH, error_);
+}
+
+TEST_F(SpeechRecognizerTest, NoSpeechCallbackNotIssued) {
+  // Start recording and give a lot of packets with audio samples set to zero
+  // and then some more with reasonably loud audio samples. This should be
+  // treated as normal speech input and the no-speech detector should not get
+  // triggered.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller = audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+
+  int num_packets = (SpeechRecognizer::kNoSpeechTimeoutSec * 1000) /
+                     SpeechRecognizer::kAudioPacketIntervalMs;
+
+  // The vector is already filled with zero value samples on create.
+  for (int i = 0; i < num_packets / 2; ++i) {
+    controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                        audio_packet_.size());
+  }
+
+  FillPacketWithTestWaveform();
+  for (int i = 0; i < num_packets / 2; ++i) {
+    controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                        audio_packet_.size());
+  }
+
+  MessageLoop::current()->RunAllPending();
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_);
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  recognizer_->CancelRecognition();
+}
+
+TEST_F(SpeechRecognizerTest, SetInputVolumeCallback) {
+  // Start recording and give a lot of packets with audio samples set to zero
+  // and then some more with reasonably loud audio samples. Check that we don't
+  // get the callback during estimation phase, then get zero for the silence
+  // samples and proper volume for the loud audio.
+  EXPECT_TRUE(recognizer_->StartRecording());
+  TestAudioInputController* controller =
+      audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+  controller = audio_input_controller_factory_.controller();
+  ASSERT_TRUE(controller);
+
+  // Feed some samples to begin with for the endpointer to do noise estimation.
+  int num_packets = SpeechRecognizer::kEndpointerEstimationTimeMs /
+                    SpeechRecognizer::kAudioPacketIntervalMs;
+  for (int i = 0; i < num_packets; ++i) {
+    controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                        audio_packet_.size());
+  }
+  MessageLoop::current()->RunAllPending();
+  EXPECT_EQ(-1.0f, volume_);  // No audio volume set yet.
+
+  // The vector is already filled with zero value samples on create.
+  controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                      audio_packet_.size());
+  MessageLoop::current()->RunAllPending();
+  EXPECT_EQ(0, volume_);
+
+  FillPacketWithTestWaveform();
+  controller->event_handler()->OnData(controller, &audio_packet_[0],
+                                      audio_packet_.size());
+  MessageLoop::current()->RunAllPending();
+  EXPECT_FLOAT_EQ(0.9f, volume_);
+
+  EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_);
+  EXPECT_FALSE(recording_complete_);
+  EXPECT_FALSE(recognition_complete_);
+  recognizer_->CancelRecognition();
+}
+
+}  // namespace speech_input
author	jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-02-26 18:46:15 +0000
committer	jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-02-26 18:46:15 +0000
commit	50fab53bddb2c3cb24d5682c913a03226ccf49ef (patch)
tree	bb04af83ca5f2be010e32c2e10cfd245117a4847 /content/browser
parent	5c557f37629dc12dfd99e8fb55c235c8c46a8098 (diff)
download	chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.zip chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.gz chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.bz2