Add the option of compressing speech input audio using FLAC.

In the process, added a generic AudioEncoder interface which could create the requested codec. Right now the codec is set to FLAC. In a future CL, we'll determine the codec to use dynamically based on bandwidth considerations. This CL depends on http://codereview.chromium.org/6205006/ going in first. BUG=61677 TEST=none Review URL: http://codereview.chromium.org/6111009 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@71599 0039d316-1c4b-4281-b951-d872f2087c98
author: satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-01-17 16:18:21 +0000
committer: satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-01-17 16:18:21 +0000
commit: 79d58c7d9b15c855b17ee6aef8b0ecd1a931e369 (patch)
tree: 1deb80c6b5663c47a114463a574c55ab68716976 /chrome/browser/speech
parent: 63b5a598d0be8179129603c508cb1fe33fbc72ea (diff)
download: chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.zip
chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.tar.gz
chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.tar.bz2
5 files changed, 283 insertions, 116 deletions
diff --git a/chrome/browser/speech/audio_encoder.cc b/chrome/browser/speech/audio_encoder.cc
new file mode 100644
index 0000000..f9a934b
--- /dev/null
+++ b/chrome/browser/speech/audio_encoder.cc
@@ -0,0 +1,200 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/speech/audio_encoder.h"
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "base/scoped_ptr.h"
+#include "base/stl_util-inl.h"
+#include "base/string_number_conversions.h"
+#include "third_party/flac/flac.h"
+#include "third_party/speex/speex.h"
+
+using std::string;
+
+namespace {
+
+//-------------------------------- FLACEncoder ---------------------------------
+
+const char* const kContentTypeFLAC = "audio/x-flac; rate=";
+const int kFLACCompressionLevel = 0;  // 0 for speed
+
+class FLACEncoder : public speech_input::AudioEncoder {
+ public:
+  FLACEncoder(int sampling_rate, int bits_per_sample);
+  virtual ~FLACEncoder();
+  virtual void Encode(const short* samples, int num_samples);
+  virtual void Flush();
+
+ private:
+  static FLAC__StreamEncoderWriteStatus WriteCallback(
+      const FLAC__StreamEncoder* encoder,
+      const FLAC__byte buffer[],
+      size_t bytes,
+      unsigned samples,
+      unsigned current_frame,
+      void* client_data);
+
+  FLAC__StreamEncoder* encoder_;
+  bool is_encoder_initialized_;
+
+  DISALLOW_COPY_AND_ASSIGN(FLACEncoder);
+};
+
+FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback(
+    const FLAC__StreamEncoder* encoder,
+    const FLAC__byte buffer[],
+    size_t bytes,
+    unsigned samples,
+    unsigned current_frame,
+    void* client_data) {
+  FLACEncoder* me = static_cast<FLACEncoder*>(client_data);
+  DCHECK(me->encoder_ == encoder);
+  me->AppendToBuffer(new string(reinterpret_cast<const char*>(buffer), bytes));
+  return FLAC__STREAM_ENCODER_WRITE_STATUS_OK;
+}
+
+FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample)
+    : AudioEncoder(std::string(kContentTypeFLAC) +
+                   base::IntToString(sampling_rate)),
+      encoder_(FLAC__stream_encoder_new()),
+      is_encoder_initialized_(false) {
+  FLAC__stream_encoder_set_channels(encoder_, 1);
+  FLAC__stream_encoder_set_bits_per_sample(encoder_, bits_per_sample);
+  FLAC__stream_encoder_set_sample_rate(encoder_, sampling_rate);
+  FLAC__stream_encoder_set_compression_level(encoder_, kFLACCompressionLevel);
+
+  // Initializing the encoder will cause sync bytes to be written to
+  // its output stream, so we wait until the first call to this method
+  // before doing so.
+}
+
+FLACEncoder::~FLACEncoder() {
+  FLAC__stream_encoder_delete(encoder_);
+}
+
+void FLACEncoder::Encode(const short* samples, int num_samples) {
+  if (!is_encoder_initialized_) {
+    const FLAC__StreamEncoderInitStatus encoder_status =
+        FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL,
+                                         NULL, this);
+    DCHECK(encoder_status == FLAC__STREAM_ENCODER_INIT_STATUS_OK);
+    is_encoder_initialized_ = true;
+  }
+
+  // FLAC encoder wants samples as int32s.
+  scoped_ptr<FLAC__int32> flac_samples(new FLAC__int32[num_samples]);
+  FLAC__int32* flac_samples_ptr = flac_samples.get();
+  for (int i = 0; i < num_samples; ++i)
+    flac_samples_ptr[i] = samples[i];
+
+  FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples);
+}
+
+void FLACEncoder::Flush() {
+  FLAC__stream_encoder_finish(encoder_);
+}
+
+//-------------------------------- SpeexEncoder --------------------------------
+
+const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate=";
+const int kSpeexEncodingQuality = 8;
+const int kMaxSpeexFrameLength = 110;  // (44kbps rate sampled at 32kHz).
+
+// Since the frame length gets written out as a byte in the encoded packet,
+// make sure it is within the byte range.
+COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
+
+class SpeexEncoder : public speech_input::AudioEncoder {
+ public:
+  SpeexEncoder(int sampling_rate);
+  virtual void Encode(const short* samples, int num_samples);
+  virtual void Flush() {}
+
+ private:
+  void* encoder_state_;
+  SpeexBits bits_;
+  int samples_per_frame_;
+  char encoded_frame_data_[kMaxSpeexFrameLength + 1];  // +1 for the frame size.
+  DISALLOW_COPY_AND_ASSIGN(SpeexEncoder);
+};
+
+SpeexEncoder::SpeexEncoder(int sampling_rate)
+    : AudioEncoder(std::string(kContentTypeSpeex) +
+                   base::IntToString(sampling_rate)) {
+   // speex_bits_init() does not initialize all of the |bits_| struct.
+   memset(&bits_, 0, sizeof(bits_));
+   speex_bits_init(&bits_);
+   encoder_state_ = speex_encoder_init(&speex_wb_mode);
+   DCHECK(encoder_state_);
+   speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
+   DCHECK(samples_per_frame_ > 0);
+   int quality = kSpeexEncodingQuality;
+   speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
+   int vbr = 1;
+   speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
+   memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
+}
+
+void SpeexEncoder::Encode(const short* samples, int num_samples) {
+  // Drop incomplete frames, typically those which come in when recording stops.
+  num_samples -= (num_samples % samples_per_frame_);
+  for (int i = 0; i < num_samples; i += samples_per_frame_) {
+    speex_bits_reset(&bits_);
+    speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
+                     &bits_);
+
+    // Encode the frame and place the size of the frame as the first byte. This
+    // is the packet format for MIME type x-speex-with-header-byte.
+    int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
+                                        kMaxSpeexFrameLength);
+    encoded_frame_data_[0] = static_cast<char>(frame_length);
+    AppendToBuffer(new string(encoded_frame_data_, frame_length + 1));
+  }
+}
+
+}  // namespace
+
+namespace speech_input {
+
+AudioEncoder* AudioEncoder::Create(Codec codec,
+                                   int sampling_rate,
+                                   int bits_per_sample) {
+  if (codec == CODEC_FLAC)
+    return new FLACEncoder(sampling_rate, bits_per_sample);
+  return new SpeexEncoder(sampling_rate);
+}
+
+AudioEncoder::AudioEncoder(const std::string& mime_type)
+    : mime_type_(mime_type) {
+}
+
+AudioEncoder::~AudioEncoder() {
+  STLDeleteElements(&audio_buffers_);
+}
+
+bool AudioEncoder::GetEncodedData(std::string* encoded_data) {
+  if (!audio_buffers_.size())
+    return false;
+
+  int audio_buffer_length = 0;
+  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+       it != audio_buffers_.end(); ++it) {
+    audio_buffer_length += (*it)->length();
+  }
+  encoded_data->reserve(audio_buffer_length);
+  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+       it != audio_buffers_.end(); ++it) {
+    encoded_data->append(*(*it));
+  }
+
+  return true;
+}
+
+void AudioEncoder::AppendToBuffer(std::string* item) {
+  audio_buffers_.push_back(item);
+}
+
+}  // namespace speech_input
diff --git a/chrome/browser/speech/audio_encoder.h b/chrome/browser/speech/audio_encoder.h
new file mode 100644
index 0000000..e17a413
--- /dev/null
+++ b/chrome/browser/speech/audio_encoder.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_
+#define CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_
+
+#include <list>
+#include <string>
+
+#include "base/basictypes.h"
+
+namespace speech_input {
+
+// Provides a simple interface to encode raw audio using the various speech
+// codecs.
+class AudioEncoder {
+ public:
+  enum Codec {
+    CODEC_FLAC,
+    CODEC_SPEEX,
+  };
+
+  static AudioEncoder* Create(Codec codec,
+                              int sampling_rate,
+                              int bits_per_sample);
+
+  virtual ~AudioEncoder();
+
+  // Encodes each frame of raw audio in |samples| to the internal buffer. Use
+  // |GetEncodedData| to read the result after this call or when recording
+  // completes.
+  virtual void Encode(const short* samples, int num_samples) = 0;
+
+  // Finish encoding and flush any pending encoded bits out.
+  virtual void Flush() = 0;
+
+  // Copies the encoded audio to the given string. Returns true if the output
+  // is not empty.
+  bool GetEncodedData(std::string* encoded_data);
+
+  const std::string& mime_type() { return mime_type_; }
+
+ protected:
+  AudioEncoder(const std::string& mime_type);
+
+  void AppendToBuffer(std::string* item);
+
+ private:
+  // Buffer holding the recorded audio. Owns the strings inside the list.
+  typedef std::list<std::string*> AudioBufferQueue;
+  AudioBufferQueue audio_buffers_;
+  std::string mime_type_;
+  DISALLOW_COPY_AND_ASSIGN(AudioEncoder);
+};
+
+}  // namespace speech_input
+
+#endif  // CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc
index 277393c..6d46a72 100644
--- a/chrome/browser/speech/speech_recognizer.cc
+++ b/chrome/browser/speech/speech_recognizer.cc
@@ -10,21 +10,11 @@
 #include "chrome/browser/browser_thread.h"
 #include "chrome/browser/profiles/profile.h"
 #include "chrome/common/net/url_request_context_getter.h"
-#include "third_party/speex/speex.h"
 
 using media::AudioInputController;
-using std::list;
 using std::string;
 
 namespace {
-const char* const kContentTypeSpeex =
-    "audio/x-speex-with-header-byte; rate=16000";
-const int kSpeexEncodingQuality = 8;
-const int kMaxSpeexFrameLength = 110;  // (44kbps rate sampled at 32kHz).
-
-// Since the frame length gets written out as a byte in the encoded packet,
-// make sure it is within the byte range.
-COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
 
 // The following constants are related to the volume level indicator shown in
 // the UI for recorded audio.
@@ -45,68 +35,6 @@ const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
 const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
 const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;
 
-// Provides a simple interface to encode raw audio using the Speex codec.
-class SpeexEncoder {
- public:
-  SpeexEncoder();
-  ~SpeexEncoder();
-
-  int samples_per_frame() const { return samples_per_frame_; }
-
-  // Encodes each frame of raw audio in |samples| and adds the
-  // encoded frames as a set of strings to the |encoded_frames| list.
-  // Ownership of the newly added strings is transferred to the caller.
-  void Encode(const short* samples,
-              int num_samples,
-              std::list<std::string*>* encoded_frames);
-
- private:
-  SpeexBits bits_;
-  void* encoder_state_;
-  int samples_per_frame_;
-  char encoded_frame_data_[kMaxSpeexFrameLength + 1];  // +1 for the frame size.
-};
-
-SpeexEncoder::SpeexEncoder() {
-  // speex_bits_init() does not initialize all of the |bits_| struct.
-  memset(&bits_, 0, sizeof(bits_));
-  speex_bits_init(&bits_);
-  encoder_state_ = speex_encoder_init(&speex_wb_mode);
-  DCHECK(encoder_state_);
-  speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
-  DCHECK(samples_per_frame_ > 0);
-  int quality = kSpeexEncodingQuality;
-  speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
-  int vbr = 1;
-  speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
-  memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
-}
-
-SpeexEncoder::~SpeexEncoder() {
-  speex_bits_destroy(&bits_);
-  speex_encoder_destroy(encoder_state_);
-}
-
-void SpeexEncoder::Encode(const short* samples,
-                          int num_samples,
-                          std::list<std::string*>* encoded_frames) {
-  // Drop incomplete frames, typically those which come in when recording stops.
-  num_samples -= (num_samples % samples_per_frame_);
-  for (int i = 0; i < num_samples; i += samples_per_frame_) {
-    speex_bits_reset(&bits_);
-    speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
-                     &bits_);
-
-    // Encode the frame and place the size of the frame as the first byte. This
-    // is the packet format for MIME type x-speex-with-header-byte.
-    int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
-                                        kMaxSpeexFrameLength);
-    encoded_frame_data_[0] = static_cast<char>(frame_length);
-    encoded_frames->push_back(new string(encoded_frame_data_,
-                                         frame_length + 1));
-  }
-}
-
 SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
                                    int caller_id,
                                    const std::string& language,
@@ -117,7 +45,8 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
       language_(language),
       grammar_(grammar),
       hardware_info_(hardware_info),
-      encoder_(new SpeexEncoder()),
+      codec_(AudioEncoder::CODEC_SPEEX),
+      encoder_(NULL),
       endpointer_(kAudioSampleRate),
       num_samples_recorded_(0),
       audio_level_(0.0f) {
@@ -134,7 +63,7 @@ SpeechRecognizer::~SpeechRecognizer() {
   // |StopRecording| being called.
   DCHECK(!audio_controller_.get());
   DCHECK(!request_.get() || !request_->HasPendingRequest());
-  DCHECK(audio_buffers_.empty());
+  DCHECK(!encoder_.get());
   endpointer_.EndSession();
 }
 
@@ -142,14 +71,16 @@ bool SpeechRecognizer::StartRecording() {
   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
   DCHECK(!audio_controller_.get());
   DCHECK(!request_.get() || !request_->HasPendingRequest());
+  DCHECK(!encoder_.get());
 
   // The endpointer needs to estimate the environment/background noise before
   // starting to treat the audio as user input. In |HandleOnData| we wait until
   // such time has passed before switching to user input mode.
   endpointer_.SetEnvironmentEstimationMode();
 
+  encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,
+                                      kNumBitsPerAudioSample));
   int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
-  DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
   AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
                          kAudioSampleRate, kNumBitsPerAudioSample,
                          samples_per_packet);
@@ -174,7 +105,7 @@ void SpeechRecognizer::CancelRecognition() {
   }
 
   VLOG(1) << "SpeechRecognizer canceling recognition.";
-  ReleaseAudioBuffers();
+  encoder_.reset();
   request_.reset();
 }
 
@@ -189,44 +120,29 @@ void SpeechRecognizer::StopRecording() {
   VLOG(1) << "SpeechRecognizer stopping record.";
   audio_controller_->Close();
   audio_controller_ = NULL;  // Releases the ref ptr.
+  encoder_->Flush();
 
   delegate_->DidCompleteRecording(caller_id_);
 
-  // If we haven't got any audio yet end the recognition sequence here.
-  if (audio_buffers_.empty()) {
-    // Guard against the delegate freeing us until we finish our job.
-    scoped_refptr<SpeechRecognizer> me(this);
-    delegate_->DidCompleteRecognition(caller_id_);
-    return;
-  }
-
-  // We now have recorded audio in our buffers, so start a recognition request.
   // Since the http request takes a single string as POST data, allocate
   // one and copy over bytes from the audio buffers to the string.
-  int audio_buffer_length = 0;
-  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
-       it != audio_buffers_.end(); it++) {
-    audio_buffer_length += (*it)->length();
-  }
+  // And If we haven't got any audio yet end the recognition sequence here.
   string data;
-  data.reserve(audio_buffer_length);
-  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
-       it != audio_buffers_.end(); it++) {
-    data.append(*(*it));
+  if (!encoder_->GetEncodedData(&data)) {
+    // Guard against the delegate freeing us until we finish our job.
+    scoped_refptr<SpeechRecognizer> me(this);
+    delegate_->DidCompleteRecognition(caller_id_);
+  } else {
+    DCHECK(!request_.get());
+    request_.reset(new SpeechRecognitionRequest(
+        Profile::GetDefaultRequestContext(), this));
+    request_->Send(language_, grammar_, hardware_info_, encoder_->mime_type(),
+                   data);
   }
-
-  DCHECK(!request_.get());
-  request_.reset(new SpeechRecognitionRequest(
-      Profile::GetDefaultRequestContext(), this));
-  request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data);
-  ReleaseAudioBuffers();  // No need to keep the audio anymore.
+  encoder_.reset();
 }
 
 void SpeechRecognizer::ReleaseAudioBuffers() {
-  for (AudioBufferQueue::iterator it = audio_buffers_.begin();
-       it != audio_buffers_.end(); it++)
-    delete *it;
-  audio_buffers_.clear();
 }
 
 // Invoked in the audio thread.
@@ -275,7 +191,7 @@ void SpeechRecognizer::HandleOnData(string* data) {
   DCHECK((data->length() % sizeof(short)) == 0);
   int num_samples = data->length() / sizeof(short);
 
-  encoder_->Encode(samples, num_samples, &audio_buffers_);
+  encoder_->Encode(samples, num_samples);
   float rms;
   endpointer_.ProcessAudio(samples, num_samples, &rms);
   delete data;
diff --git a/chrome/browser/speech/speech_recognizer.h b/chrome/browser/speech/speech_recognizer.h
index cafba28..5e8511f 100644
--- a/chrome/browser/speech/speech_recognizer.h
+++ b/chrome/browser/speech/speech_recognizer.h
@@ -11,14 +11,13 @@
 
 #include "base/ref_counted.h"
 #include "base/scoped_ptr.h"
+#include "chrome/browser/speech/audio_encoder.h"
 #include "chrome/browser/speech/endpointer/endpointer.h"
 #include "chrome/browser/speech/speech_recognition_request.h"
 #include "media/audio/audio_input_controller.h"
 
 namespace speech_input {
 
-class SpeexEncoder;
-
 // Records audio, sends recorded audio to server and translates server response
 // to recognition result.
 class SpeechRecognizer
@@ -128,13 +127,10 @@ class SpeechRecognizer
   std::string grammar_;
   std::string hardware_info_;
 
-  // Buffer holding the recorded audio. Owns the strings inside the list.
-  typedef std::list<std::string*> AudioBufferQueue;
-  AudioBufferQueue audio_buffers_;
-
   scoped_ptr<SpeechRecognitionRequest> request_;
   scoped_refptr<media::AudioInputController> audio_controller_;
-  scoped_ptr<SpeexEncoder> encoder_;
+  AudioEncoder::Codec codec_;
+  scoped_ptr<AudioEncoder> encoder_;
   Endpointer endpointer_;
   int num_samples_recorded_;
   float audio_level_;
diff --git a/chrome/browser/speech/speech_recognizer_unittest.cc b/chrome/browser/speech/speech_recognizer_unittest.cc
index 372c48c..05830d5d 100644
--- a/chrome/browser/speech/speech_recognizer_unittest.cc
+++ b/chrome/browser/speech/speech_recognizer_unittest.cc
@@ -38,10 +38,6 @@ class SpeechRecognizerTest : public SpeechRecognizerDelegate,
     audio_packet_.resize(audio_packet_length_bytes);
   }
 
-  void StartTest() {
-    EXPECT_TRUE(recognizer_->StartRecording());
-  }
-
   // SpeechRecognizer::Delegate methods.
   virtual void SetRecognitionResult(int caller_id,
                                     bool error,
author	satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-01-17 16:18:21 +0000
committer	satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-01-17 16:18:21 +0000
commit	79d58c7d9b15c855b17ee6aef8b0ecd1a931e369 (patch)
tree	1deb80c6b5663c47a114463a574c55ab68716976 /chrome/browser/speech
parent	63b5a598d0be8179129603c508cb1fe33fbc72ea (diff)
download	chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.zip chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.tar.gz chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.tar.bz2