summaryrefslogtreecommitdiffstats
path: root/chrome/browser/speech
diff options
context:
space:
mode:
authorsatish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-01-17 16:18:21 +0000
committersatish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2011-01-17 16:18:21 +0000
commit79d58c7d9b15c855b17ee6aef8b0ecd1a931e369 (patch)
tree1deb80c6b5663c47a114463a574c55ab68716976 /chrome/browser/speech
parent63b5a598d0be8179129603c508cb1fe33fbc72ea (diff)
downloadchromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.zip
chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.tar.gz
chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.tar.bz2
Add the option of compressing speech input audio using FLAC.
In the process, added a generic AudioEncoder interface which could create the requested codec. Right now the codec is set to FLAC. In a future CL, we'll determine the codec to use dynamically based on bandwidth considerations. This CL depends on http://codereview.chromium.org/6205006/ going in first. BUG=61677 TEST=none Review URL: http://codereview.chromium.org/6111009 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@71599 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/speech')
-rw-r--r--chrome/browser/speech/audio_encoder.cc200
-rw-r--r--chrome/browser/speech/audio_encoder.h59
-rw-r--r--chrome/browser/speech/speech_recognizer.cc126
-rw-r--r--chrome/browser/speech/speech_recognizer.h10
-rw-r--r--chrome/browser/speech/speech_recognizer_unittest.cc4
5 files changed, 283 insertions, 116 deletions
diff --git a/chrome/browser/speech/audio_encoder.cc b/chrome/browser/speech/audio_encoder.cc
new file mode 100644
index 0000000..f9a934b
--- /dev/null
+++ b/chrome/browser/speech/audio_encoder.cc
@@ -0,0 +1,200 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/speech/audio_encoder.h"
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "base/scoped_ptr.h"
+#include "base/stl_util-inl.h"
+#include "base/string_number_conversions.h"
+#include "third_party/flac/flac.h"
+#include "third_party/speex/speex.h"
+
+using std::string;
+
+namespace {
+
+//-------------------------------- FLACEncoder ---------------------------------
+
+const char* const kContentTypeFLAC = "audio/x-flac; rate=";
+const int kFLACCompressionLevel = 0; // 0 for speed
+
+class FLACEncoder : public speech_input::AudioEncoder {
+ public:
+ FLACEncoder(int sampling_rate, int bits_per_sample);
+ virtual ~FLACEncoder();
+ virtual void Encode(const short* samples, int num_samples);
+ virtual void Flush();
+
+ private:
+ static FLAC__StreamEncoderWriteStatus WriteCallback(
+ const FLAC__StreamEncoder* encoder,
+ const FLAC__byte buffer[],
+ size_t bytes,
+ unsigned samples,
+ unsigned current_frame,
+ void* client_data);
+
+ FLAC__StreamEncoder* encoder_;
+ bool is_encoder_initialized_;
+
+ DISALLOW_COPY_AND_ASSIGN(FLACEncoder);
+};
+
+FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback(
+ const FLAC__StreamEncoder* encoder,
+ const FLAC__byte buffer[],
+ size_t bytes,
+ unsigned samples,
+ unsigned current_frame,
+ void* client_data) {
+ FLACEncoder* me = static_cast<FLACEncoder*>(client_data);
+ DCHECK(me->encoder_ == encoder);
+ me->AppendToBuffer(new string(reinterpret_cast<const char*>(buffer), bytes));
+ return FLAC__STREAM_ENCODER_WRITE_STATUS_OK;
+}
+
+FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample)
+ : AudioEncoder(std::string(kContentTypeFLAC) +
+ base::IntToString(sampling_rate)),
+ encoder_(FLAC__stream_encoder_new()),
+ is_encoder_initialized_(false) {
+ FLAC__stream_encoder_set_channels(encoder_, 1);
+ FLAC__stream_encoder_set_bits_per_sample(encoder_, bits_per_sample);
+ FLAC__stream_encoder_set_sample_rate(encoder_, sampling_rate);
+ FLAC__stream_encoder_set_compression_level(encoder_, kFLACCompressionLevel);
+
+ // Initializing the encoder will cause sync bytes to be written to
+ // its output stream, so we wait until the first call to this method
+ // before doing so.
+}
+
+FLACEncoder::~FLACEncoder() {
+ FLAC__stream_encoder_delete(encoder_);
+}
+
+void FLACEncoder::Encode(const short* samples, int num_samples) {
+ if (!is_encoder_initialized_) {
+ const FLAC__StreamEncoderInitStatus encoder_status =
+ FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL,
+ NULL, this);
+ DCHECK(encoder_status == FLAC__STREAM_ENCODER_INIT_STATUS_OK);
+ is_encoder_initialized_ = true;
+ }
+
+ // FLAC encoder wants samples as int32s.
+ scoped_ptr<FLAC__int32> flac_samples(new FLAC__int32[num_samples]);
+ FLAC__int32* flac_samples_ptr = flac_samples.get();
+ for (int i = 0; i < num_samples; ++i)
+ flac_samples_ptr[i] = samples[i];
+
+ FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples);
+}
+
+void FLACEncoder::Flush() {
+ FLAC__stream_encoder_finish(encoder_);
+}
+
+//-------------------------------- SpeexEncoder --------------------------------
+
+const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate=";
+const int kSpeexEncodingQuality = 8;
+const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
+
+// Since the frame length gets written out as a byte in the encoded packet,
+// make sure it is within the byte range.
+COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
+
+class SpeexEncoder : public speech_input::AudioEncoder {
+ public:
+ SpeexEncoder(int sampling_rate);
+ virtual void Encode(const short* samples, int num_samples);
+ virtual void Flush() {}
+
+ private:
+ void* encoder_state_;
+ SpeexBits bits_;
+ int samples_per_frame_;
+ char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.
+ DISALLOW_COPY_AND_ASSIGN(SpeexEncoder);
+};
+
+SpeexEncoder::SpeexEncoder(int sampling_rate)
+ : AudioEncoder(std::string(kContentTypeSpeex) +
+ base::IntToString(sampling_rate)) {
+ // speex_bits_init() does not initialize all of the |bits_| struct.
+ memset(&bits_, 0, sizeof(bits_));
+ speex_bits_init(&bits_);
+ encoder_state_ = speex_encoder_init(&speex_wb_mode);
+ DCHECK(encoder_state_);
+ speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
+ DCHECK(samples_per_frame_ > 0);
+ int quality = kSpeexEncodingQuality;
+ speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
+ int vbr = 1;
+ speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
+ memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
+}
+
+void SpeexEncoder::Encode(const short* samples, int num_samples) {
+ // Drop incomplete frames, typically those which come in when recording stops.
+ num_samples -= (num_samples % samples_per_frame_);
+ for (int i = 0; i < num_samples; i += samples_per_frame_) {
+ speex_bits_reset(&bits_);
+ speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
+ &bits_);
+
+ // Encode the frame and place the size of the frame as the first byte. This
+ // is the packet format for MIME type x-speex-with-header-byte.
+ int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
+ kMaxSpeexFrameLength);
+ encoded_frame_data_[0] = static_cast<char>(frame_length);
+ AppendToBuffer(new string(encoded_frame_data_, frame_length + 1));
+ }
+}
+
+} // namespace
+
+namespace speech_input {
+
+AudioEncoder* AudioEncoder::Create(Codec codec,
+ int sampling_rate,
+ int bits_per_sample) {
+ if (codec == CODEC_FLAC)
+ return new FLACEncoder(sampling_rate, bits_per_sample);
+ return new SpeexEncoder(sampling_rate);
+}
+
+AudioEncoder::AudioEncoder(const std::string& mime_type)
+ : mime_type_(mime_type) {
+}
+
+AudioEncoder::~AudioEncoder() {
+ STLDeleteElements(&audio_buffers_);
+}
+
+bool AudioEncoder::GetEncodedData(std::string* encoded_data) {
+ if (!audio_buffers_.size())
+ return false;
+
+ int audio_buffer_length = 0;
+ for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+ it != audio_buffers_.end(); ++it) {
+ audio_buffer_length += (*it)->length();
+ }
+ encoded_data->reserve(audio_buffer_length);
+ for (AudioBufferQueue::iterator it = audio_buffers_.begin();
+ it != audio_buffers_.end(); ++it) {
+ encoded_data->append(*(*it));
+ }
+
+ return true;
+}
+
+void AudioEncoder::AppendToBuffer(std::string* item) {
+ audio_buffers_.push_back(item);
+}
+
+} // namespace speech_input
diff --git a/chrome/browser/speech/audio_encoder.h b/chrome/browser/speech/audio_encoder.h
new file mode 100644
index 0000000..e17a413
--- /dev/null
+++ b/chrome/browser/speech/audio_encoder.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_
+#define CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_
+
+#include <list>
+#include <string>
+
+#include "base/basictypes.h"
+
+namespace speech_input {
+
+// Provides a simple interface to encode raw audio using the various speech
+// codecs.
+class AudioEncoder {
+ public:
+ enum Codec {
+ CODEC_FLAC,
+ CODEC_SPEEX,
+ };
+
+ static AudioEncoder* Create(Codec codec,
+ int sampling_rate,
+ int bits_per_sample);
+
+ virtual ~AudioEncoder();
+
+ // Encodes each frame of raw audio in |samples| to the internal buffer. Use
+ // |GetEncodedData| to read the result after this call or when recording
+ // completes.
+ virtual void Encode(const short* samples, int num_samples) = 0;
+
+ // Finish encoding and flush any pending encoded bits out.
+ virtual void Flush() = 0;
+
+ // Copies the encoded audio to the given string. Returns true if the output
+ // is not empty.
+ bool GetEncodedData(std::string* encoded_data);
+
+ const std::string& mime_type() { return mime_type_; }
+
+ protected:
+ AudioEncoder(const std::string& mime_type);
+
+ void AppendToBuffer(std::string* item);
+
+ private:
+ // Buffer holding the recorded audio. Owns the strings inside the list.
+ typedef std::list<std::string*> AudioBufferQueue;
+ AudioBufferQueue audio_buffers_;
+ std::string mime_type_;
+ DISALLOW_COPY_AND_ASSIGN(AudioEncoder);
+};
+
+} // namespace speech_input
+
+#endif // CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_
diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc
index 277393c..6d46a72 100644
--- a/chrome/browser/speech/speech_recognizer.cc
+++ b/chrome/browser/speech/speech_recognizer.cc
@@ -10,21 +10,11 @@
#include "chrome/browser/browser_thread.h"
#include "chrome/browser/profiles/profile.h"
#include "chrome/common/net/url_request_context_getter.h"
-#include "third_party/speex/speex.h"
using media::AudioInputController;
-using std::list;
using std::string;
namespace {
-const char* const kContentTypeSpeex =
- "audio/x-speex-with-header-byte; rate=16000";
-const int kSpeexEncodingQuality = 8;
-const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
-
-// Since the frame length gets written out as a byte in the encoded packet,
-// make sure it is within the byte range.
-COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
// The following constants are related to the volume level indicator shown in
// the UI for recorded audio.
@@ -45,68 +35,6 @@ const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;
-// Provides a simple interface to encode raw audio using the Speex codec.
-class SpeexEncoder {
- public:
- SpeexEncoder();
- ~SpeexEncoder();
-
- int samples_per_frame() const { return samples_per_frame_; }
-
- // Encodes each frame of raw audio in |samples| and adds the
- // encoded frames as a set of strings to the |encoded_frames| list.
- // Ownership of the newly added strings is transferred to the caller.
- void Encode(const short* samples,
- int num_samples,
- std::list<std::string*>* encoded_frames);
-
- private:
- SpeexBits bits_;
- void* encoder_state_;
- int samples_per_frame_;
- char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.
-};
-
-SpeexEncoder::SpeexEncoder() {
- // speex_bits_init() does not initialize all of the |bits_| struct.
- memset(&bits_, 0, sizeof(bits_));
- speex_bits_init(&bits_);
- encoder_state_ = speex_encoder_init(&speex_wb_mode);
- DCHECK(encoder_state_);
- speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
- DCHECK(samples_per_frame_ > 0);
- int quality = kSpeexEncodingQuality;
- speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
- int vbr = 1;
- speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
- memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
-}
-
-SpeexEncoder::~SpeexEncoder() {
- speex_bits_destroy(&bits_);
- speex_encoder_destroy(encoder_state_);
-}
-
-void SpeexEncoder::Encode(const short* samples,
- int num_samples,
- std::list<std::string*>* encoded_frames) {
- // Drop incomplete frames, typically those which come in when recording stops.
- num_samples -= (num_samples % samples_per_frame_);
- for (int i = 0; i < num_samples; i += samples_per_frame_) {
- speex_bits_reset(&bits_);
- speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
- &bits_);
-
- // Encode the frame and place the size of the frame as the first byte. This
- // is the packet format for MIME type x-speex-with-header-byte.
- int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
- kMaxSpeexFrameLength);
- encoded_frame_data_[0] = static_cast<char>(frame_length);
- encoded_frames->push_back(new string(encoded_frame_data_,
- frame_length + 1));
- }
-}
-
SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
int caller_id,
const std::string& language,
@@ -117,7 +45,8 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
language_(language),
grammar_(grammar),
hardware_info_(hardware_info),
- encoder_(new SpeexEncoder()),
+ codec_(AudioEncoder::CODEC_SPEEX),
+ encoder_(NULL),
endpointer_(kAudioSampleRate),
num_samples_recorded_(0),
audio_level_(0.0f) {
@@ -134,7 +63,7 @@ SpeechRecognizer::~SpeechRecognizer() {
// |StopRecording| being called.
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
- DCHECK(audio_buffers_.empty());
+ DCHECK(!encoder_.get());
endpointer_.EndSession();
}
@@ -142,14 +71,16 @@ bool SpeechRecognizer::StartRecording() {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
+ DCHECK(!encoder_.get());
// The endpointer needs to estimate the environment/background noise before
// starting to treat the audio as user input. In |HandleOnData| we wait until
// such time has passed before switching to user input mode.
endpointer_.SetEnvironmentEstimationMode();
+ encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,
+ kNumBitsPerAudioSample));
int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
- DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
kAudioSampleRate, kNumBitsPerAudioSample,
samples_per_packet);
@@ -174,7 +105,7 @@ void SpeechRecognizer::CancelRecognition() {
}
VLOG(1) << "SpeechRecognizer canceling recognition.";
- ReleaseAudioBuffers();
+ encoder_.reset();
request_.reset();
}
@@ -189,44 +120,29 @@ void SpeechRecognizer::StopRecording() {
VLOG(1) << "SpeechRecognizer stopping record.";
audio_controller_->Close();
audio_controller_ = NULL; // Releases the ref ptr.
+ encoder_->Flush();
delegate_->DidCompleteRecording(caller_id_);
- // If we haven't got any audio yet end the recognition sequence here.
- if (audio_buffers_.empty()) {
- // Guard against the delegate freeing us until we finish our job.
- scoped_refptr<SpeechRecognizer> me(this);
- delegate_->DidCompleteRecognition(caller_id_);
- return;
- }
-
- // We now have recorded audio in our buffers, so start a recognition request.
// Since the http request takes a single string as POST data, allocate
// one and copy over bytes from the audio buffers to the string.
- int audio_buffer_length = 0;
- for (AudioBufferQueue::iterator it = audio_buffers_.begin();
- it != audio_buffers_.end(); it++) {
- audio_buffer_length += (*it)->length();
- }
+ // And If we haven't got any audio yet end the recognition sequence here.
string data;
- data.reserve(audio_buffer_length);
- for (AudioBufferQueue::iterator it = audio_buffers_.begin();
- it != audio_buffers_.end(); it++) {
- data.append(*(*it));
+ if (!encoder_->GetEncodedData(&data)) {
+ // Guard against the delegate freeing us until we finish our job.
+ scoped_refptr<SpeechRecognizer> me(this);
+ delegate_->DidCompleteRecognition(caller_id_);
+ } else {
+ DCHECK(!request_.get());
+ request_.reset(new SpeechRecognitionRequest(
+ Profile::GetDefaultRequestContext(), this));
+ request_->Send(language_, grammar_, hardware_info_, encoder_->mime_type(),
+ data);
}
-
- DCHECK(!request_.get());
- request_.reset(new SpeechRecognitionRequest(
- Profile::GetDefaultRequestContext(), this));
- request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data);
- ReleaseAudioBuffers(); // No need to keep the audio anymore.
+ encoder_.reset();
}
void SpeechRecognizer::ReleaseAudioBuffers() {
- for (AudioBufferQueue::iterator it = audio_buffers_.begin();
- it != audio_buffers_.end(); it++)
- delete *it;
- audio_buffers_.clear();
}
// Invoked in the audio thread.
@@ -275,7 +191,7 @@ void SpeechRecognizer::HandleOnData(string* data) {
DCHECK((data->length() % sizeof(short)) == 0);
int num_samples = data->length() / sizeof(short);
- encoder_->Encode(samples, num_samples, &audio_buffers_);
+ encoder_->Encode(samples, num_samples);
float rms;
endpointer_.ProcessAudio(samples, num_samples, &rms);
delete data;
diff --git a/chrome/browser/speech/speech_recognizer.h b/chrome/browser/speech/speech_recognizer.h
index cafba28..5e8511f 100644
--- a/chrome/browser/speech/speech_recognizer.h
+++ b/chrome/browser/speech/speech_recognizer.h
@@ -11,14 +11,13 @@
#include "base/ref_counted.h"
#include "base/scoped_ptr.h"
+#include "chrome/browser/speech/audio_encoder.h"
#include "chrome/browser/speech/endpointer/endpointer.h"
#include "chrome/browser/speech/speech_recognition_request.h"
#include "media/audio/audio_input_controller.h"
namespace speech_input {
-class SpeexEncoder;
-
// Records audio, sends recorded audio to server and translates server response
// to recognition result.
class SpeechRecognizer
@@ -128,13 +127,10 @@ class SpeechRecognizer
std::string grammar_;
std::string hardware_info_;
- // Buffer holding the recorded audio. Owns the strings inside the list.
- typedef std::list<std::string*> AudioBufferQueue;
- AudioBufferQueue audio_buffers_;
-
scoped_ptr<SpeechRecognitionRequest> request_;
scoped_refptr<media::AudioInputController> audio_controller_;
- scoped_ptr<SpeexEncoder> encoder_;
+ AudioEncoder::Codec codec_;
+ scoped_ptr<AudioEncoder> encoder_;
Endpointer endpointer_;
int num_samples_recorded_;
float audio_level_;
diff --git a/chrome/browser/speech/speech_recognizer_unittest.cc b/chrome/browser/speech/speech_recognizer_unittest.cc
index 372c48c..05830d5d 100644
--- a/chrome/browser/speech/speech_recognizer_unittest.cc
+++ b/chrome/browser/speech/speech_recognizer_unittest.cc
@@ -38,10 +38,6 @@ class SpeechRecognizerTest : public SpeechRecognizerDelegate,
audio_packet_.resize(audio_packet_length_bytes);
}
- void StartTest() {
- EXPECT_TRUE(recognizer_->StartRecording());
- }
-
// SpeechRecognizer::Delegate methods.
virtual void SetRecognitionResult(int caller_id,
bool error,