diff options
author | satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-01-17 16:18:21 +0000 |
---|---|---|
committer | satish@chromium.org <satish@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-01-17 16:18:21 +0000 |
commit | 79d58c7d9b15c855b17ee6aef8b0ecd1a931e369 (patch) | |
tree | 1deb80c6b5663c47a114463a574c55ab68716976 /chrome/browser/speech | |
parent | 63b5a598d0be8179129603c508cb1fe33fbc72ea (diff) | |
download | chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.zip chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.tar.gz chromium_src-79d58c7d9b15c855b17ee6aef8b0ecd1a931e369.tar.bz2 |
Add the option of compressing speech input audio using FLAC.
In the process, added a generic AudioEncoder interface which could create the requested codec.
Right now the codec is set to FLAC. In a future CL, we'll determine the codec to use dynamically
based on bandwidth considerations.
This CL depends on http://codereview.chromium.org/6205006/ going in first.
BUG=61677
TEST=none
Review URL: http://codereview.chromium.org/6111009
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@71599 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/speech')
-rw-r--r-- | chrome/browser/speech/audio_encoder.cc | 200 | ||||
-rw-r--r-- | chrome/browser/speech/audio_encoder.h | 59 | ||||
-rw-r--r-- | chrome/browser/speech/speech_recognizer.cc | 126 | ||||
-rw-r--r-- | chrome/browser/speech/speech_recognizer.h | 10 | ||||
-rw-r--r-- | chrome/browser/speech/speech_recognizer_unittest.cc | 4 |
5 files changed, 283 insertions, 116 deletions
diff --git a/chrome/browser/speech/audio_encoder.cc b/chrome/browser/speech/audio_encoder.cc new file mode 100644 index 0000000..f9a934b --- /dev/null +++ b/chrome/browser/speech/audio_encoder.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/browser/speech/audio_encoder.h" + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/scoped_ptr.h" +#include "base/stl_util-inl.h" +#include "base/string_number_conversions.h" +#include "third_party/flac/flac.h" +#include "third_party/speex/speex.h" + +using std::string; + +namespace { + +//-------------------------------- FLACEncoder --------------------------------- + +const char* const kContentTypeFLAC = "audio/x-flac; rate="; +const int kFLACCompressionLevel = 0; // 0 for speed + +class FLACEncoder : public speech_input::AudioEncoder { + public: + FLACEncoder(int sampling_rate, int bits_per_sample); + virtual ~FLACEncoder(); + virtual void Encode(const short* samples, int num_samples); + virtual void Flush(); + + private: + static FLAC__StreamEncoderWriteStatus WriteCallback( + const FLAC__StreamEncoder* encoder, + const FLAC__byte buffer[], + size_t bytes, + unsigned samples, + unsigned current_frame, + void* client_data); + + FLAC__StreamEncoder* encoder_; + bool is_encoder_initialized_; + + DISALLOW_COPY_AND_ASSIGN(FLACEncoder); +}; + +FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback( + const FLAC__StreamEncoder* encoder, + const FLAC__byte buffer[], + size_t bytes, + unsigned samples, + unsigned current_frame, + void* client_data) { + FLACEncoder* me = static_cast<FLACEncoder*>(client_data); + DCHECK(me->encoder_ == encoder); + me->AppendToBuffer(new string(reinterpret_cast<const char*>(buffer), bytes)); + return FLAC__STREAM_ENCODER_WRITE_STATUS_OK; +} + +FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample) + : AudioEncoder(std::string(kContentTypeFLAC) + + base::IntToString(sampling_rate)), + encoder_(FLAC__stream_encoder_new()), + is_encoder_initialized_(false) { + FLAC__stream_encoder_set_channels(encoder_, 1); + FLAC__stream_encoder_set_bits_per_sample(encoder_, bits_per_sample); + FLAC__stream_encoder_set_sample_rate(encoder_, sampling_rate); + FLAC__stream_encoder_set_compression_level(encoder_, kFLACCompressionLevel); + + // Initializing the encoder will cause sync bytes to be written to + // its output stream, so we wait until the first call to this method + // before doing so. +} + +FLACEncoder::~FLACEncoder() { + FLAC__stream_encoder_delete(encoder_); +} + +void FLACEncoder::Encode(const short* samples, int num_samples) { + if (!is_encoder_initialized_) { + const FLAC__StreamEncoderInitStatus encoder_status = + FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL, + NULL, this); + DCHECK(encoder_status == FLAC__STREAM_ENCODER_INIT_STATUS_OK); + is_encoder_initialized_ = true; + } + + // FLAC encoder wants samples as int32s. + scoped_ptr<FLAC__int32> flac_samples(new FLAC__int32[num_samples]); + FLAC__int32* flac_samples_ptr = flac_samples.get(); + for (int i = 0; i < num_samples; ++i) + flac_samples_ptr[i] = samples[i]; + + FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples); +} + +void FLACEncoder::Flush() { + FLAC__stream_encoder_finish(encoder_); +} + +//-------------------------------- SpeexEncoder -------------------------------- + +const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate="; +const int kSpeexEncodingQuality = 8; +const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). + +// Since the frame length gets written out as a byte in the encoded packet, +// make sure it is within the byte range. +COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); + +class SpeexEncoder : public speech_input::AudioEncoder { + public: + SpeexEncoder(int sampling_rate); + virtual void Encode(const short* samples, int num_samples); + virtual void Flush() {} + + private: + void* encoder_state_; + SpeexBits bits_; + int samples_per_frame_; + char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. + DISALLOW_COPY_AND_ASSIGN(SpeexEncoder); +}; + +SpeexEncoder::SpeexEncoder(int sampling_rate) + : AudioEncoder(std::string(kContentTypeSpeex) + + base::IntToString(sampling_rate)) { + // speex_bits_init() does not initialize all of the |bits_| struct. + memset(&bits_, 0, sizeof(bits_)); + speex_bits_init(&bits_); + encoder_state_ = speex_encoder_init(&speex_wb_mode); + DCHECK(encoder_state_); + speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); + DCHECK(samples_per_frame_ > 0); + int quality = kSpeexEncodingQuality; + speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); + int vbr = 1; + speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); + memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); +} + +void SpeexEncoder::Encode(const short* samples, int num_samples) { + // Drop incomplete frames, typically those which come in when recording stops. + num_samples -= (num_samples % samples_per_frame_); + for (int i = 0; i < num_samples; i += samples_per_frame_) { + speex_bits_reset(&bits_); + speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), + &bits_); + + // Encode the frame and place the size of the frame as the first byte. This + // is the packet format for MIME type x-speex-with-header-byte. + int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, + kMaxSpeexFrameLength); + encoded_frame_data_[0] = static_cast<char>(frame_length); + AppendToBuffer(new string(encoded_frame_data_, frame_length + 1)); + } +} + +} // namespace + +namespace speech_input { + +AudioEncoder* AudioEncoder::Create(Codec codec, + int sampling_rate, + int bits_per_sample) { + if (codec == CODEC_FLAC) + return new FLACEncoder(sampling_rate, bits_per_sample); + return new SpeexEncoder(sampling_rate); +} + +AudioEncoder::AudioEncoder(const std::string& mime_type) + : mime_type_(mime_type) { +} + +AudioEncoder::~AudioEncoder() { + STLDeleteElements(&audio_buffers_); +} + +bool AudioEncoder::GetEncodedData(std::string* encoded_data) { + if (!audio_buffers_.size()) + return false; + + int audio_buffer_length = 0; + for (AudioBufferQueue::iterator it = audio_buffers_.begin(); + it != audio_buffers_.end(); ++it) { + audio_buffer_length += (*it)->length(); + } + encoded_data->reserve(audio_buffer_length); + for (AudioBufferQueue::iterator it = audio_buffers_.begin(); + it != audio_buffers_.end(); ++it) { + encoded_data->append(*(*it)); + } + + return true; +} + +void AudioEncoder::AppendToBuffer(std::string* item) { + audio_buffers_.push_back(item); +} + +} // namespace speech_input diff --git a/chrome/browser/speech/audio_encoder.h b/chrome/browser/speech/audio_encoder.h new file mode 100644 index 0000000..e17a413 --- /dev/null +++ b/chrome/browser/speech/audio_encoder.h @@ -0,0 +1,59 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_ +#define CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_ + +#include <list> +#include <string> + +#include "base/basictypes.h" + +namespace speech_input { + +// Provides a simple interface to encode raw audio using the various speech +// codecs. +class AudioEncoder { + public: + enum Codec { + CODEC_FLAC, + CODEC_SPEEX, + }; + + static AudioEncoder* Create(Codec codec, + int sampling_rate, + int bits_per_sample); + + virtual ~AudioEncoder(); + + // Encodes each frame of raw audio in |samples| to the internal buffer. Use + // |GetEncodedData| to read the result after this call or when recording + // completes. + virtual void Encode(const short* samples, int num_samples) = 0; + + // Finish encoding and flush any pending encoded bits out. + virtual void Flush() = 0; + + // Copies the encoded audio to the given string. Returns true if the output + // is not empty. + bool GetEncodedData(std::string* encoded_data); + + const std::string& mime_type() { return mime_type_; } + + protected: + AudioEncoder(const std::string& mime_type); + + void AppendToBuffer(std::string* item); + + private: + // Buffer holding the recorded audio. Owns the strings inside the list. + typedef std::list<std::string*> AudioBufferQueue; + AudioBufferQueue audio_buffers_; + std::string mime_type_; + DISALLOW_COPY_AND_ASSIGN(AudioEncoder); +}; + +} // namespace speech_input + +#endif // CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_ diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc index 277393c..6d46a72 100644 --- a/chrome/browser/speech/speech_recognizer.cc +++ b/chrome/browser/speech/speech_recognizer.cc @@ -10,21 +10,11 @@ #include "chrome/browser/browser_thread.h" #include "chrome/browser/profiles/profile.h" #include "chrome/common/net/url_request_context_getter.h" -#include "third_party/speex/speex.h" using media::AudioInputController; -using std::list; using std::string; namespace { -const char* const kContentTypeSpeex = - "audio/x-speex-with-header-byte; rate=16000"; -const int kSpeexEncodingQuality = 8; -const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). - -// Since the frame length gets written out as a byte in the encoded packet, -// make sure it is within the byte range. -COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); // The following constants are related to the volume level indicator shown in // the UI for recorded audio. @@ -45,68 +35,6 @@ const int SpeechRecognizer::kNumBitsPerAudioSample = 16; const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; -// Provides a simple interface to encode raw audio using the Speex codec. -class SpeexEncoder { - public: - SpeexEncoder(); - ~SpeexEncoder(); - - int samples_per_frame() const { return samples_per_frame_; } - - // Encodes each frame of raw audio in |samples| and adds the - // encoded frames as a set of strings to the |encoded_frames| list. - // Ownership of the newly added strings is transferred to the caller. - void Encode(const short* samples, - int num_samples, - std::list<std::string*>* encoded_frames); - - private: - SpeexBits bits_; - void* encoder_state_; - int samples_per_frame_; - char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. -}; - -SpeexEncoder::SpeexEncoder() { - // speex_bits_init() does not initialize all of the |bits_| struct. - memset(&bits_, 0, sizeof(bits_)); - speex_bits_init(&bits_); - encoder_state_ = speex_encoder_init(&speex_wb_mode); - DCHECK(encoder_state_); - speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); - DCHECK(samples_per_frame_ > 0); - int quality = kSpeexEncodingQuality; - speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); - int vbr = 1; - speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); - memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); -} - -SpeexEncoder::~SpeexEncoder() { - speex_bits_destroy(&bits_); - speex_encoder_destroy(encoder_state_); -} - -void SpeexEncoder::Encode(const short* samples, - int num_samples, - std::list<std::string*>* encoded_frames) { - // Drop incomplete frames, typically those which come in when recording stops. - num_samples -= (num_samples % samples_per_frame_); - for (int i = 0; i < num_samples; i += samples_per_frame_) { - speex_bits_reset(&bits_); - speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), - &bits_); - - // Encode the frame and place the size of the frame as the first byte. This - // is the packet format for MIME type x-speex-with-header-byte. - int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, - kMaxSpeexFrameLength); - encoded_frame_data_[0] = static_cast<char>(frame_length); - encoded_frames->push_back(new string(encoded_frame_data_, - frame_length + 1)); - } -} - SpeechRecognizer::SpeechRecognizer(Delegate* delegate, int caller_id, const std::string& language, @@ -117,7 +45,8 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate, language_(language), grammar_(grammar), hardware_info_(hardware_info), - encoder_(new SpeexEncoder()), + codec_(AudioEncoder::CODEC_SPEEX), + encoder_(NULL), endpointer_(kAudioSampleRate), num_samples_recorded_(0), audio_level_(0.0f) { @@ -134,7 +63,7 @@ SpeechRecognizer::~SpeechRecognizer() { // |StopRecording| being called. DCHECK(!audio_controller_.get()); DCHECK(!request_.get() || !request_->HasPendingRequest()); - DCHECK(audio_buffers_.empty()); + DCHECK(!encoder_.get()); endpointer_.EndSession(); } @@ -142,14 +71,16 @@ bool SpeechRecognizer::StartRecording() { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); DCHECK(!audio_controller_.get()); DCHECK(!request_.get() || !request_->HasPendingRequest()); + DCHECK(!encoder_.get()); // The endpointer needs to estimate the environment/background noise before // starting to treat the audio as user input. In |HandleOnData| we wait until // such time has passed before switching to user input mode. endpointer_.SetEnvironmentEstimationMode(); + encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, + kNumBitsPerAudioSample)); int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; - DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet); @@ -174,7 +105,7 @@ void SpeechRecognizer::CancelRecognition() { } VLOG(1) << "SpeechRecognizer canceling recognition."; - ReleaseAudioBuffers(); + encoder_.reset(); request_.reset(); } @@ -189,44 +120,29 @@ void SpeechRecognizer::StopRecording() { VLOG(1) << "SpeechRecognizer stopping record."; audio_controller_->Close(); audio_controller_ = NULL; // Releases the ref ptr. + encoder_->Flush(); delegate_->DidCompleteRecording(caller_id_); - // If we haven't got any audio yet end the recognition sequence here. - if (audio_buffers_.empty()) { - // Guard against the delegate freeing us until we finish our job. - scoped_refptr<SpeechRecognizer> me(this); - delegate_->DidCompleteRecognition(caller_id_); - return; - } - - // We now have recorded audio in our buffers, so start a recognition request. // Since the http request takes a single string as POST data, allocate // one and copy over bytes from the audio buffers to the string. - int audio_buffer_length = 0; - for (AudioBufferQueue::iterator it = audio_buffers_.begin(); - it != audio_buffers_.end(); it++) { - audio_buffer_length += (*it)->length(); - } + // And If we haven't got any audio yet end the recognition sequence here. string data; - data.reserve(audio_buffer_length); - for (AudioBufferQueue::iterator it = audio_buffers_.begin(); - it != audio_buffers_.end(); it++) { - data.append(*(*it)); + if (!encoder_->GetEncodedData(&data)) { + // Guard against the delegate freeing us until we finish our job. + scoped_refptr<SpeechRecognizer> me(this); + delegate_->DidCompleteRecognition(caller_id_); + } else { + DCHECK(!request_.get()); + request_.reset(new SpeechRecognitionRequest( + Profile::GetDefaultRequestContext(), this)); + request_->Send(language_, grammar_, hardware_info_, encoder_->mime_type(), + data); } - - DCHECK(!request_.get()); - request_.reset(new SpeechRecognitionRequest( - Profile::GetDefaultRequestContext(), this)); - request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data); - ReleaseAudioBuffers(); // No need to keep the audio anymore. + encoder_.reset(); } void SpeechRecognizer::ReleaseAudioBuffers() { - for (AudioBufferQueue::iterator it = audio_buffers_.begin(); - it != audio_buffers_.end(); it++) - delete *it; - audio_buffers_.clear(); } // Invoked in the audio thread. @@ -275,7 +191,7 @@ void SpeechRecognizer::HandleOnData(string* data) { DCHECK((data->length() % sizeof(short)) == 0); int num_samples = data->length() / sizeof(short); - encoder_->Encode(samples, num_samples, &audio_buffers_); + encoder_->Encode(samples, num_samples); float rms; endpointer_.ProcessAudio(samples, num_samples, &rms); delete data; diff --git a/chrome/browser/speech/speech_recognizer.h b/chrome/browser/speech/speech_recognizer.h index cafba28..5e8511f 100644 --- a/chrome/browser/speech/speech_recognizer.h +++ b/chrome/browser/speech/speech_recognizer.h @@ -11,14 +11,13 @@ #include "base/ref_counted.h" #include "base/scoped_ptr.h" +#include "chrome/browser/speech/audio_encoder.h" #include "chrome/browser/speech/endpointer/endpointer.h" #include "chrome/browser/speech/speech_recognition_request.h" #include "media/audio/audio_input_controller.h" namespace speech_input { -class SpeexEncoder; - // Records audio, sends recorded audio to server and translates server response // to recognition result. class SpeechRecognizer @@ -128,13 +127,10 @@ class SpeechRecognizer std::string grammar_; std::string hardware_info_; - // Buffer holding the recorded audio. Owns the strings inside the list. - typedef std::list<std::string*> AudioBufferQueue; - AudioBufferQueue audio_buffers_; - scoped_ptr<SpeechRecognitionRequest> request_; scoped_refptr<media::AudioInputController> audio_controller_; - scoped_ptr<SpeexEncoder> encoder_; + AudioEncoder::Codec codec_; + scoped_ptr<AudioEncoder> encoder_; Endpointer endpointer_; int num_samples_recorded_; float audio_level_; diff --git a/chrome/browser/speech/speech_recognizer_unittest.cc b/chrome/browser/speech/speech_recognizer_unittest.cc index 372c48c..05830d5d 100644 --- a/chrome/browser/speech/speech_recognizer_unittest.cc +++ b/chrome/browser/speech/speech_recognizer_unittest.cc @@ -38,10 +38,6 @@ class SpeechRecognizerTest : public SpeechRecognizerDelegate, audio_packet_.resize(audio_packet_length_bytes); } - void StartTest() { - EXPECT_TRUE(recognizer_->StartRecording()); - } - // SpeechRecognizer::Delegate methods. virtual void SetRecognitionResult(int caller_id, bool error, |