diff options
author | primiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-03-13 23:57:51 +0000 |
---|---|---|
committer | primiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-03-13 23:57:51 +0000 |
commit | fad64e7a123b6ddd2ba8af13441c74f8f37966ee (patch) | |
tree | 28f09e77787e4a77ed30d45743086f12b62f58ee /content/browser/speech | |
parent | 4a5aebb91b0784ef133a926773b0b9e517f288d9 (diff) | |
download | chromium_src-fad64e7a123b6ddd2ba8af13441c74f8f37966ee.zip chromium_src-fad64e7a123b6ddd2ba8af13441c74f8f37966ee.tar.gz chromium_src-fad64e7a123b6ddd2ba8af13441c74f8f37966ee.tar.bz2 |
Added AudioBuffer/AudioChunk abstractions for speech recognition and improved speech_recognizer_impl_unittest.
audio_encoder - Introduced AudioBuffer class in order to hide the current string-based implementation (which involved a lot of dirty and distributed casts) and make room for future implementations based on a circular buffer.
speech_recognizer_impl_unittest
- Created MockAudioManager class, in order to avoid using the true audio manager on trybots, which could lead to errors accessing the audio device.
BUG=116954
TEST=speech_recognizer_impl_uinittest should never raise errors related to the audio driver (e.g, device in use, no microphone attached, etc).
Review URL: http://codereview.chromium.org/9646031
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@126512 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'content/browser/speech')
-rw-r--r-- | content/browser/speech/audio_buffer.cc | 91 | ||||
-rw-r--r-- | content/browser/speech/audio_buffer.h | 73 | ||||
-rw-r--r-- | content/browser/speech/audio_encoder.cc | 74 | ||||
-rw-r--r-- | content/browser/speech/audio_encoder.h | 28 | ||||
-rw-r--r-- | content/browser/speech/endpointer/endpointer.cc | 6 | ||||
-rw-r--r-- | content/browser/speech/endpointer/endpointer.h | 5 | ||||
-rw-r--r-- | content/browser/speech/endpointer/endpointer_unittest.cc | 4 | ||||
-rw-r--r-- | content/browser/speech/speech_recognition_request.cc | 5 | ||||
-rw-r--r-- | content/browser/speech/speech_recognition_request.h | 4 | ||||
-rw-r--r-- | content/browser/speech/speech_recognition_request_unittest.cc | 7 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl.cc | 51 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl.h | 4 | ||||
-rw-r--r-- | content/browser/speech/speech_recognizer_impl_unittest.cc | 46 |
13 files changed, 304 insertions, 94 deletions
diff --git a/content/browser/speech/audio_buffer.cc b/content/browser/speech/audio_buffer.cc new file mode 100644 index 0000000..5b887d7 --- /dev/null +++ b/content/browser/speech/audio_buffer.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/logging.h" +#include "base/stl_util.h" +#include "content/browser/speech/audio_buffer.h" + +namespace speech { + +AudioChunk::AudioChunk(int bytes_per_sample) + : bytes_per_sample_(bytes_per_sample) { +} + +AudioChunk::AudioChunk(const uint8* data, size_t length, int bytes_per_sample) + : data_string_(reinterpret_cast<const char*>(data), length), + bytes_per_sample_(bytes_per_sample) { + DCHECK_EQ(length % bytes_per_sample, 0U); +} + +bool AudioChunk::IsEmpty() const { + return data_string_.empty(); +} + +size_t AudioChunk::NumSamples() const { + return data_string_.size() / bytes_per_sample_; +} + +const std::string& AudioChunk::AsString() const { + return data_string_; +} + +int16 AudioChunk::GetSample16(size_t index) const { + DCHECK(index < (data_string_.size() / sizeof(int16))); + return SamplesData16()[index]; +} + +const int16* AudioChunk::SamplesData16() const { + return reinterpret_cast<const int16*>(data_string_.data()); +} + + +AudioBuffer::AudioBuffer(int bytes_per_sample) + : bytes_per_sample_(bytes_per_sample) { + DCHECK(bytes_per_sample == 1 || + bytes_per_sample == 2 || + bytes_per_sample == 4); +} + +AudioBuffer::~AudioBuffer() { + Clear(); +} + +void AudioBuffer::Enqueue(const uint8* data, size_t length) { + AudioChunk* chunk = new AudioChunk(data, length, bytes_per_sample_); + chunks_.push_back(chunk); +} + +scoped_ptr<AudioChunk> AudioBuffer::DequeueSingleChunk() { + DCHECK(!chunks_.empty()); + AudioChunk* chunk = *chunks_.begin(); + chunks_.weak_erase(chunks_.begin()); + return scoped_ptr<AudioChunk>(chunk); +} + +scoped_ptr<AudioChunk> AudioBuffer::DequeueAll() { + AudioChunk* chunk = new AudioChunk(bytes_per_sample_); + size_t resulting_length = 0; + ChunksContainer::const_iterator it; + // In order to improve performance, calulate in advance the total length + // and then copy the chunks. + for (it = chunks_.begin(); it != chunks_.end(); ++it) { + resulting_length += (*it)->data_string_.length(); + } + chunk->data_string_.reserve(resulting_length); + for (it = chunks_.begin(); it != chunks_.end(); ++it) { + chunk->data_string_.append((*it)->data_string_); + } + Clear(); + return scoped_ptr<AudioChunk>(chunk); +} + +void AudioBuffer::Clear() { + chunks_.erase(chunks_.begin(), chunks_.end()); +} + +bool AudioBuffer::IsEmpty() const { + return chunks_.empty(); +} + +} // namespace speech diff --git a/content/browser/speech/audio_buffer.h b/content/browser/speech/audio_buffer.h new file mode 100644 index 0000000..c1d5103 --- /dev/null +++ b/content/browser/speech/audio_buffer.h @@ -0,0 +1,73 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_AUDIO_BUFFER_H_ +#define CONTENT_BROWSER_SPEECH_AUDIO_BUFFER_H_ +#pragma once + +#include <string> + +#include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "base/memory/scoped_vector.h" +#include "content/common/content_export.h" + +namespace speech { + +// Models a chunk derived from an AudioBuffer. +class CONTENT_EXPORT AudioChunk { + public: + explicit AudioChunk(int bytes_per_sample); + AudioChunk(const uint8* data, size_t length, int bytes_per_sample); + + bool IsEmpty() const; + int bytes_per_sample() const { return bytes_per_sample_; } + size_t NumSamples() const; + const std::string& AsString() const; + int16 GetSample16(size_t index) const; + const int16* SamplesData16() const; + friend class AudioBuffer; + + private: + std::string data_string_; + int bytes_per_sample_; + + DISALLOW_COPY_AND_ASSIGN(AudioChunk); +}; + +// Models an audio buffer. The current implementation relies on on-demand +// allocations of AudioChunk(s) (which uses a string as storage). +class AudioBuffer { + public: + explicit AudioBuffer(int bytes_per_sample); + ~AudioBuffer(); + + // Enqueues a copy of |length| bytes of |data| buffer. + void Enqueue(const uint8* data, size_t length); + + // Dequeues, in FIFO order, a single chunk respecting the length of the + // corresponding Enqueue call (in a nutshell: multiple Enqueue calls followed + // by Dequeue calls will return the individual chunks without merging them). + scoped_ptr<AudioChunk> DequeueSingleChunk(); + + // Dequeues all previously enqueued chunks, merging them in a single chunk. + scoped_ptr<AudioChunk> DequeueAll(); + + // Removes and frees all the enqueued chunks. + void Clear(); + + // Checks whether the buffer is empty. + bool IsEmpty() const; + + private: + typedef ScopedVector<AudioChunk> ChunksContainer; + ChunksContainer chunks_; + int bytes_per_sample_; + + DISALLOW_COPY_AND_ASSIGN(AudioBuffer); +}; + +} // namespace speech + +#endif // CONTENT_BROWSER_SPEECH_AUDIO_BUFFER_H_ diff --git a/content/browser/speech/audio_encoder.cc b/content/browser/speech/audio_encoder.cc index 83e0475..92ccdce 100644 --- a/content/browser/speech/audio_encoder.cc +++ b/content/browser/speech/audio_encoder.cc @@ -9,10 +9,12 @@ #include "base/memory/scoped_ptr.h" #include "base/stl_util.h" #include "base/string_number_conversions.h" +#include "content/browser/speech/audio_buffer.h" #include "third_party/flac/flac.h" #include "third_party/speex/speex.h" using std::string; +using speech::AudioChunk; namespace { @@ -25,8 +27,8 @@ class FLACEncoder : public speech::AudioEncoder { public: FLACEncoder(int sampling_rate, int bits_per_sample); virtual ~FLACEncoder(); - virtual void Encode(const short* samples, int num_samples); - virtual void Flush(); + virtual void Encode(const AudioChunk& raw_audio) OVERRIDE; + virtual void Flush() OVERRIDE; private: static FLAC__StreamEncoderWriteStatus WriteCallback( @@ -52,13 +54,14 @@ FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback( void* client_data) { FLACEncoder* me = static_cast<FLACEncoder*>(client_data); DCHECK(me->encoder_ == encoder); - me->AppendToBuffer(new string(reinterpret_cast<const char*>(buffer), bytes)); + me->encoded_audio_buffer_.Enqueue(buffer, bytes); return FLAC__STREAM_ENCODER_WRITE_STATUS_OK; } FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample) : AudioEncoder(std::string(kContentTypeFLAC) + - base::IntToString(sampling_rate)), + base::IntToString(sampling_rate), + bits_per_sample), encoder_(FLAC__stream_encoder_new()), is_encoder_initialized_(false) { FLAC__stream_encoder_set_channels(encoder_, 1); @@ -75,20 +78,22 @@ FLACEncoder::~FLACEncoder() { FLAC__stream_encoder_delete(encoder_); } -void FLACEncoder::Encode(const short* samples, int num_samples) { +void FLACEncoder::Encode(const AudioChunk& raw_audio) { + DCHECK_EQ(raw_audio.bytes_per_sample(), 2); if (!is_encoder_initialized_) { const FLAC__StreamEncoderInitStatus encoder_status = FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL, NULL, this); - DCHECK(encoder_status == FLAC__STREAM_ENCODER_INIT_STATUS_OK); + DCHECK_EQ(encoder_status, FLAC__STREAM_ENCODER_INIT_STATUS_OK); is_encoder_initialized_ = true; } // FLAC encoder wants samples as int32s. + const int num_samples = raw_audio.NumSamples(); scoped_array<FLAC__int32> flac_samples(new FLAC__int32[num_samples]); FLAC__int32* flac_samples_ptr = flac_samples.get(); for (int i = 0; i < num_samples; ++i) - flac_samples_ptr[i] = samples[i]; + flac_samples_ptr[i] = static_cast<FLAC__int32>(raw_audio.GetSample16(i)); FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples); } @@ -109,10 +114,10 @@ COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); class SpeexEncoder : public speech::AudioEncoder { public: - explicit SpeexEncoder(int sampling_rate); + explicit SpeexEncoder(int sampling_rate, int bits_per_sample); virtual ~SpeexEncoder(); - virtual void Encode(const short* samples, int num_samples); - virtual void Flush() {} + virtual void Encode(const AudioChunk& raw_audio) OVERRIDE; + virtual void Flush() OVERRIDE {} private: void* encoder_state_; @@ -122,9 +127,10 @@ class SpeexEncoder : public speech::AudioEncoder { DISALLOW_COPY_AND_ASSIGN(SpeexEncoder); }; -SpeexEncoder::SpeexEncoder(int sampling_rate) +SpeexEncoder::SpeexEncoder(int sampling_rate, int bits_per_sample) : AudioEncoder(std::string(kContentTypeSpeex) + - base::IntToString(sampling_rate)) { + base::IntToString(sampling_rate), + bits_per_sample) { // speex_bits_init() does not initialize all of the |bits_| struct. memset(&bits_, 0, sizeof(bits_)); speex_bits_init(&bits_); @@ -144,20 +150,23 @@ SpeexEncoder::~SpeexEncoder() { speex_encoder_destroy(encoder_state_); } -void SpeexEncoder::Encode(const short* samples, int num_samples) { +void SpeexEncoder::Encode(const AudioChunk& raw_audio) { + spx_int16_t* src_buffer = + const_cast<spx_int16_t*>(raw_audio.SamplesData16()); + int num_samples = raw_audio.NumSamples(); // Drop incomplete frames, typically those which come in when recording stops. num_samples -= (num_samples % samples_per_frame_); for (int i = 0; i < num_samples; i += samples_per_frame_) { speex_bits_reset(&bits_); - speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), - &bits_); + speex_encode_int(encoder_state_, src_buffer + i, &bits_); // Encode the frame and place the size of the frame as the first byte. This // is the packet format for MIME type x-speex-with-header-byte. int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, kMaxSpeexFrameLength); encoded_frame_data_[0] = static_cast<char>(frame_length); - AppendToBuffer(new string(encoded_frame_data_, frame_length + 1)); + encoded_audio_buffer_.Enqueue( + reinterpret_cast<uint8*>(&encoded_frame_data_[0]), frame_length + 1); } } @@ -170,39 +179,20 @@ AudioEncoder* AudioEncoder::Create(Codec codec, int bits_per_sample) { if (codec == CODEC_FLAC) return new FLACEncoder(sampling_rate, bits_per_sample); - return new SpeexEncoder(sampling_rate); + return new SpeexEncoder(sampling_rate, bits_per_sample); } -AudioEncoder::AudioEncoder(const std::string& mime_type) - : mime_type_(mime_type) { +AudioEncoder::AudioEncoder(const std::string& mime_type, int bits_per_sample) + : encoded_audio_buffer_(1), /* Byte granularity of encoded samples. */ + mime_type_(mime_type), + bits_per_sample_(bits_per_sample) { } AudioEncoder::~AudioEncoder() { - STLDeleteElements(&audio_buffers_); } -bool AudioEncoder::GetEncodedDataAndClear(std::string* encoded_data) { - if (!audio_buffers_.size()) - return false; - - int audio_buffer_length = 0; - for (AudioBufferQueue::iterator it = audio_buffers_.begin(); - it != audio_buffers_.end(); ++it) { - audio_buffer_length += (*it)->length(); - } - encoded_data->reserve(audio_buffer_length); - for (AudioBufferQueue::iterator it = audio_buffers_.begin(); - it != audio_buffers_.end(); ++it) { - encoded_data->append(*(*it)); - } - - STLDeleteElements(&audio_buffers_); - - return true; -} - -void AudioEncoder::AppendToBuffer(std::string* item) { - audio_buffers_.push_back(item); +scoped_ptr<AudioChunk> AudioEncoder::GetEncodedDataAndClear() { + return encoded_audio_buffer_.DequeueAll(); } } // namespace speech diff --git a/content/browser/speech/audio_encoder.h b/content/browser/speech/audio_encoder.h index 92bc645..65ceb97 100644 --- a/content/browser/speech/audio_encoder.h +++ b/content/browser/speech/audio_encoder.h @@ -9,9 +9,11 @@ #include <string> #include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "content/browser/speech/audio_buffer.h" namespace speech { - +class AudioChunk; // Provides a simple interface to encode raw audio using the various speech // codecs. class AudioEncoder { @@ -27,30 +29,28 @@ class AudioEncoder { virtual ~AudioEncoder(); - // Encodes each frame of raw audio in |samples| to the internal buffer. Use - // |GetEncodedData| to read the result after this call or when recording - // completes. - virtual void Encode(const short* samples, int num_samples) = 0; + // Encodes |raw audio| to the internal buffer. Use + // |GetEncodedDataAndClear| to read the result after this call or when + // audio capture completes. + virtual void Encode(const AudioChunk& raw_audio) = 0; // Finish encoding and flush any pending encoded bits out. virtual void Flush() = 0; - // Copies the encoded audio to the given string. Returns true if the output - // is not empty. - bool GetEncodedDataAndClear(std::string* encoded_data); + // Merges, retrieves and clears all the accumulated encoded audio chunks. + scoped_ptr<AudioChunk> GetEncodedDataAndClear(); const std::string& mime_type() { return mime_type_; } + int bits_per_sample() { return bits_per_sample_; } protected: - AudioEncoder(const std::string& mime_type); - - void AppendToBuffer(std::string* item); + AudioEncoder(const std::string& mime_type, int bits_per_sample); + AudioBuffer encoded_audio_buffer_; private: - // Buffer holding the recorded audio. Owns the strings inside the list. - typedef std::list<std::string*> AudioBufferQueue; - AudioBufferQueue audio_buffers_; std::string mime_type_; + int bits_per_sample_; + DISALLOW_COPY_AND_ASSIGN(AudioEncoder); }; diff --git a/content/browser/speech/endpointer/endpointer.cc b/content/browser/speech/endpointer/endpointer.cc index fe3e0bf..b4a54c1 100644 --- a/content/browser/speech/endpointer/endpointer.cc +++ b/content/browser/speech/endpointer/endpointer.cc @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#include "content/browser/speech/audio_buffer.h" #include "content/browser/speech/endpointer/endpointer.h" #include "base/time.h" @@ -88,8 +89,9 @@ EpStatus Endpointer::Status(int64 *time) { return energy_endpointer_.Status(time); } -EpStatus Endpointer::ProcessAudio(const int16* audio_data, int num_samples, - float* rms_out) { +EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { + const int16* audio_data = raw_audio.SamplesData16(); + const int num_samples = raw_audio.NumSamples(); EpStatus ep_status = EP_PRE_SPEECH; // Process the input data in blocks of frame_size_, dropping any incomplete diff --git a/content/browser/speech/endpointer/endpointer.h b/content/browser/speech/endpointer/endpointer.h index 9ba2018..89ec3a9 100644 --- a/content/browser/speech/endpointer/endpointer.h +++ b/content/browser/speech/endpointer/endpointer.h @@ -13,6 +13,8 @@ class EpStatus; namespace speech { +class AudioChunk; + // A simple interface to the underlying energy-endpointer implementation, this // class lets callers provide audio as being recorded and let them poll to find // when the user has stopped speaking. @@ -61,8 +63,7 @@ class CONTENT_EXPORT Endpointer { // Process a segment of audio, which may be more than one frame. // The status of the last frame will be returned. - EpStatus ProcessAudio(const int16* audio_data, int num_samples, - float* rms_out); + EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); // Get the status of the endpointer. EpStatus Status(int64 *time_us); diff --git a/content/browser/speech/endpointer/endpointer_unittest.cc b/content/browser/speech/endpointer/endpointer_unittest.cc index 240e5dc..37f2339 100644 --- a/content/browser/speech/endpointer/endpointer_unittest.cc +++ b/content/browser/speech/endpointer/endpointer_unittest.cc @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#include "content/browser/speech/audio_buffer.h" #include "content/browser/speech/endpointer/endpointer.h" #include "testing/gtest/include/gtest/gtest.h" @@ -116,7 +117,8 @@ class EndpointerFrameProcessor : public FrameProcessor { : endpointer_(endpointer) {} EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) { - endpointer_->ProcessAudio(samples, kFrameSize, NULL); + AudioChunk frame(reinterpret_cast<uint8*>(samples), kFrameSize * 2, 2); + endpointer_->ProcessAudio(frame, NULL); int64 ep_time; return endpointer_->Status(&ep_time); } diff --git a/content/browser/speech/speech_recognition_request.cc b/content/browser/speech/speech_recognition_request.cc index 89dde84..a143699 100644 --- a/content/browser/speech/speech_recognition_request.cc +++ b/content/browser/speech/speech_recognition_request.cc @@ -10,6 +10,7 @@ #include "base/string_number_conversions.h" #include "base/string_util.h" #include "base/values.h" +#include "content/browser/speech/audio_buffer.h" #include "content/common/net/url_fetcher_impl.h" #include "content/public/common/speech_recognition_result.h" #include "net/base/escape.h" @@ -201,10 +202,10 @@ void SpeechRecognitionRequest::Start(const std::string& language, url_fetcher_->Start(); } -void SpeechRecognitionRequest::UploadAudioChunk(const std::string& audio_data, +void SpeechRecognitionRequest::UploadAudioChunk(const AudioChunk& audio_chunk, bool is_last_chunk) { DCHECK(url_fetcher_.get()); - url_fetcher_->AppendChunkToUpload(audio_data, is_last_chunk); + url_fetcher_->AppendChunkToUpload(audio_chunk.AsString(), is_last_chunk); } void SpeechRecognitionRequest::OnURLFetchComplete( diff --git a/content/browser/speech/speech_recognition_request.h b/content/browser/speech/speech_recognition_request.h index b6ce077..2f29e0a 100644 --- a/content/browser/speech/speech_recognition_request.h +++ b/content/browser/speech/speech_recognition_request.h @@ -27,6 +27,8 @@ class URLRequestContextGetter; namespace speech { +class AudioChunk; + // Provides a simple interface for sending recorded speech data to the server // and get back recognition results. class SpeechRecognitionRequest : public content::URLFetcherDelegate { @@ -61,7 +63,7 @@ class SpeechRecognitionRequest : public content::URLFetcherDelegate { const std::string& content_type); // Send a single chunk of audio immediately to the server. - CONTENT_EXPORT void UploadAudioChunk(const std::string& audio_data, + CONTENT_EXPORT void UploadAudioChunk(const AudioChunk& audio_chunk, bool is_last_chunk); CONTENT_EXPORT bool HasPendingRequest() { return url_fetcher_ != NULL; } diff --git a/content/browser/speech/speech_recognition_request_unittest.cc b/content/browser/speech/speech_recognition_request_unittest.cc index 37b82f8..822e254 100644 --- a/content/browser/speech/speech_recognition_request_unittest.cc +++ b/content/browser/speech/speech_recognition_request_unittest.cc @@ -4,6 +4,7 @@ #include "base/message_loop.h" #include "base/utf_string_conversions.h" +#include "content/browser/speech/audio_buffer.h" #include "content/browser/speech/speech_recognition_request.h" #include "content/public/common/speech_recognition_result.h" #include "content/test/test_url_fetcher_factory.h" @@ -39,7 +40,11 @@ void SpeechRecognitionRequestTest::CreateAndTestRequest( SpeechRecognitionRequest request(NULL, this); request.Start(std::string(), std::string(), false, std::string(), std::string(), std::string()); - request.UploadAudioChunk(std::string(" "), true); + unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'}; + AudioChunk dummy_audio_chunk(&dummy_audio_buffer_data[0], + sizeof(dummy_audio_buffer_data), + 2 /* bytes per sample */); + request.UploadAudioChunk(dummy_audio_chunk, true); TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); ASSERT_TRUE(fetcher); diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc index bbcfbd2..c5342d8 100644 --- a/content/browser/speech/speech_recognizer_impl.cc +++ b/content/browser/speech/speech_recognizer_impl.cc @@ -7,6 +7,7 @@ #include "base/bind.h" #include "base/time.h" #include "content/browser/browser_main_loop.h" +#include "content/browser/speech/audio_buffer.h" #include "content/public/browser/speech_recognizer_delegate.h" #include "content/public/browser/browser_thread.h" #include "content/public/common/speech_recognition_result.h" @@ -38,9 +39,11 @@ const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb; const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f; // Returns true if more than 5% of the samples are at min or max value. -bool Clipping(const int16* samples, int num_samples) { - int clipping_samples = 0; +bool DetectClipping(const speech::AudioChunk& chunk) { + const int num_samples = chunk.NumSamples(); + const int16* samples = chunk.SamplesData16(); const int kThreshold = num_samples / 20; + int clipping_samples = 0; for (int i = 0; i < num_samples; ++i) { if (samples[i] <= -32767 || samples[i] >= 32767) { if (++clipping_samples > kThreshold) @@ -174,11 +177,13 @@ void SpeechRecognizerImpl::StopRecording() { // of silence in case encoder had no data already. std::vector<short> samples((kAudioSampleRate * kAudioPacketIntervalMs) / 1000); - encoder_->Encode(&samples[0], samples.size()); + AudioChunk dummy_chunk(reinterpret_cast<uint8*>(&samples[0]), + samples.size() * sizeof(short), + encoder_->bits_per_sample() / 8); + encoder_->Encode(dummy_chunk); encoder_->Flush(); - string encoded_data; - encoder_->GetEncodedDataAndClear(&encoded_data); - DCHECK(!encoded_data.empty()); + scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); + DCHECK(!encoded_data->IsEmpty()); encoder_.reset(); // If we haven't got any audio yet end the recognition sequence here. @@ -187,7 +192,7 @@ void SpeechRecognizerImpl::StopRecording() { scoped_refptr<SpeechRecognizerImpl> me(this); delegate_->DidCompleteRecognition(caller_id_); } else { - request_->UploadAudioChunk(encoded_data, true /* is_last_chunk */); + request_->UploadAudioChunk(*encoded_data, true /* is_last_chunk */); } } @@ -215,33 +220,28 @@ void SpeechRecognizerImpl::OnData(AudioInputController* controller, const uint8* data, uint32 size) { if (size == 0) // This could happen when recording stops and is normal. return; - - string* str_data = new string(reinterpret_cast<const char*>(data), size); + AudioChunk* raw_audio = new AudioChunk(data, static_cast<size_t>(size), + kNumBitsPerAudioSample / 8); BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, base::Bind(&SpeechRecognizerImpl::HandleOnData, - this, str_data)); + this, raw_audio)); } -void SpeechRecognizerImpl::HandleOnData(string* data) { +void SpeechRecognizerImpl::HandleOnData(AudioChunk* raw_audio) { + scoped_ptr<AudioChunk> free_raw_audio_on_return(raw_audio); // Check if we are still recording and if not discard this buffer, as // recording might have been stopped after this buffer was posted to the queue // by |OnData|. - if (!audio_controller_.get()) { - delete data; + if (!audio_controller_.get()) return; - } bool speech_was_heard_before_packet = endpointer_.DidStartReceivingSpeech(); - const short* samples = reinterpret_cast<const short*>(data->data()); - DCHECK_EQ((data->length() % sizeof(short)), 0U); - int num_samples = data->length() / sizeof(short); - encoder_->Encode(samples, num_samples); + encoder_->Encode(*raw_audio); float rms; - endpointer_.ProcessAudio(samples, num_samples, &rms); - bool did_clip = Clipping(samples, num_samples); - delete data; - num_samples_recorded_ += num_samples; + endpointer_.ProcessAudio(*raw_audio, &rms); + bool did_clip = DetectClipping(*raw_audio); + num_samples_recorded_ += raw_audio->NumSamples(); if (request_ == NULL) { // This was the first audio packet recorded, so start a request to the @@ -252,10 +252,9 @@ void SpeechRecognizerImpl::HandleOnData(string* data) { hardware_info_, origin_url_, encoder_->mime_type()); } - string encoded_data; - encoder_->GetEncodedDataAndClear(&encoded_data); - DCHECK(!encoded_data.empty()); - request_->UploadAudioChunk(encoded_data, false /* is_last_chunk */); + scoped_ptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); + DCHECK(!encoded_data->IsEmpty()); + request_->UploadAudioChunk(*encoded_data, false /* is_last_chunk */); if (endpointer_.IsEstimatingEnvironment()) { // Check if we have gathered enough audio for the endpointer to do diff --git a/content/browser/speech/speech_recognizer_impl.h b/content/browser/speech/speech_recognizer_impl.h index 25e0c0c..eaec3e3 100644 --- a/content/browser/speech/speech_recognizer_impl.h +++ b/content/browser/speech/speech_recognizer_impl.h @@ -75,8 +75,8 @@ class CONTENT_EXPORT SpeechRecognizerImpl void HandleOnError(int error_code); // Handles OnError in the IO thread. - // Handles OnData in the IO thread. Takes ownership of |data|. - void HandleOnData(std::string* data); + // Handles OnData in the IO thread. Takes ownership of |raw_audio|. + void HandleOnData(AudioChunk* raw_audio); // Helper method which closes the audio controller and blocks until done. void CloseAudioControllerSynchronously(); diff --git a/content/browser/speech/speech_recognizer_impl_unittest.cc b/content/browser/speech/speech_recognizer_impl_unittest.cc index cfd2c33..baf3a3e 100644 --- a/content/browser/speech/speech_recognizer_impl_unittest.cc +++ b/content/browser/speech/speech_recognizer_impl_unittest.cc @@ -9,6 +9,8 @@ #include "content/public/browser/speech_recognizer_delegate.h" #include "content/test/test_url_fetcher_factory.h" #include "media/audio/audio_manager.h" +#include "media/audio/fake_audio_input_stream.h" +#include "media/audio/fake_audio_output_stream.h" #include "media/audio/test_audio_input_controller_factory.h" #include "net/base/net_errors.h" #include "net/url_request/url_request_status.h" @@ -20,6 +22,48 @@ using media::AudioInputController; using media::TestAudioInputController; using media::TestAudioInputControllerFactory; +namespace { + +class MockAudioManager : public AudioManager { + public: + MockAudioManager() { + audio_thread_.reset(new base::Thread("MockAudioThread")); + CHECK(audio_thread_->Start()); + } + virtual bool HasAudioOutputDevices() OVERRIDE { return true; } + virtual bool HasAudioInputDevices() OVERRIDE { return true; } + virtual string16 GetAudioInputDeviceModel() OVERRIDE { return string16(); } + virtual bool CanShowAudioInputSettings() OVERRIDE { return false; } + virtual void ShowAudioInputSettings() OVERRIDE {} + virtual void GetAudioInputDeviceNames( + media::AudioDeviceNames* device_names) OVERRIDE {} + virtual AudioOutputStream* MakeAudioOutputStream( + const AudioParameters& params) OVERRIDE { + return FakeAudioOutputStream::MakeFakeStream(params); + } + virtual AudioOutputStream* MakeAudioOutputStreamProxy( + const AudioParameters& params) OVERRIDE { + NOTREACHED(); + return NULL; + } + virtual AudioInputStream* MakeAudioInputStream( + const AudioParameters& params, const std::string& device_id) OVERRIDE { + return FakeAudioInputStream::MakeFakeStream(params); + } + virtual void MuteAll() OVERRIDE {} + virtual void UnMuteAll() OVERRIDE {} + virtual bool IsRecordingInProcess() OVERRIDE { return false; } + virtual scoped_refptr<base::MessageLoopProxy> GetMessageLoop() OVERRIDE { + return audio_thread_->message_loop_proxy(); + } + virtual void Init() OVERRIDE {}; + private: + scoped_ptr<base::Thread> audio_thread_; + DISALLOW_COPY_AND_ASSIGN(MockAudioManager); +}; +} // namespace + + namespace speech { class SpeechRecognizerTest : public content::SpeechRecognizerDelegate, @@ -27,7 +71,7 @@ class SpeechRecognizerTest : public content::SpeechRecognizerDelegate, public: SpeechRecognizerTest() : io_thread_(BrowserThread::IO, &message_loop_), - audio_manager_(AudioManager::Create()), + audio_manager_(new MockAudioManager()), recording_complete_(false), recognition_complete_(false), result_received_(false), |