summaryrefslogtreecommitdiffstats
path: root/content
diff options
context:
space:
mode:
authorhenrika@chromium.org <henrika@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-06-17 14:41:38 +0000
committerhenrika@chromium.org <henrika@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-06-17 14:41:38 +0000
commit2e50f6d799fac2b15198f6635a2f0b5455fbad10 (patch)
tree9ac7d0760652aec1de1c919810d6131429386ba8 /content
parenta96e6fd7c438f27a553e46b94f52019aa4120a30 (diff)
downloadchromium_src-2e50f6d799fac2b15198f6635a2f0b5455fbad10.zip
chromium_src-2e50f6d799fac2b15198f6635a2f0b5455fbad10.tar.gz
chromium_src-2e50f6d799fac2b15198f6635a2f0b5455fbad10.tar.bz2
Moves WebSpeech to the low-latency audio backend.
Also uses https://codereview.chromium.org/16955009 to resolve iOS linking issues. BUG=112472 TEST=https://www.google.com/intl/en/chrome/demos/speech.html and content_unittests on Linux, Mac and Windows. Review URL: https://chromiumcodereview.appspot.com/16658006 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@206723 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'content')
-rw-r--r--content/browser/speech/speech_recognizer_impl.cc180
-rw-r--r--content/browser/speech/speech_recognizer_impl.h6
-rw-r--r--content/browser/speech/speech_recognizer_impl_unittest.cc2
3 files changed, 174 insertions, 14 deletions
diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc
index d6902da..ddb255e 100644
--- a/content/browser/speech/speech_recognizer_impl.cc
+++ b/content/browser/speech/speech_recognizer_impl.cc
@@ -10,16 +10,57 @@
#include "content/browser/browser_main_loop.h"
#include "content/browser/speech/audio_buffer.h"
#include "content/browser/speech/google_one_shot_remote_engine.h"
-#include "content/public/browser/browser_thread.h"
#include "content/public/browser/speech_recognition_event_listener.h"
+#include "media/base/audio_converter.h"
#include "net/url_request/url_request_context_getter.h"
+#if defined(OS_WIN)
+#include "media/audio/win/core_audio_util_win.h"
+#endif
+
+using media::AudioBus;
+using media::AudioConverter;
using media::AudioInputController;
using media::AudioManager;
using media::AudioParameters;
using media::ChannelLayout;
namespace content {
+
+// Private class which encapsulates the audio converter and the
+// AudioConverter::InputCallback. It handles resampling, buffering and
+// channel mixing between input and output parameters.
+class SpeechRecognizerImpl::OnDataConverter
+ : public media::AudioConverter::InputCallback {
+ public:
+ OnDataConverter(const AudioParameters& input_params,
+ const AudioParameters& output_params);
+ virtual ~OnDataConverter();
+
+ // Converts input |data| buffer into an AudioChunk where the input format
+ // is given by |input_parameters_| and the output format by
+ // |output_parameters_|.
+ scoped_refptr<AudioChunk> Convert(const uint8* data, size_t size);
+
+ private:
+ // media::AudioConverter::InputCallback implementation.
+ virtual double ProvideInput(AudioBus* dest,
+ base::TimeDelta buffer_delay) OVERRIDE;
+
+ // Handles resampling, buffering, and channel mixing between input and output
+ // parameters.
+ AudioConverter audio_converter_;
+
+ scoped_ptr<AudioBus> input_bus_;
+ scoped_ptr<AudioBus> output_bus_;
+ const AudioParameters input_parameters_;
+ const AudioParameters output_parameters_;
+ bool waiting_for_input_;
+ scoped_ptr<uint8[]> converted_data_;
+
+ DISALLOW_COPY_AND_ASSIGN(OnDataConverter);
+};
+
namespace {
// The following constants are related to the volume level indicator shown in
@@ -70,6 +111,65 @@ media::AudioManager* SpeechRecognizerImpl::audio_manager_for_tests_ = NULL;
COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,
kNumBitsPerAudioSample_must_be_a_multiple_of_8);
+// SpeechRecognizerImpl::OnDataConverter implementation
+
+SpeechRecognizerImpl::OnDataConverter::OnDataConverter(
+ const AudioParameters& input_params, const AudioParameters& output_params)
+ : audio_converter_(input_params, output_params, false),
+ input_bus_(AudioBus::Create(input_params)),
+ output_bus_(AudioBus::Create(output_params)),
+ input_parameters_(input_params),
+ output_parameters_(output_params),
+ waiting_for_input_(false),
+ converted_data_(new uint8[output_parameters_.GetBytesPerBuffer()]) {
+ audio_converter_.AddInput(this);
+}
+
+SpeechRecognizerImpl::OnDataConverter::~OnDataConverter() {
+ // It should now be safe to unregister the converter since no more OnData()
+ // callbacks are outstanding at this point.
+ audio_converter_.RemoveInput(this);
+}
+
+scoped_refptr<AudioChunk> SpeechRecognizerImpl::OnDataConverter::Convert(
+ const uint8* data, size_t size) {
+ CHECK_EQ(size, static_cast<size_t>(input_parameters_.GetBytesPerBuffer()));
+
+ input_bus_->FromInterleaved(
+ data, input_bus_->frames(), input_parameters_.bits_per_sample() / 8);
+
+ waiting_for_input_ = true;
+ audio_converter_.Convert(output_bus_.get());
+
+ output_bus_->ToInterleaved(
+ output_bus_->frames(), output_parameters_.bits_per_sample() / 8,
+ converted_data_.get());
+
+ // TODO(primiano): Refactor AudioChunk to avoid the extra-copy here
+ // (see http://crbug.com/249316 for details).
+ return scoped_refptr<AudioChunk>(new AudioChunk(
+ converted_data_.get(),
+ output_parameters_.GetBytesPerBuffer(),
+ output_parameters_.bits_per_sample() / 8));
+}
+
+double SpeechRecognizerImpl::OnDataConverter::ProvideInput(
+ AudioBus* dest, base::TimeDelta buffer_delay) {
+ // The audio converted should never ask for more than one bus in each call
+ // to Convert(). If so, we have a serious issue in our design since we might
+ // miss recorded chunks of 100 ms audio data.
+ CHECK(waiting_for_input_);
+
+ // Read from the input bus to feed the converter.
+ input_bus_->CopyTo(dest);
+
+ // |input_bus_| should only be provide once.
+ waiting_for_input_ = false;
+ return 1;
+}
+
+// SpeechRecognizerImpl implementation
+
SpeechRecognizerImpl::SpeechRecognizerImpl(
SpeechRecognitionEventListener* listener,
int session_id,
@@ -169,9 +269,10 @@ void SpeechRecognizerImpl::OnData(AudioInputController* controller,
if (size == 0) // This could happen when audio capture stops and is normal.
return;
+ // Convert audio from native format to fixed format used by WebSpeech.
FSMEventArgs event_args(EVENT_AUDIO_DATA);
- event_args.audio_data = new AudioChunk(data, static_cast<size_t>(size),
- kNumBitsPerAudioSample / 8);
+ event_args.audio_data = audio_converter_->Convert(data, size);
+
BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
base::Bind(&SpeechRecognizerImpl::DispatchEvent,
this, event_args));
@@ -387,9 +488,10 @@ SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {
DCHECK(recognition_engine_.get() != NULL);
DCHECK(!IsCapturingAudio());
- AudioManager* audio_manager = (audio_manager_for_tests_ != NULL) ?
- audio_manager_for_tests_ :
- BrowserMainLoop::GetAudioManager();
+ const bool unit_test_is_active = (audio_manager_for_tests_ != NULL);
+ AudioManager* audio_manager = unit_test_is_active ?
+ audio_manager_for_tests_ :
+ AudioManager::Get();
DCHECK(audio_manager != NULL);
DVLOG(1) << "SpeechRecognizerImpl starting audio capture.";
@@ -402,14 +504,66 @@ SpeechRecognizerImpl::StartRecording(const FSMEventArgs&) {
SPEECH_AUDIO_ERROR_DETAILS_NO_MIC));
}
- const int samples_per_packet = (kAudioSampleRate *
- recognition_engine_->GetDesiredAudioChunkDurationMs()) / 1000;
- AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
- kAudioSampleRate, kNumBitsPerAudioSample,
- samples_per_packet);
- audio_controller_ = AudioInputController::Create(audio_manager, this, params);
+ int chunk_duration_ms = recognition_engine_->GetDesiredAudioChunkDurationMs();
+
+ // TODO(xians): use the correct input device here.
+ AudioParameters in_params = audio_manager->GetInputStreamParameters(
+ media::AudioManagerBase::kDefaultDeviceId);
+ if (!in_params.IsValid() && !unit_test_is_active) {
+ DLOG(ERROR) << "Invalid native audio input parameters";
+ return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));
+ }
+
+ // Audio converter shall provide audio based on these parameters as output.
+ // Hard coded, WebSpeech specific parameters are utilized here.
+ int frames_per_buffer = (kAudioSampleRate * chunk_duration_ms) / 1000;
+ AudioParameters output_parameters = AudioParameters(
+ AudioParameters::AUDIO_PCM_LOW_LATENCY, kChannelLayout, kAudioSampleRate,
+ kNumBitsPerAudioSample, frames_per_buffer);
+
+ // Audio converter will receive audio based on these parameters as input.
+ // On Windows we start by verifying that Core Audio is supported. If not,
+ // the WaveIn API is used and we might as well avoid all audio conversations
+ // since WaveIn does the conversion for us.
+ // TODO(henrika): this code should be moved to platform dependent audio
+ // managers.
+ bool use_native_audio_params = true;
+#if defined(OS_WIN)
+ use_native_audio_params = media::CoreAudioUtil::IsSupported();
+ DVLOG_IF(1, !use_native_audio_params) << "Reverting to WaveIn for WebSpeech";
+#endif
+
+ AudioParameters input_parameters = output_parameters;
+ if (use_native_audio_params && !unit_test_is_active) {
+ // Use native audio parameters but avoid opening up at the native buffer
+ // size. Instead use same frame size (in milliseconds) as WebSpeech uses.
+ // We rely on internal buffers in the audio back-end to fulfill this request
+ // and the idea is to simplify the audio conversion since each Convert()
+ // call will then render exactly one ProvideInput() call.
+ // Due to implementation details in the audio converter, 2 milliseconds
+ // are added to the default frame size (100 ms) to ensure there is enough
+ // data to generate 100 ms of output when resampling.
+ frames_per_buffer =
+ ((in_params.sample_rate() * (chunk_duration_ms + 2)) / 1000.0) + 0.5;
+ input_parameters.Reset(in_params.format(),
+ in_params.channel_layout(),
+ in_params.channels(),
+ in_params.input_channels(),
+ in_params.sample_rate(),
+ in_params.bits_per_sample(),
+ frames_per_buffer);
+ }
+
+ // Create an audio converter which converts data between native input format
+ // and WebSpeech specific output format.
+ audio_converter_.reset(
+ new OnDataConverter(input_parameters, output_parameters));
+
+ // TODO(xians): use the correct input device here.
+ audio_controller_ = AudioInputController::Create(
+ audio_manager, this, input_parameters);
- if (audio_controller_.get() == NULL) {
+ if (!audio_controller_) {
return Abort(SpeechRecognitionError(SPEECH_RECOGNITION_ERROR_AUDIO));
}
diff --git a/content/browser/speech/speech_recognizer_impl.h b/content/browser/speech/speech_recognizer_impl.h
index 05ef2a0..e75560e 100644
--- a/content/browser/speech/speech_recognizer_impl.h
+++ b/content/browser/speech/speech_recognizer_impl.h
@@ -149,6 +149,12 @@ class CONTENT_EXPORT SpeechRecognizerImpl
bool is_single_shot_;
FSMState state_;
+ class OnDataConverter;
+
+ // Converts data between native input format and a WebSpeech specific
+ // output format.
+ scoped_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_;
+
DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl);
};
diff --git a/content/browser/speech/speech_recognizer_impl_unittest.cc b/content/browser/speech/speech_recognizer_impl_unittest.cc
index 04a4724..058f87e 100644
--- a/content/browser/speech/speech_recognizer_impl_unittest.cc
+++ b/content/browser/speech/speech_recognizer_impl_unittest.cc
@@ -484,7 +484,7 @@ TEST_F(SpeechRecognizerImplTest, SetInputVolumeCallback) {
controller->event_handler()->OnData(controller, &audio_packet_[0],
audio_packet_.size());
base::MessageLoop::current()->RunUntilIdle();
- EXPECT_FLOAT_EQ(0.89926866f, volume_);
+ EXPECT_NEAR(0.89926866f, volume_, 0.00001f);
EXPECT_FLOAT_EQ(0.75071919f, noise_volume_);
EXPECT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);