diff options
author | jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-26 18:46:15 +0000 |
---|---|---|
committer | jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-26 18:46:15 +0000 |
commit | 50fab53bddb2c3cb24d5682c913a03226ccf49ef (patch) | |
tree | bb04af83ca5f2be010e32c2e10cfd245117a4847 /content/browser | |
parent | 5c557f37629dc12dfd99e8fb55c235c8c46a8098 (diff) | |
download | chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.zip chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.gz chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.bz2 |
Move core pieces of speech from chrome to content.
TBR=satish
Review URL: http://codereview.chromium.org/6591024
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@76165 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'content/browser')
20 files changed, 3095 insertions, 0 deletions
diff --git a/content/browser/speech/OWNERS b/content/browser/speech/OWNERS new file mode 100644 index 0000000..2ad1bbd --- /dev/null +++ b/content/browser/speech/OWNERS @@ -0,0 +1 @@ +satish@chromium.org diff --git a/content/browser/speech/audio_encoder.cc b/content/browser/speech/audio_encoder.cc new file mode 100644 index 0000000..c24f45f --- /dev/null +++ b/content/browser/speech/audio_encoder.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/audio_encoder.h" + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/scoped_ptr.h" +#include "base/stl_util-inl.h" +#include "base/string_number_conversions.h" +#include "third_party/flac/flac.h" +#include "third_party/speex/speex.h" + +using std::string; + +namespace { + +//-------------------------------- FLACEncoder --------------------------------- + +const char* const kContentTypeFLAC = "audio/x-flac; rate="; +const int kFLACCompressionLevel = 0; // 0 for speed + +class FLACEncoder : public speech_input::AudioEncoder { + public: + FLACEncoder(int sampling_rate, int bits_per_sample); + virtual ~FLACEncoder(); + virtual void Encode(const short* samples, int num_samples); + virtual void Flush(); + + private: + static FLAC__StreamEncoderWriteStatus WriteCallback( + const FLAC__StreamEncoder* encoder, + const FLAC__byte buffer[], + size_t bytes, + unsigned samples, + unsigned current_frame, + void* client_data); + + FLAC__StreamEncoder* encoder_; + bool is_encoder_initialized_; + + DISALLOW_COPY_AND_ASSIGN(FLACEncoder); +}; + +FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback( + const FLAC__StreamEncoder* encoder, + const FLAC__byte buffer[], + size_t bytes, + unsigned samples, + unsigned current_frame, + void* client_data) { + FLACEncoder* me = static_cast<FLACEncoder*>(client_data); + DCHECK(me->encoder_ == encoder); + me->AppendToBuffer(new string(reinterpret_cast<const char*>(buffer), bytes)); + return FLAC__STREAM_ENCODER_WRITE_STATUS_OK; +} + +FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample) + : AudioEncoder(std::string(kContentTypeFLAC) + + base::IntToString(sampling_rate)), + encoder_(FLAC__stream_encoder_new()), + is_encoder_initialized_(false) { + FLAC__stream_encoder_set_channels(encoder_, 1); + FLAC__stream_encoder_set_bits_per_sample(encoder_, bits_per_sample); + FLAC__stream_encoder_set_sample_rate(encoder_, sampling_rate); + FLAC__stream_encoder_set_compression_level(encoder_, kFLACCompressionLevel); + + // Initializing the encoder will cause sync bytes to be written to + // its output stream, so we wait until the first call to this method + // before doing so. +} + +FLACEncoder::~FLACEncoder() { + FLAC__stream_encoder_delete(encoder_); +} + +void FLACEncoder::Encode(const short* samples, int num_samples) { + if (!is_encoder_initialized_) { + const FLAC__StreamEncoderInitStatus encoder_status = + FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL, + NULL, this); + DCHECK(encoder_status == FLAC__STREAM_ENCODER_INIT_STATUS_OK); + is_encoder_initialized_ = true; + } + + // FLAC encoder wants samples as int32s. + scoped_ptr<FLAC__int32> flac_samples(new FLAC__int32[num_samples]); + FLAC__int32* flac_samples_ptr = flac_samples.get(); + for (int i = 0; i < num_samples; ++i) + flac_samples_ptr[i] = samples[i]; + + FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples); +} + +void FLACEncoder::Flush() { + FLAC__stream_encoder_finish(encoder_); +} + +//-------------------------------- SpeexEncoder -------------------------------- + +const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate="; +const int kSpeexEncodingQuality = 8; +const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). + +// Since the frame length gets written out as a byte in the encoded packet, +// make sure it is within the byte range. +COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); + +class SpeexEncoder : public speech_input::AudioEncoder { + public: + explicit SpeexEncoder(int sampling_rate); + virtual ~SpeexEncoder(); + virtual void Encode(const short* samples, int num_samples); + virtual void Flush() {} + + private: + void* encoder_state_; + SpeexBits bits_; + int samples_per_frame_; + char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. + DISALLOW_COPY_AND_ASSIGN(SpeexEncoder); +}; + +SpeexEncoder::SpeexEncoder(int sampling_rate) + : AudioEncoder(std::string(kContentTypeSpeex) + + base::IntToString(sampling_rate)) { + // speex_bits_init() does not initialize all of the |bits_| struct. + memset(&bits_, 0, sizeof(bits_)); + speex_bits_init(&bits_); + encoder_state_ = speex_encoder_init(&speex_wb_mode); + DCHECK(encoder_state_); + speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); + DCHECK(samples_per_frame_ > 0); + int quality = kSpeexEncodingQuality; + speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); + int vbr = 1; + speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); + memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); +} + +SpeexEncoder::~SpeexEncoder() { + speex_bits_destroy(&bits_); + speex_encoder_destroy(encoder_state_); +} + +void SpeexEncoder::Encode(const short* samples, int num_samples) { + // Drop incomplete frames, typically those which come in when recording stops. + num_samples -= (num_samples % samples_per_frame_); + for (int i = 0; i < num_samples; i += samples_per_frame_) { + speex_bits_reset(&bits_); + speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), + &bits_); + + // Encode the frame and place the size of the frame as the first byte. This + // is the packet format for MIME type x-speex-with-header-byte. + int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, + kMaxSpeexFrameLength); + encoded_frame_data_[0] = static_cast<char>(frame_length); + AppendToBuffer(new string(encoded_frame_data_, frame_length + 1)); + } +} + +} // namespace + +namespace speech_input { + +AudioEncoder* AudioEncoder::Create(Codec codec, + int sampling_rate, + int bits_per_sample) { + if (codec == CODEC_FLAC) + return new FLACEncoder(sampling_rate, bits_per_sample); + return new SpeexEncoder(sampling_rate); +} + +AudioEncoder::AudioEncoder(const std::string& mime_type) + : mime_type_(mime_type) { +} + +AudioEncoder::~AudioEncoder() { + STLDeleteElements(&audio_buffers_); +} + +bool AudioEncoder::GetEncodedData(std::string* encoded_data) { + if (!audio_buffers_.size()) + return false; + + int audio_buffer_length = 0; + for (AudioBufferQueue::iterator it = audio_buffers_.begin(); + it != audio_buffers_.end(); ++it) { + audio_buffer_length += (*it)->length(); + } + encoded_data->reserve(audio_buffer_length); + for (AudioBufferQueue::iterator it = audio_buffers_.begin(); + it != audio_buffers_.end(); ++it) { + encoded_data->append(*(*it)); + } + + return true; +} + +void AudioEncoder::AppendToBuffer(std::string* item) { + audio_buffers_.push_back(item); +} + +} // namespace speech_input diff --git a/content/browser/speech/audio_encoder.h b/content/browser/speech/audio_encoder.h new file mode 100644 index 0000000..c70bfd0 --- /dev/null +++ b/content/browser/speech/audio_encoder.h @@ -0,0 +1,59 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_AUDIO_ENCODER_H_ +#define CONTENT_BROWSER_SPEECH_AUDIO_ENCODER_H_ + +#include <list> +#include <string> + +#include "base/basictypes.h" + +namespace speech_input { + +// Provides a simple interface to encode raw audio using the various speech +// codecs. +class AudioEncoder { + public: + enum Codec { + CODEC_FLAC, + CODEC_SPEEX, + }; + + static AudioEncoder* Create(Codec codec, + int sampling_rate, + int bits_per_sample); + + virtual ~AudioEncoder(); + + // Encodes each frame of raw audio in |samples| to the internal buffer. Use + // |GetEncodedData| to read the result after this call or when recording + // completes. + virtual void Encode(const short* samples, int num_samples) = 0; + + // Finish encoding and flush any pending encoded bits out. + virtual void Flush() = 0; + + // Copies the encoded audio to the given string. Returns true if the output + // is not empty. + bool GetEncodedData(std::string* encoded_data); + + const std::string& mime_type() { return mime_type_; } + + protected: + AudioEncoder(const std::string& mime_type); + + void AppendToBuffer(std::string* item); + + private: + // Buffer holding the recorded audio. Owns the strings inside the list. + typedef std::list<std::string*> AudioBufferQueue; + AudioBufferQueue audio_buffers_; + std::string mime_type_; + DISALLOW_COPY_AND_ASSIGN(AudioEncoder); +}; + +} // namespace speech_input + +#endif // CONTENT_BROWSER_SPEECH_AUDIO_ENCODER_H_ diff --git a/content/browser/speech/endpointer/endpointer.cc b/content/browser/speech/endpointer/endpointer.cc new file mode 100644 index 0000000..69c79a6 --- /dev/null +++ b/content/browser/speech/endpointer/endpointer.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/endpointer/endpointer.h" + +#include "base/time.h" + +using base::Time; + +namespace { +static const int kFrameRate = 50; // 1 frame = 20ms of audio. +} + +namespace speech_input { + +Endpointer::Endpointer(int sample_rate) + : speech_input_possibly_complete_silence_length_us_(-1), + speech_input_complete_silence_length_us_(-1), + audio_frame_time_us_(0), + sample_rate_(sample_rate), + frame_size_(0) { + Reset(); + + frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); + + speech_input_minimum_length_us_ = + static_cast<int64>(1.7 * Time::kMicrosecondsPerSecond); + speech_input_complete_silence_length_us_ = + static_cast<int64>(0.5 * Time::kMicrosecondsPerSecond); + long_speech_input_complete_silence_length_us_ = -1; + long_speech_length_us_ = -1; + speech_input_possibly_complete_silence_length_us_ = + 1 * Time::kMicrosecondsPerSecond; + + // Set the default configuration for Push To Talk mode. + EnergyEndpointerParams ep_config; + ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); + ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); + ep_config.set_endpoint_margin(0.2f); + ep_config.set_onset_window(0.15f); + ep_config.set_speech_on_window(0.4f); + ep_config.set_offset_window(0.15f); + ep_config.set_onset_detect_dur(0.09f); + ep_config.set_onset_confirm_dur(0.075f); + ep_config.set_on_maintain_dur(0.10f); + ep_config.set_offset_confirm_dur(0.12f); + ep_config.set_decision_threshold(1000.0f); + ep_config.set_min_decision_threshold(50.0f); + ep_config.set_fast_update_dur(0.2f); + ep_config.set_sample_rate(static_cast<float>(sample_rate)); + ep_config.set_min_fundamental_frequency(57.143f); + ep_config.set_max_fundamental_frequency(400.0f); + ep_config.set_contamination_rejection_period(0.25f); + energy_endpointer_.Init(ep_config); +} + +void Endpointer::Reset() { + old_ep_status_ = EP_PRE_SPEECH; + waiting_for_speech_possibly_complete_timeout_ = false; + waiting_for_speech_complete_timeout_ = false; + speech_previously_detected_ = false; + speech_input_complete_ = false; + audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer. + speech_end_time_us_ = -1; + speech_start_time_us_ = -1; +} + +void Endpointer::StartSession() { + Reset(); + energy_endpointer_.StartSession(); +} + +void Endpointer::EndSession() { + energy_endpointer_.EndSession(); +} + +void Endpointer::SetEnvironmentEstimationMode() { + Reset(); + energy_endpointer_.SetEnvironmentEstimationMode(); +} + +void Endpointer::SetUserInputMode() { + energy_endpointer_.SetUserInputMode(); +} + +EpStatus Endpointer::Status(int64 *time) { + return energy_endpointer_.Status(time); +} + +EpStatus Endpointer::ProcessAudio(const int16* audio_data, int num_samples, + float* rms_out) { + EpStatus ep_status = EP_PRE_SPEECH; + + // Process the input data in blocks of frame_size_, dropping any incomplete + // frames at the end (which is ok since typically the caller will be recording + // audio in multiples of our frame size). + int sample_index = 0; + while (sample_index + frame_size_ <= num_samples) { + // Have the endpointer process the frame. + energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, + audio_data + sample_index, + frame_size_, + rms_out); + sample_index += frame_size_; + audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) / + sample_rate_; + + // Get the status of the endpointer. + int64 ep_time; + ep_status = energy_endpointer_.Status(&ep_time); + + // Handle state changes. + if ((EP_SPEECH_PRESENT == ep_status) && + (EP_POSSIBLE_ONSET == old_ep_status_)) { + speech_end_time_us_ = -1; + waiting_for_speech_possibly_complete_timeout_ = false; + waiting_for_speech_complete_timeout_ = false; + // Trigger SpeechInputDidStart event on first detection. + if (false == speech_previously_detected_) { + speech_previously_detected_ = true; + speech_start_time_us_ = ep_time; + } + } + if ((EP_PRE_SPEECH == ep_status) && + (EP_POSSIBLE_OFFSET == old_ep_status_)) { + speech_end_time_us_ = ep_time; + waiting_for_speech_possibly_complete_timeout_ = true; + waiting_for_speech_complete_timeout_ = true; + } + if (ep_time > speech_input_minimum_length_us_) { + // Speech possibly complete timeout. + if ((waiting_for_speech_possibly_complete_timeout_) && + (ep_time - speech_end_time_us_ > + speech_input_possibly_complete_silence_length_us_)) { + waiting_for_speech_possibly_complete_timeout_ = false; + } + if (waiting_for_speech_complete_timeout_) { + // The length of the silence timeout period can be held constant, or it + // can be changed after a fixed amount of time from the beginning of + // speech. + bool has_stepped_silence = + (long_speech_length_us_ > 0) && + (long_speech_input_complete_silence_length_us_ > 0); + int64 requested_silence_length; + if (has_stepped_silence && + (ep_time - speech_start_time_us_) > long_speech_length_us_) { + requested_silence_length = + long_speech_input_complete_silence_length_us_; + } else { + requested_silence_length = + speech_input_complete_silence_length_us_; + } + + // Speech complete timeout. + if ((ep_time - speech_end_time_us_) > requested_silence_length) { + waiting_for_speech_complete_timeout_ = false; + speech_input_complete_ = true; + } + } + } + old_ep_status_ = ep_status; + } + return ep_status; +} + +} // namespace speech diff --git a/content/browser/speech/endpointer/endpointer.h b/content/browser/speech/endpointer/endpointer.h new file mode 100644 index 0000000..be4bd65 --- /dev/null +++ b/content/browser/speech/endpointer/endpointer.h @@ -0,0 +1,148 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ + +#include "base/basictypes.h" +#include "content/browser/speech/endpointer/energy_endpointer.h" + +class EpStatus; + +namespace speech_input { + +// A simple interface to the underlying energy-endpointer implementation, this +// class lets callers provide audio as being recorded and let them poll to find +// when the user has stopped speaking. +// +// There are two events that may trigger the end of speech: +// +// speechInputPossiblyComplete event: +// +// Signals that silence/noise has been detected for a *short* amount of +// time after some speech has been detected. It can be used for low latency +// UI feedback. To disable it, set it to a large amount. +// +// speechInputComplete event: +// +// This event is intended to signal end of input and to stop recording. +// The amount of time to wait after speech is set by +// speech_input_complete_silence_length_ and optionally two other +// parameters (see below). +// This time can be held constant, or can change as more speech is detected. +// In the latter case, the time changes after a set amount of time from the +// *beginning* of speech. This is motivated by the expectation that there +// will be two distinct types of inputs: short search queries and longer +// dictation style input. +// +// Three parameters are used to define the piecewise constant timeout function. +// The timeout length is speech_input_complete_silence_length until +// long_speech_length, when it changes to +// long_speech_input_complete_silence_length. +class Endpointer { + public: + explicit Endpointer(int sample_rate); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Process a segment of audio, which may be more than one frame. + // The status of the last frame will be returned. + EpStatus ProcessAudio(const int16* audio_data, int num_samples, + float* rms_out); + + // Get the status of the endpointer. + EpStatus Status(int64 *time_us); + + // Returns true if the endpointer detected reasonable audio levels above + // background noise which could be user speech, false if not. + bool DidStartReceivingSpeech() const { + return speech_previously_detected_; + } + + bool IsEstimatingEnvironment() const { + return energy_endpointer_.estimating_environment(); + } + + void set_speech_input_complete_silence_length(int64 time_us) { + speech_input_complete_silence_length_us_ = time_us; + } + + void set_long_speech_input_complete_silence_length(int64 time_us) { + long_speech_input_complete_silence_length_us_ = time_us; + } + + void set_speech_input_possibly_complete_silence_length(int64 time_us) { + speech_input_possibly_complete_silence_length_us_ = time_us; + } + + void set_long_speech_length(int64 time_us) { + long_speech_length_us_ = time_us; + } + + bool speech_input_complete() const { + return speech_input_complete_; + } + + private: + // Reset internal states. Helper method common to initial input utterance + // and following input utternaces. + void Reset(); + + // Minimum allowable length of speech input. + int64 speech_input_minimum_length_us_; + + // The speechInputPossiblyComplete event signals that silence/noise has been + // detected for a *short* amount of time after some speech has been detected. + // This proporty specifies the time period. + int64 speech_input_possibly_complete_silence_length_us_; + + // The speechInputComplete event signals that silence/noise has been + // detected for a *long* amount of time after some speech has been detected. + // This property specifies the time period. + int64 speech_input_complete_silence_length_us_; + + // Same as above, this specifies the required silence period after speech + // detection. This period is used instead of + // speech_input_complete_silence_length_ when the utterance is longer than + // long_speech_length_. This parameter is optional. + int64 long_speech_input_complete_silence_length_us_; + + // The period of time after which the endpointer should consider + // long_speech_input_complete_silence_length_ as a valid silence period + // instead of speech_input_complete_silence_length_. This parameter is + // optional. + int64 long_speech_length_us_; + + // First speech onset time, used in determination of speech complete timeout. + int64 speech_start_time_us_; + + // Most recent end time, used in determination of speech complete timeout. + int64 speech_end_time_us_; + + int64 audio_frame_time_us_; + EpStatus old_ep_status_; + bool waiting_for_speech_possibly_complete_timeout_; + bool waiting_for_speech_complete_timeout_; + bool speech_previously_detected_; + bool speech_input_complete_; + EnergyEndpointer energy_endpointer_; + int sample_rate_; + int32 frame_size_; +}; + +} // namespace speech_input + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ diff --git a/content/browser/speech/endpointer/endpointer_unittest.cc b/content/browser/speech/endpointer/endpointer_unittest.cc new file mode 100644 index 0000000..3d1583e --- /dev/null +++ b/content/browser/speech/endpointer/endpointer_unittest.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/task.h" +#include "content/browser/speech/endpointer/endpointer.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace { +const int kFrameRate = 50; // 20 ms long frames for AMR encoding. +const int kSampleRate = 8000; // 8 k samples per second for AMR encoding. + +// At 8 sample per second a 20 ms frame is 160 samples, which corrsponds +// to the AMR codec. +const int kFrameSize = kSampleRate / kFrameRate; // 160 samples. +COMPILE_ASSERT(kFrameSize == 160, invalid_frame_size); +} + +namespace speech_input { + +class FrameProcessor { + public: + // Process a single frame of test audio samples. + virtual EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) = 0; +}; + +void RunEndpointerEventsTest(FrameProcessor* processor) { + int16 samples[kFrameSize]; + + // We will create a white noise signal of 150 frames. The frames from 50 to + // 100 will have more power, and the endpointer should fire on those frames. + const int kNumFrames = 150; + + // Create a random sequence of samples. + srand(1); + float gain = 0.0; + int64 time = 0; + for (int frame_count = 0; frame_count < kNumFrames; ++frame_count) { + // The frames from 50 to 100 will have more power, and the endpointer + // should detect those frames as speech. + if ((frame_count >= 50) && (frame_count < 100)) { + gain = 2000.0; + } else { + gain = 1.0; + } + // Create random samples. + for (int i = 0; i < kFrameSize; ++i) { + float randNum = static_cast<float>(rand() - (RAND_MAX / 2)) / + static_cast<float>(RAND_MAX); + samples[i] = static_cast<int16>(gain * randNum); + } + + EpStatus ep_status = processor->ProcessFrame(time, samples, kFrameSize); + time += static_cast<int64>(kFrameSize * (1e6 / kSampleRate)); + + // Log the status. + if (20 == frame_count) + EXPECT_EQ(EP_PRE_SPEECH, ep_status); + if (70 == frame_count) + EXPECT_EQ(EP_SPEECH_PRESENT, ep_status); + if (120 == frame_count) + EXPECT_EQ(EP_PRE_SPEECH, ep_status); + } +} + +// This test instantiates and initializes a stand alone endpointer module. +// The test creates FrameData objects with random noise and send them +// to the endointer module. The energy of the first 50 frames is low, +// followed by 500 high energy frames, and another 50 low energy frames. +// We test that the correct start and end frames were detected. +class EnergyEndpointerFrameProcessor : public FrameProcessor { + public: + explicit EnergyEndpointerFrameProcessor(EnergyEndpointer* endpointer) + : endpointer_(endpointer) {} + + EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) { + endpointer_->ProcessAudioFrame(time, samples, kFrameSize, NULL); + int64 ep_time; + return endpointer_->Status(&ep_time); + } + + private: + EnergyEndpointer* endpointer_; +}; + +TEST(EndpointerTest, TestEnergyEndpointerEvents) { + // Initialize endpointer and configure it. We specify the parameters + // here for a 20ms window, and a 20ms step size, which corrsponds to + // the narrow band AMR codec. + EnergyEndpointerParams ep_config; + ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); + ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); + ep_config.set_endpoint_margin(0.2f); + ep_config.set_onset_window(0.15f); + ep_config.set_speech_on_window(0.4f); + ep_config.set_offset_window(0.15f); + ep_config.set_onset_detect_dur(0.09f); + ep_config.set_onset_confirm_dur(0.075f); + ep_config.set_on_maintain_dur(0.10f); + ep_config.set_offset_confirm_dur(0.12f); + ep_config.set_decision_threshold(100.0f); + EnergyEndpointer endpointer; + endpointer.Init(ep_config); + + endpointer.StartSession(); + + EnergyEndpointerFrameProcessor frame_processor(&endpointer); + RunEndpointerEventsTest(&frame_processor); + + endpointer.EndSession(); +}; + +// Test endpointer wrapper class. +class EndpointerFrameProcessor : public FrameProcessor { + public: + explicit EndpointerFrameProcessor(Endpointer* endpointer) + : endpointer_(endpointer) {} + + EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) { + endpointer_->ProcessAudio(samples, kFrameSize, NULL); + int64 ep_time; + return endpointer_->Status(&ep_time); + } + + private: + Endpointer* endpointer_; +}; + +TEST(EndpointerTest, TestEmbeddedEndpointerEvents) { + const int kSampleRate = 8000; // 8 k samples per second for AMR encoding. + + Endpointer endpointer(kSampleRate); + const int64 kMillisecondsPerMicrosecond = 1000; + const int64 short_timeout = 300 * kMillisecondsPerMicrosecond; + endpointer.set_speech_input_possibly_complete_silence_length(short_timeout); + const int64 long_timeout = 500 * kMillisecondsPerMicrosecond; + endpointer.set_speech_input_complete_silence_length(long_timeout); + endpointer.StartSession(); + + EndpointerFrameProcessor frame_processor(&endpointer); + RunEndpointerEventsTest(&frame_processor); + + endpointer.EndSession(); +} + +} // namespace speech_input diff --git a/content/browser/speech/endpointer/energy_endpointer.cc b/content/browser/speech/endpointer/energy_endpointer.cc new file mode 100644 index 0000000..c806aed --- /dev/null +++ b/content/browser/speech/endpointer/energy_endpointer.cc @@ -0,0 +1,369 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// To know more about the algorithm used and the original code which this is +// based of, see +// https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef + +#include "content/browser/speech/endpointer/energy_endpointer.h" + +#include <math.h> + +#include "base/logging.h" + +namespace { + +// Returns the RMS (quadratic mean) of the input signal. +float RMS(const int16* samples, int num_samples) { + int64 ssq_int64 = 0; + int64 sum_int64 = 0; + for (int i = 0; i < num_samples; ++i) { + sum_int64 += samples[i]; + ssq_int64 += samples[i] * samples[i]; + } + // now convert to floats. + double sum = static_cast<double>(sum_int64); + sum /= num_samples; + double ssq = static_cast<double>(ssq_int64); + return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); +} + +int64 Secs2Usecs(float seconds) { + return static_cast<int64>(0.5 + (1.0e6 * seconds)); +} + +} // namespace + +namespace speech_input { + +// Stores threshold-crossing histories for making decisions about the speech +// state. +class EnergyEndpointer::HistoryRing { + public: + HistoryRing() : insertion_index_(0) {} + + // Resets the ring to |size| elements each with state |initial_state| + void SetRing(int size, bool initial_state); + + // Inserts a new entry into the ring and drops the oldest entry. + void Insert(int64 time_us, bool decision); + + // Returns the time in microseconds of the most recently added entry. + int64 EndTime() const; + + // Returns the sum of all intervals during which 'decision' is true within + // the time in seconds specified by 'duration'. The returned interval is + // in seconds. + float RingSum(float duration_sec); + + private: + struct DecisionPoint { + int64 time_us; + bool decision; + }; + + std::vector<DecisionPoint> decision_points_; + int insertion_index_; // Index at which the next item gets added/inserted. + + DISALLOW_COPY_AND_ASSIGN(HistoryRing); +}; + +void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { + insertion_index_ = 0; + decision_points_.clear(); + DecisionPoint init = { -1, initial_state }; + decision_points_.resize(size, init); +} + +void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) { + decision_points_[insertion_index_].time_us = time_us; + decision_points_[insertion_index_].decision = decision; + insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); +} + +int64 EnergyEndpointer::HistoryRing::EndTime() const { + int ind = insertion_index_ - 1; + if (ind < 0) + ind = decision_points_.size() - 1; + return decision_points_[ind].time_us; +} + +float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { + if (!decision_points_.size()) + return 0.0; + + int64 sum_us = 0; + int ind = insertion_index_ - 1; + if (ind < 0) + ind = decision_points_.size() - 1; + int64 end_us = decision_points_[ind].time_us; + bool is_on = decision_points_[ind].decision; + int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec)); + if (start_us < 0) + start_us = 0; + size_t n_summed = 1; // n points ==> (n-1) intervals + while ((decision_points_[ind].time_us > start_us) && + (n_summed < decision_points_.size())) { + --ind; + if (ind < 0) + ind = decision_points_.size() - 1; + if (is_on) + sum_us += end_us - decision_points_[ind].time_us; + is_on = decision_points_[ind].decision; + end_us = decision_points_[ind].time_us; + n_summed++; + } + + return 1.0e-6f * sum_us; // Returns total time that was super threshold. +} + +EnergyEndpointer::EnergyEndpointer() + : status_(EP_PRE_SPEECH), + offset_confirm_dur_sec_(0), + endpointer_time_us_(0), + fast_update_frames_(0), + frame_counter_(0), + max_window_dur_(4.0), + sample_rate_(0), + history_(new HistoryRing()), + decision_threshold_(0), + estimating_environment_(false), + noise_level_(0), + rms_adapt_(0), + start_lag_(0), + end_lag_(0), + user_input_start_time_us_(0) { +} + +EnergyEndpointer::~EnergyEndpointer() { +} + +int EnergyEndpointer::TimeToFrame(float time) const { + return static_cast<int32>(0.5 + (time / params_.frame_period())); +} + +void EnergyEndpointer::Restart(bool reset_threshold) { + status_ = EP_PRE_SPEECH; + user_input_start_time_us_ = 0; + + if (reset_threshold) { + decision_threshold_ = params_.decision_threshold(); + rms_adapt_ = decision_threshold_; + noise_level_ = params_.decision_threshold() / 2.0f; + frame_counter_ = 0; // Used for rapid initial update of levels. + } + + // Set up the memories to hold the history windows. + history_->SetRing(TimeToFrame(max_window_dur_), false); + + // Flag that indicates that current input should be used for + // estimating the environment. The user has not yet started input + // by e.g. pressed the push-to-talk button. By default, this is + // false for backward compatibility. + estimating_environment_ = false; +} + +void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { + params_ = params; + + // Find the longest history interval to be used, and make the ring + // large enough to accommodate that number of frames. NOTE: This + // depends upon ep_frame_period being set correctly in the factory + // that did this instantiation. + max_window_dur_ = params_.onset_window(); + if (params_.speech_on_window() > max_window_dur_) + max_window_dur_ = params_.speech_on_window(); + if (params_.offset_window() > max_window_dur_) + max_window_dur_ = params_.offset_window(); + Restart(true); + + offset_confirm_dur_sec_ = params_.offset_window() - + params_.offset_confirm_dur(); + if (offset_confirm_dur_sec_ < 0.0) + offset_confirm_dur_sec_ = 0.0; + + user_input_start_time_us_ = 0; + + // Flag that indicates that current input should be used for + // estimating the environment. The user has not yet started input + // by e.g. pressed the push-to-talk button. By default, this is + // false for backward compatibility. + estimating_environment_ = false; + // The initial value of the noise and speech levels is inconsequential. + // The level of the first frame will overwrite these values. + noise_level_ = params_.decision_threshold() / 2.0f; + fast_update_frames_ = + static_cast<int64>(params_.fast_update_dur() / params_.frame_period()); + + frame_counter_ = 0; // Used for rapid initial update of levels. + + sample_rate_ = params_.sample_rate(); + start_lag_ = static_cast<int>(sample_rate_ / + params_.max_fundamental_frequency()); + end_lag_ = static_cast<int>(sample_rate_ / + params_.min_fundamental_frequency()); +} + +void EnergyEndpointer::StartSession() { + Restart(true); +} + +void EnergyEndpointer::EndSession() { + status_ = EP_POST_SPEECH; +} + +void EnergyEndpointer::SetEnvironmentEstimationMode() { + Restart(true); + estimating_environment_ = true; +} + +void EnergyEndpointer::SetUserInputMode() { + estimating_environment_ = false; + user_input_start_time_us_ = endpointer_time_us_; +} + +void EnergyEndpointer::ProcessAudioFrame(int64 time_us, + const int16* samples, + int num_samples, + float* rms_out) { + endpointer_time_us_ = time_us; + float rms = RMS(samples, num_samples); + + // Check that this is user input audio vs. pre-input adaptation audio. + // Input audio starts when the user indicates start of input, by e.g. + // pressing push-to-talk. Audio recieved prior to that is used to update + // noise and speech level estimates. + if (!estimating_environment_) { + bool decision = false; + if ((endpointer_time_us_ - user_input_start_time_us_) < + Secs2Usecs(params_.contamination_rejection_period())) { + decision = false; + DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_; + } else { + decision = (rms > decision_threshold_); + } + + history_->Insert(endpointer_time_us_, decision); + + switch (status_) { + case EP_PRE_SPEECH: + if (history_->RingSum(params_.onset_window()) > + params_.onset_detect_dur()) { + status_ = EP_POSSIBLE_ONSET; + } + break; + + case EP_POSSIBLE_ONSET: { + float tsum = history_->RingSum(params_.onset_window()); + if (tsum > params_.onset_confirm_dur()) { + status_ = EP_SPEECH_PRESENT; + } else { // If signal is not maintained, drop back to pre-speech. + if (tsum <= params_.onset_detect_dur()) + status_ = EP_PRE_SPEECH; + } + break; + } + + case EP_SPEECH_PRESENT: { + // To induce hysteresis in the state residency, we allow a + // smaller residency time in the on_ring, than was required to + // enter the SPEECH_PERSENT state. + float on_time = history_->RingSum(params_.speech_on_window()); + if (on_time < params_.on_maintain_dur()) + status_ = EP_POSSIBLE_OFFSET; + break; + } + + case EP_POSSIBLE_OFFSET: + if (history_->RingSum(params_.offset_window()) <= + offset_confirm_dur_sec_) { + // Note that this offset time may be beyond the end + // of the input buffer in a real-time system. It will be up + // to the RecognizerSession to decide what to do. + status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. + } else { // If speech picks up again we allow return to SPEECH_PRESENT. + if (history_->RingSum(params_.speech_on_window()) >= + params_.on_maintain_dur()) + status_ = EP_SPEECH_PRESENT; + } + break; + + default: + LOG(WARNING) << "Invalid case in switch: " << status_; + break; + } + + // If this is a quiet, non-speech region, slowly adapt the detection + // threshold to be about 6dB above the average RMS. + if ((!decision) && (status_ == EP_PRE_SPEECH)) { + decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); + rms_adapt_ = decision_threshold_; + } else { + // If this is in a speech region, adapt the decision threshold to + // be about 10dB below the average RMS. If the noise level is high, + // the threshold is pushed up. + // Adaptation up to a higher level is 5 times faster than decay to + // a lower level. + if ((status_ == EP_SPEECH_PRESENT) && decision) { + if (rms_adapt_ > rms) { + rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); + } else { + rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); + } + float target_threshold = 0.3f * rms_adapt_ + noise_level_; + decision_threshold_ = (.90f * decision_threshold_) + + (0.10f * target_threshold); + } + } + + // Set a floor + if (decision_threshold_ < params_.min_decision_threshold()) + decision_threshold_ = params_.min_decision_threshold(); + } + + // Update speech and noise levels. + UpdateLevels(rms); + ++frame_counter_; + + if (rms_out) { + *rms_out = -120.0; + if ((noise_level_ > 0.0) && ((rms / noise_level_ ) > 0.000001)) + *rms_out = static_cast<float>(20.0 * log10(rms / noise_level_)); + } +} + +void EnergyEndpointer::UpdateLevels(float rms) { + // Update quickly initially. We assume this is noise and that + // speech is 6dB above the noise. + if (frame_counter_ < fast_update_frames_) { + // Alpha increases from 0 to (k-1)/k where k is the number of time + // steps in the initial adaptation period. + float alpha = static_cast<float>(frame_counter_) / + static_cast<float>(fast_update_frames_); + noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); + DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_ + << ", fast_update_frames_ " << fast_update_frames_; + } else { + // Update Noise level. The noise level adapts quickly downward, but + // slowly upward. The noise_level_ parameter is not currently used + // for threshold adaptation. It is used for UI feedback. + if (noise_level_ < rms) + noise_level_ = (0.999f * noise_level_) + (0.001f * rms); + else + noise_level_ = (0.95f * noise_level_) + (0.05f * rms); + } + if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { + decision_threshold_ = noise_level_ * 2; // 6dB above noise level. + // Set a floor + if (decision_threshold_ < params_.min_decision_threshold()) + decision_threshold_ = params_.min_decision_threshold(); + } +} + +EpStatus EnergyEndpointer::Status(int64* status_time) const { + *status_time = history_->EndTime(); + return status_; +} + +} // namespace speech diff --git a/content/browser/speech/endpointer/energy_endpointer.h b/content/browser/speech/endpointer/energy_endpointer.h new file mode 100644 index 0000000..b10d8b7 --- /dev/null +++ b/content/browser/speech/endpointer/energy_endpointer.h @@ -0,0 +1,151 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// The EnergyEndpointer class finds likely speech onset and offset points. +// +// The implementation described here is about the simplest possible. +// It is based on timings of threshold crossings for overall signal +// RMS. It is suitable for light weight applications. +// +// As written, the basic idea is that one specifies intervals that +// must be occupied by super- and sub-threshold energy levels, and +// defers decisions re onset and offset times until these +// specifications have been met. Three basic intervals are tested: an +// onset window, a speech-on window, and an offset window. We require +// super-threshold to exceed some mimimum total durations in the onset +// and speech-on windows before declaring the speech onset time, and +// we specify a required sub-threshold residency in the offset window +// before declaring speech offset. As the various residency requirements are +// met, the EnergyEndpointer instance assumes various states, and can return the +// ID of these states to the client (see EpStatus below). +// +// The levels of the speech and background noise are continuously updated. It is +// important that the background noise level be estimated initially for +// robustness in noisy conditions. The first frames are assumed to be background +// noise and a fast update rate is used for the noise level. The duration for +// fast update is controlled by the fast_update_dur_ paramter. +// +// If used in noisy conditions, the endpointer should be started and run in the +// EnvironmentEstimation mode, for at least 200ms, before switching to +// UserInputMode. +// Audio feedback contamination can appear in the input audio, if not cut +// out or handled by echo cancellation. Audio feedback can trigger a false +// accept. The false accepts can be ignored by setting +// ep_contamination_rejection_period. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ + +#include <vector> + +#include "base/basictypes.h" +#include "base/scoped_ptr.h" +#include "content/browser/speech/endpointer/energy_endpointer_params.h" + +namespace speech_input { + +// Endpointer status codes +enum EpStatus { + EP_PRE_SPEECH = 10, + EP_POSSIBLE_ONSET, + EP_SPEECH_PRESENT, + EP_POSSIBLE_OFFSET, + EP_POST_SPEECH, +}; + +class EnergyEndpointer { + public: + // The default construction MUST be followed by Init(), before any + // other use can be made of the instance. + EnergyEndpointer(); + virtual ~EnergyEndpointer(); + + void Init(const EnergyEndpointerParams& params); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Computes the next input frame and modifies EnergyEndpointer status as + // appropriate based on the computation. + void ProcessAudioFrame(int64 time_us, + const int16* samples, int num_samples, + float* rms_out); + + // Returns the current state of the EnergyEndpointer and the time + // corresponding to the most recently computed frame. + EpStatus Status(int64* status_time_us) const; + + bool estimating_environment() const { + return estimating_environment_; + } + + private: + class HistoryRing; + + // Resets the endpointer internal state. If reset_threshold is true, the + // state will be reset completely, including adaptive thresholds and the + // removal of all history information. + void Restart(bool reset_threshold); + + // Update internal speech and noise levels. + void UpdateLevels(float rms); + + // Returns the number of frames (or frame number) corresponding to + // the 'time' (in seconds). + int TimeToFrame(float time) const; + + EpStatus status_; // The current state of this instance. + float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH + int64 endpointer_time_us_; // Time of the most recently received audio frame. + int64 fast_update_frames_; // Number of frames for initial level adaptation. + int64 frame_counter_; // Number of frames seen. Used for initial adaptation. + float max_window_dur_; // Largest search window size (seconds) + float sample_rate_; // Sampling rate. + + // Ring buffers to hold the speech activity history. + scoped_ptr<HistoryRing> history_; + + // Configuration parameters. + EnergyEndpointerParams params_; + + // RMS which must be exceeded to conclude frame is speech. + float decision_threshold_; + + // Flag to indicate that audio should be used to estimate environment, prior + // to receiving user input. + bool estimating_environment_; + + // Estimate of the background noise level. Used externally for UI feedback. + float noise_level_; + + // An adaptive threshold used to update decision_threshold_ when appropriate. + float rms_adapt_; + + // Start lag corresponds to the highest fundamental frequency. + int start_lag_; + + // End lag corresponds to the lowest fundamental frequency. + int end_lag_; + + // Time when mode switched from environment estimation to user input. This + // is used to time forced rejection of audio feedback contamination. + int64 user_input_start_time_us_; + + DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer); +}; + +} // namespace speech_input + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ diff --git a/content/browser/speech/endpointer/energy_endpointer_params.cc b/content/browser/speech/endpointer/energy_endpointer_params.cc new file mode 100644 index 0000000..e110b24 --- /dev/null +++ b/content/browser/speech/endpointer/energy_endpointer_params.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/endpointer/energy_endpointer_params.h" + +namespace speech_input { + +EnergyEndpointerParams::EnergyEndpointerParams() { + SetDefaults(); +} + +void EnergyEndpointerParams::SetDefaults() { + frame_period_ = 0.01f; + frame_duration_ = 0.01f; + endpoint_margin_ = 0.2f; + onset_window_ = 0.15f; + speech_on_window_ = 0.4f; + offset_window_ = 0.15f; + onset_detect_dur_ = 0.09f; + onset_confirm_dur_ = 0.075f; + on_maintain_dur_ = 0.10f; + offset_confirm_dur_ = 0.12f; + decision_threshold_ = 150.0f; + min_decision_threshold_ = 50.0f; + fast_update_dur_ = 0.2f; + sample_rate_ = 8000.0f; + min_fundamental_frequency_ = 57.143f; + max_fundamental_frequency_ = 400.0f; + contamination_rejection_period_ = 0.25f; +} + +void EnergyEndpointerParams::operator=(const EnergyEndpointerParams& source) { + frame_period_ = source.frame_period(); + frame_duration_ = source.frame_duration(); + endpoint_margin_ = source.endpoint_margin(); + onset_window_ = source.onset_window(); + speech_on_window_ = source.speech_on_window(); + offset_window_ = source.offset_window(); + onset_detect_dur_ = source.onset_detect_dur(); + onset_confirm_dur_ = source.onset_confirm_dur(); + on_maintain_dur_ = source.on_maintain_dur(); + offset_confirm_dur_ = source.offset_confirm_dur(); + decision_threshold_ = source.decision_threshold(); + min_decision_threshold_ = source.min_decision_threshold(); + fast_update_dur_ = source.fast_update_dur(); + sample_rate_ = source.sample_rate(); + min_fundamental_frequency_ = source.min_fundamental_frequency(); + max_fundamental_frequency_ = source.max_fundamental_frequency(); + contamination_rejection_period_ = source.contamination_rejection_period(); +} + +} // namespace speech_input diff --git a/content/browser/speech/endpointer/energy_endpointer_params.h b/content/browser/speech/endpointer/energy_endpointer_params.h new file mode 100644 index 0000000..5fd923d --- /dev/null +++ b/content/browser/speech/endpointer/energy_endpointer_params.h @@ -0,0 +1,137 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ +#define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ + +#include "base/basictypes.h" + +namespace speech_input { + +// Input parameters for the EnergyEndpointer class. +class EnergyEndpointerParams { + public: + EnergyEndpointerParams(); + + void SetDefaults(); + + void operator=(const EnergyEndpointerParams& source); + + // Accessors and mutators + float frame_period() const { return frame_period_; } + void set_frame_period(float frame_period) { + frame_period_ = frame_period; + } + + float frame_duration() const { return frame_duration_; } + void set_frame_duration(float frame_duration) { + frame_duration_ = frame_duration; + } + + float endpoint_margin() const { return endpoint_margin_; } + void set_endpoint_margin(float endpoint_margin) { + endpoint_margin_ = endpoint_margin; + } + + float onset_window() const { return onset_window_; } + void set_onset_window(float onset_window) { onset_window_ = onset_window; } + + float speech_on_window() const { return speech_on_window_; } + void set_speech_on_window(float speech_on_window) { + speech_on_window_ = speech_on_window; + } + + float offset_window() const { return offset_window_; } + void set_offset_window(float offset_window) { + offset_window_ = offset_window; + } + + float onset_detect_dur() const { return onset_detect_dur_; } + void set_onset_detect_dur(float onset_detect_dur) { + onset_detect_dur_ = onset_detect_dur; + } + + float onset_confirm_dur() const { return onset_confirm_dur_; } + void set_onset_confirm_dur(float onset_confirm_dur) { + onset_confirm_dur_ = onset_confirm_dur; + } + + float on_maintain_dur() const { return on_maintain_dur_; } + void set_on_maintain_dur(float on_maintain_dur) { + on_maintain_dur_ = on_maintain_dur; + } + + float offset_confirm_dur() const { return offset_confirm_dur_; } + void set_offset_confirm_dur(float offset_confirm_dur) { + offset_confirm_dur_ = offset_confirm_dur; + } + + float decision_threshold() const { return decision_threshold_; } + void set_decision_threshold(float decision_threshold) { + decision_threshold_ = decision_threshold; + } + + float min_decision_threshold() const { return min_decision_threshold_; } + void set_min_decision_threshold(float min_decision_threshold) { + min_decision_threshold_ = min_decision_threshold; + } + + float fast_update_dur() const { return fast_update_dur_; } + void set_fast_update_dur(float fast_update_dur) { + fast_update_dur_ = fast_update_dur; + } + + float sample_rate() const { return sample_rate_; } + void set_sample_rate(float sample_rate) { sample_rate_ = sample_rate; } + + float min_fundamental_frequency() const { return min_fundamental_frequency_; } + void set_min_fundamental_frequency(float min_fundamental_frequency) { + min_fundamental_frequency_ = min_fundamental_frequency; + } + + float max_fundamental_frequency() const { return max_fundamental_frequency_; } + void set_max_fundamental_frequency(float max_fundamental_frequency) { + max_fundamental_frequency_ = max_fundamental_frequency; + } + + float contamination_rejection_period() const { + return contamination_rejection_period_; + } + void set_contamination_rejection_period( + float contamination_rejection_period) { + contamination_rejection_period_ = contamination_rejection_period; + } + + private: + float frame_period_; // Frame period + float frame_duration_; // Window size + float onset_window_; // Interval scanned for onset activity + float speech_on_window_; // Inverval scanned for ongoing speech + float offset_window_; // Interval scanned for offset evidence + float offset_confirm_dur_; // Silence duration required to confirm offset + float decision_threshold_; // Initial rms detection threshold + float min_decision_threshold_; // Minimum rms detection threshold + float fast_update_dur_; // Period for initial estimation of levels. + float sample_rate_; // Expected sample rate. + + // Time to add on either side of endpoint threshold crossings + float endpoint_margin_; + // Total dur within onset_window required to enter ONSET state + float onset_detect_dur_; + // Total on time within onset_window required to enter SPEECH_ON state + float onset_confirm_dur_; + // Minimum dur in SPEECH_ON state required to maintain ON state + float on_maintain_dur_; + // Minimum fundamental frequency for autocorrelation. + float min_fundamental_frequency_; + // Maximum fundamental frequency for autocorrelation. + float max_fundamental_frequency_; + // Period after start of user input that above threshold values are ignored. + // This is to reject audio feedback contamination. + float contamination_rejection_period_; +}; + +} // namespace speech_input + +#endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ diff --git a/content/browser/speech/speech_input_browsertest.cc b/content/browser/speech/speech_input_browsertest.cc new file mode 100644 index 0000000..c827f47 --- /dev/null +++ b/content/browser/speech/speech_input_browsertest.cc @@ -0,0 +1,207 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/command_line.h" +#include "base/file_path.h" +#include "base/string_number_conversions.h" +#include "base/utf_string_conversions.h" +#include "chrome/browser/ui/browser.h" +#include "chrome/common/chrome_switches.h" +#include "chrome/test/in_process_browser_test.h" +#include "chrome/test/ui_test_utils.h" +#include "content/browser/renderer_host/render_view_host.h" +#include "content/browser/speech/speech_input_dispatcher_host.h" +#include "content/browser/speech/speech_input_manager.h" +#include "content/browser/tab_contents/tab_contents.h" +#include "third_party/WebKit/Source/WebKit/chromium/public/WebInputEvent.h" + +namespace speech_input { +class FakeSpeechInputManager; +} + +// This class does not need to be refcounted (typically done by PostTask) since +// it will outlive the test and gets released only when the test shuts down. +// Disabling refcounting here saves a bit of unnecessary code and the factory +// method can return a plain pointer below as required by the real code. +DISABLE_RUNNABLE_METHOD_REFCOUNT(speech_input::FakeSpeechInputManager); + +namespace speech_input { + +const char* kTestResult = "Pictures of the moon"; + +class FakeSpeechInputManager : public SpeechInputManager { + public: + FakeSpeechInputManager() + : caller_id_(0), + delegate_(NULL) { + } + + std::string grammar() { + return grammar_; + } + + // SpeechInputManager methods. + virtual void StartRecognition(Delegate* delegate, + int caller_id, + int render_process_id, + int render_view_id, + const gfx::Rect& element_rect, + const std::string& language, + const std::string& grammar, + const std::string& origin_url) { + VLOG(1) << "StartRecognition invoked."; + EXPECT_EQ(0, caller_id_); + EXPECT_EQ(NULL, delegate_); + caller_id_ = caller_id; + delegate_ = delegate; + grammar_ = grammar; + // Give the fake result in a short while. + MessageLoop::current()->PostTask(FROM_HERE, NewRunnableMethod(this, + &FakeSpeechInputManager::SetFakeRecognitionResult)); + } + virtual void CancelRecognition(int caller_id) { + VLOG(1) << "CancelRecognition invoked."; + EXPECT_EQ(caller_id_, caller_id); + caller_id_ = 0; + delegate_ = NULL; + } + virtual void StopRecording(int caller_id) { + VLOG(1) << "StopRecording invoked."; + EXPECT_EQ(caller_id_, caller_id); + // Nothing to do here since we aren't really recording. + } + virtual void CancelAllRequestsWithDelegate(Delegate* delegate) { + VLOG(1) << "CancelAllRequestsWithDelegate invoked."; + } + + private: + void SetFakeRecognitionResult() { + if (caller_id_) { // Do a check in case we were cancelled.. + VLOG(1) << "Setting fake recognition result."; + delegate_->DidCompleteRecording(caller_id_); + SpeechInputResultArray results; + results.push_back(SpeechInputResultItem(ASCIIToUTF16(kTestResult), 1.0)); + delegate_->SetRecognitionResult(caller_id_, results); + delegate_->DidCompleteRecognition(caller_id_); + caller_id_ = 0; + delegate_ = NULL; + VLOG(1) << "Finished setting fake recognition result."; + } + } + + int caller_id_; + Delegate* delegate_; + std::string grammar_; +}; + +class SpeechInputBrowserTest : public InProcessBrowserTest { + public: + // InProcessBrowserTest methods + GURL testUrl(const FilePath::CharType* filename) { + const FilePath kTestDir(FILE_PATH_LITERAL("speech")); + return ui_test_utils::GetTestUrl(kTestDir, FilePath(filename)); + } + + protected: + void LoadAndRunSpeechInputTest(const FilePath::CharType* filename) { + // The test page calculates the speech button's coordinate in the page on + // load & sets that coordinate in the URL fragment. We send mouse down & up + // events at that coordinate to trigger speech recognition. + GURL test_url = testUrl(filename); + ui_test_utils::NavigateToURL(browser(), test_url); + std::string coords = browser()->GetSelectedTabContents()->GetURL().ref(); + VLOG(1) << "Coordinates given by script: " << coords; + int comma_pos = coords.find(','); + ASSERT_NE(-1, comma_pos); + int x = 0; + ASSERT_TRUE(base::StringToInt(coords.substr(0, comma_pos).c_str(), &x)); + int y = 0; + ASSERT_TRUE(base::StringToInt(coords.substr(comma_pos + 1).c_str(), &y)); + + WebKit::WebMouseEvent mouse_event; + mouse_event.type = WebKit::WebInputEvent::MouseDown; + mouse_event.button = WebKit::WebMouseEvent::ButtonLeft; + mouse_event.x = x; + mouse_event.y = y; + mouse_event.clickCount = 1; + TabContents* tab_contents = browser()->GetSelectedTabContents(); + tab_contents->render_view_host()->ForwardMouseEvent(mouse_event); + mouse_event.type = WebKit::WebInputEvent::MouseUp; + tab_contents->render_view_host()->ForwardMouseEvent(mouse_event); + + // The fake speech input manager would receive the speech input + // request and return the test string as recognition result. The test page + // then sets the URL fragment as 'pass' if it received the expected string. + ui_test_utils::WaitForNavigations(&tab_contents->controller(), 1); + EXPECT_EQ("pass", browser()->GetSelectedTabContents()->GetURL().ref()); + } + + // InProcessBrowserTest methods. + virtual void SetUpInProcessBrowserTestFixture() { + speech_input_manager_ = &fake_speech_input_manager_; + + // Inject the fake manager factory so that the test result is returned to + // the web page. + SpeechInputDispatcherHost::set_manager_accessor(&fakeManagerAccessor); + } + + virtual void TearDownInProcessBrowserTestFixture() { + speech_input_manager_ = NULL; + } + + // Factory method. + static SpeechInputManager* fakeManagerAccessor() { + return speech_input_manager_; + } + + FakeSpeechInputManager fake_speech_input_manager_; + + // This is used by the static |fakeManagerAccessor|, and it is a pointer + // rather than a direct instance per the style guide. + static SpeechInputManager* speech_input_manager_; +}; + +SpeechInputManager* SpeechInputBrowserTest::speech_input_manager_ = NULL; + +// Marked as FLAKY due to http://crbug.com/51337 +// +// TODO(satish): Once this flakiness has been fixed, add a second test here to +// check for sending many clicks in succession to the speech button and verify +// that it doesn't cause any crash but works as expected. This should act as the +// test for http://crbug.com/59173 +// +// TODO(satish): Similar to above, once this flakiness has been fixed add +// another test here to check that when speech recognition is in progress and +// a renderer crashes, we get a call to +// SpeechInputManager::CancelAllRequestsWithDelegate. +// +// Marked as DISABLED due to http://crbug.com/71227 +#if defined(GOOGLE_CHROME_BUILD) +#define MAYBE_TestBasicRecognition DISABLED_TestBasicRecognition +#elif defined(OS_WIN) +#define MAYBE_TestBasicRecognition FLAKY_TestBasicRecognition +#else +#define MAYBE_TestBasicRecognition TestBasicRecognition +#endif +IN_PROC_BROWSER_TEST_F(SpeechInputBrowserTest, MAYBE_TestBasicRecognition) { + LoadAndRunSpeechInputTest(FILE_PATH_LITERAL("basic_recognition.html")); + EXPECT_TRUE(fake_speech_input_manager_.grammar().empty()); +} + +// Marked as FLAKY due to http://crbug.com/51337 +// Marked as DISALBED due to http://crbug.com/71227 +#if defined(GOOGLE_CHROME_BUILD) +#define MAYBE_GrammarAttribute DISABLED_GrammarAttribute +#elif defined(OS_WIN) +#define MAYBE_GrammarAttribute FLAKY_GrammarAttribute +#else +#define MAYBE_GrammarAttribute GrammarAttribute +#endif +IN_PROC_BROWSER_TEST_F(SpeechInputBrowserTest, MAYBE_GrammarAttribute) { + LoadAndRunSpeechInputTest(FILE_PATH_LITERAL("grammar_attribute.html")); + EXPECT_EQ("http://example.com/grammar.xml", + fake_speech_input_manager_.grammar()); +} + +} // namespace speech_input diff --git a/content/browser/speech/speech_input_dispatcher_host.cc b/content/browser/speech/speech_input_dispatcher_host.cc new file mode 100644 index 0000000..84e2a95 --- /dev/null +++ b/content/browser/speech/speech_input_dispatcher_host.cc @@ -0,0 +1,225 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/speech_input_dispatcher_host.h" + +#include "base/lazy_instance.h" +#include "chrome/common/speech_input_messages.h" + +namespace speech_input { + +//----------------------------- SpeechInputCallers ----------------------------- + +// A singleton class to map the tuple +// (render-process-id, render-view-id, requestid) to a single ID which is passed +// through rest of the speech code. +class SpeechInputDispatcherHost::SpeechInputCallers { + public: + // Creates a new ID for a given tuple. + int CreateId(int render_process_id, int render_view_id, int request_id); + + // Returns the ID for a tuple assuming the ID was created earlier. + int GetId(int render_process_id, int render_view_id, int request_id); + + // Removes the ID and associated tuple from the map. + void RemoveId(int id); + + // Getters for the various tuple elements for the given ID. + int render_process_id(int id); + int render_view_id(int id); + int request_id(int id); + + private: + struct CallerInfo { + int render_process_id; + int render_view_id; + int request_id; + }; + friend struct base::DefaultLazyInstanceTraits<SpeechInputCallers>; + + SpeechInputCallers(); + + std::map<int, CallerInfo> callers_; + int next_id_; +}; + +static base::LazyInstance<SpeechInputDispatcherHost::SpeechInputCallers> + g_speech_input_callers(base::LINKER_INITIALIZED); + +SpeechInputDispatcherHost::SpeechInputCallers::SpeechInputCallers() + : next_id_(1) { +} + +int SpeechInputDispatcherHost::SpeechInputCallers::GetId(int render_process_id, + int render_view_id, + int request_id) { + for (std::map<int, CallerInfo>::iterator it = callers_.begin(); + it != callers_.end(); it++) { + const CallerInfo& item = it->second; + if (item.render_process_id == render_process_id && + item.render_view_id == render_view_id && + item.request_id == request_id) { + return it->first; + } + } + + // Not finding an entry here is valid since a cancel/stop may have been issued + // by the renderer and before it received our response the user may have + // clicked the button to stop again. The caller of this method should take + // care of this case. + return 0; +} + +int SpeechInputDispatcherHost::SpeechInputCallers::CreateId( + int render_process_id, + int render_view_id, + int request_id) { + CallerInfo info; + info.render_process_id = render_process_id; + info.render_view_id = render_view_id; + info.request_id = request_id; + callers_[next_id_] = info; + return next_id_++; +} + +void SpeechInputDispatcherHost::SpeechInputCallers::RemoveId(int id) { + callers_.erase(id); +} + +int SpeechInputDispatcherHost::SpeechInputCallers::render_process_id(int id) { + return callers_[id].render_process_id; +} + +int SpeechInputDispatcherHost::SpeechInputCallers::render_view_id(int id) { + return callers_[id].render_view_id; +} + +int SpeechInputDispatcherHost::SpeechInputCallers::request_id(int id) { + return callers_[id].request_id; +} + +//-------------------------- SpeechInputDispatcherHost ------------------------- + +SpeechInputManager::AccessorMethod* + SpeechInputDispatcherHost::manager_accessor_ = &SpeechInputManager::Get; + +SpeechInputDispatcherHost::SpeechInputDispatcherHost(int render_process_id) + : render_process_id_(render_process_id), + may_have_pending_requests_(false) { + // This is initialized by Browser. Do not add any non-trivial + // initialization here, instead do it lazily when required (e.g. see the + // method |manager()|) or add an Init() method. +} + +SpeechInputDispatcherHost::~SpeechInputDispatcherHost() { + // If the renderer crashed for some reason or if we didn't receive a proper + // Cancel/Stop call for an existing session, cancel such active sessions now. + // We first check if this dispatcher received any speech IPC requst so that + // we don't end up creating the speech input manager for web pages which don't + // use speech input. + if (may_have_pending_requests_) + manager()->CancelAllRequestsWithDelegate(this); +} + +SpeechInputManager* SpeechInputDispatcherHost::manager() { + return (*manager_accessor_)(); +} + +bool SpeechInputDispatcherHost::OnMessageReceived( + const IPC::Message& message, bool* message_was_ok) { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + + uint32 message_type = message.type(); + if (message_type == SpeechInputHostMsg_StartRecognition::ID || + message_type == SpeechInputHostMsg_CancelRecognition::ID || + message_type == SpeechInputHostMsg_StopRecording::ID) { + if (!SpeechInputManager::IsFeatureEnabled()) { + *message_was_ok = false; + return true; + } + + may_have_pending_requests_ = true; + IPC_BEGIN_MESSAGE_MAP_EX(SpeechInputDispatcherHost, message, + *message_was_ok) + IPC_MESSAGE_HANDLER(SpeechInputHostMsg_StartRecognition, + OnStartRecognition) + IPC_MESSAGE_HANDLER(SpeechInputHostMsg_CancelRecognition, + OnCancelRecognition) + IPC_MESSAGE_HANDLER(SpeechInputHostMsg_StopRecording, + OnStopRecording) + IPC_END_MESSAGE_MAP() + return true; + } + + return false; +} + +void SpeechInputDispatcherHost::OnStartRecognition( + const SpeechInputHostMsg_StartRecognition_Params ¶ms) { + int caller_id = g_speech_input_callers.Get().CreateId( + render_process_id_, params.render_view_id, params.request_id); + manager()->StartRecognition(this, caller_id, + render_process_id_, + params.render_view_id, params.element_rect, + params.language, params.grammar, + params.origin_url); +} + +void SpeechInputDispatcherHost::OnCancelRecognition(int render_view_id, + int request_id) { + int caller_id = g_speech_input_callers.Get().GetId( + render_process_id_, render_view_id, request_id); + if (caller_id) { + manager()->CancelRecognition(caller_id); + // Request sequence ended so remove mapping. + g_speech_input_callers.Get().RemoveId(caller_id); + } +} + +void SpeechInputDispatcherHost::OnStopRecording(int render_view_id, + int request_id) { + int caller_id = g_speech_input_callers.Get().GetId( + render_process_id_, render_view_id, request_id); + if (caller_id) + manager()->StopRecording(caller_id); +} + +void SpeechInputDispatcherHost::SetRecognitionResult( + int caller_id, const SpeechInputResultArray& result) { + VLOG(1) << "SpeechInputDispatcherHost::SetRecognitionResult enter"; + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + int caller_render_view_id = + g_speech_input_callers.Get().render_view_id(caller_id); + int caller_request_id = g_speech_input_callers.Get().request_id(caller_id); + Send(new SpeechInputMsg_SetRecognitionResult(caller_render_view_id, + caller_request_id, + result)); + VLOG(1) << "SpeechInputDispatcherHost::SetRecognitionResult exit"; +} + +void SpeechInputDispatcherHost::DidCompleteRecording(int caller_id) { + VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecording enter"; + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + int caller_render_view_id = + g_speech_input_callers.Get().render_view_id(caller_id); + int caller_request_id = g_speech_input_callers.Get().request_id(caller_id); + Send(new SpeechInputMsg_RecordingComplete(caller_render_view_id, + caller_request_id)); + VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecording exit"; +} + +void SpeechInputDispatcherHost::DidCompleteRecognition(int caller_id) { + VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecognition enter"; + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + int caller_render_view_id = + g_speech_input_callers.Get().render_view_id(caller_id); + int caller_request_id = g_speech_input_callers.Get().request_id(caller_id); + Send(new SpeechInputMsg_RecognitionComplete(caller_render_view_id, + caller_request_id)); + // Request sequence ended, so remove mapping. + g_speech_input_callers.Get().RemoveId(caller_id); + VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecognition exit"; +} + +} // namespace speech_input diff --git a/content/browser/speech/speech_input_dispatcher_host.h b/content/browser/speech/speech_input_dispatcher_host.h new file mode 100644 index 0000000..abd93da --- /dev/null +++ b/content/browser/speech/speech_input_dispatcher_host.h @@ -0,0 +1,63 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_ +#define CONTENT_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_ + +#include "base/scoped_ptr.h" +#include "content/browser/browser_message_filter.h" +#include "content/browser/speech/speech_input_manager.h" + +struct SpeechInputHostMsg_StartRecognition_Params; + +namespace speech_input { + +// SpeechInputDispatcherHost is a delegate for Speech API messages used by +// RenderMessageFilter. +// It's the complement of SpeechInputDispatcher (owned by RenderView). +class SpeechInputDispatcherHost : public BrowserMessageFilter, + public SpeechInputManager::Delegate { + public: + class SpeechInputCallers; + + explicit SpeechInputDispatcherHost(int render_process_id); + + // SpeechInputManager::Delegate methods. + virtual void SetRecognitionResult(int caller_id, + const SpeechInputResultArray& result); + virtual void DidCompleteRecording(int caller_id); + virtual void DidCompleteRecognition(int caller_id); + + // BrowserMessageFilter implementation. + virtual bool OnMessageReceived(const IPC::Message& message, + bool* message_was_ok); + + // Singleton accessor setter useful for tests. + static void set_manager_accessor(SpeechInputManager::AccessorMethod* method) { + manager_accessor_ = method; + } + + private: + virtual ~SpeechInputDispatcherHost(); + + void OnStartRecognition( + const SpeechInputHostMsg_StartRecognition_Params ¶ms); + void OnCancelRecognition(int render_view_id, int request_id); + void OnStopRecording(int render_view_id, int request_id); + + // Returns the speech input manager to forward events to, creating one if + // needed. + SpeechInputManager* manager(); + + int render_process_id_; + bool may_have_pending_requests_; // Set if we received any speech IPC request + + static SpeechInputManager::AccessorMethod* manager_accessor_; + + DISALLOW_COPY_AND_ASSIGN(SpeechInputDispatcherHost); +}; + +} // namespace speech_input + +#endif // CONTENT_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_ diff --git a/content/browser/speech/speech_input_manager.h b/content/browser/speech/speech_input_manager.h new file mode 100644 index 0000000..a6ba61f --- /dev/null +++ b/content/browser/speech/speech_input_manager.h @@ -0,0 +1,77 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_ +#define CONTENT_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_ + +#include "base/basictypes.h" +#include "chrome/common/speech_input_result.h" +#include "ui/gfx/rect.h" + +namespace speech_input { + +// This is the gatekeeper for speech recognition in the browser process. It +// handles requests received from various render views and makes sure only one +// of them can use speech recognition at a time. It also sends recognition +// results and status events to the render views when required. +// This class is a singleton and accessed via the Get method. +class SpeechInputManager { + public: + // Implemented by the dispatcher host to relay events to the render views. + class Delegate { + public: + virtual void SetRecognitionResult( + int caller_id, + const SpeechInputResultArray& result) = 0; + virtual void DidCompleteRecording(int caller_id) = 0; + virtual void DidCompleteRecognition(int caller_id) = 0; + + protected: + virtual ~Delegate() {} + }; + + // Whether the speech input feature is enabled, based on the browser channel + // information and command line flags. + static bool IsFeatureEnabled(); + + // Factory method to access the singleton. We have this method here instead of + // using Singleton directly in the calling code to aid tests in injection + // mocks. + static SpeechInputManager* Get(); + // Factory method definition useful for tests. + typedef SpeechInputManager* (AccessorMethod)(); + + virtual ~SpeechInputManager() {} + + // Handlers for requests from render views. + + // |delegate| is a weak pointer and should remain valid until + // its |DidCompleteRecognition| method is called or recognition is cancelled. + // |render_process_id| is the ID of the renderer process initiating the + // request. + // |element_rect| is the display bounds of the html element requesting speech + // input (in page coordinates). + virtual void StartRecognition(Delegate* delegate, + int caller_id, + int render_process_id, + int render_view_id, + const gfx::Rect& element_rect, + const std::string& language, + const std::string& grammar, + const std::string& origin_url) = 0; + virtual void CancelRecognition(int caller_id) = 0; + virtual void StopRecording(int caller_id) = 0; + + virtual void CancelAllRequestsWithDelegate(Delegate* delegate) = 0; +}; + +// This typedef is to workaround the issue with certain versions of +// Visual Studio where it gets confused between multiple Delegate +// classes and gives a C2500 error. (I saw this error on the try bots - +// the workaround was not needed for my machine). +typedef SpeechInputManager::Delegate SpeechInputManagerDelegate; + +} // namespace speech_input + +#endif // CONTENT_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_ diff --git a/content/browser/speech/speech_recognition_request.cc b/content/browser/speech/speech_recognition_request.cc new file mode 100644 index 0000000..d127437 --- /dev/null +++ b/content/browser/speech/speech_recognition_request.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/speech_recognition_request.h" + +#include <vector> + +#include "base/json/json_reader.h" +#include "base/string_util.h" +#include "base/values.h" +#include "chrome/common/net/url_request_context_getter.h" +#include "net/base/escape.h" +#include "net/base/load_flags.h" +#include "net/url_request/url_request_context.h" +#include "net/url_request/url_request_status.h" +#include "ui/base/l10n/l10n_util.h" + +namespace { + +const char* const kDefaultSpeechRecognitionUrl = + "https://www.google.com/speech-api/v1/recognize?client=chromium&"; +const char* const kHypothesesString = "hypotheses"; +const char* const kUtteranceString = "utterance"; +const char* const kConfidenceString = "confidence"; + +bool ParseServerResponse(const std::string& response_body, + speech_input::SpeechInputResultArray* result) { + if (response_body.empty()) { + LOG(WARNING) << "ParseServerResponse: Response was empty."; + return false; + } + DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; + + // Parse the response, ignoring comments. + std::string error_msg; + scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError( + response_body, false, NULL, &error_msg)); + if (response_value == NULL) { + LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg; + return false; + } + + if (!response_value->IsType(Value::TYPE_DICTIONARY)) { + VLOG(1) << "ParseServerResponse: Unexpected response type " + << response_value->GetType(); + return false; + } + const DictionaryValue* response_object = + static_cast<DictionaryValue*>(response_value.get()); + + // Get the hypotheses + Value* hypotheses_value = NULL; + if (!response_object->Get(kHypothesesString, &hypotheses_value)) { + VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; + return false; + } + DCHECK(hypotheses_value); + if (!hypotheses_value->IsType(Value::TYPE_LIST)) { + VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " + << hypotheses_value->GetType(); + return false; + } + const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); + if (hypotheses_list->GetSize() == 0) { + VLOG(1) << "ParseServerResponse: hypotheses list is empty."; + return false; + } + + size_t index = 0; + for (; index < hypotheses_list->GetSize(); ++index) { + Value* hypothesis = NULL; + if (!hypotheses_list->Get(index, &hypothesis)) { + LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; + break; + } + DCHECK(hypothesis); + if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) { + LOG(WARNING) << "ParseServerResponse: Unexpected value type " + << hypothesis->GetType(); + break; + } + + const DictionaryValue* hypothesis_value = + static_cast<DictionaryValue*>(hypothesis); + string16 utterance; + if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { + LOG(WARNING) << "ParseServerResponse: Missing utterance value."; + break; + } + + // It is not an error if the 'confidence' field is missing. + double confidence = 0.0; + hypothesis_value->GetDouble(kConfidenceString, &confidence); + + result->push_back(speech_input::SpeechInputResultItem(utterance, + confidence)); + } + + if (index < hypotheses_list->GetSize()) { + result->clear(); + return false; + } + + return true; +} + +} // namespace + +namespace speech_input { + +int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0; + +SpeechRecognitionRequest::SpeechRecognitionRequest( + URLRequestContextGetter* context, Delegate* delegate) + : url_context_(context), + delegate_(delegate) { + DCHECK(delegate); +} + +SpeechRecognitionRequest::~SpeechRecognitionRequest() {} + +bool SpeechRecognitionRequest::Send(const std::string& language, + const std::string& grammar, + const std::string& hardware_info, + const std::string& origin_url, + const std::string& content_type, + const std::string& audio_data) { + DCHECK(!url_fetcher_.get()); + + std::vector<std::string> parts; + + std::string lang_param = language; + if (lang_param.empty() && url_context_) { + // If no language is provided then we use the first from the accepted + // language list. If this list is empty then it defaults to "en-US". + // Example of the contents of this list: "es,en-GB;q=0.8", "" + net::URLRequestContext* request_context = + url_context_->GetURLRequestContext(); + DCHECK(request_context); + std::string accepted_language_list = request_context->accept_language(); + size_t separator = accepted_language_list.find_first_of(",;"); + lang_param = accepted_language_list.substr(0, separator); + } + if (lang_param.empty()) + lang_param = "en-US"; + parts.push_back("lang=" + EscapeQueryParamValue(lang_param, true)); + + if (!grammar.empty()) + parts.push_back("lm=" + EscapeQueryParamValue(grammar, true)); + if (!hardware_info.empty()) + parts.push_back("xhw=" + EscapeQueryParamValue(hardware_info, true)); + // TODO(satish): Remove this hardcoded value once the page is allowed to + // set this via an attribute. + parts.push_back("maxresults=3"); + + GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); + + url_fetcher_.reset(URLFetcher::Create(url_fetcher_id_for_tests, + url, + URLFetcher::POST, + this)); + url_fetcher_->set_upload_data(content_type, audio_data); + url_fetcher_->set_request_context(url_context_); + url_fetcher_->set_referrer(origin_url); + + // The speech recognition API does not require user identification as part + // of requests, so we don't send cookies or auth data for these requests to + // prevent any accidental connection between users who are logged into the + // domain for other services (e.g. bookmark sync) with the speech requests. + url_fetcher_->set_load_flags( + net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES | + net::LOAD_DO_NOT_SEND_AUTH_DATA); + url_fetcher_->Start(); + return true; +} + +void SpeechRecognitionRequest::OnURLFetchComplete( + const URLFetcher* source, + const GURL& url, + const net::URLRequestStatus& status, + int response_code, + const ResponseCookies& cookies, + const std::string& data) { + DCHECK_EQ(url_fetcher_.get(), source); + + bool error = !status.is_success() || response_code != 200; + SpeechInputResultArray result; + if (!error) + error = !ParseServerResponse(data, &result); + url_fetcher_.reset(); + + DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result."; + delegate_->SetRecognitionResult(error, result); +} + +} // namespace speech_input diff --git a/content/browser/speech/speech_recognition_request.h b/content/browser/speech/speech_recognition_request.h new file mode 100644 index 0000000..3036d59 --- /dev/null +++ b/content/browser/speech/speech_recognition_request.h @@ -0,0 +1,82 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_ +#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_ +#pragma once + +#include <string> + +#include "base/basictypes.h" +#include "base/ref_counted.h" +#include "base/scoped_ptr.h" +#include "chrome/common/net/url_fetcher.h" +#include "chrome/common/speech_input_result.h" +#include "googleurl/src/gurl.h" + +class URLFetcher; +class URLRequestContextGetter; + +namespace speech_input { + +// Provides a simple interface for sending recorded speech data to the server +// and get back recognition results. +class SpeechRecognitionRequest : public URLFetcher::Delegate { + public: + // ID passed to URLFetcher::Create(). Used for testing. + static int url_fetcher_id_for_tests; + + // Interface for receiving callbacks from this object. + class Delegate { + public: + virtual void SetRecognitionResult( + bool error, const SpeechInputResultArray& result) = 0; + + protected: + virtual ~Delegate() {} + }; + + // |url| is the server address to which the request wil be sent. + SpeechRecognitionRequest(URLRequestContextGetter* context, + Delegate* delegate); + + virtual ~SpeechRecognitionRequest(); + + // Sends a new request with the given audio data, returns true if successful. + // The same object can be used to send multiple requests but only after the + // previous request has completed. + bool Send(const std::string& language, + const std::string& grammar, + const std::string& hardware_info, + const std::string& origin_url, + const std::string& content_type, + const std::string& audio_data); + + bool HasPendingRequest() { return url_fetcher_ != NULL; } + + // URLFetcher::Delegate methods. + virtual void OnURLFetchComplete(const URLFetcher* source, + const GURL& url, + const net::URLRequestStatus& status, + int response_code, + const ResponseCookies& cookies, + const std::string& data); + + private: + scoped_refptr<URLRequestContextGetter> url_context_; + Delegate* delegate_; + scoped_ptr<URLFetcher> url_fetcher_; + + DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionRequest); +}; + +// This typedef is to workaround the issue with certain versions of +// Visual Studio where it gets confused between multiple Delegate +// classes and gives a C2500 error. (I saw this error on the try bots - +// the workaround was not needed for my machine). +typedef SpeechRecognitionRequest::Delegate SpeechRecognitionRequestDelegate; + +} // namespace speech_input + +#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_ diff --git a/content/browser/speech/speech_recognition_request_unittest.cc b/content/browser/speech/speech_recognition_request_unittest.cc new file mode 100644 index 0000000..e90f5cd --- /dev/null +++ b/content/browser/speech/speech_recognition_request_unittest.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/utf_string_conversions.h" +#include "chrome/common/net/url_request_context_getter.h" +#include "chrome/common/net/test_url_fetcher_factory.h" +#include "content/browser/speech/speech_recognition_request.h" +#include "net/url_request/url_request_status.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace speech_input { + +class SpeechRecognitionRequestTest : public SpeechRecognitionRequestDelegate, + public testing::Test { + public: + SpeechRecognitionRequestTest() : error_(false) { } + + // Creates a speech recognition request and invokes it's URL fetcher delegate + // with the given test data. + void CreateAndTestRequest(bool success, const std::string& http_response); + + // SpeechRecognitionRequestDelegate methods. + virtual void SetRecognitionResult(bool error, + const SpeechInputResultArray& result) { + error_ = error; + result_ = result; + } + + // testing::Test methods. + virtual void SetUp() { + URLFetcher::set_factory(&url_fetcher_factory_); + } + + virtual void TearDown() { + URLFetcher::set_factory(NULL); + } + + protected: + MessageLoop message_loop_; + TestURLFetcherFactory url_fetcher_factory_; + bool error_; + SpeechInputResultArray result_; +}; + +void SpeechRecognitionRequestTest::CreateAndTestRequest( + bool success, const std::string& http_response) { + SpeechRecognitionRequest request(NULL, this); + request.Send(std::string(), std::string(), std::string(), std::string(), + std::string(), std::string()); + TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); + ASSERT_TRUE(fetcher); + net::URLRequestStatus status; + status.set_status(success ? net::URLRequestStatus::SUCCESS : + net::URLRequestStatus::FAILED); + fetcher->delegate()->OnURLFetchComplete(fetcher, fetcher->original_url(), + status, success ? 200 : 500, + ResponseCookies(), + http_response); + // Parsed response will be available in result_. +} + +TEST_F(SpeechRecognitionRequestTest, BasicTest) { + // Normal success case with one result. + CreateAndTestRequest(true, + "{\"hypotheses\":[{\"utterance\":\"123456\",\"confidence\":0.9}]}"); + EXPECT_FALSE(error_); + EXPECT_EQ(1U, result_.size()); + EXPECT_EQ(ASCIIToUTF16("123456"), result_[0].utterance); + EXPECT_EQ(0.9, result_[0].confidence); + + // Normal success case with multiple results. + CreateAndTestRequest(true, + "{\"hypotheses\":[{\"utterance\":\"hello\",\"confidence\":0.9}," + "{\"utterance\":\"123456\",\"confidence\":0.5}]}"); + EXPECT_FALSE(error_); + EXPECT_EQ(2u, result_.size()); + EXPECT_EQ(ASCIIToUTF16("hello"), result_[0].utterance); + EXPECT_EQ(0.9, result_[0].confidence); + EXPECT_EQ(ASCIIToUTF16("123456"), result_[1].utterance); + EXPECT_EQ(0.5, result_[1].confidence); + + // Http failure case. + CreateAndTestRequest(false, ""); + EXPECT_TRUE(error_); + EXPECT_EQ(0U, result_.size()); + + // Malformed JSON case. + CreateAndTestRequest(true, "{\"hypotheses\":[{\"unknownkey\":\"hello\"}]}"); + EXPECT_TRUE(error_); + EXPECT_EQ(0U, result_.size()); +} + +} // namespace speech_input diff --git a/content/browser/speech/speech_recognizer.cc b/content/browser/speech/speech_recognizer.cc new file mode 100644 index 0000000..fdc1a4c --- /dev/null +++ b/content/browser/speech/speech_recognizer.cc @@ -0,0 +1,262 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/speech_recognizer.h" + +#include "base/time.h" +#include "chrome/browser/profiles/profile.h" +#include "chrome/common/net/url_request_context_getter.h" +#include "content/browser/browser_thread.h" + +using media::AudioInputController; +using std::string; + +namespace { + +// The following constants are related to the volume level indicator shown in +// the UI for recorded audio. +// Multiplier used when new volume is greater than previous level. +const float kUpSmoothingFactor = 0.9f; +// Multiplier used when new volume is lesser than previous level. +const float kDownSmoothingFactor = 0.4f; +const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. +const float kAudioMeterDbRange = 25.0f; +} // namespace + +namespace speech_input { + +const int SpeechRecognizer::kAudioSampleRate = 16000; +const int SpeechRecognizer::kAudioPacketIntervalMs = 100; +const int SpeechRecognizer::kNumAudioChannels = 1; +const int SpeechRecognizer::kNumBitsPerAudioSample = 16; +const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; +const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; + +SpeechRecognizer::SpeechRecognizer(Delegate* delegate, + int caller_id, + const std::string& language, + const std::string& grammar, + const std::string& hardware_info, + const std::string& origin_url) + : delegate_(delegate), + caller_id_(caller_id), + language_(language), + grammar_(grammar), + hardware_info_(hardware_info), + origin_url_(origin_url), + codec_(AudioEncoder::CODEC_SPEEX), + encoder_(NULL), + endpointer_(kAudioSampleRate), + num_samples_recorded_(0), + audio_level_(0.0f) { + endpointer_.set_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond / 2); + endpointer_.set_long_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond); + endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); + endpointer_.StartSession(); +} + +SpeechRecognizer::~SpeechRecognizer() { + // Recording should have stopped earlier due to the endpointer or + // |StopRecording| being called. + DCHECK(!audio_controller_.get()); + DCHECK(!request_.get() || !request_->HasPendingRequest()); + DCHECK(!encoder_.get()); + endpointer_.EndSession(); +} + +bool SpeechRecognizer::StartRecording() { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + DCHECK(!audio_controller_.get()); + DCHECK(!request_.get() || !request_->HasPendingRequest()); + DCHECK(!encoder_.get()); + + // The endpointer needs to estimate the environment/background noise before + // starting to treat the audio as user input. In |HandleOnData| we wait until + // such time has passed before switching to user input mode. + endpointer_.SetEnvironmentEstimationMode(); + + encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, + kNumBitsPerAudioSample)); + int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; + AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, + kAudioSampleRate, kNumBitsPerAudioSample, + samples_per_packet); + audio_controller_ = AudioInputController::Create(this, params); + DCHECK(audio_controller_.get()); + VLOG(1) << "SpeechRecognizer starting record."; + num_samples_recorded_ = 0; + audio_controller_->Record(); + + return true; +} + +void SpeechRecognizer::CancelRecognition() { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + DCHECK(audio_controller_.get() || request_.get()); + + // Stop recording if required. + if (audio_controller_.get()) { + VLOG(1) << "SpeechRecognizer stopping record."; + audio_controller_->Close(); + audio_controller_ = NULL; // Releases the ref ptr. + } + + VLOG(1) << "SpeechRecognizer canceling recognition."; + encoder_.reset(); + request_.reset(); +} + +void SpeechRecognizer::StopRecording() { + DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); + + // If audio recording has already stopped and we are in recognition phase, + // silently ignore any more calls to stop recording. + if (!audio_controller_.get()) + return; + + VLOG(1) << "SpeechRecognizer stopping record."; + audio_controller_->Close(); + audio_controller_ = NULL; // Releases the ref ptr. + encoder_->Flush(); + + delegate_->DidCompleteRecording(caller_id_); + + // Since the http request takes a single string as POST data, allocate + // one and copy over bytes from the audio buffers to the string. + // And If we haven't got any audio yet end the recognition sequence here. + string mime_type = encoder_->mime_type(); + string data; + encoder_->GetEncodedData(&data); + encoder_.reset(); + + if (data.empty()) { + // Guard against the delegate freeing us until we finish our job. + scoped_refptr<SpeechRecognizer> me(this); + delegate_->DidCompleteRecognition(caller_id_); + } else { + DCHECK(!request_.get()); + request_.reset(new SpeechRecognitionRequest( + Profile::GetDefaultRequestContext(), this)); + request_->Send(language_, grammar_, hardware_info_, origin_url_, + mime_type, data); + } +} + +void SpeechRecognizer::ReleaseAudioBuffers() { +} + +// Invoked in the audio thread. +void SpeechRecognizer::OnError(AudioInputController* controller, + int error_code) { + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + NewRunnableMethod(this, + &SpeechRecognizer::HandleOnError, + error_code)); +} + +void SpeechRecognizer::HandleOnError(int error_code) { + LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; + + // Check if we are still recording before canceling recognition, as + // recording might have been stopped after this error was posted to the queue + // by |OnError|. + if (!audio_controller_.get()) + return; + + InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE); +} + +void SpeechRecognizer::OnData(AudioInputController* controller, + const uint8* data, uint32 size) { + if (size == 0) // This could happen when recording stops and is normal. + return; + + string* str_data = new string(reinterpret_cast<const char*>(data), size); + BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, + NewRunnableMethod(this, + &SpeechRecognizer::HandleOnData, + str_data)); +} + +void SpeechRecognizer::HandleOnData(string* data) { + // Check if we are still recording and if not discard this buffer, as + // recording might have been stopped after this buffer was posted to the queue + // by |OnData|. + if (!audio_controller_.get()) { + delete data; + return; + } + + const short* samples = reinterpret_cast<const short*>(data->data()); + DCHECK((data->length() % sizeof(short)) == 0); + int num_samples = data->length() / sizeof(short); + + encoder_->Encode(samples, num_samples); + float rms; + endpointer_.ProcessAudio(samples, num_samples, &rms); + delete data; + num_samples_recorded_ += num_samples; + + if (endpointer_.IsEstimatingEnvironment()) { + // Check if we have gathered enough audio for the endpointer to do + // environment estimation and should move on to detect speech/end of speech. + if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * + kAudioSampleRate) / 1000) { + endpointer_.SetUserInputMode(); + delegate_->DidCompleteEnvironmentEstimation(caller_id_); + } + return; // No more processing since we are still estimating environment. + } + + // Check if we have waited too long without hearing any speech. + if (!endpointer_.DidStartReceivingSpeech() && + num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) { + InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH); + return; + } + + // Calculate the input volume to display in the UI, smoothing towards the + // new level. + float level = (rms - kAudioMeterMinDb) / kAudioMeterDbRange; + level = std::min(std::max(0.0f, level), 1.0f); + if (level > audio_level_) { + audio_level_ += (level - audio_level_) * kUpSmoothingFactor; + } else { + audio_level_ += (level - audio_level_) * kDownSmoothingFactor; + } + delegate_->SetInputVolume(caller_id_, audio_level_); + + if (endpointer_.speech_input_complete()) { + StopRecording(); + } + + // TODO(satish): Once we have streaming POST, start sending the data received + // here as POST chunks. +} + +void SpeechRecognizer::SetRecognitionResult( + bool error, const SpeechInputResultArray& result) { + if (result.empty()) { + InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS); + return; + } + + delegate_->SetRecognitionResult(caller_id_, error, result); + + // Guard against the delegate freeing us until we finish our job. + scoped_refptr<SpeechRecognizer> me(this); + delegate_->DidCompleteRecognition(caller_id_); +} + +void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { + CancelRecognition(); + + // Guard against the delegate freeing us until we finish our job. + scoped_refptr<SpeechRecognizer> me(this); + delegate_->OnRecognizerError(caller_id_, error); +} + +} // namespace speech_input diff --git a/content/browser/speech/speech_recognizer.h b/content/browser/speech/speech_recognizer.h new file mode 100644 index 0000000..a54a59d --- /dev/null +++ b/content/browser/speech/speech_recognizer.h @@ -0,0 +1,151 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ +#define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ + +#include <list> +#include <string> +#include <utility> + +#include "base/ref_counted.h" +#include "base/scoped_ptr.h" +#include "content/browser/speech/audio_encoder.h" +#include "content/browser/speech/endpointer/endpointer.h" +#include "content/browser/speech/speech_recognition_request.h" +#include "media/audio/audio_input_controller.h" + +namespace speech_input { + +// Records audio, sends recorded audio to server and translates server response +// to recognition result. +class SpeechRecognizer + : public base::RefCountedThreadSafe<SpeechRecognizer>, + public media::AudioInputController::EventHandler, + public SpeechRecognitionRequestDelegate { + public: + enum ErrorCode { + RECOGNIZER_NO_ERROR, + RECOGNIZER_ERROR_CAPTURE, + RECOGNIZER_ERROR_NO_SPEECH, + RECOGNIZER_ERROR_NO_RESULTS, + }; + + // Implemented by the caller to receive recognition events. + class Delegate { + public: + virtual void SetRecognitionResult( + int caller_id, + bool error, + const SpeechInputResultArray& result) = 0; + + // Invoked when audio recording stops, either due to the end pointer + // detecting silence in user input or if |StopRecording| was called. The + // delegate has to wait until |DidCompleteRecognition| is invoked before + // destroying the |SpeechRecognizer| object. + virtual void DidCompleteRecording(int caller_id) = 0; + + // This is guaranteed to be the last method invoked in the recognition + // sequence and the |SpeechRecognizer| object can be freed up if necessary. + virtual void DidCompleteRecognition(int caller_id) = 0; + + // Invoked if there was an error while recording or recognizing audio. The + // session has already been cancelled when this call is made and the DidXxxx + // callbacks will not be issued. It is safe to destroy/release the + // |SpeechRecognizer| object while processing this call. + virtual void OnRecognizerError(int caller_id, + SpeechRecognizer::ErrorCode error) = 0; + + // At the start of recognition, a short amount of audio is recorded to + // estimate the environment/background noise and this callback is issued + // after that is complete. Typically the delegate brings up any speech + // recognition UI once this callback is received. + virtual void DidCompleteEnvironmentEstimation(int caller_id) = 0; + + // Informs of a change in the captured audio level, useful if displaying + // a microphone volume indicator while recording. + // The value of |volume| is in the [0.0, 1.0] range. + virtual void SetInputVolume(int caller_id, float volume) = 0; + + protected: + virtual ~Delegate() {} + }; + + SpeechRecognizer(Delegate* delegate, + int caller_id, + const std::string& language, + const std::string& grammar, + const std::string& hardware_info, + const std::string& origin_url); + ~SpeechRecognizer(); + + // Starts audio recording and does recognition after recording ends. The same + // SpeechRecognizer instance can be used multiple times for speech recognition + // though each recognition request can be made only after the previous one + // completes (i.e. after receiving Delegate::DidCompleteRecognition). + bool StartRecording(); + + // Stops recording audio and starts recognition. + void StopRecording(); + + // Stops recording audio and cancels recognition. Any audio recorded so far + // gets discarded. + void CancelRecognition(); + + // AudioInputController::EventHandler methods. + virtual void OnCreated(media::AudioInputController* controller) { } + virtual void OnRecording(media::AudioInputController* controller) { } + virtual void OnError(media::AudioInputController* controller, int error_code); + virtual void OnData(media::AudioInputController* controller, + const uint8* data, + uint32 size); + + // SpeechRecognitionRequest::Delegate methods. + virtual void SetRecognitionResult(bool error, + const SpeechInputResultArray& result); + + static const int kAudioSampleRate; + static const int kAudioPacketIntervalMs; // Duration of each audio packet. + static const int kNumAudioChannels; + static const int kNumBitsPerAudioSample; + static const int kNoSpeechTimeoutSec; + static const int kEndpointerEstimationTimeMs; + + private: + void ReleaseAudioBuffers(); + void InformErrorAndCancelRecognition(ErrorCode error); + void SendRecordedAudioToServer(); + + void HandleOnError(int error_code); // Handles OnError in the IO thread. + + // Handles OnData in the IO thread. Takes ownership of |data|. + void HandleOnData(std::string* data); + + Delegate* delegate_; + int caller_id_; + std::string language_; + std::string grammar_; + std::string hardware_info_; + std::string origin_url_; + + scoped_ptr<SpeechRecognitionRequest> request_; + scoped_refptr<media::AudioInputController> audio_controller_; + AudioEncoder::Codec codec_; + scoped_ptr<AudioEncoder> encoder_; + Endpointer endpointer_; + int num_samples_recorded_; + float audio_level_; + + DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer); +}; + +// This typedef is to workaround the issue with certain versions of +// Visual Studio where it gets confused between multiple Delegate +// classes and gives a C2500 error. (I saw this error on the try bots - +// the workaround was not needed for my machine). +typedef SpeechRecognizer::Delegate SpeechRecognizerDelegate; + +} // namespace speech_input + +#endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ diff --git a/content/browser/speech/speech_recognizer_unittest.cc b/content/browser/speech/speech_recognizer_unittest.cc new file mode 100644 index 0000000..8365396 --- /dev/null +++ b/content/browser/speech/speech_recognizer_unittest.cc @@ -0,0 +1,300 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <vector> + +#include "chrome/common/net/test_url_fetcher_factory.h" +#include "content/browser/browser_thread.h" +#include "content/browser/speech/speech_recognizer.h" +#include "media/audio/test_audio_input_controller_factory.h" +#include "net/url_request/url_request_status.h" +#include "testing/gtest/include/gtest/gtest.h" + +using media::AudioInputController; +using media::TestAudioInputController; +using media::TestAudioInputControllerFactory; + +namespace speech_input { + +class SpeechRecognizerTest : public SpeechRecognizerDelegate, + public testing::Test { + public: + SpeechRecognizerTest() + : io_thread_(BrowserThread::IO, &message_loop_), + ALLOW_THIS_IN_INITIALIZER_LIST( + recognizer_(new SpeechRecognizer(this, 1, std::string(), + std::string(), std::string(), + std::string()))), + recording_complete_(false), + recognition_complete_(false), + result_received_(false), + error_(SpeechRecognizer::RECOGNIZER_NO_ERROR), + volume_(-1.0f) { + int audio_packet_length_bytes = + (SpeechRecognizer::kAudioSampleRate * + SpeechRecognizer::kAudioPacketIntervalMs * + SpeechRecognizer::kNumAudioChannels * + SpeechRecognizer::kNumBitsPerAudioSample) / (8 * 1000); + audio_packet_.resize(audio_packet_length_bytes); + } + + // SpeechRecognizer::Delegate methods. + virtual void SetRecognitionResult(int caller_id, + bool error, + const SpeechInputResultArray& result) { + result_received_ = true; + } + + virtual void DidCompleteRecording(int caller_id) { + recording_complete_ = true; + } + + virtual void DidCompleteRecognition(int caller_id) { + recognition_complete_ = true; + } + + virtual void DidCompleteEnvironmentEstimation(int caller_id) { + } + + virtual void OnRecognizerError(int caller_id, + SpeechRecognizer::ErrorCode error) { + error_ = error; + } + + virtual void SetInputVolume(int caller_id, float volume) { + volume_ = volume; + } + + // testing::Test methods. + virtual void SetUp() { + URLFetcher::set_factory(&url_fetcher_factory_); + AudioInputController::set_factory(&audio_input_controller_factory_); + } + + virtual void TearDown() { + URLFetcher::set_factory(NULL); + AudioInputController::set_factory(NULL); + } + + void FillPacketWithTestWaveform() { + // Fill the input with a simple pattern, a 125Hz sawtooth waveform. + for (size_t i = 0; i < audio_packet_.size(); ++i) + audio_packet_[i] = static_cast<uint8>(i); + } + + protected: + MessageLoopForIO message_loop_; + BrowserThread io_thread_; + scoped_refptr<SpeechRecognizer> recognizer_; + bool recording_complete_; + bool recognition_complete_; + bool result_received_; + SpeechRecognizer::ErrorCode error_; + TestURLFetcherFactory url_fetcher_factory_; + TestAudioInputControllerFactory audio_input_controller_factory_; + std::vector<uint8> audio_packet_; + float volume_; +}; + +TEST_F(SpeechRecognizerTest, StopNoData) { + // Check for callbacks when stopping record before any audio gets recorded. + EXPECT_TRUE(recognizer_->StartRecording()); + recognizer_->CancelRecognition(); + EXPECT_FALSE(recording_complete_); + EXPECT_FALSE(recognition_complete_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); +} + +TEST_F(SpeechRecognizerTest, CancelNoData) { + // Check for callbacks when canceling recognition before any audio gets + // recorded. + EXPECT_TRUE(recognizer_->StartRecording()); + recognizer_->StopRecording(); + EXPECT_TRUE(recording_complete_); + EXPECT_TRUE(recognition_complete_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); +} + +TEST_F(SpeechRecognizerTest, StopWithData) { + // Start recording, give some data and then stop. This should wait for the + // network callback to arrive before completion. + EXPECT_TRUE(recognizer_->StartRecording()); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller = audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + MessageLoop::current()->RunAllPending(); + recognizer_->StopRecording(); + EXPECT_TRUE(recording_complete_); + EXPECT_FALSE(recognition_complete_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); + + // Issue the network callback to complete the process. + TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); + ASSERT_TRUE(fetcher); + net::URLRequestStatus status; + status.set_status(net::URLRequestStatus::SUCCESS); + fetcher->delegate()->OnURLFetchComplete( + fetcher, fetcher->original_url(), status, 200, ResponseCookies(), + "{\"hypotheses\":[{\"utterance\":\"123\"}]}"); + EXPECT_TRUE(recognition_complete_); + EXPECT_TRUE(result_received_); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); +} + +TEST_F(SpeechRecognizerTest, CancelWithData) { + // Start recording, give some data and then cancel. This should not create + // a network request and finish immediately. + EXPECT_TRUE(recognizer_->StartRecording()); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + MessageLoop::current()->RunAllPending(); + recognizer_->CancelRecognition(); + EXPECT_EQ(NULL, url_fetcher_factory_.GetFetcherByID(0)); + EXPECT_FALSE(recording_complete_); + EXPECT_FALSE(recognition_complete_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); +} + +TEST_F(SpeechRecognizerTest, AudioControllerErrorNoData) { + // Check if things tear down properly if AudioInputController threw an error. + EXPECT_TRUE(recognizer_->StartRecording()); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnError(controller, 0); + MessageLoop::current()->RunAllPending(); + EXPECT_FALSE(recording_complete_); + EXPECT_FALSE(recognition_complete_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_CAPTURE, error_); +} + +TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) { + // Check if things tear down properly if AudioInputController threw an error + // after giving some audio data. + EXPECT_TRUE(recognizer_->StartRecording()); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + controller->event_handler()->OnError(controller, 0); + MessageLoop::current()->RunAllPending(); + EXPECT_EQ(NULL, url_fetcher_factory_.GetFetcherByID(0)); + EXPECT_FALSE(recording_complete_); + EXPECT_FALSE(recognition_complete_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_CAPTURE, error_); +} + +TEST_F(SpeechRecognizerTest, NoSpeechCallbackIssued) { + // Start recording and give a lot of packets with audio samples set to zero. + // This should trigger the no-speech detector and issue a callback. + EXPECT_TRUE(recognizer_->StartRecording()); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller = audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + + int num_packets = (SpeechRecognizer::kNoSpeechTimeoutSec * 1000) / + SpeechRecognizer::kAudioPacketIntervalMs; + // The vector is already filled with zero value samples on create. + for (int i = 0; i < num_packets; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + } + MessageLoop::current()->RunAllPending(); + EXPECT_FALSE(recording_complete_); + EXPECT_FALSE(recognition_complete_); + EXPECT_FALSE(result_received_); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_NO_SPEECH, error_); +} + +TEST_F(SpeechRecognizerTest, NoSpeechCallbackNotIssued) { + // Start recording and give a lot of packets with audio samples set to zero + // and then some more with reasonably loud audio samples. This should be + // treated as normal speech input and the no-speech detector should not get + // triggered. + EXPECT_TRUE(recognizer_->StartRecording()); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller = audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + + int num_packets = (SpeechRecognizer::kNoSpeechTimeoutSec * 1000) / + SpeechRecognizer::kAudioPacketIntervalMs; + + // The vector is already filled with zero value samples on create. + for (int i = 0; i < num_packets / 2; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + } + + FillPacketWithTestWaveform(); + for (int i = 0; i < num_packets / 2; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + } + + MessageLoop::current()->RunAllPending(); + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); + EXPECT_FALSE(recording_complete_); + EXPECT_FALSE(recognition_complete_); + recognizer_->CancelRecognition(); +} + +TEST_F(SpeechRecognizerTest, SetInputVolumeCallback) { + // Start recording and give a lot of packets with audio samples set to zero + // and then some more with reasonably loud audio samples. Check that we don't + // get the callback during estimation phase, then get zero for the silence + // samples and proper volume for the loud audio. + EXPECT_TRUE(recognizer_->StartRecording()); + TestAudioInputController* controller = + audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + controller = audio_input_controller_factory_.controller(); + ASSERT_TRUE(controller); + + // Feed some samples to begin with for the endpointer to do noise estimation. + int num_packets = SpeechRecognizer::kEndpointerEstimationTimeMs / + SpeechRecognizer::kAudioPacketIntervalMs; + for (int i = 0; i < num_packets; ++i) { + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + } + MessageLoop::current()->RunAllPending(); + EXPECT_EQ(-1.0f, volume_); // No audio volume set yet. + + // The vector is already filled with zero value samples on create. + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + MessageLoop::current()->RunAllPending(); + EXPECT_EQ(0, volume_); + + FillPacketWithTestWaveform(); + controller->event_handler()->OnData(controller, &audio_packet_[0], + audio_packet_.size()); + MessageLoop::current()->RunAllPending(); + EXPECT_FLOAT_EQ(0.9f, volume_); + + EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); + EXPECT_FALSE(recording_complete_); + EXPECT_FALSE(recognition_complete_); + recognizer_->CancelRecognition(); +} + +} // namespace speech_input |