diff options
author | jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-26 18:46:15 +0000 |
---|---|---|
committer | jam@chromium.org <jam@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-02-26 18:46:15 +0000 |
commit | 50fab53bddb2c3cb24d5682c913a03226ccf49ef (patch) | |
tree | bb04af83ca5f2be010e32c2e10cfd245117a4847 /chrome/browser/speech | |
parent | 5c557f37629dc12dfd99e8fb55c235c8c46a8098 (diff) | |
download | chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.zip chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.gz chromium_src-50fab53bddb2c3cb24d5682c913a03226ccf49ef.tar.bz2 |
Move core pieces of speech from chrome to content.
TBR=satish
Review URL: http://codereview.chromium.org/6591024
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@76165 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/speech')
20 files changed, 3 insertions, 3097 deletions
diff --git a/chrome/browser/speech/audio_encoder.cc b/chrome/browser/speech/audio_encoder.cc deleted file mode 100644 index fe48639..0000000 --- a/chrome/browser/speech/audio_encoder.cc +++ /dev/null @@ -1,206 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/browser/speech/audio_encoder.h" - -#include "base/basictypes.h" -#include "base/logging.h" -#include "base/scoped_ptr.h" -#include "base/stl_util-inl.h" -#include "base/string_number_conversions.h" -#include "third_party/flac/flac.h" -#include "third_party/speex/speex.h" - -using std::string; - -namespace { - -//-------------------------------- FLACEncoder --------------------------------- - -const char* const kContentTypeFLAC = "audio/x-flac; rate="; -const int kFLACCompressionLevel = 0; // 0 for speed - -class FLACEncoder : public speech_input::AudioEncoder { - public: - FLACEncoder(int sampling_rate, int bits_per_sample); - virtual ~FLACEncoder(); - virtual void Encode(const short* samples, int num_samples); - virtual void Flush(); - - private: - static FLAC__StreamEncoderWriteStatus WriteCallback( - const FLAC__StreamEncoder* encoder, - const FLAC__byte buffer[], - size_t bytes, - unsigned samples, - unsigned current_frame, - void* client_data); - - FLAC__StreamEncoder* encoder_; - bool is_encoder_initialized_; - - DISALLOW_COPY_AND_ASSIGN(FLACEncoder); -}; - -FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback( - const FLAC__StreamEncoder* encoder, - const FLAC__byte buffer[], - size_t bytes, - unsigned samples, - unsigned current_frame, - void* client_data) { - FLACEncoder* me = static_cast<FLACEncoder*>(client_data); - DCHECK(me->encoder_ == encoder); - me->AppendToBuffer(new string(reinterpret_cast<const char*>(buffer), bytes)); - return FLAC__STREAM_ENCODER_WRITE_STATUS_OK; -} - -FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample) - : AudioEncoder(std::string(kContentTypeFLAC) + - base::IntToString(sampling_rate)), - encoder_(FLAC__stream_encoder_new()), - is_encoder_initialized_(false) { - FLAC__stream_encoder_set_channels(encoder_, 1); - FLAC__stream_encoder_set_bits_per_sample(encoder_, bits_per_sample); - FLAC__stream_encoder_set_sample_rate(encoder_, sampling_rate); - FLAC__stream_encoder_set_compression_level(encoder_, kFLACCompressionLevel); - - // Initializing the encoder will cause sync bytes to be written to - // its output stream, so we wait until the first call to this method - // before doing so. -} - -FLACEncoder::~FLACEncoder() { - FLAC__stream_encoder_delete(encoder_); -} - -void FLACEncoder::Encode(const short* samples, int num_samples) { - if (!is_encoder_initialized_) { - const FLAC__StreamEncoderInitStatus encoder_status = - FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL, - NULL, this); - DCHECK(encoder_status == FLAC__STREAM_ENCODER_INIT_STATUS_OK); - is_encoder_initialized_ = true; - } - - // FLAC encoder wants samples as int32s. - scoped_ptr<FLAC__int32> flac_samples(new FLAC__int32[num_samples]); - FLAC__int32* flac_samples_ptr = flac_samples.get(); - for (int i = 0; i < num_samples; ++i) - flac_samples_ptr[i] = samples[i]; - - FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples); -} - -void FLACEncoder::Flush() { - FLAC__stream_encoder_finish(encoder_); -} - -//-------------------------------- SpeexEncoder -------------------------------- - -const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate="; -const int kSpeexEncodingQuality = 8; -const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). - -// Since the frame length gets written out as a byte in the encoded packet, -// make sure it is within the byte range. -COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); - -class SpeexEncoder : public speech_input::AudioEncoder { - public: - explicit SpeexEncoder(int sampling_rate); - virtual ~SpeexEncoder(); - virtual void Encode(const short* samples, int num_samples); - virtual void Flush() {} - - private: - void* encoder_state_; - SpeexBits bits_; - int samples_per_frame_; - char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. - DISALLOW_COPY_AND_ASSIGN(SpeexEncoder); -}; - -SpeexEncoder::SpeexEncoder(int sampling_rate) - : AudioEncoder(std::string(kContentTypeSpeex) + - base::IntToString(sampling_rate)) { - // speex_bits_init() does not initialize all of the |bits_| struct. - memset(&bits_, 0, sizeof(bits_)); - speex_bits_init(&bits_); - encoder_state_ = speex_encoder_init(&speex_wb_mode); - DCHECK(encoder_state_); - speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); - DCHECK(samples_per_frame_ > 0); - int quality = kSpeexEncodingQuality; - speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); - int vbr = 1; - speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); - memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); -} - -SpeexEncoder::~SpeexEncoder() { - speex_bits_destroy(&bits_); - speex_encoder_destroy(encoder_state_); -} - -void SpeexEncoder::Encode(const short* samples, int num_samples) { - // Drop incomplete frames, typically those which come in when recording stops. - num_samples -= (num_samples % samples_per_frame_); - for (int i = 0; i < num_samples; i += samples_per_frame_) { - speex_bits_reset(&bits_); - speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i), - &bits_); - - // Encode the frame and place the size of the frame as the first byte. This - // is the packet format for MIME type x-speex-with-header-byte. - int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, - kMaxSpeexFrameLength); - encoded_frame_data_[0] = static_cast<char>(frame_length); - AppendToBuffer(new string(encoded_frame_data_, frame_length + 1)); - } -} - -} // namespace - -namespace speech_input { - -AudioEncoder* AudioEncoder::Create(Codec codec, - int sampling_rate, - int bits_per_sample) { - if (codec == CODEC_FLAC) - return new FLACEncoder(sampling_rate, bits_per_sample); - return new SpeexEncoder(sampling_rate); -} - -AudioEncoder::AudioEncoder(const std::string& mime_type) - : mime_type_(mime_type) { -} - -AudioEncoder::~AudioEncoder() { - STLDeleteElements(&audio_buffers_); -} - -bool AudioEncoder::GetEncodedData(std::string* encoded_data) { - if (!audio_buffers_.size()) - return false; - - int audio_buffer_length = 0; - for (AudioBufferQueue::iterator it = audio_buffers_.begin(); - it != audio_buffers_.end(); ++it) { - audio_buffer_length += (*it)->length(); - } - encoded_data->reserve(audio_buffer_length); - for (AudioBufferQueue::iterator it = audio_buffers_.begin(); - it != audio_buffers_.end(); ++it) { - encoded_data->append(*(*it)); - } - - return true; -} - -void AudioEncoder::AppendToBuffer(std::string* item) { - audio_buffers_.push_back(item); -} - -} // namespace speech_input diff --git a/chrome/browser/speech/audio_encoder.h b/chrome/browser/speech/audio_encoder.h deleted file mode 100644 index e17a413..0000000 --- a/chrome/browser/speech/audio_encoder.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_ -#define CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_ - -#include <list> -#include <string> - -#include "base/basictypes.h" - -namespace speech_input { - -// Provides a simple interface to encode raw audio using the various speech -// codecs. -class AudioEncoder { - public: - enum Codec { - CODEC_FLAC, - CODEC_SPEEX, - }; - - static AudioEncoder* Create(Codec codec, - int sampling_rate, - int bits_per_sample); - - virtual ~AudioEncoder(); - - // Encodes each frame of raw audio in |samples| to the internal buffer. Use - // |GetEncodedData| to read the result after this call or when recording - // completes. - virtual void Encode(const short* samples, int num_samples) = 0; - - // Finish encoding and flush any pending encoded bits out. - virtual void Flush() = 0; - - // Copies the encoded audio to the given string. Returns true if the output - // is not empty. - bool GetEncodedData(std::string* encoded_data); - - const std::string& mime_type() { return mime_type_; } - - protected: - AudioEncoder(const std::string& mime_type); - - void AppendToBuffer(std::string* item); - - private: - // Buffer holding the recorded audio. Owns the strings inside the list. - typedef std::list<std::string*> AudioBufferQueue; - AudioBufferQueue audio_buffers_; - std::string mime_type_; - DISALLOW_COPY_AND_ASSIGN(AudioEncoder); -}; - -} // namespace speech_input - -#endif // CHROME_BROWSER_SPEECH_AUDIO_ENCODER_H_ diff --git a/chrome/browser/speech/endpointer/endpointer.cc b/chrome/browser/speech/endpointer/endpointer.cc deleted file mode 100644 index c30e1f2..0000000 --- a/chrome/browser/speech/endpointer/endpointer.cc +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/browser/speech/endpointer/endpointer.h" -#include "base/time.h" - -using base::Time; - -namespace { -static const int kFrameRate = 50; // 1 frame = 20ms of audio. -} - -namespace speech_input { - -Endpointer::Endpointer(int sample_rate) - : speech_input_possibly_complete_silence_length_us_(-1), - speech_input_complete_silence_length_us_(-1), - audio_frame_time_us_(0), - sample_rate_(sample_rate), - frame_size_(0) { - Reset(); - - frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); - - speech_input_minimum_length_us_ = - static_cast<int64>(1.7 * Time::kMicrosecondsPerSecond); - speech_input_complete_silence_length_us_ = - static_cast<int64>(0.5 * Time::kMicrosecondsPerSecond); - long_speech_input_complete_silence_length_us_ = -1; - long_speech_length_us_ = -1; - speech_input_possibly_complete_silence_length_us_ = - 1 * Time::kMicrosecondsPerSecond; - - // Set the default configuration for Push To Talk mode. - EnergyEndpointerParams ep_config; - ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); - ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); - ep_config.set_endpoint_margin(0.2f); - ep_config.set_onset_window(0.15f); - ep_config.set_speech_on_window(0.4f); - ep_config.set_offset_window(0.15f); - ep_config.set_onset_detect_dur(0.09f); - ep_config.set_onset_confirm_dur(0.075f); - ep_config.set_on_maintain_dur(0.10f); - ep_config.set_offset_confirm_dur(0.12f); - ep_config.set_decision_threshold(1000.0f); - ep_config.set_min_decision_threshold(50.0f); - ep_config.set_fast_update_dur(0.2f); - ep_config.set_sample_rate(static_cast<float>(sample_rate)); - ep_config.set_min_fundamental_frequency(57.143f); - ep_config.set_max_fundamental_frequency(400.0f); - ep_config.set_contamination_rejection_period(0.25f); - energy_endpointer_.Init(ep_config); -} - -void Endpointer::Reset() { - old_ep_status_ = EP_PRE_SPEECH; - waiting_for_speech_possibly_complete_timeout_ = false; - waiting_for_speech_complete_timeout_ = false; - speech_previously_detected_ = false; - speech_input_complete_ = false; - audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer. - speech_end_time_us_ = -1; - speech_start_time_us_ = -1; -} - -void Endpointer::StartSession() { - Reset(); - energy_endpointer_.StartSession(); -} - -void Endpointer::EndSession() { - energy_endpointer_.EndSession(); -} - -void Endpointer::SetEnvironmentEstimationMode() { - Reset(); - energy_endpointer_.SetEnvironmentEstimationMode(); -} - -void Endpointer::SetUserInputMode() { - energy_endpointer_.SetUserInputMode(); -} - -EpStatus Endpointer::Status(int64 *time) { - return energy_endpointer_.Status(time); -} - -EpStatus Endpointer::ProcessAudio(const int16* audio_data, int num_samples, - float* rms_out) { - EpStatus ep_status = EP_PRE_SPEECH; - - // Process the input data in blocks of frame_size_, dropping any incomplete - // frames at the end (which is ok since typically the caller will be recording - // audio in multiples of our frame size). - int sample_index = 0; - while (sample_index + frame_size_ <= num_samples) { - // Have the endpointer process the frame. - energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, - audio_data + sample_index, - frame_size_, - rms_out); - sample_index += frame_size_; - audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) / - sample_rate_; - - // Get the status of the endpointer. - int64 ep_time; - ep_status = energy_endpointer_.Status(&ep_time); - - // Handle state changes. - if ((EP_SPEECH_PRESENT == ep_status) && - (EP_POSSIBLE_ONSET == old_ep_status_)) { - speech_end_time_us_ = -1; - waiting_for_speech_possibly_complete_timeout_ = false; - waiting_for_speech_complete_timeout_ = false; - // Trigger SpeechInputDidStart event on first detection. - if (false == speech_previously_detected_) { - speech_previously_detected_ = true; - speech_start_time_us_ = ep_time; - } - } - if ((EP_PRE_SPEECH == ep_status) && - (EP_POSSIBLE_OFFSET == old_ep_status_)) { - speech_end_time_us_ = ep_time; - waiting_for_speech_possibly_complete_timeout_ = true; - waiting_for_speech_complete_timeout_ = true; - } - if (ep_time > speech_input_minimum_length_us_) { - // Speech possibly complete timeout. - if ((waiting_for_speech_possibly_complete_timeout_) && - (ep_time - speech_end_time_us_ > - speech_input_possibly_complete_silence_length_us_)) { - waiting_for_speech_possibly_complete_timeout_ = false; - } - if (waiting_for_speech_complete_timeout_) { - // The length of the silence timeout period can be held constant, or it - // can be changed after a fixed amount of time from the beginning of - // speech. - bool has_stepped_silence = - (long_speech_length_us_ > 0) && - (long_speech_input_complete_silence_length_us_ > 0); - int64 requested_silence_length; - if (has_stepped_silence && - (ep_time - speech_start_time_us_) > long_speech_length_us_) { - requested_silence_length = - long_speech_input_complete_silence_length_us_; - } else { - requested_silence_length = - speech_input_complete_silence_length_us_; - } - - // Speech complete timeout. - if ((ep_time - speech_end_time_us_) > requested_silence_length) { - waiting_for_speech_complete_timeout_ = false; - speech_input_complete_ = true; - } - } - } - old_ep_status_ = ep_status; - } - return ep_status; -} - -} // namespace speech diff --git a/chrome/browser/speech/endpointer/endpointer.h b/chrome/browser/speech/endpointer/endpointer.h deleted file mode 100644 index 8af6016..0000000 --- a/chrome/browser/speech/endpointer/endpointer.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ -#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ - -#include "base/basictypes.h" -#include "chrome/browser/speech/endpointer/energy_endpointer.h" - -class EpStatus; - -namespace speech_input { - -// A simple interface to the underlying energy-endpointer implementation, this -// class lets callers provide audio as being recorded and let them poll to find -// when the user has stopped speaking. -// -// There are two events that may trigger the end of speech: -// -// speechInputPossiblyComplete event: -// -// Signals that silence/noise has been detected for a *short* amount of -// time after some speech has been detected. It can be used for low latency -// UI feedback. To disable it, set it to a large amount. -// -// speechInputComplete event: -// -// This event is intended to signal end of input and to stop recording. -// The amount of time to wait after speech is set by -// speech_input_complete_silence_length_ and optionally two other -// parameters (see below). -// This time can be held constant, or can change as more speech is detected. -// In the latter case, the time changes after a set amount of time from the -// *beginning* of speech. This is motivated by the expectation that there -// will be two distinct types of inputs: short search queries and longer -// dictation style input. -// -// Three parameters are used to define the piecewise constant timeout function. -// The timeout length is speech_input_complete_silence_length until -// long_speech_length, when it changes to -// long_speech_input_complete_silence_length. -class Endpointer { - public: - explicit Endpointer(int sample_rate); - - // Start the endpointer. This should be called at the beginning of a session. - void StartSession(); - - // Stop the endpointer. - void EndSession(); - - // Start environment estimation. Audio will be used for environment estimation - // i.e. noise level estimation. - void SetEnvironmentEstimationMode(); - - // Start user input. This should be called when the user indicates start of - // input, e.g. by pressing a button. - void SetUserInputMode(); - - // Process a segment of audio, which may be more than one frame. - // The status of the last frame will be returned. - EpStatus ProcessAudio(const int16* audio_data, int num_samples, - float* rms_out); - - // Get the status of the endpointer. - EpStatus Status(int64 *time_us); - - // Returns true if the endpointer detected reasonable audio levels above - // background noise which could be user speech, false if not. - bool DidStartReceivingSpeech() const { - return speech_previously_detected_; - } - - bool IsEstimatingEnvironment() const { - return energy_endpointer_.estimating_environment(); - } - - void set_speech_input_complete_silence_length(int64 time_us) { - speech_input_complete_silence_length_us_ = time_us; - } - - void set_long_speech_input_complete_silence_length(int64 time_us) { - long_speech_input_complete_silence_length_us_ = time_us; - } - - void set_speech_input_possibly_complete_silence_length(int64 time_us) { - speech_input_possibly_complete_silence_length_us_ = time_us; - } - - void set_long_speech_length(int64 time_us) { - long_speech_length_us_ = time_us; - } - - bool speech_input_complete() const { - return speech_input_complete_; - } - - private: - // Reset internal states. Helper method common to initial input utterance - // and following input utternaces. - void Reset(); - - // Minimum allowable length of speech input. - int64 speech_input_minimum_length_us_; - - // The speechInputPossiblyComplete event signals that silence/noise has been - // detected for a *short* amount of time after some speech has been detected. - // This proporty specifies the time period. - int64 speech_input_possibly_complete_silence_length_us_; - - // The speechInputComplete event signals that silence/noise has been - // detected for a *long* amount of time after some speech has been detected. - // This property specifies the time period. - int64 speech_input_complete_silence_length_us_; - - // Same as above, this specifies the required silence period after speech - // detection. This period is used instead of - // speech_input_complete_silence_length_ when the utterance is longer than - // long_speech_length_. This parameter is optional. - int64 long_speech_input_complete_silence_length_us_; - - // The period of time after which the endpointer should consider - // long_speech_input_complete_silence_length_ as a valid silence period - // instead of speech_input_complete_silence_length_. This parameter is - // optional. - int64 long_speech_length_us_; - - // First speech onset time, used in determination of speech complete timeout. - int64 speech_start_time_us_; - - // Most recent end time, used in determination of speech complete timeout. - int64 speech_end_time_us_; - - int64 audio_frame_time_us_; - EpStatus old_ep_status_; - bool waiting_for_speech_possibly_complete_timeout_; - bool waiting_for_speech_complete_timeout_; - bool speech_previously_detected_; - bool speech_input_complete_; - EnergyEndpointer energy_endpointer_; - int sample_rate_; - int32 frame_size_; -}; - -} // namespace speech_input - -#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ diff --git a/chrome/browser/speech/endpointer/endpointer_unittest.cc b/chrome/browser/speech/endpointer/endpointer_unittest.cc deleted file mode 100644 index bbdc572..0000000 --- a/chrome/browser/speech/endpointer/endpointer_unittest.cc +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "base/task.h" -#include "chrome/browser/speech/endpointer/endpointer.h" -#include "testing/gtest/include/gtest/gtest.h" - -namespace { -const int kFrameRate = 50; // 20 ms long frames for AMR encoding. -const int kSampleRate = 8000; // 8 k samples per second for AMR encoding. - -// At 8 sample per second a 20 ms frame is 160 samples, which corrsponds -// to the AMR codec. -const int kFrameSize = kSampleRate / kFrameRate; // 160 samples. -COMPILE_ASSERT(kFrameSize == 160, invalid_frame_size); -} - -namespace speech_input { - -class FrameProcessor { - public: - // Process a single frame of test audio samples. - virtual EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) = 0; -}; - -void RunEndpointerEventsTest(FrameProcessor* processor) { - int16 samples[kFrameSize]; - - // We will create a white noise signal of 150 frames. The frames from 50 to - // 100 will have more power, and the endpointer should fire on those frames. - const int kNumFrames = 150; - - // Create a random sequence of samples. - srand(1); - float gain = 0.0; - int64 time = 0; - for (int frame_count = 0; frame_count < kNumFrames; ++frame_count) { - // The frames from 50 to 100 will have more power, and the endpointer - // should detect those frames as speech. - if ((frame_count >= 50) && (frame_count < 100)) { - gain = 2000.0; - } else { - gain = 1.0; - } - // Create random samples. - for (int i = 0; i < kFrameSize; ++i) { - float randNum = static_cast<float>(rand() - (RAND_MAX / 2)) / - static_cast<float>(RAND_MAX); - samples[i] = static_cast<int16>(gain * randNum); - } - - EpStatus ep_status = processor->ProcessFrame(time, samples, kFrameSize); - time += static_cast<int64>(kFrameSize * (1e6 / kSampleRate)); - - // Log the status. - if (20 == frame_count) - EXPECT_EQ(EP_PRE_SPEECH, ep_status); - if (70 == frame_count) - EXPECT_EQ(EP_SPEECH_PRESENT, ep_status); - if (120 == frame_count) - EXPECT_EQ(EP_PRE_SPEECH, ep_status); - } -} - -// This test instantiates and initializes a stand alone endpointer module. -// The test creates FrameData objects with random noise and send them -// to the endointer module. The energy of the first 50 frames is low, -// followed by 500 high energy frames, and another 50 low energy frames. -// We test that the correct start and end frames were detected. -class EnergyEndpointerFrameProcessor : public FrameProcessor { - public: - explicit EnergyEndpointerFrameProcessor(EnergyEndpointer* endpointer) - : endpointer_(endpointer) {} - - EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) { - endpointer_->ProcessAudioFrame(time, samples, kFrameSize, NULL); - int64 ep_time; - return endpointer_->Status(&ep_time); - } - - private: - EnergyEndpointer* endpointer_; -}; - -TEST(EndpointerTest, TestEnergyEndpointerEvents) { - // Initialize endpointer and configure it. We specify the parameters - // here for a 20ms window, and a 20ms step size, which corrsponds to - // the narrow band AMR codec. - EnergyEndpointerParams ep_config; - ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); - ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); - ep_config.set_endpoint_margin(0.2f); - ep_config.set_onset_window(0.15f); - ep_config.set_speech_on_window(0.4f); - ep_config.set_offset_window(0.15f); - ep_config.set_onset_detect_dur(0.09f); - ep_config.set_onset_confirm_dur(0.075f); - ep_config.set_on_maintain_dur(0.10f); - ep_config.set_offset_confirm_dur(0.12f); - ep_config.set_decision_threshold(100.0f); - EnergyEndpointer endpointer; - endpointer.Init(ep_config); - - endpointer.StartSession(); - - EnergyEndpointerFrameProcessor frame_processor(&endpointer); - RunEndpointerEventsTest(&frame_processor); - - endpointer.EndSession(); -}; - -// Test endpointer wrapper class. -class EndpointerFrameProcessor : public FrameProcessor { - public: - explicit EndpointerFrameProcessor(Endpointer* endpointer) - : endpointer_(endpointer) {} - - EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) { - endpointer_->ProcessAudio(samples, kFrameSize, NULL); - int64 ep_time; - return endpointer_->Status(&ep_time); - } - - private: - Endpointer* endpointer_; -}; - -TEST(EndpointerTest, TestEmbeddedEndpointerEvents) { - const int kSampleRate = 8000; // 8 k samples per second for AMR encoding. - - Endpointer endpointer(kSampleRate); - const int64 kMillisecondsPerMicrosecond = 1000; - const int64 short_timeout = 300 * kMillisecondsPerMicrosecond; - endpointer.set_speech_input_possibly_complete_silence_length(short_timeout); - const int64 long_timeout = 500 * kMillisecondsPerMicrosecond; - endpointer.set_speech_input_complete_silence_length(long_timeout); - endpointer.StartSession(); - - EndpointerFrameProcessor frame_processor(&endpointer); - RunEndpointerEventsTest(&frame_processor); - - endpointer.EndSession(); -} - -} // namespace speech_input diff --git a/chrome/browser/speech/endpointer/energy_endpointer.cc b/chrome/browser/speech/endpointer/energy_endpointer.cc deleted file mode 100644 index 85d4a29..0000000 --- a/chrome/browser/speech/endpointer/energy_endpointer.cc +++ /dev/null @@ -1,369 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -// -// To know more about the algorithm used and the original code which this is -// based of, see -// https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef - -#include "chrome/browser/speech/endpointer/energy_endpointer.h" - -#include "base/logging.h" -#include <math.h> -#include <vector> - -namespace { - -// Returns the RMS (quadratic mean) of the input signal. -float RMS(const int16* samples, int num_samples) { - int64 ssq_int64 = 0; - int64 sum_int64 = 0; - for (int i = 0; i < num_samples; ++i) { - sum_int64 += samples[i]; - ssq_int64 += samples[i] * samples[i]; - } - // now convert to floats. - double sum = static_cast<double>(sum_int64); - sum /= num_samples; - double ssq = static_cast<double>(ssq_int64); - return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); -} - -int64 Secs2Usecs(float seconds) { - return static_cast<int64>(0.5 + (1.0e6 * seconds)); -} - -} // namespace - -namespace speech_input { - -// Stores threshold-crossing histories for making decisions about the speech -// state. -class EnergyEndpointer::HistoryRing { - public: - HistoryRing() : insertion_index_(0) {} - - // Resets the ring to |size| elements each with state |initial_state| - void SetRing(int size, bool initial_state); - - // Inserts a new entry into the ring and drops the oldest entry. - void Insert(int64 time_us, bool decision); - - // Returns the time in microseconds of the most recently added entry. - int64 EndTime() const; - - // Returns the sum of all intervals during which 'decision' is true within - // the time in seconds specified by 'duration'. The returned interval is - // in seconds. - float RingSum(float duration_sec); - - private: - struct DecisionPoint { - int64 time_us; - bool decision; - }; - - std::vector<DecisionPoint> decision_points_; - int insertion_index_; // Index at which the next item gets added/inserted. - - DISALLOW_COPY_AND_ASSIGN(HistoryRing); -}; - -void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { - insertion_index_ = 0; - decision_points_.clear(); - DecisionPoint init = { -1, initial_state }; - decision_points_.resize(size, init); -} - -void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) { - decision_points_[insertion_index_].time_us = time_us; - decision_points_[insertion_index_].decision = decision; - insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); -} - -int64 EnergyEndpointer::HistoryRing::EndTime() const { - int ind = insertion_index_ - 1; - if (ind < 0) - ind = decision_points_.size() - 1; - return decision_points_[ind].time_us; -} - -float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { - if (!decision_points_.size()) - return 0.0; - - int64 sum_us = 0; - int ind = insertion_index_ - 1; - if (ind < 0) - ind = decision_points_.size() - 1; - int64 end_us = decision_points_[ind].time_us; - bool is_on = decision_points_[ind].decision; - int64 start_us = end_us - static_cast<int64>(0.5 + (1.0e6 * duration_sec)); - if (start_us < 0) - start_us = 0; - size_t n_summed = 1; // n points ==> (n-1) intervals - while ((decision_points_[ind].time_us > start_us) && - (n_summed < decision_points_.size())) { - --ind; - if (ind < 0) - ind = decision_points_.size() - 1; - if (is_on) - sum_us += end_us - decision_points_[ind].time_us; - is_on = decision_points_[ind].decision; - end_us = decision_points_[ind].time_us; - n_summed++; - } - - return 1.0e-6f * sum_us; // Returns total time that was super threshold. -} - -EnergyEndpointer::EnergyEndpointer() - : status_(EP_PRE_SPEECH), - offset_confirm_dur_sec_(0), - endpointer_time_us_(0), - fast_update_frames_(0), - frame_counter_(0), - max_window_dur_(4.0), - sample_rate_(0), - history_(new HistoryRing()), - decision_threshold_(0), - estimating_environment_(false), - noise_level_(0), - rms_adapt_(0), - start_lag_(0), - end_lag_(0), - user_input_start_time_us_(0) { -} - -EnergyEndpointer::~EnergyEndpointer() { -} - -int EnergyEndpointer::TimeToFrame(float time) const { - return static_cast<int32>(0.5 + (time / params_.frame_period())); -} - -void EnergyEndpointer::Restart(bool reset_threshold) { - status_ = EP_PRE_SPEECH; - user_input_start_time_us_ = 0; - - if (reset_threshold) { - decision_threshold_ = params_.decision_threshold(); - rms_adapt_ = decision_threshold_; - noise_level_ = params_.decision_threshold() / 2.0f; - frame_counter_ = 0; // Used for rapid initial update of levels. - } - - // Set up the memories to hold the history windows. - history_->SetRing(TimeToFrame(max_window_dur_), false); - - // Flag that indicates that current input should be used for - // estimating the environment. The user has not yet started input - // by e.g. pressed the push-to-talk button. By default, this is - // false for backward compatibility. - estimating_environment_ = false; -} - -void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { - params_ = params; - - // Find the longest history interval to be used, and make the ring - // large enough to accommodate that number of frames. NOTE: This - // depends upon ep_frame_period being set correctly in the factory - // that did this instantiation. - max_window_dur_ = params_.onset_window(); - if (params_.speech_on_window() > max_window_dur_) - max_window_dur_ = params_.speech_on_window(); - if (params_.offset_window() > max_window_dur_) - max_window_dur_ = params_.offset_window(); - Restart(true); - - offset_confirm_dur_sec_ = params_.offset_window() - - params_.offset_confirm_dur(); - if (offset_confirm_dur_sec_ < 0.0) - offset_confirm_dur_sec_ = 0.0; - - user_input_start_time_us_ = 0; - - // Flag that indicates that current input should be used for - // estimating the environment. The user has not yet started input - // by e.g. pressed the push-to-talk button. By default, this is - // false for backward compatibility. - estimating_environment_ = false; - // The initial value of the noise and speech levels is inconsequential. - // The level of the first frame will overwrite these values. - noise_level_ = params_.decision_threshold() / 2.0f; - fast_update_frames_ = - static_cast<int64>(params_.fast_update_dur() / params_.frame_period()); - - frame_counter_ = 0; // Used for rapid initial update of levels. - - sample_rate_ = params_.sample_rate(); - start_lag_ = static_cast<int>(sample_rate_ / - params_.max_fundamental_frequency()); - end_lag_ = static_cast<int>(sample_rate_ / - params_.min_fundamental_frequency()); -} - -void EnergyEndpointer::StartSession() { - Restart(true); -} - -void EnergyEndpointer::EndSession() { - status_ = EP_POST_SPEECH; -} - -void EnergyEndpointer::SetEnvironmentEstimationMode() { - Restart(true); - estimating_environment_ = true; -} - -void EnergyEndpointer::SetUserInputMode() { - estimating_environment_ = false; - user_input_start_time_us_ = endpointer_time_us_; -} - -void EnergyEndpointer::ProcessAudioFrame(int64 time_us, - const int16* samples, - int num_samples, - float* rms_out) { - endpointer_time_us_ = time_us; - float rms = RMS(samples, num_samples); - - // Check that this is user input audio vs. pre-input adaptation audio. - // Input audio starts when the user indicates start of input, by e.g. - // pressing push-to-talk. Audio recieved prior to that is used to update - // noise and speech level estimates. - if (!estimating_environment_) { - bool decision = false; - if ((endpointer_time_us_ - user_input_start_time_us_) < - Secs2Usecs(params_.contamination_rejection_period())) { - decision = false; - DVLOG(1) << "decision: forced to false, time: " << endpointer_time_us_; - } else { - decision = (rms > decision_threshold_); - } - - history_->Insert(endpointer_time_us_, decision); - - switch (status_) { - case EP_PRE_SPEECH: - if (history_->RingSum(params_.onset_window()) > - params_.onset_detect_dur()) { - status_ = EP_POSSIBLE_ONSET; - } - break; - - case EP_POSSIBLE_ONSET: { - float tsum = history_->RingSum(params_.onset_window()); - if (tsum > params_.onset_confirm_dur()) { - status_ = EP_SPEECH_PRESENT; - } else { // If signal is not maintained, drop back to pre-speech. - if (tsum <= params_.onset_detect_dur()) - status_ = EP_PRE_SPEECH; - } - break; - } - - case EP_SPEECH_PRESENT: { - // To induce hysteresis in the state residency, we allow a - // smaller residency time in the on_ring, than was required to - // enter the SPEECH_PERSENT state. - float on_time = history_->RingSum(params_.speech_on_window()); - if (on_time < params_.on_maintain_dur()) - status_ = EP_POSSIBLE_OFFSET; - break; - } - - case EP_POSSIBLE_OFFSET: - if (history_->RingSum(params_.offset_window()) <= - offset_confirm_dur_sec_) { - // Note that this offset time may be beyond the end - // of the input buffer in a real-time system. It will be up - // to the RecognizerSession to decide what to do. - status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. - } else { // If speech picks up again we allow return to SPEECH_PRESENT. - if (history_->RingSum(params_.speech_on_window()) >= - params_.on_maintain_dur()) - status_ = EP_SPEECH_PRESENT; - } - break; - - default: - LOG(WARNING) << "Invalid case in switch: " << status_; - break; - } - - // If this is a quiet, non-speech region, slowly adapt the detection - // threshold to be about 6dB above the average RMS. - if ((!decision) && (status_ == EP_PRE_SPEECH)) { - decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); - rms_adapt_ = decision_threshold_; - } else { - // If this is in a speech region, adapt the decision threshold to - // be about 10dB below the average RMS. If the noise level is high, - // the threshold is pushed up. - // Adaptation up to a higher level is 5 times faster than decay to - // a lower level. - if ((status_ == EP_SPEECH_PRESENT) && decision) { - if (rms_adapt_ > rms) { - rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); - } else { - rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); - } - float target_threshold = 0.3f * rms_adapt_ + noise_level_; - decision_threshold_ = (.90f * decision_threshold_) + - (0.10f * target_threshold); - } - } - - // Set a floor - if (decision_threshold_ < params_.min_decision_threshold()) - decision_threshold_ = params_.min_decision_threshold(); - } - - // Update speech and noise levels. - UpdateLevels(rms); - ++frame_counter_; - - if (rms_out) { - *rms_out = -120.0; - if ((noise_level_ > 0.0) && ((rms / noise_level_ ) > 0.000001)) - *rms_out = static_cast<float>(20.0 * log10(rms / noise_level_)); - } -} - -void EnergyEndpointer::UpdateLevels(float rms) { - // Update quickly initially. We assume this is noise and that - // speech is 6dB above the noise. - if (frame_counter_ < fast_update_frames_) { - // Alpha increases from 0 to (k-1)/k where k is the number of time - // steps in the initial adaptation period. - float alpha = static_cast<float>(frame_counter_) / - static_cast<float>(fast_update_frames_); - noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); - DVLOG(1) << "FAST UPDATE, frame_counter_ " << frame_counter_ - << ", fast_update_frames_ " << fast_update_frames_; - } else { - // Update Noise level. The noise level adapts quickly downward, but - // slowly upward. The noise_level_ parameter is not currently used - // for threshold adaptation. It is used for UI feedback. - if (noise_level_ < rms) - noise_level_ = (0.999f * noise_level_) + (0.001f * rms); - else - noise_level_ = (0.95f * noise_level_) + (0.05f * rms); - } - if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { - decision_threshold_ = noise_level_ * 2; // 6dB above noise level. - // Set a floor - if (decision_threshold_ < params_.min_decision_threshold()) - decision_threshold_ = params_.min_decision_threshold(); - } -} - -EpStatus EnergyEndpointer::Status(int64* status_time) const { - *status_time = history_->EndTime(); - return status_; -} - -} // namespace speech diff --git a/chrome/browser/speech/endpointer/energy_endpointer.h b/chrome/browser/speech/endpointer/energy_endpointer.h deleted file mode 100644 index 20476e7..0000000 --- a/chrome/browser/speech/endpointer/energy_endpointer.h +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// The EnergyEndpointer class finds likely speech onset and offset points. -// -// The implementation described here is about the simplest possible. -// It is based on timings of threshold crossings for overall signal -// RMS. It is suitable for light weight applications. -// -// As written, the basic idea is that one specifies intervals that -// must be occupied by super- and sub-threshold energy levels, and -// defers decisions re onset and offset times until these -// specifications have been met. Three basic intervals are tested: an -// onset window, a speech-on window, and an offset window. We require -// super-threshold to exceed some mimimum total durations in the onset -// and speech-on windows before declaring the speech onset time, and -// we specify a required sub-threshold residency in the offset window -// before declaring speech offset. As the various residency requirements are -// met, the EnergyEndpointer instance assumes various states, and can return the -// ID of these states to the client (see EpStatus below). -// -// The levels of the speech and background noise are continuously updated. It is -// important that the background noise level be estimated initially for -// robustness in noisy conditions. The first frames are assumed to be background -// noise and a fast update rate is used for the noise level. The duration for -// fast update is controlled by the fast_update_dur_ paramter. -// -// If used in noisy conditions, the endpointer should be started and run in the -// EnvironmentEstimation mode, for at least 200ms, before switching to -// UserInputMode. -// Audio feedback contamination can appear in the input audio, if not cut -// out or handled by echo cancellation. Audio feedback can trigger a false -// accept. The false accepts can be ignored by setting -// ep_contamination_rejection_period. - -#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ -#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ - -#include "base/basictypes.h" -#include "base/scoped_ptr.h" -#include "chrome/browser/speech/endpointer/energy_endpointer_params.h" -#include <vector> - -namespace speech_input { - -// Endpointer status codes -enum EpStatus { - EP_PRE_SPEECH = 10, - EP_POSSIBLE_ONSET, - EP_SPEECH_PRESENT, - EP_POSSIBLE_OFFSET, - EP_POST_SPEECH, -}; - -class EnergyEndpointer { - public: - // The default construction MUST be followed by Init(), before any - // other use can be made of the instance. - EnergyEndpointer(); - virtual ~EnergyEndpointer(); - - void Init(const EnergyEndpointerParams& params); - - // Start the endpointer. This should be called at the beginning of a session. - void StartSession(); - - // Stop the endpointer. - void EndSession(); - - // Start environment estimation. Audio will be used for environment estimation - // i.e. noise level estimation. - void SetEnvironmentEstimationMode(); - - // Start user input. This should be called when the user indicates start of - // input, e.g. by pressing a button. - void SetUserInputMode(); - - // Computes the next input frame and modifies EnergyEndpointer status as - // appropriate based on the computation. - void ProcessAudioFrame(int64 time_us, - const int16* samples, int num_samples, - float* rms_out); - - // Returns the current state of the EnergyEndpointer and the time - // corresponding to the most recently computed frame. - EpStatus Status(int64* status_time_us) const; - - bool estimating_environment() const { - return estimating_environment_; - } - - private: - class HistoryRing; - - // Resets the endpointer internal state. If reset_threshold is true, the - // state will be reset completely, including adaptive thresholds and the - // removal of all history information. - void Restart(bool reset_threshold); - - // Update internal speech and noise levels. - void UpdateLevels(float rms); - - // Returns the number of frames (or frame number) corresponding to - // the 'time' (in seconds). - int TimeToFrame(float time) const; - - EpStatus status_; // The current state of this instance. - float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH - int64 endpointer_time_us_; // Time of the most recently received audio frame. - int64 fast_update_frames_; // Number of frames for initial level adaptation. - int64 frame_counter_; // Number of frames seen. Used for initial adaptation. - float max_window_dur_; // Largest search window size (seconds) - float sample_rate_; // Sampling rate. - - // Ring buffers to hold the speech activity history. - scoped_ptr<HistoryRing> history_; - - // Configuration parameters. - EnergyEndpointerParams params_; - - // RMS which must be exceeded to conclude frame is speech. - float decision_threshold_; - - // Flag to indicate that audio should be used to estimate environment, prior - // to receiving user input. - bool estimating_environment_; - - // Estimate of the background noise level. Used externally for UI feedback. - float noise_level_; - - // An adaptive threshold used to update decision_threshold_ when appropriate. - float rms_adapt_; - - // Start lag corresponds to the highest fundamental frequency. - int start_lag_; - - // End lag corresponds to the lowest fundamental frequency. - int end_lag_; - - // Time when mode switched from environment estimation to user input. This - // is used to time forced rejection of audio feedback contamination. - int64 user_input_start_time_us_; - - DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer); -}; - -} // namespace speech_input - -#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ diff --git a/chrome/browser/speech/endpointer/energy_endpointer_params.cc b/chrome/browser/speech/endpointer/energy_endpointer_params.cc deleted file mode 100644 index 1ab044a..0000000 --- a/chrome/browser/speech/endpointer/energy_endpointer_params.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/browser/speech/endpointer/energy_endpointer_params.h" - -namespace speech_input { - -EnergyEndpointerParams::EnergyEndpointerParams() { - SetDefaults(); -} - -void EnergyEndpointerParams::SetDefaults() { - frame_period_ = 0.01f; - frame_duration_ = 0.01f; - endpoint_margin_ = 0.2f; - onset_window_ = 0.15f; - speech_on_window_ = 0.4f; - offset_window_ = 0.15f; - onset_detect_dur_ = 0.09f; - onset_confirm_dur_ = 0.075f; - on_maintain_dur_ = 0.10f; - offset_confirm_dur_ = 0.12f; - decision_threshold_ = 150.0f; - min_decision_threshold_ = 50.0f; - fast_update_dur_ = 0.2f; - sample_rate_ = 8000.0f; - min_fundamental_frequency_ = 57.143f; - max_fundamental_frequency_ = 400.0f; - contamination_rejection_period_ = 0.25f; -} - -void EnergyEndpointerParams::operator=(const EnergyEndpointerParams& source) { - frame_period_ = source.frame_period(); - frame_duration_ = source.frame_duration(); - endpoint_margin_ = source.endpoint_margin(); - onset_window_ = source.onset_window(); - speech_on_window_ = source.speech_on_window(); - offset_window_ = source.offset_window(); - onset_detect_dur_ = source.onset_detect_dur(); - onset_confirm_dur_ = source.onset_confirm_dur(); - on_maintain_dur_ = source.on_maintain_dur(); - offset_confirm_dur_ = source.offset_confirm_dur(); - decision_threshold_ = source.decision_threshold(); - min_decision_threshold_ = source.min_decision_threshold(); - fast_update_dur_ = source.fast_update_dur(); - sample_rate_ = source.sample_rate(); - min_fundamental_frequency_ = source.min_fundamental_frequency(); - max_fundamental_frequency_ = source.max_fundamental_frequency(); - contamination_rejection_period_ = source.contamination_rejection_period(); -} - -} // namespace speech_input diff --git a/chrome/browser/speech/endpointer/energy_endpointer_params.h b/chrome/browser/speech/endpointer/energy_endpointer_params.h deleted file mode 100644 index 86e44c9..0000000 --- a/chrome/browser/speech/endpointer/energy_endpointer_params.h +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ -#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ - -#include "base/basictypes.h" - -namespace speech_input { - -// Input parameters for the EnergyEndpointer class. -class EnergyEndpointerParams { - public: - EnergyEndpointerParams(); - - void SetDefaults(); - - void operator=(const EnergyEndpointerParams& source); - - // Accessors and mutators - float frame_period() const { return frame_period_; } - void set_frame_period(float frame_period) { - frame_period_ = frame_period; - } - - float frame_duration() const { return frame_duration_; } - void set_frame_duration(float frame_duration) { - frame_duration_ = frame_duration; - } - - float endpoint_margin() const { return endpoint_margin_; } - void set_endpoint_margin(float endpoint_margin) { - endpoint_margin_ = endpoint_margin; - } - - float onset_window() const { return onset_window_; } - void set_onset_window(float onset_window) { onset_window_ = onset_window; } - - float speech_on_window() const { return speech_on_window_; } - void set_speech_on_window(float speech_on_window) { - speech_on_window_ = speech_on_window; - } - - float offset_window() const { return offset_window_; } - void set_offset_window(float offset_window) { - offset_window_ = offset_window; - } - - float onset_detect_dur() const { return onset_detect_dur_; } - void set_onset_detect_dur(float onset_detect_dur) { - onset_detect_dur_ = onset_detect_dur; - } - - float onset_confirm_dur() const { return onset_confirm_dur_; } - void set_onset_confirm_dur(float onset_confirm_dur) { - onset_confirm_dur_ = onset_confirm_dur; - } - - float on_maintain_dur() const { return on_maintain_dur_; } - void set_on_maintain_dur(float on_maintain_dur) { - on_maintain_dur_ = on_maintain_dur; - } - - float offset_confirm_dur() const { return offset_confirm_dur_; } - void set_offset_confirm_dur(float offset_confirm_dur) { - offset_confirm_dur_ = offset_confirm_dur; - } - - float decision_threshold() const { return decision_threshold_; } - void set_decision_threshold(float decision_threshold) { - decision_threshold_ = decision_threshold; - } - - float min_decision_threshold() const { return min_decision_threshold_; } - void set_min_decision_threshold(float min_decision_threshold) { - min_decision_threshold_ = min_decision_threshold; - } - - float fast_update_dur() const { return fast_update_dur_; } - void set_fast_update_dur(float fast_update_dur) { - fast_update_dur_ = fast_update_dur; - } - - float sample_rate() const { return sample_rate_; } - void set_sample_rate(float sample_rate) { sample_rate_ = sample_rate; } - - float min_fundamental_frequency() const { return min_fundamental_frequency_; } - void set_min_fundamental_frequency(float min_fundamental_frequency) { - min_fundamental_frequency_ = min_fundamental_frequency; - } - - float max_fundamental_frequency() const { return max_fundamental_frequency_; } - void set_max_fundamental_frequency(float max_fundamental_frequency) { - max_fundamental_frequency_ = max_fundamental_frequency; - } - - float contamination_rejection_period() const { - return contamination_rejection_period_; - } - void set_contamination_rejection_period( - float contamination_rejection_period) { - contamination_rejection_period_ = contamination_rejection_period; - } - - private: - float frame_period_; // Frame period - float frame_duration_; // Window size - float onset_window_; // Interval scanned for onset activity - float speech_on_window_; // Inverval scanned for ongoing speech - float offset_window_; // Interval scanned for offset evidence - float offset_confirm_dur_; // Silence duration required to confirm offset - float decision_threshold_; // Initial rms detection threshold - float min_decision_threshold_; // Minimum rms detection threshold - float fast_update_dur_; // Period for initial estimation of levels. - float sample_rate_; // Expected sample rate. - - // Time to add on either side of endpoint threshold crossings - float endpoint_margin_; - // Total dur within onset_window required to enter ONSET state - float onset_detect_dur_; - // Total on time within onset_window required to enter SPEECH_ON state - float onset_confirm_dur_; - // Minimum dur in SPEECH_ON state required to maintain ON state - float on_maintain_dur_; - // Minimum fundamental frequency for autocorrelation. - float min_fundamental_frequency_; - // Maximum fundamental frequency for autocorrelation. - float max_fundamental_frequency_; - // Period after start of user input that above threshold values are ignored. - // This is to reject audio feedback contamination. - float contamination_rejection_period_; -}; - -} // namespace speech_input - -#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ diff --git a/chrome/browser/speech/speech_input_browsertest.cc b/chrome/browser/speech/speech_input_browsertest.cc deleted file mode 100644 index 0b8c904..0000000 --- a/chrome/browser/speech/speech_input_browsertest.cc +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "base/command_line.h" -#include "base/file_path.h" -#include "base/string_number_conversions.h" -#include "base/utf_string_conversions.h" -#include "chrome/browser/renderer_host/render_view_host.h" -#include "chrome/browser/speech/speech_input_dispatcher_host.h" -#include "chrome/browser/speech/speech_input_manager.h" -#include "chrome/browser/tab_contents/tab_contents.h" -#include "chrome/browser/ui/browser.h" -#include "chrome/common/chrome_switches.h" -#include "chrome/test/in_process_browser_test.h" -#include "chrome/test/ui_test_utils.h" -#include "third_party/WebKit/Source/WebKit/chromium/public/WebInputEvent.h" - -namespace speech_input { -class FakeSpeechInputManager; -} - -// This class does not need to be refcounted (typically done by PostTask) since -// it will outlive the test and gets released only when the test shuts down. -// Disabling refcounting here saves a bit of unnecessary code and the factory -// method can return a plain pointer below as required by the real code. -DISABLE_RUNNABLE_METHOD_REFCOUNT(speech_input::FakeSpeechInputManager); - -namespace speech_input { - -const char* kTestResult = "Pictures of the moon"; - -class FakeSpeechInputManager : public SpeechInputManager { - public: - FakeSpeechInputManager() - : caller_id_(0), - delegate_(NULL) { - } - - std::string grammar() { - return grammar_; - } - - // SpeechInputManager methods. - virtual void StartRecognition(Delegate* delegate, - int caller_id, - int render_process_id, - int render_view_id, - const gfx::Rect& element_rect, - const std::string& language, - const std::string& grammar, - const std::string& origin_url) { - VLOG(1) << "StartRecognition invoked."; - EXPECT_EQ(0, caller_id_); - EXPECT_EQ(NULL, delegate_); - caller_id_ = caller_id; - delegate_ = delegate; - grammar_ = grammar; - // Give the fake result in a short while. - MessageLoop::current()->PostTask(FROM_HERE, NewRunnableMethod(this, - &FakeSpeechInputManager::SetFakeRecognitionResult)); - } - virtual void CancelRecognition(int caller_id) { - VLOG(1) << "CancelRecognition invoked."; - EXPECT_EQ(caller_id_, caller_id); - caller_id_ = 0; - delegate_ = NULL; - } - virtual void StopRecording(int caller_id) { - VLOG(1) << "StopRecording invoked."; - EXPECT_EQ(caller_id_, caller_id); - // Nothing to do here since we aren't really recording. - } - virtual void CancelAllRequestsWithDelegate(Delegate* delegate) { - VLOG(1) << "CancelAllRequestsWithDelegate invoked."; - } - - private: - void SetFakeRecognitionResult() { - if (caller_id_) { // Do a check in case we were cancelled.. - VLOG(1) << "Setting fake recognition result."; - delegate_->DidCompleteRecording(caller_id_); - SpeechInputResultArray results; - results.push_back(SpeechInputResultItem(ASCIIToUTF16(kTestResult), 1.0)); - delegate_->SetRecognitionResult(caller_id_, results); - delegate_->DidCompleteRecognition(caller_id_); - caller_id_ = 0; - delegate_ = NULL; - VLOG(1) << "Finished setting fake recognition result."; - } - } - - int caller_id_; - Delegate* delegate_; - std::string grammar_; -}; - -class SpeechInputBrowserTest : public InProcessBrowserTest { - public: - // InProcessBrowserTest methods - GURL testUrl(const FilePath::CharType* filename) { - const FilePath kTestDir(FILE_PATH_LITERAL("speech")); - return ui_test_utils::GetTestUrl(kTestDir, FilePath(filename)); - } - - protected: - void LoadAndRunSpeechInputTest(const FilePath::CharType* filename) { - // The test page calculates the speech button's coordinate in the page on - // load & sets that coordinate in the URL fragment. We send mouse down & up - // events at that coordinate to trigger speech recognition. - GURL test_url = testUrl(filename); - ui_test_utils::NavigateToURL(browser(), test_url); - std::string coords = browser()->GetSelectedTabContents()->GetURL().ref(); - VLOG(1) << "Coordinates given by script: " << coords; - int comma_pos = coords.find(','); - ASSERT_NE(-1, comma_pos); - int x = 0; - ASSERT_TRUE(base::StringToInt(coords.substr(0, comma_pos).c_str(), &x)); - int y = 0; - ASSERT_TRUE(base::StringToInt(coords.substr(comma_pos + 1).c_str(), &y)); - - WebKit::WebMouseEvent mouse_event; - mouse_event.type = WebKit::WebInputEvent::MouseDown; - mouse_event.button = WebKit::WebMouseEvent::ButtonLeft; - mouse_event.x = x; - mouse_event.y = y; - mouse_event.clickCount = 1; - TabContents* tab_contents = browser()->GetSelectedTabContents(); - tab_contents->render_view_host()->ForwardMouseEvent(mouse_event); - mouse_event.type = WebKit::WebInputEvent::MouseUp; - tab_contents->render_view_host()->ForwardMouseEvent(mouse_event); - - // The fake speech input manager would receive the speech input - // request and return the test string as recognition result. The test page - // then sets the URL fragment as 'pass' if it received the expected string. - ui_test_utils::WaitForNavigations(&tab_contents->controller(), 1); - EXPECT_EQ("pass", browser()->GetSelectedTabContents()->GetURL().ref()); - } - - // InProcessBrowserTest methods. - virtual void SetUpInProcessBrowserTestFixture() { - speech_input_manager_ = &fake_speech_input_manager_; - - // Inject the fake manager factory so that the test result is returned to - // the web page. - SpeechInputDispatcherHost::set_manager_accessor(&fakeManagerAccessor); - } - - virtual void TearDownInProcessBrowserTestFixture() { - speech_input_manager_ = NULL; - } - - // Factory method. - static SpeechInputManager* fakeManagerAccessor() { - return speech_input_manager_; - } - - FakeSpeechInputManager fake_speech_input_manager_; - - // This is used by the static |fakeManagerAccessor|, and it is a pointer - // rather than a direct instance per the style guide. - static SpeechInputManager* speech_input_manager_; -}; - -SpeechInputManager* SpeechInputBrowserTest::speech_input_manager_ = NULL; - -// Marked as FLAKY due to http://crbug.com/51337 -// -// TODO(satish): Once this flakiness has been fixed, add a second test here to -// check for sending many clicks in succession to the speech button and verify -// that it doesn't cause any crash but works as expected. This should act as the -// test for http://crbug.com/59173 -// -// TODO(satish): Similar to above, once this flakiness has been fixed add -// another test here to check that when speech recognition is in progress and -// a renderer crashes, we get a call to -// SpeechInputManager::CancelAllRequestsWithDelegate. -// -// Marked as DISABLED due to http://crbug.com/71227 -#if defined(GOOGLE_CHROME_BUILD) -#define MAYBE_TestBasicRecognition DISABLED_TestBasicRecognition -#elif defined(OS_WIN) -#define MAYBE_TestBasicRecognition FLAKY_TestBasicRecognition -#else -#define MAYBE_TestBasicRecognition TestBasicRecognition -#endif -IN_PROC_BROWSER_TEST_F(SpeechInputBrowserTest, MAYBE_TestBasicRecognition) { - LoadAndRunSpeechInputTest(FILE_PATH_LITERAL("basic_recognition.html")); - EXPECT_TRUE(fake_speech_input_manager_.grammar().empty()); -} - -// Marked as FLAKY due to http://crbug.com/51337 -// Marked as DISALBED due to http://crbug.com/71227 -#if defined(GOOGLE_CHROME_BUILD) -#define MAYBE_GrammarAttribute DISABLED_GrammarAttribute -#elif defined(OS_WIN) -#define MAYBE_GrammarAttribute FLAKY_GrammarAttribute -#else -#define MAYBE_GrammarAttribute GrammarAttribute -#endif -IN_PROC_BROWSER_TEST_F(SpeechInputBrowserTest, MAYBE_GrammarAttribute) { - LoadAndRunSpeechInputTest(FILE_PATH_LITERAL("grammar_attribute.html")); - EXPECT_EQ("http://example.com/grammar.xml", - fake_speech_input_manager_.grammar()); -} - -} // namespace speech_input diff --git a/chrome/browser/speech/speech_input_dispatcher_host.cc b/chrome/browser/speech/speech_input_dispatcher_host.cc deleted file mode 100644 index dc993d5..0000000 --- a/chrome/browser/speech/speech_input_dispatcher_host.cc +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/browser/speech/speech_input_dispatcher_host.h" - -#include "base/lazy_instance.h" -#include "chrome/common/speech_input_messages.h" - -namespace speech_input { - -//----------------------------- SpeechInputCallers ----------------------------- - -// A singleton class to map the tuple -// (render-process-id, render-view-id, requestid) to a single ID which is passed -// through rest of the speech code. -class SpeechInputDispatcherHost::SpeechInputCallers { - public: - // Creates a new ID for a given tuple. - int CreateId(int render_process_id, int render_view_id, int request_id); - - // Returns the ID for a tuple assuming the ID was created earlier. - int GetId(int render_process_id, int render_view_id, int request_id); - - // Removes the ID and associated tuple from the map. - void RemoveId(int id); - - // Getters for the various tuple elements for the given ID. - int render_process_id(int id); - int render_view_id(int id); - int request_id(int id); - - private: - struct CallerInfo { - int render_process_id; - int render_view_id; - int request_id; - }; - friend struct base::DefaultLazyInstanceTraits<SpeechInputCallers>; - - SpeechInputCallers(); - - std::map<int, CallerInfo> callers_; - int next_id_; -}; - -static base::LazyInstance<SpeechInputDispatcherHost::SpeechInputCallers> - g_speech_input_callers(base::LINKER_INITIALIZED); - -SpeechInputDispatcherHost::SpeechInputCallers::SpeechInputCallers() - : next_id_(1) { -} - -int SpeechInputDispatcherHost::SpeechInputCallers::GetId(int render_process_id, - int render_view_id, - int request_id) { - for (std::map<int, CallerInfo>::iterator it = callers_.begin(); - it != callers_.end(); it++) { - const CallerInfo& item = it->second; - if (item.render_process_id == render_process_id && - item.render_view_id == render_view_id && - item.request_id == request_id) { - return it->first; - } - } - - // Not finding an entry here is valid since a cancel/stop may have been issued - // by the renderer and before it received our response the user may have - // clicked the button to stop again. The caller of this method should take - // care of this case. - return 0; -} - -int SpeechInputDispatcherHost::SpeechInputCallers::CreateId( - int render_process_id, - int render_view_id, - int request_id) { - CallerInfo info; - info.render_process_id = render_process_id; - info.render_view_id = render_view_id; - info.request_id = request_id; - callers_[next_id_] = info; - return next_id_++; -} - -void SpeechInputDispatcherHost::SpeechInputCallers::RemoveId(int id) { - callers_.erase(id); -} - -int SpeechInputDispatcherHost::SpeechInputCallers::render_process_id(int id) { - return callers_[id].render_process_id; -} - -int SpeechInputDispatcherHost::SpeechInputCallers::render_view_id(int id) { - return callers_[id].render_view_id; -} - -int SpeechInputDispatcherHost::SpeechInputCallers::request_id(int id) { - return callers_[id].request_id; -} - -//-------------------------- SpeechInputDispatcherHost ------------------------- - -SpeechInputManager::AccessorMethod* - SpeechInputDispatcherHost::manager_accessor_ = &SpeechInputManager::Get; - -SpeechInputDispatcherHost::SpeechInputDispatcherHost(int render_process_id) - : render_process_id_(render_process_id), - may_have_pending_requests_(false) { - // This is initialized by Browser. Do not add any non-trivial - // initialization here, instead do it lazily when required (e.g. see the - // method |manager()|) or add an Init() method. -} - -SpeechInputDispatcherHost::~SpeechInputDispatcherHost() { - // If the renderer crashed for some reason or if we didn't receive a proper - // Cancel/Stop call for an existing session, cancel such active sessions now. - // We first check if this dispatcher received any speech IPC requst so that - // we don't end up creating the speech input manager for web pages which don't - // use speech input. - if (may_have_pending_requests_) - manager()->CancelAllRequestsWithDelegate(this); -} - -SpeechInputManager* SpeechInputDispatcherHost::manager() { - return (*manager_accessor_)(); -} - -bool SpeechInputDispatcherHost::OnMessageReceived( - const IPC::Message& message, bool* message_was_ok) { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - - uint32 message_type = message.type(); - if (message_type == SpeechInputHostMsg_StartRecognition::ID || - message_type == SpeechInputHostMsg_CancelRecognition::ID || - message_type == SpeechInputHostMsg_StopRecording::ID) { - if (!SpeechInputManager::IsFeatureEnabled()) { - *message_was_ok = false; - return true; - } - - may_have_pending_requests_ = true; - IPC_BEGIN_MESSAGE_MAP_EX(SpeechInputDispatcherHost, message, - *message_was_ok) - IPC_MESSAGE_HANDLER(SpeechInputHostMsg_StartRecognition, - OnStartRecognition) - IPC_MESSAGE_HANDLER(SpeechInputHostMsg_CancelRecognition, - OnCancelRecognition) - IPC_MESSAGE_HANDLER(SpeechInputHostMsg_StopRecording, - OnStopRecording) - IPC_END_MESSAGE_MAP() - return true; - } - - return false; -} - -void SpeechInputDispatcherHost::OnStartRecognition( - const SpeechInputHostMsg_StartRecognition_Params ¶ms) { - int caller_id = g_speech_input_callers.Get().CreateId( - render_process_id_, params.render_view_id, params.request_id); - manager()->StartRecognition(this, caller_id, - render_process_id_, - params.render_view_id, params.element_rect, - params.language, params.grammar, - params.origin_url); -} - -void SpeechInputDispatcherHost::OnCancelRecognition(int render_view_id, - int request_id) { - int caller_id = g_speech_input_callers.Get().GetId( - render_process_id_, render_view_id, request_id); - if (caller_id) { - manager()->CancelRecognition(caller_id); - // Request sequence ended so remove mapping. - g_speech_input_callers.Get().RemoveId(caller_id); - } -} - -void SpeechInputDispatcherHost::OnStopRecording(int render_view_id, - int request_id) { - int caller_id = g_speech_input_callers.Get().GetId( - render_process_id_, render_view_id, request_id); - if (caller_id) - manager()->StopRecording(caller_id); -} - -void SpeechInputDispatcherHost::SetRecognitionResult( - int caller_id, const SpeechInputResultArray& result) { - VLOG(1) << "SpeechInputDispatcherHost::SetRecognitionResult enter"; - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - int caller_render_view_id = - g_speech_input_callers.Get().render_view_id(caller_id); - int caller_request_id = g_speech_input_callers.Get().request_id(caller_id); - Send(new SpeechInputMsg_SetRecognitionResult(caller_render_view_id, - caller_request_id, - result)); - VLOG(1) << "SpeechInputDispatcherHost::SetRecognitionResult exit"; -} - -void SpeechInputDispatcherHost::DidCompleteRecording(int caller_id) { - VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecording enter"; - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - int caller_render_view_id = - g_speech_input_callers.Get().render_view_id(caller_id); - int caller_request_id = g_speech_input_callers.Get().request_id(caller_id); - Send(new SpeechInputMsg_RecordingComplete(caller_render_view_id, - caller_request_id)); - VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecording exit"; -} - -void SpeechInputDispatcherHost::DidCompleteRecognition(int caller_id) { - VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecognition enter"; - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - int caller_render_view_id = - g_speech_input_callers.Get().render_view_id(caller_id); - int caller_request_id = g_speech_input_callers.Get().request_id(caller_id); - Send(new SpeechInputMsg_RecognitionComplete(caller_render_view_id, - caller_request_id)); - // Request sequence ended, so remove mapping. - g_speech_input_callers.Get().RemoveId(caller_id); - VLOG(1) << "SpeechInputDispatcherHost::DidCompleteRecognition exit"; -} - -} // namespace speech_input diff --git a/chrome/browser/speech/speech_input_dispatcher_host.h b/chrome/browser/speech/speech_input_dispatcher_host.h deleted file mode 100644 index 23a1f23..0000000 --- a/chrome/browser/speech/speech_input_dispatcher_host.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_ -#define CHROME_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_ - -#include "base/scoped_ptr.h" -#include "chrome/browser/browser_message_filter.h" -#include "chrome/browser/speech/speech_input_manager.h" - -struct SpeechInputHostMsg_StartRecognition_Params; - -namespace speech_input { - -// SpeechInputDispatcherHost is a delegate for Speech API messages used by -// RenderMessageFilter. -// It's the complement of SpeechInputDispatcher (owned by RenderView). -class SpeechInputDispatcherHost : public BrowserMessageFilter, - public SpeechInputManager::Delegate { - public: - class SpeechInputCallers; - - explicit SpeechInputDispatcherHost(int render_process_id); - - // SpeechInputManager::Delegate methods. - virtual void SetRecognitionResult(int caller_id, - const SpeechInputResultArray& result); - virtual void DidCompleteRecording(int caller_id); - virtual void DidCompleteRecognition(int caller_id); - - // BrowserMessageFilter implementation. - virtual bool OnMessageReceived(const IPC::Message& message, - bool* message_was_ok); - - // Singleton accessor setter useful for tests. - static void set_manager_accessor(SpeechInputManager::AccessorMethod* method) { - manager_accessor_ = method; - } - - private: - virtual ~SpeechInputDispatcherHost(); - - void OnStartRecognition( - const SpeechInputHostMsg_StartRecognition_Params ¶ms); - void OnCancelRecognition(int render_view_id, int request_id); - void OnStopRecording(int render_view_id, int request_id); - - // Returns the speech input manager to forward events to, creating one if - // needed. - SpeechInputManager* manager(); - - int render_process_id_; - bool may_have_pending_requests_; // Set if we received any speech IPC request - - static SpeechInputManager::AccessorMethod* manager_accessor_; - - DISALLOW_COPY_AND_ASSIGN(SpeechInputDispatcherHost); -}; - -} // namespace speech_input - -#endif // CHROME_BROWSER_SPEECH_SPEECH_INPUT_DISPATCHER_HOST_H_ diff --git a/chrome/browser/speech/speech_input_manager.cc b/chrome/browser/speech/speech_input_manager.cc index 7a07543..626bf4f 100644 --- a/chrome/browser/speech/speech_input_manager.cc +++ b/chrome/browser/speech/speech_input_manager.cc @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "chrome/browser/speech/speech_input_manager.h" +#include "content/browser/speech/speech_input_manager.h" #include <map> #include <string> @@ -14,14 +14,14 @@ #include "base/threading/thread_restrictions.h" #include "base/utf_string_conversions.h" #include "chrome/browser/browser_process.h" -#include "chrome/browser/browser_thread.h" #include "chrome/browser/platform_util.h" #include "chrome/browser/prefs/pref_service.h" #include "chrome/browser/speech/speech_input_bubble_controller.h" -#include "chrome/browser/speech/speech_recognizer.h" #include "chrome/browser/tab_contents/tab_util.h" #include "chrome/common/chrome_switches.h" #include "chrome/common/pref_names.h" +#include "content/browser/browser_thread.h" +#include "content/browser/speech/speech_recognizer.h" #include "grit/generated_resources.h" #include "media/audio/audio_manager.h" #include "ui/base/l10n/l10n_util.h" diff --git a/chrome/browser/speech/speech_input_manager.h b/chrome/browser/speech/speech_input_manager.h deleted file mode 100644 index 3646f4f..0000000 --- a/chrome/browser/speech/speech_input_manager.h +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_ -#define CHROME_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_ - -#include "base/basictypes.h" -#include "chrome/common/speech_input_result.h" -#include "ipc/ipc_message.h" -#include "ui/gfx/rect.h" - -namespace speech_input { - -// This is the gatekeeper for speech recognition in the browser process. It -// handles requests received from various render views and makes sure only one -// of them can use speech recognition at a time. It also sends recognition -// results and status events to the render views when required. -// This class is a singleton and accessed via the Get method. -class SpeechInputManager { - public: - // Implemented by the dispatcher host to relay events to the render views. - class Delegate { - public: - virtual void SetRecognitionResult( - int caller_id, - const SpeechInputResultArray& result) = 0; - virtual void DidCompleteRecording(int caller_id) = 0; - virtual void DidCompleteRecognition(int caller_id) = 0; - - protected: - virtual ~Delegate() {} - }; - - // Whether the speech input feature is enabled, based on the browser channel - // information and command line flags. - static bool IsFeatureEnabled(); - - // Factory method to access the singleton. We have this method here instead of - // using Singleton directly in the calling code to aid tests in injection - // mocks. - static SpeechInputManager* Get(); - // Factory method definition useful for tests. - typedef SpeechInputManager* (AccessorMethod)(); - - virtual ~SpeechInputManager() {} - - // Handlers for requests from render views. - - // |delegate| is a weak pointer and should remain valid until - // its |DidCompleteRecognition| method is called or recognition is cancelled. - // |render_process_id| is the ID of the renderer process initiating the - // request. - // |element_rect| is the display bounds of the html element requesting speech - // input (in page coordinates). - virtual void StartRecognition(Delegate* delegate, - int caller_id, - int render_process_id, - int render_view_id, - const gfx::Rect& element_rect, - const std::string& language, - const std::string& grammar, - const std::string& origin_url) = 0; - virtual void CancelRecognition(int caller_id) = 0; - virtual void StopRecording(int caller_id) = 0; - - virtual void CancelAllRequestsWithDelegate(Delegate* delegate) = 0; -}; - -// This typedef is to workaround the issue with certain versions of -// Visual Studio where it gets confused between multiple Delegate -// classes and gives a C2500 error. (I saw this error on the try bots - -// the workaround was not needed for my machine). -typedef SpeechInputManager::Delegate SpeechInputManagerDelegate; - -} // namespace speech_input - -#endif // CHROME_BROWSER_SPEECH_SPEECH_INPUT_MANAGER_H_ diff --git a/chrome/browser/speech/speech_recognition_request.cc b/chrome/browser/speech/speech_recognition_request.cc deleted file mode 100644 index dc8dc27..0000000 --- a/chrome/browser/speech/speech_recognition_request.cc +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/browser/speech/speech_recognition_request.h" - -#include <vector> - -#include "base/json/json_reader.h" -#include "base/string_util.h" -#include "base/values.h" -#include "chrome/common/net/url_request_context_getter.h" -#include "net/base/escape.h" -#include "net/base/load_flags.h" -#include "net/url_request/url_request_context.h" -#include "net/url_request/url_request_status.h" -#include "ui/base/l10n/l10n_util.h" - -namespace { - -const char* const kDefaultSpeechRecognitionUrl = - "https://www.google.com/speech-api/v1/recognize?client=chromium&"; -const char* const kHypothesesString = "hypotheses"; -const char* const kUtteranceString = "utterance"; -const char* const kConfidenceString = "confidence"; - -bool ParseServerResponse(const std::string& response_body, - speech_input::SpeechInputResultArray* result) { - if (response_body.empty()) { - LOG(WARNING) << "ParseServerResponse: Response was empty."; - return false; - } - DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; - - // Parse the response, ignoring comments. - std::string error_msg; - scoped_ptr<Value> response_value(base::JSONReader::ReadAndReturnError( - response_body, false, NULL, &error_msg)); - if (response_value == NULL) { - LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg; - return false; - } - - if (!response_value->IsType(Value::TYPE_DICTIONARY)) { - VLOG(1) << "ParseServerResponse: Unexpected response type " - << response_value->GetType(); - return false; - } - const DictionaryValue* response_object = - static_cast<DictionaryValue*>(response_value.get()); - - // Get the hypotheses - Value* hypotheses_value = NULL; - if (!response_object->Get(kHypothesesString, &hypotheses_value)) { - VLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; - return false; - } - DCHECK(hypotheses_value); - if (!hypotheses_value->IsType(Value::TYPE_LIST)) { - VLOG(1) << "ParseServerResponse: Unexpected hypotheses type " - << hypotheses_value->GetType(); - return false; - } - const ListValue* hypotheses_list = static_cast<ListValue*>(hypotheses_value); - if (hypotheses_list->GetSize() == 0) { - VLOG(1) << "ParseServerResponse: hypotheses list is empty."; - return false; - } - - size_t index = 0; - for (; index < hypotheses_list->GetSize(); ++index) { - Value* hypothesis = NULL; - if (!hypotheses_list->Get(index, &hypothesis)) { - LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; - break; - } - DCHECK(hypothesis); - if (!hypothesis->IsType(Value::TYPE_DICTIONARY)) { - LOG(WARNING) << "ParseServerResponse: Unexpected value type " - << hypothesis->GetType(); - break; - } - - const DictionaryValue* hypothesis_value = - static_cast<DictionaryValue*>(hypothesis); - string16 utterance; - if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { - LOG(WARNING) << "ParseServerResponse: Missing utterance value."; - break; - } - - // It is not an error if the 'confidence' field is missing. - double confidence = 0.0; - hypothesis_value->GetDouble(kConfidenceString, &confidence); - - result->push_back(speech_input::SpeechInputResultItem(utterance, - confidence)); - } - - if (index < hypotheses_list->GetSize()) { - result->clear(); - return false; - } - - return true; -} - -} // namespace - -namespace speech_input { - -int SpeechRecognitionRequest::url_fetcher_id_for_tests = 0; - -SpeechRecognitionRequest::SpeechRecognitionRequest( - URLRequestContextGetter* context, Delegate* delegate) - : url_context_(context), - delegate_(delegate) { - DCHECK(delegate); -} - -SpeechRecognitionRequest::~SpeechRecognitionRequest() {} - -bool SpeechRecognitionRequest::Send(const std::string& language, - const std::string& grammar, - const std::string& hardware_info, - const std::string& origin_url, - const std::string& content_type, - const std::string& audio_data) { - DCHECK(!url_fetcher_.get()); - - std::vector<std::string> parts; - - std::string lang_param = language; - if (lang_param.empty() && url_context_) { - // If no language is provided then we use the first from the accepted - // language list. If this list is empty then it defaults to "en-US". - // Example of the contents of this list: "es,en-GB;q=0.8", "" - net::URLRequestContext* request_context = - url_context_->GetURLRequestContext(); - DCHECK(request_context); - std::string accepted_language_list = request_context->accept_language(); - size_t separator = accepted_language_list.find_first_of(",;"); - lang_param = accepted_language_list.substr(0, separator); - } - if (lang_param.empty()) - lang_param = "en-US"; - parts.push_back("lang=" + EscapeQueryParamValue(lang_param, true)); - - if (!grammar.empty()) - parts.push_back("lm=" + EscapeQueryParamValue(grammar, true)); - if (!hardware_info.empty()) - parts.push_back("xhw=" + EscapeQueryParamValue(hardware_info, true)); - // TODO(satish): Remove this hardcoded value once the page is allowed to - // set this via an attribute. - parts.push_back("maxresults=3"); - - GURL url(std::string(kDefaultSpeechRecognitionUrl) + JoinString(parts, '&')); - - url_fetcher_.reset(URLFetcher::Create(url_fetcher_id_for_tests, - url, - URLFetcher::POST, - this)); - url_fetcher_->set_upload_data(content_type, audio_data); - url_fetcher_->set_request_context(url_context_); - url_fetcher_->set_referrer(origin_url); - - // The speech recognition API does not require user identification as part - // of requests, so we don't send cookies or auth data for these requests to - // prevent any accidental connection between users who are logged into the - // domain for other services (e.g. bookmark sync) with the speech requests. - url_fetcher_->set_load_flags( - net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES | - net::LOAD_DO_NOT_SEND_AUTH_DATA); - url_fetcher_->Start(); - return true; -} - -void SpeechRecognitionRequest::OnURLFetchComplete( - const URLFetcher* source, - const GURL& url, - const net::URLRequestStatus& status, - int response_code, - const ResponseCookies& cookies, - const std::string& data) { - DCHECK_EQ(url_fetcher_.get(), source); - - bool error = !status.is_success() || response_code != 200; - SpeechInputResultArray result; - if (!error) - error = !ParseServerResponse(data, &result); - url_fetcher_.reset(); - - DVLOG(1) << "SpeechRecognitionRequest: Invoking delegate with result."; - delegate_->SetRecognitionResult(error, result); -} - -} // namespace speech_input diff --git a/chrome/browser/speech/speech_recognition_request.h b/chrome/browser/speech/speech_recognition_request.h deleted file mode 100644 index 9b022cf..0000000 --- a/chrome/browser/speech/speech_recognition_request.h +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_ -#define CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_ -#pragma once - -#include <string> -#include "base/basictypes.h" -#include "base/ref_counted.h" -#include "base/scoped_ptr.h" -#include "chrome/common/net/url_fetcher.h" -#include "chrome/common/speech_input_result.h" -#include "googleurl/src/gurl.h" - -class URLFetcher; -class URLRequestContextGetter; - -namespace speech_input { - -// Provides a simple interface for sending recorded speech data to the server -// and get back recognition results. -class SpeechRecognitionRequest : public URLFetcher::Delegate { - public: - // ID passed to URLFetcher::Create(). Used for testing. - static int url_fetcher_id_for_tests; - - // Interface for receiving callbacks from this object. - class Delegate { - public: - virtual void SetRecognitionResult( - bool error, const SpeechInputResultArray& result) = 0; - - protected: - virtual ~Delegate() {} - }; - - // |url| is the server address to which the request wil be sent. - SpeechRecognitionRequest(URLRequestContextGetter* context, - Delegate* delegate); - - virtual ~SpeechRecognitionRequest(); - - // Sends a new request with the given audio data, returns true if successful. - // The same object can be used to send multiple requests but only after the - // previous request has completed. - bool Send(const std::string& language, - const std::string& grammar, - const std::string& hardware_info, - const std::string& origin_url, - const std::string& content_type, - const std::string& audio_data); - - bool HasPendingRequest() { return url_fetcher_ != NULL; } - - // URLFetcher::Delegate methods. - virtual void OnURLFetchComplete(const URLFetcher* source, - const GURL& url, - const net::URLRequestStatus& status, - int response_code, - const ResponseCookies& cookies, - const std::string& data); - - private: - scoped_refptr<URLRequestContextGetter> url_context_; - Delegate* delegate_; - scoped_ptr<URLFetcher> url_fetcher_; - - DISALLOW_COPY_AND_ASSIGN(SpeechRecognitionRequest); -}; - -// This typedef is to workaround the issue with certain versions of -// Visual Studio where it gets confused between multiple Delegate -// classes and gives a C2500 error. (I saw this error on the try bots - -// the workaround was not needed for my machine). -typedef SpeechRecognitionRequest::Delegate SpeechRecognitionRequestDelegate; - -} // namespace speech_input - -#endif // CHROME_BROWSER_SPEECH_SPEECH_RECOGNITION_REQUEST_H_ diff --git a/chrome/browser/speech/speech_recognition_request_unittest.cc b/chrome/browser/speech/speech_recognition_request_unittest.cc deleted file mode 100644 index bd2a26e..0000000 --- a/chrome/browser/speech/speech_recognition_request_unittest.cc +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "base/utf_string_conversions.h" -#include "chrome/browser/speech/speech_recognition_request.h" -#include "chrome/common/net/url_request_context_getter.h" -#include "chrome/common/net/test_url_fetcher_factory.h" -#include "net/url_request/url_request_status.h" -#include "testing/gtest/include/gtest/gtest.h" - -namespace speech_input { - -class SpeechRecognitionRequestTest : public SpeechRecognitionRequestDelegate, - public testing::Test { - public: - SpeechRecognitionRequestTest() : error_(false) { } - - // Creates a speech recognition request and invokes it's URL fetcher delegate - // with the given test data. - void CreateAndTestRequest(bool success, const std::string& http_response); - - // SpeechRecognitionRequestDelegate methods. - virtual void SetRecognitionResult(bool error, - const SpeechInputResultArray& result) { - error_ = error; - result_ = result; - } - - // testing::Test methods. - virtual void SetUp() { - URLFetcher::set_factory(&url_fetcher_factory_); - } - - virtual void TearDown() { - URLFetcher::set_factory(NULL); - } - - protected: - MessageLoop message_loop_; - TestURLFetcherFactory url_fetcher_factory_; - bool error_; - SpeechInputResultArray result_; -}; - -void SpeechRecognitionRequestTest::CreateAndTestRequest( - bool success, const std::string& http_response) { - SpeechRecognitionRequest request(NULL, this); - request.Send(std::string(), std::string(), std::string(), std::string(), - std::string(), std::string()); - TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); - ASSERT_TRUE(fetcher); - net::URLRequestStatus status; - status.set_status(success ? net::URLRequestStatus::SUCCESS : - net::URLRequestStatus::FAILED); - fetcher->delegate()->OnURLFetchComplete(fetcher, fetcher->original_url(), - status, success ? 200 : 500, - ResponseCookies(), - http_response); - // Parsed response will be available in result_. -} - -TEST_F(SpeechRecognitionRequestTest, BasicTest) { - // Normal success case with one result. - CreateAndTestRequest(true, - "{\"hypotheses\":[{\"utterance\":\"123456\",\"confidence\":0.9}]}"); - EXPECT_FALSE(error_); - EXPECT_EQ(1U, result_.size()); - EXPECT_EQ(ASCIIToUTF16("123456"), result_[0].utterance); - EXPECT_EQ(0.9, result_[0].confidence); - - // Normal success case with multiple results. - CreateAndTestRequest(true, - "{\"hypotheses\":[{\"utterance\":\"hello\",\"confidence\":0.9}," - "{\"utterance\":\"123456\",\"confidence\":0.5}]}"); - EXPECT_FALSE(error_); - EXPECT_EQ(2u, result_.size()); - EXPECT_EQ(ASCIIToUTF16("hello"), result_[0].utterance); - EXPECT_EQ(0.9, result_[0].confidence); - EXPECT_EQ(ASCIIToUTF16("123456"), result_[1].utterance); - EXPECT_EQ(0.5, result_[1].confidence); - - // Http failure case. - CreateAndTestRequest(false, ""); - EXPECT_TRUE(error_); - EXPECT_EQ(0U, result_.size()); - - // Malformed JSON case. - CreateAndTestRequest(true, "{\"hypotheses\":[{\"unknownkey\":\"hello\"}]}"); - EXPECT_TRUE(error_); - EXPECT_EQ(0U, result_.size()); -} - -} // namespace speech_input diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc deleted file mode 100644 index 113600b..0000000 --- a/chrome/browser/speech/speech_recognizer.cc +++ /dev/null @@ -1,264 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "chrome/browser/speech/speech_recognizer.h" - -#include "base/ref_counted.h" -#include "base/scoped_ptr.h" -#include "base/time.h" -#include "chrome/browser/browser_thread.h" -#include "chrome/browser/profiles/profile.h" -#include "chrome/common/net/url_request_context_getter.h" - -using media::AudioInputController; -using std::string; - -namespace { - -// The following constants are related to the volume level indicator shown in -// the UI for recorded audio. -// Multiplier used when new volume is greater than previous level. -const float kUpSmoothingFactor = 0.9f; -// Multiplier used when new volume is lesser than previous level. -const float kDownSmoothingFactor = 0.4f; -const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. -const float kAudioMeterDbRange = 25.0f; -} // namespace - -namespace speech_input { - -const int SpeechRecognizer::kAudioSampleRate = 16000; -const int SpeechRecognizer::kAudioPacketIntervalMs = 100; -const int SpeechRecognizer::kNumAudioChannels = 1; -const int SpeechRecognizer::kNumBitsPerAudioSample = 16; -const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; -const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; - -SpeechRecognizer::SpeechRecognizer(Delegate* delegate, - int caller_id, - const std::string& language, - const std::string& grammar, - const std::string& hardware_info, - const std::string& origin_url) - : delegate_(delegate), - caller_id_(caller_id), - language_(language), - grammar_(grammar), - hardware_info_(hardware_info), - origin_url_(origin_url), - codec_(AudioEncoder::CODEC_SPEEX), - encoder_(NULL), - endpointer_(kAudioSampleRate), - num_samples_recorded_(0), - audio_level_(0.0f) { - endpointer_.set_speech_input_complete_silence_length( - base::Time::kMicrosecondsPerSecond / 2); - endpointer_.set_long_speech_input_complete_silence_length( - base::Time::kMicrosecondsPerSecond); - endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); - endpointer_.StartSession(); -} - -SpeechRecognizer::~SpeechRecognizer() { - // Recording should have stopped earlier due to the endpointer or - // |StopRecording| being called. - DCHECK(!audio_controller_.get()); - DCHECK(!request_.get() || !request_->HasPendingRequest()); - DCHECK(!encoder_.get()); - endpointer_.EndSession(); -} - -bool SpeechRecognizer::StartRecording() { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - DCHECK(!audio_controller_.get()); - DCHECK(!request_.get() || !request_->HasPendingRequest()); - DCHECK(!encoder_.get()); - - // The endpointer needs to estimate the environment/background noise before - // starting to treat the audio as user input. In |HandleOnData| we wait until - // such time has passed before switching to user input mode. - endpointer_.SetEnvironmentEstimationMode(); - - encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate, - kNumBitsPerAudioSample)); - int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; - AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, - kAudioSampleRate, kNumBitsPerAudioSample, - samples_per_packet); - audio_controller_ = AudioInputController::Create(this, params); - DCHECK(audio_controller_.get()); - VLOG(1) << "SpeechRecognizer starting record."; - num_samples_recorded_ = 0; - audio_controller_->Record(); - - return true; -} - -void SpeechRecognizer::CancelRecognition() { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - DCHECK(audio_controller_.get() || request_.get()); - - // Stop recording if required. - if (audio_controller_.get()) { - VLOG(1) << "SpeechRecognizer stopping record."; - audio_controller_->Close(); - audio_controller_ = NULL; // Releases the ref ptr. - } - - VLOG(1) << "SpeechRecognizer canceling recognition."; - encoder_.reset(); - request_.reset(); -} - -void SpeechRecognizer::StopRecording() { - DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); - - // If audio recording has already stopped and we are in recognition phase, - // silently ignore any more calls to stop recording. - if (!audio_controller_.get()) - return; - - VLOG(1) << "SpeechRecognizer stopping record."; - audio_controller_->Close(); - audio_controller_ = NULL; // Releases the ref ptr. - encoder_->Flush(); - - delegate_->DidCompleteRecording(caller_id_); - - // Since the http request takes a single string as POST data, allocate - // one and copy over bytes from the audio buffers to the string. - // And If we haven't got any audio yet end the recognition sequence here. - string mime_type = encoder_->mime_type(); - string data; - encoder_->GetEncodedData(&data); - encoder_.reset(); - - if (data.empty()) { - // Guard against the delegate freeing us until we finish our job. - scoped_refptr<SpeechRecognizer> me(this); - delegate_->DidCompleteRecognition(caller_id_); - } else { - DCHECK(!request_.get()); - request_.reset(new SpeechRecognitionRequest( - Profile::GetDefaultRequestContext(), this)); - request_->Send(language_, grammar_, hardware_info_, origin_url_, - mime_type, data); - } -} - -void SpeechRecognizer::ReleaseAudioBuffers() { -} - -// Invoked in the audio thread. -void SpeechRecognizer::OnError(AudioInputController* controller, - int error_code) { - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - NewRunnableMethod(this, - &SpeechRecognizer::HandleOnError, - error_code)); -} - -void SpeechRecognizer::HandleOnError(int error_code) { - LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; - - // Check if we are still recording before canceling recognition, as - // recording might have been stopped after this error was posted to the queue - // by |OnError|. - if (!audio_controller_.get()) - return; - - InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE); -} - -void SpeechRecognizer::OnData(AudioInputController* controller, - const uint8* data, uint32 size) { - if (size == 0) // This could happen when recording stops and is normal. - return; - - string* str_data = new string(reinterpret_cast<const char*>(data), size); - BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, - NewRunnableMethod(this, - &SpeechRecognizer::HandleOnData, - str_data)); -} - -void SpeechRecognizer::HandleOnData(string* data) { - // Check if we are still recording and if not discard this buffer, as - // recording might have been stopped after this buffer was posted to the queue - // by |OnData|. - if (!audio_controller_.get()) { - delete data; - return; - } - - const short* samples = reinterpret_cast<const short*>(data->data()); - DCHECK((data->length() % sizeof(short)) == 0); - int num_samples = data->length() / sizeof(short); - - encoder_->Encode(samples, num_samples); - float rms; - endpointer_.ProcessAudio(samples, num_samples, &rms); - delete data; - num_samples_recorded_ += num_samples; - - if (endpointer_.IsEstimatingEnvironment()) { - // Check if we have gathered enough audio for the endpointer to do - // environment estimation and should move on to detect speech/end of speech. - if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * - kAudioSampleRate) / 1000) { - endpointer_.SetUserInputMode(); - delegate_->DidCompleteEnvironmentEstimation(caller_id_); - } - return; // No more processing since we are still estimating environment. - } - - // Check if we have waited too long without hearing any speech. - if (!endpointer_.DidStartReceivingSpeech() && - num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) { - InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH); - return; - } - - // Calculate the input volume to display in the UI, smoothing towards the - // new level. - float level = (rms - kAudioMeterMinDb) / kAudioMeterDbRange; - level = std::min(std::max(0.0f, level), 1.0f); - if (level > audio_level_) { - audio_level_ += (level - audio_level_) * kUpSmoothingFactor; - } else { - audio_level_ += (level - audio_level_) * kDownSmoothingFactor; - } - delegate_->SetInputVolume(caller_id_, audio_level_); - - if (endpointer_.speech_input_complete()) { - StopRecording(); - } - - // TODO(satish): Once we have streaming POST, start sending the data received - // here as POST chunks. -} - -void SpeechRecognizer::SetRecognitionResult( - bool error, const SpeechInputResultArray& result) { - if (result.empty()) { - InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS); - return; - } - - delegate_->SetRecognitionResult(caller_id_, error, result); - - // Guard against the delegate freeing us until we finish our job. - scoped_refptr<SpeechRecognizer> me(this); - delegate_->DidCompleteRecognition(caller_id_); -} - -void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { - CancelRecognition(); - - // Guard against the delegate freeing us until we finish our job. - scoped_refptr<SpeechRecognizer> me(this); - delegate_->OnRecognizerError(caller_id_, error); -} - -} // namespace speech_input diff --git a/chrome/browser/speech/speech_recognizer.h b/chrome/browser/speech/speech_recognizer.h deleted file mode 100644 index 2570fba..0000000 --- a/chrome/browser/speech/speech_recognizer.h +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2010 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ -#define CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ - -#include <list> -#include <string> -#include <utility> - -#include "base/ref_counted.h" -#include "base/scoped_ptr.h" -#include "chrome/browser/speech/audio_encoder.h" -#include "chrome/browser/speech/endpointer/endpointer.h" -#include "chrome/browser/speech/speech_recognition_request.h" -#include "media/audio/audio_input_controller.h" - -namespace speech_input { - -// Records audio, sends recorded audio to server and translates server response -// to recognition result. -class SpeechRecognizer - : public base::RefCountedThreadSafe<SpeechRecognizer>, - public media::AudioInputController::EventHandler, - public SpeechRecognitionRequestDelegate { - public: - enum ErrorCode { - RECOGNIZER_NO_ERROR, - RECOGNIZER_ERROR_CAPTURE, - RECOGNIZER_ERROR_NO_SPEECH, - RECOGNIZER_ERROR_NO_RESULTS, - }; - - // Implemented by the caller to receive recognition events. - class Delegate { - public: - virtual void SetRecognitionResult( - int caller_id, - bool error, - const SpeechInputResultArray& result) = 0; - - // Invoked when audio recording stops, either due to the end pointer - // detecting silence in user input or if |StopRecording| was called. The - // delegate has to wait until |DidCompleteRecognition| is invoked before - // destroying the |SpeechRecognizer| object. - virtual void DidCompleteRecording(int caller_id) = 0; - - // This is guaranteed to be the last method invoked in the recognition - // sequence and the |SpeechRecognizer| object can be freed up if necessary. - virtual void DidCompleteRecognition(int caller_id) = 0; - - // Invoked if there was an error while recording or recognizing audio. The - // session has already been cancelled when this call is made and the DidXxxx - // callbacks will not be issued. It is safe to destroy/release the - // |SpeechRecognizer| object while processing this call. - virtual void OnRecognizerError(int caller_id, - SpeechRecognizer::ErrorCode error) = 0; - - // At the start of recognition, a short amount of audio is recorded to - // estimate the environment/background noise and this callback is issued - // after that is complete. Typically the delegate brings up any speech - // recognition UI once this callback is received. - virtual void DidCompleteEnvironmentEstimation(int caller_id) = 0; - - // Informs of a change in the captured audio level, useful if displaying - // a microphone volume indicator while recording. - // The value of |volume| is in the [0.0, 1.0] range. - virtual void SetInputVolume(int caller_id, float volume) = 0; - - protected: - virtual ~Delegate() {} - }; - - SpeechRecognizer(Delegate* delegate, - int caller_id, - const std::string& language, - const std::string& grammar, - const std::string& hardware_info, - const std::string& origin_url); - ~SpeechRecognizer(); - - // Starts audio recording and does recognition after recording ends. The same - // SpeechRecognizer instance can be used multiple times for speech recognition - // though each recognition request can be made only after the previous one - // completes (i.e. after receiving Delegate::DidCompleteRecognition). - bool StartRecording(); - - // Stops recording audio and starts recognition. - void StopRecording(); - - // Stops recording audio and cancels recognition. Any audio recorded so far - // gets discarded. - void CancelRecognition(); - - // AudioInputController::EventHandler methods. - virtual void OnCreated(media::AudioInputController* controller) { } - virtual void OnRecording(media::AudioInputController* controller) { } - virtual void OnError(media::AudioInputController* controller, int error_code); - virtual void OnData(media::AudioInputController* controller, - const uint8* data, - uint32 size); - - // SpeechRecognitionRequest::Delegate methods. - virtual void SetRecognitionResult(bool error, - const SpeechInputResultArray& result); - - static const int kAudioSampleRate; - static const int kAudioPacketIntervalMs; // Duration of each audio packet. - static const int kNumAudioChannels; - static const int kNumBitsPerAudioSample; - static const int kNoSpeechTimeoutSec; - static const int kEndpointerEstimationTimeMs; - - private: - void ReleaseAudioBuffers(); - void InformErrorAndCancelRecognition(ErrorCode error); - void SendRecordedAudioToServer(); - - void HandleOnError(int error_code); // Handles OnError in the IO thread. - - // Handles OnData in the IO thread. Takes ownership of |data|. - void HandleOnData(std::string* data); - - Delegate* delegate_; - int caller_id_; - std::string language_; - std::string grammar_; - std::string hardware_info_; - std::string origin_url_; - - scoped_ptr<SpeechRecognitionRequest> request_; - scoped_refptr<media::AudioInputController> audio_controller_; - AudioEncoder::Codec codec_; - scoped_ptr<AudioEncoder> encoder_; - Endpointer endpointer_; - int num_samples_recorded_; - float audio_level_; - - DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer); -}; - -// This typedef is to workaround the issue with certain versions of -// Visual Studio where it gets confused between multiple Delegate -// classes and gives a C2500 error. (I saw this error on the try bots - -// the workaround was not needed for my machine). -typedef SpeechRecognizer::Delegate SpeechRecognizerDelegate; - -} // namespace speech_input - -#endif // CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_ diff --git a/chrome/browser/speech/speech_recognizer_unittest.cc b/chrome/browser/speech/speech_recognizer_unittest.cc deleted file mode 100644 index 855f35a..0000000 --- a/chrome/browser/speech/speech_recognizer_unittest.cc +++ /dev/null @@ -1,300 +0,0 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include <vector> - -#include "chrome/browser/browser_thread.h" -#include "chrome/browser/speech/speech_recognizer.h" -#include "chrome/common/net/test_url_fetcher_factory.h" -#include "media/audio/test_audio_input_controller_factory.h" -#include "net/url_request/url_request_status.h" -#include "testing/gtest/include/gtest/gtest.h" - -using media::AudioInputController; -using media::TestAudioInputController; -using media::TestAudioInputControllerFactory; - -namespace speech_input { - -class SpeechRecognizerTest : public SpeechRecognizerDelegate, - public testing::Test { - public: - SpeechRecognizerTest() - : io_thread_(BrowserThread::IO, &message_loop_), - ALLOW_THIS_IN_INITIALIZER_LIST( - recognizer_(new SpeechRecognizer(this, 1, std::string(), - std::string(), std::string(), - std::string()))), - recording_complete_(false), - recognition_complete_(false), - result_received_(false), - error_(SpeechRecognizer::RECOGNIZER_NO_ERROR), - volume_(-1.0f) { - int audio_packet_length_bytes = - (SpeechRecognizer::kAudioSampleRate * - SpeechRecognizer::kAudioPacketIntervalMs * - SpeechRecognizer::kNumAudioChannels * - SpeechRecognizer::kNumBitsPerAudioSample) / (8 * 1000); - audio_packet_.resize(audio_packet_length_bytes); - } - - // SpeechRecognizer::Delegate methods. - virtual void SetRecognitionResult(int caller_id, - bool error, - const SpeechInputResultArray& result) { - result_received_ = true; - } - - virtual void DidCompleteRecording(int caller_id) { - recording_complete_ = true; - } - - virtual void DidCompleteRecognition(int caller_id) { - recognition_complete_ = true; - } - - virtual void DidCompleteEnvironmentEstimation(int caller_id) { - } - - virtual void OnRecognizerError(int caller_id, - SpeechRecognizer::ErrorCode error) { - error_ = error; - } - - virtual void SetInputVolume(int caller_id, float volume) { - volume_ = volume; - } - - // testing::Test methods. - virtual void SetUp() { - URLFetcher::set_factory(&url_fetcher_factory_); - AudioInputController::set_factory(&audio_input_controller_factory_); - } - - virtual void TearDown() { - URLFetcher::set_factory(NULL); - AudioInputController::set_factory(NULL); - } - - void FillPacketWithTestWaveform() { - // Fill the input with a simple pattern, a 125Hz sawtooth waveform. - for (size_t i = 0; i < audio_packet_.size(); ++i) - audio_packet_[i] = static_cast<uint8>(i); - } - - protected: - MessageLoopForIO message_loop_; - BrowserThread io_thread_; - scoped_refptr<SpeechRecognizer> recognizer_; - bool recording_complete_; - bool recognition_complete_; - bool result_received_; - SpeechRecognizer::ErrorCode error_; - TestURLFetcherFactory url_fetcher_factory_; - TestAudioInputControllerFactory audio_input_controller_factory_; - std::vector<uint8> audio_packet_; - float volume_; -}; - -TEST_F(SpeechRecognizerTest, StopNoData) { - // Check for callbacks when stopping record before any audio gets recorded. - EXPECT_TRUE(recognizer_->StartRecording()); - recognizer_->CancelRecognition(); - EXPECT_FALSE(recording_complete_); - EXPECT_FALSE(recognition_complete_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); -} - -TEST_F(SpeechRecognizerTest, CancelNoData) { - // Check for callbacks when canceling recognition before any audio gets - // recorded. - EXPECT_TRUE(recognizer_->StartRecording()); - recognizer_->StopRecording(); - EXPECT_TRUE(recording_complete_); - EXPECT_TRUE(recognition_complete_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); -} - -TEST_F(SpeechRecognizerTest, StopWithData) { - // Start recording, give some data and then stop. This should wait for the - // network callback to arrive before completion. - EXPECT_TRUE(recognizer_->StartRecording()); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller = audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - MessageLoop::current()->RunAllPending(); - recognizer_->StopRecording(); - EXPECT_TRUE(recording_complete_); - EXPECT_FALSE(recognition_complete_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); - - // Issue the network callback to complete the process. - TestURLFetcher* fetcher = url_fetcher_factory_.GetFetcherByID(0); - ASSERT_TRUE(fetcher); - net::URLRequestStatus status; - status.set_status(net::URLRequestStatus::SUCCESS); - fetcher->delegate()->OnURLFetchComplete( - fetcher, fetcher->original_url(), status, 200, ResponseCookies(), - "{\"hypotheses\":[{\"utterance\":\"123\"}]}"); - EXPECT_TRUE(recognition_complete_); - EXPECT_TRUE(result_received_); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); -} - -TEST_F(SpeechRecognizerTest, CancelWithData) { - // Start recording, give some data and then cancel. This should not create - // a network request and finish immediately. - EXPECT_TRUE(recognizer_->StartRecording()); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - MessageLoop::current()->RunAllPending(); - recognizer_->CancelRecognition(); - EXPECT_EQ(NULL, url_fetcher_factory_.GetFetcherByID(0)); - EXPECT_FALSE(recording_complete_); - EXPECT_FALSE(recognition_complete_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); -} - -TEST_F(SpeechRecognizerTest, AudioControllerErrorNoData) { - // Check if things tear down properly if AudioInputController threw an error. - EXPECT_TRUE(recognizer_->StartRecording()); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnError(controller, 0); - MessageLoop::current()->RunAllPending(); - EXPECT_FALSE(recording_complete_); - EXPECT_FALSE(recognition_complete_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_CAPTURE, error_); -} - -TEST_F(SpeechRecognizerTest, AudioControllerErrorWithData) { - // Check if things tear down properly if AudioInputController threw an error - // after giving some audio data. - EXPECT_TRUE(recognizer_->StartRecording()); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - controller->event_handler()->OnError(controller, 0); - MessageLoop::current()->RunAllPending(); - EXPECT_EQ(NULL, url_fetcher_factory_.GetFetcherByID(0)); - EXPECT_FALSE(recording_complete_); - EXPECT_FALSE(recognition_complete_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_CAPTURE, error_); -} - -TEST_F(SpeechRecognizerTest, NoSpeechCallbackIssued) { - // Start recording and give a lot of packets with audio samples set to zero. - // This should trigger the no-speech detector and issue a callback. - EXPECT_TRUE(recognizer_->StartRecording()); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller = audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - - int num_packets = (SpeechRecognizer::kNoSpeechTimeoutSec * 1000) / - SpeechRecognizer::kAudioPacketIntervalMs; - // The vector is already filled with zero value samples on create. - for (int i = 0; i < num_packets; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - } - MessageLoop::current()->RunAllPending(); - EXPECT_FALSE(recording_complete_); - EXPECT_FALSE(recognition_complete_); - EXPECT_FALSE(result_received_); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_ERROR_NO_SPEECH, error_); -} - -TEST_F(SpeechRecognizerTest, NoSpeechCallbackNotIssued) { - // Start recording and give a lot of packets with audio samples set to zero - // and then some more with reasonably loud audio samples. This should be - // treated as normal speech input and the no-speech detector should not get - // triggered. - EXPECT_TRUE(recognizer_->StartRecording()); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller = audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - - int num_packets = (SpeechRecognizer::kNoSpeechTimeoutSec * 1000) / - SpeechRecognizer::kAudioPacketIntervalMs; - - // The vector is already filled with zero value samples on create. - for (int i = 0; i < num_packets / 2; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - } - - FillPacketWithTestWaveform(); - for (int i = 0; i < num_packets / 2; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - } - - MessageLoop::current()->RunAllPending(); - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); - EXPECT_FALSE(recording_complete_); - EXPECT_FALSE(recognition_complete_); - recognizer_->CancelRecognition(); -} - -TEST_F(SpeechRecognizerTest, SetInputVolumeCallback) { - // Start recording and give a lot of packets with audio samples set to zero - // and then some more with reasonably loud audio samples. Check that we don't - // get the callback during estimation phase, then get zero for the silence - // samples and proper volume for the loud audio. - EXPECT_TRUE(recognizer_->StartRecording()); - TestAudioInputController* controller = - audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - controller = audio_input_controller_factory_.controller(); - ASSERT_TRUE(controller); - - // Feed some samples to begin with for the endpointer to do noise estimation. - int num_packets = SpeechRecognizer::kEndpointerEstimationTimeMs / - SpeechRecognizer::kAudioPacketIntervalMs; - for (int i = 0; i < num_packets; ++i) { - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - } - MessageLoop::current()->RunAllPending(); - EXPECT_EQ(-1.0f, volume_); // No audio volume set yet. - - // The vector is already filled with zero value samples on create. - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - MessageLoop::current()->RunAllPending(); - EXPECT_EQ(0, volume_); - - FillPacketWithTestWaveform(); - controller->event_handler()->OnData(controller, &audio_packet_[0], - audio_packet_.size()); - MessageLoop::current()->RunAllPending(); - EXPECT_FLOAT_EQ(0.9f, volume_); - - EXPECT_EQ(SpeechRecognizer::RECOGNIZER_NO_ERROR, error_); - EXPECT_FALSE(recording_complete_); - EXPECT_FALSE(recognition_complete_); - recognizer_->CancelRecognition(); -} - -} // namespace speech_input |