From b7d43842a01ec20cb2183bcb015f7d470234f6f1 Mon Sep 17 00:00:00 2001 From: "satish@chromium.org" Date: Tue, 24 Aug 2010 20:22:02 +0000 Subject: Add an endpointer for detecting end of speech. This is based on existing code/math. I have removed all the unused code for our usage and adapted to the chromium coding style. TEST=unit_tests --gtest_filter=EndpointerTest.* BUG=none Review URL: http://codereview.chromium.org/3117026 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@57226 0039d316-1c4b-4281-b951-d872f2087c98 --- chrome/browser/speech/endpointer/endpointer.cc | 164 ++++++++++ chrome/browser/speech/endpointer/endpointer.h | 137 ++++++++ .../speech/endpointer/endpointer_unittest.cc | 146 +++++++++ .../browser/speech/endpointer/energy_endpointer.cc | 355 +++++++++++++++++++++ .../browser/speech/endpointer/energy_endpointer.h | 144 +++++++++ .../speech/endpointer/energy_endpointer_params.h | 175 ++++++++++ chrome/browser/speech/speech_recognizer.cc | 45 ++- chrome/browser/speech/speech_recognizer.h | 2 + 8 files changed, 1159 insertions(+), 9 deletions(-) create mode 100644 chrome/browser/speech/endpointer/endpointer.cc create mode 100644 chrome/browser/speech/endpointer/endpointer.h create mode 100644 chrome/browser/speech/endpointer/endpointer_unittest.cc create mode 100644 chrome/browser/speech/endpointer/energy_endpointer.cc create mode 100644 chrome/browser/speech/endpointer/energy_endpointer.h create mode 100644 chrome/browser/speech/endpointer/energy_endpointer_params.h (limited to 'chrome/browser/speech') diff --git a/chrome/browser/speech/endpointer/endpointer.cc b/chrome/browser/speech/endpointer/endpointer.cc new file mode 100644 index 0000000..57a4f65 --- /dev/null +++ b/chrome/browser/speech/endpointer/endpointer.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/browser/speech/endpointer/endpointer.h" +#include "base/time.h" + +using base::Time; + +namespace { +static const int kFrameRate = 50; // 1 frame = 20ms of audio. +} + +namespace speech_input { + +Endpointer::Endpointer(int sample_rate) + : speech_input_possibly_complete_silence_length_us_(-1), + speech_input_complete_silence_length_us_(-1), + audio_frame_time_us_(0), + sample_rate_(sample_rate), + frame_size_(0) { + Reset(); + + frame_size_ = static_cast(sample_rate / static_cast(kFrameRate)); + + speech_input_minimum_length_us_ = + static_cast(1.7 * Time::kMicrosecondsPerSecond); + speech_input_complete_silence_length_us_ = + static_cast(0.5 * Time::kMicrosecondsPerSecond); + long_speech_input_complete_silence_length_us_ = -1; + long_speech_length_us_ = -1; + speech_input_possibly_complete_silence_length_us_ = + 1 * Time::kMicrosecondsPerSecond; + + // Set the default configuration for Push To Talk mode. + EnergyEndpointerParams ep_config; + ep_config.set_frame_period(1.0f / static_cast(kFrameRate)); + ep_config.set_frame_duration(1.0f / static_cast(kFrameRate)); + ep_config.set_endpoint_margin(0.2f); + ep_config.set_onset_window(0.15f); + ep_config.set_speech_on_window(0.4f); + ep_config.set_offset_window(0.15f); + ep_config.set_onset_detect_dur(0.09f); + ep_config.set_onset_confirm_dur(0.075f); + ep_config.set_on_maintain_dur(0.10f); + ep_config.set_offset_confirm_dur(0.12f); + ep_config.set_decision_threshold(1000.0f); + ep_config.set_min_decision_threshold(50.0f); + ep_config.set_fast_update_dur(0.2f); + ep_config.set_sample_rate(static_cast(sample_rate)); + ep_config.set_min_fundamental_frequency(57.143f); + ep_config.set_max_fundamental_frequency(400.0f); + ep_config.set_contamination_rejection_period(0.25f); + energy_endpointer_.Init(ep_config); +} + +void Endpointer::Reset() { + old_ep_status_ = EP_PRE_SPEECH; + waiting_for_speech_possibly_complete_timeout_ = false; + waiting_for_speech_complete_timeout_ = false; + speech_previously_detected_ = false; + speech_input_complete_ = false; + audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer. + speech_end_time_us_ = -1; + speech_start_time_us_ = -1; +} + +void Endpointer::StartSession() { + Reset(); + energy_endpointer_.StartSession(); +} + +void Endpointer::EndSession() { + energy_endpointer_.EndSession(); +} + +void Endpointer::SetEnvironmentEstimationMode() { + Reset(); + energy_endpointer_.SetEnvironmentEstimationMode(); +} + +void Endpointer::SetUserInputMode() { + energy_endpointer_.SetUserInputMode(); +} + +EpStatus Endpointer::Status(int64 *time) { + return energy_endpointer_.Status(time); +} + +EpStatus Endpointer::ProcessAudio(const int16* audio_data, int num_samples) { + EpStatus ep_status = EP_PRE_SPEECH; + + // Process the input data in blocks of frame_size_, dropping any incomplete + // frames at the end (which is ok since typically the caller will be recording + // audio in multiples of our frame size). + int sample_index = 0; + while (sample_index + frame_size_ <= num_samples) { + // Have the endpointer process the frame. + energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, + audio_data + sample_index, + frame_size_); + sample_index += frame_size_; + audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) / + sample_rate_; + + // Get the status of the endpointer. + int64 ep_time; + ep_status = energy_endpointer_.Status(&ep_time); + + // Handle state changes. + if ((EP_SPEECH_PRESENT == ep_status) && + (EP_POSSIBLE_ONSET == old_ep_status_)) { + speech_end_time_us_ = -1; + waiting_for_speech_possibly_complete_timeout_ = false; + waiting_for_speech_complete_timeout_ = false; + // Trigger SpeechInputDidStart event on first detection. + if (false == speech_previously_detected_) { + speech_previously_detected_ = true; + speech_start_time_us_ = ep_time; + } + } + if ((EP_PRE_SPEECH == ep_status) && + (EP_POSSIBLE_OFFSET == old_ep_status_)) { + speech_end_time_us_ = ep_time; + waiting_for_speech_possibly_complete_timeout_ = true; + waiting_for_speech_complete_timeout_ = true; + } + if (ep_time > speech_input_minimum_length_us_) { + // Speech possibly complete timeout. + if ((waiting_for_speech_possibly_complete_timeout_) && + (ep_time - speech_end_time_us_ > + speech_input_possibly_complete_silence_length_us_)) { + waiting_for_speech_possibly_complete_timeout_ = false; + } + if (waiting_for_speech_complete_timeout_) { + // The length of the silence timeout period can be held constant, or it + // can be changed after a fixed amount of time from the beginning of + // speech. + bool has_stepped_silence = + (long_speech_length_us_ > 0) && + (long_speech_input_complete_silence_length_us_ > 0); + int64 requested_silence_length; + if (has_stepped_silence && + (ep_time - speech_start_time_us_) > long_speech_length_us_) { + requested_silence_length = + long_speech_input_complete_silence_length_us_; + } else { + requested_silence_length = + speech_input_complete_silence_length_us_; + } + + // Speech complete timeout. + if ((ep_time - speech_end_time_us_) > requested_silence_length) { + waiting_for_speech_complete_timeout_ = false; + speech_input_complete_ = true; + } + } + } + old_ep_status_ = ep_status; + } + return ep_status; +} + +} // namespace speech diff --git a/chrome/browser/speech/endpointer/endpointer.h b/chrome/browser/speech/endpointer/endpointer.h new file mode 100644 index 0000000..79b316d --- /dev/null +++ b/chrome/browser/speech/endpointer/endpointer.h @@ -0,0 +1,137 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ +#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ + +#include "base/basictypes.h" +#include "chrome/browser/speech/endpointer/energy_endpointer.h" + +class EpStatus; + +namespace speech_input { + +// A simple interface to the underlying energy-endpointer implementation, this +// class lets callers provide audio as being recorded and let them poll to find +// when the user has stopped speaking. +// +// There are two events that may trigger the end of speech: +// +// speechInputPossiblyComplete event: +// +// Signals that silence/noise has been detected for a *short* amount of +// time after some speech has been detected. It can be used for low latency +// UI feedback. To disable it, set it to a large amount. +// +// speechInputComplete event: +// +// This event is intended to signal end of input and to stop recording. +// The amount of time to wait after speech is set by +// speech_input_complete_silence_length_ and optionally two other +// parameters (see below). +// This time can be held constant, or can change as more speech is detected. +// In the latter case, the time changes after a set amount of time from the +// *beginning* of speech. This is motivated by the expectation that there +// will be two distinct types of inputs: short search queries and longer +// dictation style input. +// +// Three parameters are used to define the piecewise constant timeout function. +// The timeout length is speech_input_complete_silence_length until +// long_speech_length, when it changes to +// long_speech_input_complete_silence_length. +class Endpointer { + public: + explicit Endpointer(int sample_rate); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Process a segment of audio, which may be more than one frame. + // The status of the last frame will be returned. + EpStatus ProcessAudio(const int16* audio_data, int num_samples); + + // Get the status of the endpointer. + EpStatus Status(int64 *time_us); + + void set_speech_input_complete_silence_length(int64 time_us) { + speech_input_complete_silence_length_us_ = time_us; + } + + void set_long_speech_input_complete_silence_length(int64 time_us) { + long_speech_input_complete_silence_length_us_ = time_us; + } + + void set_speech_input_possibly_complete_silence_length(int64 time_us) { + speech_input_possibly_complete_silence_length_us_ = time_us; + } + + void set_long_speech_length(int64 time_us) { + long_speech_length_us_ = time_us; + } + + bool speech_input_complete() const { + return speech_input_complete_; + } + + private: + // Reset internal states. Helper method common to initial input utterance + // and following input utternaces. + void Reset(); + + // Minimum allowable length of speech input. + int64 speech_input_minimum_length_us_; + + // The speechInputPossiblyComplete event signals that silence/noise has been + // detected for a *short* amount of time after some speech has been detected. + // This proporty specifies the time period. + int64 speech_input_possibly_complete_silence_length_us_; + + // The speechInputComplete event signals that silence/noise has been + // detected for a *long* amount of time after some speech has been detected. + // This property specifies the time period. + int64 speech_input_complete_silence_length_us_; + + // Same as above, this specifies the required silence period after speech + // detection. This period is used instead of + // speech_input_complete_silence_length_ when the utterance is longer than + // long_speech_length_. This parameter is optional. + int64 long_speech_input_complete_silence_length_us_; + + // The period of time after which the endpointer should consider + // long_speech_input_complete_silence_length_ as a valid silence period + // instead of speech_input_complete_silence_length_. This parameter is + // optional. + int64 long_speech_length_us_; + + // First speech onset time, used in determination of speech complete timeout. + int64 speech_start_time_us_; + + // Most recent end time, used in determination of speech complete timeout. + int64 speech_end_time_us_; + + int64 audio_frame_time_us_; + EpStatus old_ep_status_; + bool waiting_for_speech_possibly_complete_timeout_; + bool waiting_for_speech_complete_timeout_; + bool speech_previously_detected_; + bool speech_input_complete_; + EnergyEndpointer energy_endpointer_; + int sample_rate_; + int32 frame_size_; +}; + +} // namespace speech_input + +#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ diff --git a/chrome/browser/speech/endpointer/endpointer_unittest.cc b/chrome/browser/speech/endpointer/endpointer_unittest.cc new file mode 100644 index 0000000..b49a6a6 --- /dev/null +++ b/chrome/browser/speech/endpointer/endpointer_unittest.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/task.h" +#include "chrome/browser/speech/endpointer/endpointer.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace { +const int kFrameRate = 50; // 20 ms long frames for AMR encoding. +const int kSampleRate = 8000; // 8 k samples per second for AMR encoding. + +// At 8 sample per second a 20 ms frame is 160 samples, which corrsponds +// to the AMR codec. +const int kFrameSize = kSampleRate / kFrameRate; // 160 samples. +COMPILE_ASSERT(kFrameSize == 160, invalid_frame_size); +} + +namespace speech_input { + +class FrameProcessor { + public: + // Process a single frame of test audio samples. + virtual EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) = 0; +}; + +void RunEndpointerEventsTest(FrameProcessor* processor) { + int16 samples[kFrameSize]; + + // We will create a white noise signal of 150 frames. The frames from 50 to + // 100 will have more power, and the endpointer should fire on those frames. + const int kNumFrames = 150; + + // Create a random sequence of samples. + srand(1); + float gain = 0.0; + int64 time = 0; + for (int frame_count = 0; frame_count < kNumFrames; ++frame_count) { + // The frames from 50 to 100 will have more power, and the endpointer + // should detect those frames as speech. + if ((frame_count >= 50) && (frame_count < 100)) { + gain = 2000.0; + } else { + gain = 1.0; + } + // Create random samples. + for (int i = 0; i < kFrameSize; ++i) { + float randNum = static_cast(rand() - (RAND_MAX / 2)) / + static_cast(RAND_MAX); + samples[i] = static_cast(gain * randNum); + } + + EpStatus ep_status = processor->ProcessFrame(time, samples, kFrameSize); + time += static_cast(kFrameSize * (1e6 / kSampleRate)); + + // Log the status. + if (20 == frame_count) + EXPECT_EQ(EP_PRE_SPEECH, ep_status); + if (70 == frame_count) + EXPECT_EQ(EP_SPEECH_PRESENT, ep_status); + if (120 == frame_count) + EXPECT_EQ(EP_PRE_SPEECH, ep_status); + } +} + +// This test instantiates and initializes a stand alone endpointer module. +// The test creates FrameData objects with random noise and send them +// to the endointer module. The energy of the first 50 frames is low, +// followed by 500 high energy frames, and another 50 low energy frames. +// We test that the correct start and end frames were detected. +class EnergyEndpointerFrameProcessor : public FrameProcessor { + public: + explicit EnergyEndpointerFrameProcessor(EnergyEndpointer* endpointer) + : endpointer_(endpointer) {} + + EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) { + endpointer_->ProcessAudioFrame(time, samples, kFrameSize); + int64 ep_time; + return endpointer_->Status(&ep_time); + } + + private: + EnergyEndpointer* endpointer_; +}; + +TEST(EndpointerTest, TestEnergyEndpointerEvents) { + // Initialize endpointer and configure it. We specify the parameters + // here for a 20ms window, and a 20ms step size, which corrsponds to + // the narrow band AMR codec. + EnergyEndpointerParams ep_config; + ep_config.set_frame_period(1.0f / static_cast(kFrameRate)); + ep_config.set_frame_duration(1.0f / static_cast(kFrameRate)); + ep_config.set_endpoint_margin(0.2f); + ep_config.set_onset_window(0.15f); + ep_config.set_speech_on_window(0.4f); + ep_config.set_offset_window(0.15f); + ep_config.set_onset_detect_dur(0.09f); + ep_config.set_onset_confirm_dur(0.075f); + ep_config.set_on_maintain_dur(0.10f); + ep_config.set_offset_confirm_dur(0.12f); + ep_config.set_decision_threshold(100.0f); + EnergyEndpointer endpointer; + endpointer.Init(ep_config); + + endpointer.StartSession(); + + EnergyEndpointerFrameProcessor frame_processor(&endpointer); + RunEndpointerEventsTest(&frame_processor); + + endpointer.EndSession(); +}; + +// Test endpointer wrapper class. +class EndpointerFrameProcessor : public FrameProcessor { + public: + explicit EndpointerFrameProcessor(Endpointer* endpointer) + : endpointer_(endpointer) {} + + EpStatus ProcessFrame(int64 time, int16* samples, int frame_size) { + endpointer_->ProcessAudio(samples, kFrameSize); + int64 ep_time; + return endpointer_->Status(&ep_time); + } + + private: + Endpointer* endpointer_; +}; + +TEST(EndpointerTest, TestEmbeddedEndpointerEvents) { + const int kSampleRate = 8000; // 8 k samples per second for AMR encoding. + + Endpointer endpointer(kSampleRate); + const int64 kMillisecondsPerMicrosecond = 1000; + const int64 short_timeout = 300 * kMillisecondsPerMicrosecond; + endpointer.set_speech_input_possibly_complete_silence_length(short_timeout); + const int64 long_timeout = 500 * kMillisecondsPerMicrosecond; + endpointer.set_speech_input_complete_silence_length(long_timeout); + endpointer.StartSession(); + + EndpointerFrameProcessor frame_processor(&endpointer); + RunEndpointerEventsTest(&frame_processor); + + endpointer.EndSession(); +} + +} // namespace speech_input diff --git a/chrome/browser/speech/endpointer/energy_endpointer.cc b/chrome/browser/speech/endpointer/energy_endpointer.cc new file mode 100644 index 0000000..44ca4dd --- /dev/null +++ b/chrome/browser/speech/endpointer/energy_endpointer.cc @@ -0,0 +1,355 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// To know more about the algorithm used and the original code which this is +// based of, see +// https://wiki.corp.google.com/twiki/bin/view/Main/ChromeGoogleCodeXRef + +#include "chrome/browser/speech/endpointer/energy_endpointer.h" + +#include "base/logging.h" +#include +#include + +namespace { + +// Returns the RMS (quadratic mean) of the input signal. +float RMS(const int16* samples, int num_samples) { + int64 ssq_int64 = 0; + int64 sum_int64 = 0; + for (int i = 0; i < num_samples; ++i) { + sum_int64 += samples[i]; + ssq_int64 += samples[i] * samples[i]; + } + // now convert to floats. + double sum = static_cast(sum_int64); + sum /= num_samples; + double ssq = static_cast(ssq_int64); + return static_cast(sqrt((ssq / num_samples) - (sum * sum))); +} + +int64 Secs2Usecs(float seconds) { + return static_cast(0.5 + (1.0e6 * seconds)); +} + +} // namespace + +namespace speech_input { + +// Stores threshold-crossing histories for making decisions about the speech +// state. +class EnergyEndpointer::HistoryRing { + public: + HistoryRing() {} + + // Resets the ring to |size| elements each with state |initial_state| + void SetRing(int size, bool initial_state); + + // Inserts a new entry into the ring and drops the oldest entry. + void Insert(int64 time_us, bool decision); + + // Returns the time in microseconds of the most recently added entry. + int64 EndTime() const; + + // Returns the sum of all intervals during which 'decision' is true within + // the time in seconds specified by 'duration'. The returned interval is + // in seconds. + float RingSum(float duration_sec); + + private: + struct DecisionPoint { + int64 time_us; + bool decision; + }; + + std::vector decision_points_; + int insertion_index_; // Index at which the next item gets added/inserted. + + DISALLOW_COPY_AND_ASSIGN(HistoryRing); +}; + +void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { + insertion_index_ = 0; + decision_points_.clear(); + DecisionPoint init = { -1, initial_state }; + decision_points_.resize(size, init); +} + +void EnergyEndpointer::HistoryRing::Insert(int64 time_us, bool decision) { + decision_points_[insertion_index_].time_us = time_us; + decision_points_[insertion_index_].decision = decision; + insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); +} + +int64 EnergyEndpointer::HistoryRing::EndTime() const { + int ind = insertion_index_ - 1; + if (ind < 0) + ind = decision_points_.size() - 1; + return decision_points_[ind].time_us; +} + +float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { + if (!decision_points_.size()) + return 0.0; + + int64 sum_us = 0; + int ind = insertion_index_ - 1; + if (ind < 0) + ind = decision_points_.size() - 1; + int64 end_us = decision_points_[ind].time_us; + bool is_on = decision_points_[ind].decision; + int64 start_us = end_us - static_cast(0.5 + (1.0e6 * duration_sec)); + if (start_us < 0) + start_us = 0; + size_t n_summed = 1; // n points ==> (n-1) intervals + while ((decision_points_[ind].time_us > start_us) && + (n_summed < decision_points_.size())) { + --ind; + if (ind < 0) + ind = decision_points_.size() - 1; + if (is_on) + sum_us += end_us - decision_points_[ind].time_us; + is_on = decision_points_[ind].decision; + end_us = decision_points_[ind].time_us; + n_summed++; + } + + return 1.0e-6f * sum_us; // Returns total time that was super threshold. +} + +EnergyEndpointer::EnergyEndpointer() + : endpointer_time_us_(0), + max_window_dur_(4.0), + history_(new HistoryRing()) { +} + +EnergyEndpointer::~EnergyEndpointer() { +} + +int EnergyEndpointer::TimeToFrame(float time) const { + return static_cast(0.5 + (time / params_.frame_period())); +} + +void EnergyEndpointer::Restart(bool reset_threshold) { + status_ = EP_PRE_SPEECH; + user_input_start_time_us_ = 0; + + if (reset_threshold) { + decision_threshold_ = params_.decision_threshold(); + rms_adapt_ = decision_threshold_; + noise_level_ = params_.decision_threshold() / 2.0f; + frame_counter_ = 0; // Used for rapid initial update of levels. + } + + // Set up the memories to hold the history windows. + history_->SetRing(TimeToFrame(max_window_dur_), false); + + // Flag that indicates that current input should be used for + // estimating the environment. The user has not yet started input + // by e.g. pressed the push-to-talk button. By default, this is + // false for backward compatibility. + estimating_environment_ = false; +} + +void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { + params_ = params; + + // Find the longest history interval to be used, and make the ring + // large enough to accommodate that number of frames. NOTE: This + // depends upon ep_frame_period being set correctly in the factory + // that did this instantiation. + max_window_dur_ = params_.onset_window(); + if (params_.speech_on_window() > max_window_dur_) + max_window_dur_ = params_.speech_on_window(); + if (params_.offset_window() > max_window_dur_) + max_window_dur_ = params_.offset_window(); + Restart(true); + + offset_confirm_dur_sec_ = params_.offset_window() - + params_.offset_confirm_dur(); + if (offset_confirm_dur_sec_ < 0.0) + offset_confirm_dur_sec_ = 0.0; + + user_input_start_time_us_ = 0; + + // Flag that indicates that current input should be used for + // estimating the environment. The user has not yet started input + // by e.g. pressed the push-to-talk button. By default, this is + // false for backward compatibility. + estimating_environment_ = false; + // The initial value of the noise and speech levels is inconsequential. + // The level of the first frame will overwrite these values. + noise_level_ = params_.decision_threshold() / 2.0f; + fast_update_frames_ = + static_cast(params_.fast_update_dur() / params_.frame_period()); + + frame_counter_ = 0; // Used for rapid initial update of levels. + + sample_rate_ = params_.sample_rate(); + start_lag_ = static_cast(sample_rate_ / + params_.max_fundamental_frequency()); + end_lag_ = static_cast(sample_rate_ / + params_.min_fundamental_frequency()); +} + +void EnergyEndpointer::StartSession() { + Restart(true); +} + +void EnergyEndpointer::EndSession() { + status_ = EP_POST_SPEECH; +} + +void EnergyEndpointer::SetEnvironmentEstimationMode() { + Restart(true); + estimating_environment_ = true; +} + +void EnergyEndpointer::SetUserInputMode() { + estimating_environment_ = false; + user_input_start_time_us_ = endpointer_time_us_; +} + +void EnergyEndpointer::ProcessAudioFrame(int64 time_us, + const int16* samples, + int num_samples) { + endpointer_time_us_ = time_us; + float rms = RMS(samples, num_samples); + + // Check that this is user input audio vs. pre-input adaptation audio. + // Input audio starts when the user indicates start of input, by e.g. + // pressing push-to-talk. Audio recieved prior to that is used to update + // noise and speech level estimates. + if (!estimating_environment_) { + bool decision = false; + if ((endpointer_time_us_ - user_input_start_time_us_) < + Secs2Usecs(params_.contamination_rejection_period())) { + decision = false; + DLOG(INFO) << "decision: forced to false, time: " << endpointer_time_us_; + } else { + decision = (rms > decision_threshold_); + } + DLOG(INFO) << "endpointer_time: " << endpointer_time_us_ + << " user_input_start_time: " << user_input_start_time_us_ + << " FA reject period " + << Secs2Usecs(params_.contamination_rejection_period()) + << " decision: " << (decision ? "SPEECH +++" : "SIL ------"); + + history_->Insert(endpointer_time_us_, decision); + + switch (status_) { + case EP_PRE_SPEECH: + if (history_->RingSum(params_.onset_window()) > + params_.onset_detect_dur()) { + status_ = EP_POSSIBLE_ONSET; + } + break; + + case EP_POSSIBLE_ONSET: { + float tsum = history_->RingSum(params_.onset_window()); + if (tsum > params_.onset_confirm_dur()) { + status_ = EP_SPEECH_PRESENT; + } else { // If signal is not maintained, drop back to pre-speech. + if (tsum <= params_.onset_detect_dur()) + status_ = EP_PRE_SPEECH; + } + break; + } + + case EP_SPEECH_PRESENT: { + // To induce hysteresis in the state residency, we allow a + // smaller residency time in the on_ring, than was required to + // enter the SPEECH_PERSENT state. + float on_time = history_->RingSum(params_.speech_on_window()); + if (on_time < params_.on_maintain_dur()) + status_ = EP_POSSIBLE_OFFSET; + break; + } + + case EP_POSSIBLE_OFFSET: + if (history_->RingSum(params_.offset_window()) <= + offset_confirm_dur_sec_) { + // Note that this offset time may be beyond the end + // of the input buffer in a real-time system. It will be up + // to the RecognizerSession to decide what to do. + status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. + } else { // If speech picks up again we allow return to SPEECH_PRESENT. + if (history_->RingSum(params_.speech_on_window()) >= + params_.on_maintain_dur()) + status_ = EP_SPEECH_PRESENT; + } + break; + + default: + LOG(WARNING) << "Invalid case in switch: " << status_; + break; + } + + // If this is a quiet, non-speech region, slowly adapt the detection + // threshold to be about 6dB above the average RMS. + if ((!decision) && (status_ == EP_PRE_SPEECH)) { + decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); + rms_adapt_ = decision_threshold_; + } else { + // If this is in a speech region, adapt the decision threshold to + // be about 10dB below the average RMS. If the noise level is high, + // the threshold is pushed up. + // Adaptation up to a higher level is 5 times faster than decay to + // a lower level. + if ((status_ == EP_SPEECH_PRESENT) && decision) { + if (rms_adapt_ > rms) { + rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); + } else { + rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); + } + float target_threshold = 0.3f * rms_adapt_ + noise_level_; + decision_threshold_ = (.90f * decision_threshold_) + + (0.10f * target_threshold); + } + } + + // Set a floor + if (decision_threshold_ (frame_counter_) / + static_cast(fast_update_frames_); + noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); + DLOG(INFO) << "FAST UPDATE, frame_counter_ " << frame_counter_ + << "fast_update_frames_ " << fast_update_frames_; + } else { + // Update Noise level. The noise level adapts quickly downward, but + // slowly upward. The noise_level_ parameter is not currently used + // for threshold adaptation. It is used for UI feedback. + if (noise_level_ < rms) + noise_level_ = (0.999f * noise_level_) + (0.001f * rms); + else + noise_level_ = (0.95f * noise_level_) + (0.05f * rms); + } + if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { + decision_threshold_ = noise_level_ * 2; // 6dB above noise level. + // Set a floor + if (decision_threshold_ < params_.min_decision_threshold()) + decision_threshold_ = params_.min_decision_threshold(); + } +} + +EpStatus EnergyEndpointer::Status(int64* status_time) const { + *status_time = history_->EndTime(); + return status_; +} + +} // namespace speech diff --git a/chrome/browser/speech/endpointer/energy_endpointer.h b/chrome/browser/speech/endpointer/energy_endpointer.h new file mode 100644 index 0000000..5a5c76f --- /dev/null +++ b/chrome/browser/speech/endpointer/energy_endpointer.h @@ -0,0 +1,144 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// The EnergyEndpointer class finds likely speech onset and offset points. +// +// The implementation described here is about the simplest possible. +// It is based on timings of threshold crossings for overall signal +// RMS. It is suitable for light weight applications. +// +// As written, the basic idea is that one specifies intervals that +// must be occupied by super- and sub-threshold energy levels, and +// defers decisions re onset and offset times until these +// specifications have been met. Three basic intervals are tested: an +// onset window, a speech-on window, and an offset window. We require +// super-threshold to exceed some mimimum total durations in the onset +// and speech-on windows before declaring the speech onset time, and +// we specify a required sub-threshold residency in the offset window +// before declaring speech offset. As the various residency requirements are +// met, the EnergyEndpointer instance assumes various states, and can return the +// ID of these states to the client (see EpStatus below). +// +// The levels of the speech and background noise are continuously updated. It is +// important that the background noise level be estimated initially for +// robustness in noisy conditions. The first frames are assumed to be background +// noise and a fast update rate is used for the noise level. The duration for +// fast update is controlled by the fast_update_dur_ paramter. +// +// If used in noisy conditions, the endpointer should be started and run in the +// EnvironmentEstimation mode, for at least 200ms, before switching to +// UserInputMode. +// Audio feedback contamination can appear in the input audio, if not cut +// out or handled by echo cancellation. Audio feedback can trigger a false +// accept. The false accepts can be ignored by setting +// ep_contamination_rejection_period. + +#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ +#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ + +#include "base/basictypes.h" +#include "base/scoped_ptr.h" +#include "chrome/browser/speech/endpointer/energy_endpointer_params.h" +#include + +namespace speech_input { + +// Endpointer status codes +enum EpStatus { + EP_PRE_SPEECH = 10, + EP_POSSIBLE_ONSET, + EP_SPEECH_PRESENT, + EP_POSSIBLE_OFFSET, + EP_POST_SPEECH, +}; + +class EnergyEndpointer { + public: + // The default construction MUST be followed by Init(), before any + // other use can be made of the instance. + EnergyEndpointer(); + virtual ~EnergyEndpointer(); + + void Init(const EnergyEndpointerParams& params); + + // Start the endpointer. This should be called at the beginning of a session. + void StartSession(); + + // Stop the endpointer. + void EndSession(); + + // Start environment estimation. Audio will be used for environment estimation + // i.e. noise level estimation. + void SetEnvironmentEstimationMode(); + + // Start user input. This should be called when the user indicates start of + // input, e.g. by pressing a button. + void SetUserInputMode(); + + // Computes the next input frame and modifies EnergyEndpointer status as + // appropriate based on the computation. + void ProcessAudioFrame(int64 time_us, const int16* samples, int num_samples); + + // Returns the current state of the EnergyEndpointer and the time + // corresponding to the most recently computed frame. + EpStatus Status(int64* status_time_us) const; + + private: + class HistoryRing; + + // Resets the endpointer internal state. If reset_threshold is true, the + // state will be reset completely, including adaptive thresholds and the + // removal of all history information. + void Restart(bool reset_threshold); + + // Update internal speech and noise levels. + void UpdateLevels(float rms); + + // Returns the number of frames (or frame number) corresponding to + // the 'time' (in seconds). + int TimeToFrame(float time) const; + + EpStatus status_; // The current state of this instance. + float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH + int64 endpointer_time_us_; // Time of the most recently received audio frame. + int64 fast_update_frames_; // Number of frames for initial level adaptation. + int64 frame_counter_; // Number of frames seen. Used for initial adaptation. + float max_window_dur_; // Largest search window size (seconds) + float sample_rate_; // Sampling rate. + + // Ring buffers to hold the speech activity history. + scoped_ptr history_; + + // Configuration parameters. + EnergyEndpointerParams params_; + + // RMS which must be exceeded to conclude frame is speech. + float decision_threshold_; + + // Flag to indicate that audio should be used to estmiate enviroment, prior to + // receiving user input. + bool estimating_environment_; + + // Estimate of the background noise level. Used externally for UI feedback. + float noise_level_; + + // An adaptive threshold used to update decision_threshold_ when appropriate. + float rms_adapt_; + + // Start lag corresponds to the highest fundamental frequency. + int start_lag_; + + // End lag corresponds to the lowest fundamental frequency. + int end_lag_; + + // Time when mode switched from environment estimation to user input. This + // is used to time forced rejection of audio feedback contamination. + int64 user_input_start_time_us_; + + DISALLOW_COPY_AND_ASSIGN(EnergyEndpointer); +}; + +} // namespace speech_input + +#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ diff --git a/chrome/browser/speech/endpointer/energy_endpointer_params.h b/chrome/browser/speech/endpointer/energy_endpointer_params.h new file mode 100644 index 0000000..c99ff99 --- /dev/null +++ b/chrome/browser/speech/endpointer/energy_endpointer_params.h @@ -0,0 +1,175 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ +#define CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ + +#include "base/basictypes.h" + +namespace speech_input { + +// Input parameters for the EnergyEndpointer class. +class EnergyEndpointerParams { + public: + EnergyEndpointerParams() { + SetDefaults(); + } + + void SetDefaults() { + frame_period_ = 0.01f; + frame_duration_ = 0.01f; + endpoint_margin_ = 0.2f; + onset_window_ = 0.15f; + speech_on_window_ = 0.4f; + offset_window_ = 0.15f; + onset_detect_dur_ = 0.09f; + onset_confirm_dur_ = 0.075f; + on_maintain_dur_ = 0.10f; + offset_confirm_dur_ = 0.12f; + decision_threshold_ = 150.0f; + min_decision_threshold_ = 50.0f; + fast_update_dur_ = 0.2f; + sample_rate_ = 8000.0f; + min_fundamental_frequency_ = 57.143f; + max_fundamental_frequency_ = 400.0f; + contamination_rejection_period_ = 0.25f; + } + + void operator=(const EnergyEndpointerParams& source) { + frame_period_ = source.frame_period(); + frame_duration_ = source.frame_duration(); + endpoint_margin_ = source.endpoint_margin(); + onset_window_ = source.onset_window(); + speech_on_window_ = source.speech_on_window(); + offset_window_ = source.offset_window(); + onset_detect_dur_ = source.onset_detect_dur(); + onset_confirm_dur_ = source.onset_confirm_dur(); + on_maintain_dur_ = source.on_maintain_dur(); + offset_confirm_dur_ = source.offset_confirm_dur(); + decision_threshold_ = source.decision_threshold(); + min_decision_threshold_ = source.min_decision_threshold(); + fast_update_dur_ = source.fast_update_dur(); + sample_rate_ = source.sample_rate(); + min_fundamental_frequency_ = source.min_fundamental_frequency(); + max_fundamental_frequency_ = source.max_fundamental_frequency(); + contamination_rejection_period_ = source.contamination_rejection_period(); + } + + // Accessors and mutators + float frame_period() const { return frame_period_; } + void set_frame_period(float frame_period) { + frame_period_ = frame_period; + } + + float frame_duration() const { return frame_duration_; } + void set_frame_duration(float frame_duration) { + frame_duration_ = frame_duration; + } + + float endpoint_margin() const { return endpoint_margin_; } + void set_endpoint_margin(float endpoint_margin) { + endpoint_margin_ = endpoint_margin; + } + + float onset_window() const { return onset_window_; } + void set_onset_window(float onset_window) { onset_window_ = onset_window; } + + float speech_on_window() const { return speech_on_window_; } + void set_speech_on_window(float speech_on_window) { + speech_on_window_ = speech_on_window; + } + + float offset_window() const { return offset_window_; } + void set_offset_window(float offset_window) { + offset_window_ = offset_window; + } + + float onset_detect_dur() const { return onset_detect_dur_; } + void set_onset_detect_dur(float onset_detect_dur) { + onset_detect_dur_ = onset_detect_dur; + } + + float onset_confirm_dur() const { return onset_confirm_dur_; } + void set_onset_confirm_dur(float onset_confirm_dur) { + onset_confirm_dur_ = onset_confirm_dur; + } + + float on_maintain_dur() const { return on_maintain_dur_; } + void set_on_maintain_dur(float on_maintain_dur) { + on_maintain_dur_ = on_maintain_dur; + } + + float offset_confirm_dur() const { return offset_confirm_dur_; } + void set_offset_confirm_dur(float offset_confirm_dur) { + offset_confirm_dur_ = offset_confirm_dur; + } + + float decision_threshold() const { return decision_threshold_; } + void set_decision_threshold(float decision_threshold) { + decision_threshold_ = decision_threshold; + } + + float min_decision_threshold() const { return min_decision_threshold_; } + void set_min_decision_threshold(float min_decision_threshold) { + min_decision_threshold_ = min_decision_threshold; + } + + float fast_update_dur() const { return fast_update_dur_; } + void set_fast_update_dur(float fast_update_dur) { + fast_update_dur_ = fast_update_dur; + } + + float sample_rate() const { return sample_rate_; } + void set_sample_rate(float sample_rate) { sample_rate_ = sample_rate; } + + float min_fundamental_frequency() const { return min_fundamental_frequency_; } + void set_min_fundamental_frequency(float min_fundamental_frequency) { + min_fundamental_frequency_ = min_fundamental_frequency; + } + + float max_fundamental_frequency() const { return max_fundamental_frequency_; } + void set_max_fundamental_frequency(float max_fundamental_frequency) { + max_fundamental_frequency_ = max_fundamental_frequency; + } + + float contamination_rejection_period() const { + return contamination_rejection_period_; + } + void set_contamination_rejection_period( + float contamination_rejection_period) { + contamination_rejection_period_ = contamination_rejection_period; + } + + private: + float frame_period_; // Frame period + float frame_duration_; // Window size + float onset_window_; // Interval scanned for onset activity + float speech_on_window_; // Inverval scanned for ongoing speech + float offset_window_; // Interval scanned for offset evidence + float offset_confirm_dur_; // Silence duration required to confirm offset + float decision_threshold_; // Initial rms detection threshold + float min_decision_threshold_; // Minimum rms detection threshold + float fast_update_dur_; // Period for initial estimation of levels. + float sample_rate_; // Expected sample rate. + + // Time to add on either side of endpoint threshold crossings + float endpoint_margin_; + // Total dur within onset_window required to enter ONSET state + float onset_detect_dur_; + // Total on time within onset_window required to enter SPEECH_ON state + float onset_confirm_dur_; + // Minimum dur in SPEECH_ON state required to maintain ON state + float on_maintain_dur_; + // Minimum fundamental frequency for autocorrelation. + float min_fundamental_frequency_; + // Maximum fundamental frequency for autocorrelation. + float max_fundamental_frequency_; + // Period after start of user input that above threshold values are ignored. + // This is to reject audio feedback contamination. + float contamination_rejection_period_; +}; + +} // namespace speech_input + +#endif // CHROME_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_PARAMS_H_ diff --git a/chrome/browser/speech/speech_recognizer.cc b/chrome/browser/speech/speech_recognizer.cc index 2e8f353..0df671c 100644 --- a/chrome/browser/speech/speech_recognizer.cc +++ b/chrome/browser/speech/speech_recognizer.cc @@ -6,6 +6,7 @@ #include "base/ref_counted.h" #include "base/scoped_ptr.h" +#include "base/time.h" #include "chrome/browser/chrome_thread.h" #include "chrome/browser/profile.h" #include "chrome/common/net/url_request_context_getter.h" @@ -43,10 +44,11 @@ class SpeexEncoder { int samples_per_frame() const { return samples_per_frame_; } - // Encodes each frame of raw audio in |raw_samples| and adds the + // Encodes each frame of raw audio in |samples| and adds the // encoded frames as a set of strings to the |encoded_frames| list. // Ownership of the newly added strings is transferred to the caller. - void Encode(const string& raw_samples, + void Encode(const short* samples, + int num_samples, std::list* encoded_frames); private: @@ -73,12 +75,9 @@ SpeexEncoder::~SpeexEncoder() { speex_encoder_destroy(encoder_state_); } -void SpeexEncoder::Encode(const string& raw_samples, +void SpeexEncoder::Encode(const short* samples, + int num_samples, std::list* encoded_frames) { - const short* samples = reinterpret_cast(raw_samples.data()); - DCHECK((raw_samples.length() % sizeof(short)) == 0); - int num_samples = raw_samples.length() / sizeof(short); - // Drop incomplete frames, typically those which come in when recording stops. num_samples -= (num_samples % samples_per_frame_); for (int i = 0; i < num_samples; i += samples_per_frame_) { @@ -100,7 +99,14 @@ SpeechRecognizer::SpeechRecognizer(Delegate* delegate, const SpeechInputCallerId& caller_id) : delegate_(delegate), caller_id_(caller_id), - encoder_(new SpeexEncoder()) { + encoder_(new SpeexEncoder()), + endpointer_(kAudioSampleRate) { + endpointer_.set_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond / 2); + endpointer_.set_long_speech_input_complete_silence_length( + base::Time::kMicrosecondsPerSecond); + endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); + endpointer_.StartSession(); } SpeechRecognizer::~SpeechRecognizer() { @@ -109,6 +115,7 @@ SpeechRecognizer::~SpeechRecognizer() { DCHECK(!audio_controller_.get()); DCHECK(!request_.get() || !request_->HasPendingRequest()); DCHECK(audio_buffers_.empty()); + endpointer_.EndSession(); } bool SpeechRecognizer::StartRecording() { @@ -116,6 +123,16 @@ bool SpeechRecognizer::StartRecording() { DCHECK(!audio_controller_.get()); DCHECK(!request_.get() || !request_->HasPendingRequest()); + // TODO(satish): Normally for a short time (even 0.5s) the endpointer needs to + // estimate the environment/background noise before starting to treat the + // audio as user input. Once we have implemented a popup UI to notify the user + // that recording has started, we should perhaps have a short interval where + // we record background audio and then show the popup UI so that the user can + // start speaking after that. For now we just do these together so there isn't + // any background noise for the end pointer (still works ok). + endpointer_.SetEnvironmentEstimationMode(); + endpointer_.SetUserInputMode(); + int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); audio_controller_ = AudioInputController::Create(this, @@ -156,6 +173,7 @@ void SpeechRecognizer::StopRecording() { LOG(INFO) << "SpeechRecognizer stopping record."; audio_controller_->Close(); audio_controller_ = NULL; // Releases the ref ptr. + delegate_->DidCompleteRecording(caller_id_); // If we haven't got any audio yet end the recognition sequence here. @@ -240,9 +258,18 @@ void SpeechRecognizer::HandleOnData(string* data) { return; } - encoder_->Encode(*data, &audio_buffers_); + const short* samples = reinterpret_cast(data->data()); + DCHECK((data->length() % sizeof(short)) == 0); + int num_samples = data->length() / sizeof(short); + + encoder_->Encode(samples, num_samples, &audio_buffers_); + endpointer_.ProcessAudio(samples, num_samples); delete data; + if (endpointer_.speech_input_complete()) { + StopRecording(); + } + // TODO(satish): Once we have streaming POST, start sending the data received // here as POST chunks. } diff --git a/chrome/browser/speech/speech_recognizer.h b/chrome/browser/speech/speech_recognizer.h index 5d51b02..2a298f3 100644 --- a/chrome/browser/speech/speech_recognizer.h +++ b/chrome/browser/speech/speech_recognizer.h @@ -9,6 +9,7 @@ #include "base/scoped_ptr.h" #include "media/audio/audio_input_controller.h" #include "chrome/browser/speech/speech_recognition_request.h" +#include "chrome/browser/speech/endpointer/endpointer.h" #include #include #include @@ -95,6 +96,7 @@ class SpeechRecognizer scoped_ptr request_; scoped_refptr audio_controller_; scoped_ptr encoder_; + Endpointer endpointer_; DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer); }; -- cgit v1.1