// Copyright (c) 2010 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/browser/speech/speech_recognizer.h" #include "base/ref_counted.h" #include "base/scoped_ptr.h" #include "base/time.h" #include "chrome/browser/browser_thread.h" #include "chrome/browser/profiles/profile.h" #include "chrome/common/net/url_request_context_getter.h" #include "third_party/speex/include/speex/speex.h" using media::AudioInputController; using std::list; using std::string; namespace { const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate=16000"; const int kSpeexEncodingQuality = 8; const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). // Since the frame length gets written out as a byte in the encoded packet, // make sure it is within the byte range. COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); // The following constants are related to the volume level indicator shown in // the UI for recorded audio. // Multiplier used when new volume is greater than previous level. const float kUpSmoothingFactor = 0.9f; // Multiplier used when new volume is lesser than previous level. const float kDownSmoothingFactor = 0.4f; const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter. const float kAudioMeterDbRange = 25.0f; } // namespace namespace speech_input { const int SpeechRecognizer::kAudioSampleRate = 16000; const int SpeechRecognizer::kAudioPacketIntervalMs = 100; const int SpeechRecognizer::kNumAudioChannels = 1; const int SpeechRecognizer::kNumBitsPerAudioSample = 16; const int SpeechRecognizer::kNoSpeechTimeoutSec = 8; const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300; // Provides a simple interface to encode raw audio using the Speex codec. class SpeexEncoder { public: SpeexEncoder(); ~SpeexEncoder(); int samples_per_frame() const { return samples_per_frame_; } // Encodes each frame of raw audio in |samples| and adds the // encoded frames as a set of strings to the |encoded_frames| list. // Ownership of the newly added strings is transferred to the caller. void Encode(const short* samples, int num_samples, std::list* encoded_frames); private: SpeexBits bits_; void* encoder_state_; int samples_per_frame_; char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. }; SpeexEncoder::SpeexEncoder() { // speex_bits_init() does not initialize all of the |bits_| struct. memset(&bits_, 0, sizeof(bits_)); speex_bits_init(&bits_); encoder_state_ = speex_encoder_init(&speex_wb_mode); DCHECK(encoder_state_); speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); DCHECK(samples_per_frame_ > 0); int quality = kSpeexEncodingQuality; speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); int vbr = 1; speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); } SpeexEncoder::~SpeexEncoder() { speex_bits_destroy(&bits_); speex_encoder_destroy(encoder_state_); } void SpeexEncoder::Encode(const short* samples, int num_samples, std::list* encoded_frames) { // Drop incomplete frames, typically those which come in when recording stops. num_samples -= (num_samples % samples_per_frame_); for (int i = 0; i < num_samples; i += samples_per_frame_) { speex_bits_reset(&bits_); speex_encode_int(encoder_state_, const_cast(samples + i), &bits_); // Encode the frame and place the size of the frame as the first byte. This // is the packet format for MIME type x-speex-with-header-byte. int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, kMaxSpeexFrameLength); encoded_frame_data_[0] = static_cast(frame_length); encoded_frames->push_back(new string(encoded_frame_data_, frame_length + 1)); } } SpeechRecognizer::SpeechRecognizer(Delegate* delegate, int caller_id, const std::string& language, const std::string& grammar, const std::string& hardware_info) : delegate_(delegate), caller_id_(caller_id), language_(language), grammar_(grammar), hardware_info_(hardware_info), encoder_(new SpeexEncoder()), endpointer_(kAudioSampleRate), num_samples_recorded_(0), audio_level_(0.0f) { endpointer_.set_speech_input_complete_silence_length( base::Time::kMicrosecondsPerSecond / 2); endpointer_.set_long_speech_input_complete_silence_length( base::Time::kMicrosecondsPerSecond); endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond); endpointer_.StartSession(); } SpeechRecognizer::~SpeechRecognizer() { // Recording should have stopped earlier due to the endpointer or // |StopRecording| being called. DCHECK(!audio_controller_.get()); DCHECK(!request_.get() || !request_->HasPendingRequest()); DCHECK(audio_buffers_.empty()); endpointer_.EndSession(); } bool SpeechRecognizer::StartRecording() { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); DCHECK(!audio_controller_.get()); DCHECK(!request_.get() || !request_->HasPendingRequest()); // The endpointer needs to estimate the environment/background noise before // starting to treat the audio as user input. In |HandleOnData| we wait until // such time has passed before switching to user input mode. endpointer_.SetEnvironmentEstimationMode(); int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000; DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0); AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels, kAudioSampleRate, kNumBitsPerAudioSample, samples_per_packet); audio_controller_ = AudioInputController::Create(this, params); DCHECK(audio_controller_.get()); VLOG(1) << "SpeechRecognizer starting record."; num_samples_recorded_ = 0; audio_controller_->Record(); return true; } void SpeechRecognizer::CancelRecognition() { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); DCHECK(audio_controller_.get() || request_.get()); // Stop recording if required. if (audio_controller_.get()) { VLOG(1) << "SpeechRecognizer stopping record."; audio_controller_->Close(); audio_controller_ = NULL; // Releases the ref ptr. } VLOG(1) << "SpeechRecognizer canceling recognition."; ReleaseAudioBuffers(); request_.reset(); } void SpeechRecognizer::StopRecording() { DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO)); // If audio recording has already stopped and we are in recognition phase, // silently ignore any more calls to stop recording. if (!audio_controller_.get()) return; VLOG(1) << "SpeechRecognizer stopping record."; audio_controller_->Close(); audio_controller_ = NULL; // Releases the ref ptr. delegate_->DidCompleteRecording(caller_id_); // If we haven't got any audio yet end the recognition sequence here. if (audio_buffers_.empty()) { // Guard against the delegate freeing us until we finish our job. scoped_refptr me(this); delegate_->DidCompleteRecognition(caller_id_); return; } // We now have recorded audio in our buffers, so start a recognition request. // Since the http request takes a single string as POST data, allocate // one and copy over bytes from the audio buffers to the string. int audio_buffer_length = 0; for (AudioBufferQueue::iterator it = audio_buffers_.begin(); it != audio_buffers_.end(); it++) { audio_buffer_length += (*it)->length(); } string data; data.reserve(audio_buffer_length); for (AudioBufferQueue::iterator it = audio_buffers_.begin(); it != audio_buffers_.end(); it++) { data.append(*(*it)); } DCHECK(!request_.get()); request_.reset(new SpeechRecognitionRequest( Profile::GetDefaultRequestContext(), this)); request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data); ReleaseAudioBuffers(); // No need to keep the audio anymore. } void SpeechRecognizer::ReleaseAudioBuffers() { for (AudioBufferQueue::iterator it = audio_buffers_.begin(); it != audio_buffers_.end(); it++) delete *it; audio_buffers_.clear(); } // Invoked in the audio thread. void SpeechRecognizer::OnError(AudioInputController* controller, int error_code) { BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, NewRunnableMethod(this, &SpeechRecognizer::HandleOnError, error_code)); } void SpeechRecognizer::HandleOnError(int error_code) { LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code; // Check if we are still recording before canceling recognition, as // recording might have been stopped after this error was posted to the queue // by |OnError|. if (!audio_controller_.get()) return; InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE); } void SpeechRecognizer::OnData(AudioInputController* controller, const uint8* data, uint32 size) { if (size == 0) // This could happen when recording stops and is normal. return; string* str_data = new string(reinterpret_cast(data), size); BrowserThread::PostTask(BrowserThread::IO, FROM_HERE, NewRunnableMethod(this, &SpeechRecognizer::HandleOnData, str_data)); } void SpeechRecognizer::HandleOnData(string* data) { // Check if we are still recording and if not discard this buffer, as // recording might have been stopped after this buffer was posted to the queue // by |OnData|. if (!audio_controller_.get()) { delete data; return; } const short* samples = reinterpret_cast(data->data()); DCHECK((data->length() % sizeof(short)) == 0); int num_samples = data->length() / sizeof(short); encoder_->Encode(samples, num_samples, &audio_buffers_); float rms; endpointer_.ProcessAudio(samples, num_samples, &rms); delete data; num_samples_recorded_ += num_samples; if (endpointer_.IsEstimatingEnvironment()) { // Check if we have gathered enough audio for the endpointer to do // environment estimation and should move on to detect speech/end of speech. if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs * kAudioSampleRate) / 1000) { endpointer_.SetUserInputMode(); delegate_->DidCompleteEnvironmentEstimation(caller_id_); } return; // No more processing since we are still estimating environment. } // Check if we have waited too long without hearing any speech. if (!endpointer_.DidStartReceivingSpeech() && num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) { InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH); return; } // Calculate the input volume to display in the UI, smoothing towards the // new level. float level = (rms - kAudioMeterMinDb) / kAudioMeterDbRange; level = std::min(std::max(0.0f, level), 1.0f); if (level > audio_level_) { audio_level_ += (level - audio_level_) * kUpSmoothingFactor; } else { audio_level_ += (level - audio_level_) * kDownSmoothingFactor; } delegate_->SetInputVolume(caller_id_, audio_level_); if (endpointer_.speech_input_complete()) { StopRecording(); } // TODO(satish): Once we have streaming POST, start sending the data received // here as POST chunks. } void SpeechRecognizer::SetRecognitionResult( bool error, const SpeechInputResultArray& result) { if (result.empty()) { InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS); return; } delegate_->SetRecognitionResult(caller_id_, error, result); // Guard against the delegate freeing us until we finish our job. scoped_refptr me(this); delegate_->DidCompleteRecognition(caller_id_); } void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) { CancelRecognition(); // Guard against the delegate freeing us until we finish our job. scoped_refptr me(this); delegate_->OnRecognizerError(caller_id_, error); } } // namespace speech_input