diff options
17 files changed, 1304 insertions, 36 deletions
diff --git a/content/browser/speech/chunked_byte_buffer.cc b/content/browser/speech/chunked_byte_buffer.cc index 123755a..115fa82 100644 --- a/content/browser/speech/chunked_byte_buffer.cc +++ b/content/browser/speech/chunked_byte_buffer.cc @@ -38,7 +38,6 @@ ChunkedByteBuffer::~ChunkedByteBuffer() { } void ChunkedByteBuffer::Append(const uint8* start, size_t length) { - DCHECK(length > 0); size_t remaining_bytes = length; const uint8* next_data = start; diff --git a/content/browser/speech/google_streaming_remote_engine.cc b/content/browser/speech/google_streaming_remote_engine.cc new file mode 100644 index 0000000..34ff853 --- /dev/null +++ b/content/browser/speech/google_streaming_remote_engine.cc @@ -0,0 +1,586 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "content/browser/speech/google_streaming_remote_engine.h" + +#include <vector> + +#include "base/bind.h" +#include "base/command_line.h" +#include "base/rand_util.h" +#include "base/string_number_conversions.h" +#include "base/string_util.h" +#include "base/time.h" +#include "base/utf_string_conversions.h" +#include "content/browser/speech/audio_buffer.h" +#include "content/browser/speech/proto/google_streaming_api.pb.h" +#include "content/public/browser/browser_thread.h" +#include "content/public/common/content_switches.h" +#include "content/public/common/speech_recognition_error.h" +#include "content/public/common/speech_recognition_result.h" +#include "net/base/escape.h" +#include "net/base/load_flags.h" +#include "net/url_request/url_fetcher.h" +#include "net/url_request/url_request_context.h" +#include "net/url_request/url_request_context_getter.h" +#include "net/url_request/url_request_status.h" + +using content::BrowserThread; +using content::SpeechRecognitionError; +using content::SpeechRecognitionErrorCode; +using content::SpeechRecognitionHypothesis; +using content::SpeechRecognitionResult; +using net::URLFetcher; + +namespace { + +// TODO(primiano): This shouldn't be a const, rather it should be taken from +// maxNBest property (which is not yet implemented in WebKit). +const int kMaxResults = 5; +const char kDownstreamUrl[] = "/down?"; +const char kUpstreamUrl[] = "/up?"; +const int kAudioPacketIntervalMs = 100; +const speech::AudioEncoder::Codec kDefaultAudioCodec = + speech::AudioEncoder::CODEC_FLAC; + +// TODO(primiano): /////////// Remove this after debug stage. ///////////////// +void DumpResponse(const std::string& response) { + bool parse_ok; + speech::HttpStreamingResult res; + parse_ok = res.ParseFromString(response); + DVLOG(1) << "------------"; + if(!parse_ok) { + DVLOG(1) << "Parse failed!"; + return; + } + if (res.has_id()) + DVLOG(1) << "ID\t" << res.id(); + if (res.has_status()) + DVLOG(1) << "STATUS\t" << res.status(); + if (res.has_upstream_connected()) + DVLOG(1) << "UPCON\t" << res.upstream_connected(); + if (res.hypotheses_size() > 0) + DVLOG(1) << "HYPS\t" << res.hypotheses_size(); + if (res.has_provisional()) + DVLOG(1) << "PROV\t" << res.provisional(); + if (res.has_ephemeral()) + DVLOG(1) << "EPHM\t" << res.ephemeral(); + DVLOG(1) << "------------\n"; +} + +std::string GetWebserviceBaseURL() { + const CommandLine& command_line = *CommandLine::ForCurrentProcess(); + return command_line.GetSwitchValueASCII( + switches::kSpeechRecognitionWebserviceURL); +} + +std::string GetWebserviceKey() { + const CommandLine& command_line = *CommandLine::ForCurrentProcess(); + return command_line.GetSwitchValueASCII( + switches::kSpeechRecognitionWebserviceKey); +} + +} // namespace + +namespace speech { + +const int GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTests = 0; +const int GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTests = 1; +const int GoogleStreamingRemoteEngine::kWebserviceStatusNoError = 0; +const int GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch = 5; + +GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine( + net::URLRequestContextGetter* context) + : url_context_(context), + encoder_(NULL), + previous_response_length_(0), + got_last_definitive_result_(false), + is_dispatching_event_(false), + state_(STATE_IDLE) { +} + +GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {} + +void GoogleStreamingRemoteEngine::SetConfig( + const SpeechRecognitionEngineConfig& config) { + config_ = config; +} + +void GoogleStreamingRemoteEngine::StartRecognition() { + FSMEventArgs event_args(EVENT_START_RECOGNITION); + DispatchEvent(event_args); +} + +void GoogleStreamingRemoteEngine::EndRecognition() { + FSMEventArgs event_args(EVENT_END_RECOGNITION); + DispatchEvent(event_args); +} + +void GoogleStreamingRemoteEngine::TakeAudioChunk(const AudioChunk& data) { + FSMEventArgs event_args(EVENT_AUDIO_CHUNK); + event_args.audio_data = &data; + DispatchEvent(event_args); +} + +void GoogleStreamingRemoteEngine::AudioChunksEnded() { + FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED); + DispatchEvent(event_args); +} + +void GoogleStreamingRemoteEngine::OnURLFetchComplete(const URLFetcher* source) { + const bool kResponseComplete = true; + DispatchHTTPResponse(source, kResponseComplete); +} + +void GoogleStreamingRemoteEngine::OnURLFetchDownloadProgress( + const URLFetcher* source, int64 current, int64 total) { + const bool kPartialResponse = false; + DispatchHTTPResponse(source, kPartialResponse); +} + +void GoogleStreamingRemoteEngine::DispatchHTTPResponse(const URLFetcher* source, + bool end_of_response) { + DCHECK(CalledOnValidThread()); + DCHECK(source); + const bool response_is_good = source->GetStatus().is_success() && + source->GetResponseCode() == 200; + std::string response; + if (response_is_good) + source->GetResponseAsString(&response); + const size_t current_response_length = response.size(); + + // TODO(primiano): /////////// Remove this after debug stage. //////////////// + DVLOG(1) << (source == downstream_fetcher_.get() ? "Downstream" : "Upstream") + << "HTTP, code: " << source->GetResponseCode() + << " length: " << current_response_length + << " eor: " << end_of_response; + + // URLFetcher provides always the entire response buffer, but we are only + // interested in the fresh data introduced by the last chunk. Therefore, we + // drop the previous content we have already processed. + if (current_response_length != 0) { + DCHECK_GE(current_response_length, previous_response_length_); + response.erase(0, previous_response_length_); + previous_response_length_ = current_response_length; + } + + if (!response_is_good && source == downstream_fetcher_.get()) { + // TODO(primiano): /////////// Remove this after debug stage. ///////////// + DVLOG(1) << "Downstream error " << source->GetResponseCode(); + FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR); + DispatchEvent(event_args); + return; + } + if (!response_is_good && source == upstream_fetcher_.get()) { + // TODO(primiano): /////////// Remove this after debug stage. ///////////// + DVLOG(1) << "Upstream error " << source->GetResponseCode() + << " EOR " << end_of_response; + FSMEventArgs event_args(EVENT_UPSTREAM_ERROR); + DispatchEvent(event_args); + return; + } + if (source == upstream_fetcher_.get()) + return; + + DCHECK(response_is_good && source == downstream_fetcher_.get()); + + // The downstream response is organized in chunks, whose size is determined + // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class. + // Such chunks are sent by the speech recognition webservice over the HTTP + // downstream channel using HTTP chunked transfer (unrelated to our chunks). + // This function is called every time an HTTP chunk is received by the + // url fetcher. However there isn't any particular matching beween our + // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can + // contain a portion of one chunk or even more chunks together. + chunked_byte_buffer_.Append(response); + + // A single HTTP chunk can contain more than one data chunk, thus the while. + while (chunked_byte_buffer_.HasChunks()) { + FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE); + event_args.response = chunked_byte_buffer_.PopChunk(); + DCHECK(event_args.response.get()); + // TODO(primiano): /////////// Remove this after debug stage. ///////////// + DumpResponse(std::string((char*)&(*event_args.response->begin()), + event_args.response->size())); + DispatchEvent(event_args); + } + if (end_of_response) { + FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED); + DispatchEvent(event_args); + } +} + +bool GoogleStreamingRemoteEngine::IsRecognitionPending() const { + DCHECK(CalledOnValidThread()); + return state_ != STATE_IDLE; +} + +int GoogleStreamingRemoteEngine::GetDesiredAudioChunkDurationMs() const { + return kAudioPacketIntervalMs; +} + +// ----------------------- Core FSM implementation --------------------------- + +void GoogleStreamingRemoteEngine::DispatchEvent( + const FSMEventArgs& event_args) { + DCHECK(CalledOnValidThread()); + DCHECK_LE(event_args.event, EVENT_MAX_VALUE); + DCHECK_LE(state_, STATE_MAX_VALUE); + + // Event dispatching must be sequential, otherwise it will break all the rules + // and the assumptions of the finite state automata model. + DCHECK(!is_dispatching_event_); + is_dispatching_event_ = true; + + // TODO(primiano): /////////// Remove this after debug stage. //////////////// + //DVLOG(1) << "State " << state_ << ", Event " << event_args.event; + state_ = ExecuteTransitionAndGetNextState(event_args); + + is_dispatching_event_ = false; +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::ExecuteTransitionAndGetNextState( + const FSMEventArgs& event_args) { + const FSMEvent event = event_args.event; + switch (state_) { + case STATE_IDLE: + switch (event) { + case EVENT_START_RECOGNITION: + return ConnectBothStreams(event_args); + case EVENT_END_RECOGNITION: + // Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of + // abort, so we just silently drop them here. + case EVENT_AUDIO_CHUNK: + case EVENT_AUDIO_CHUNKS_ENDED: + // DOWNSTREAM_CLOSED can be received if we end up here due to an error. + case EVENT_DOWNSTREAM_CLOSED: + return DoNothing(event_args); + case EVENT_UPSTREAM_ERROR: + case EVENT_DOWNSTREAM_ERROR: + case EVENT_DOWNSTREAM_RESPONSE: + return NotFeasible(event_args); + } + break; + case STATE_BOTH_STREAMS_CONNECTED: + switch (event) { + case EVENT_AUDIO_CHUNK: + return TransmitAudioUpstream(event_args); + case EVENT_DOWNSTREAM_RESPONSE: + return ProcessDownstreamResponse(event_args); + case EVENT_AUDIO_CHUNKS_ENDED: + return CloseUpstreamAndWaitForResults(event_args); + case EVENT_END_RECOGNITION: + return AbortSilently(event_args); + case EVENT_UPSTREAM_ERROR: + case EVENT_DOWNSTREAM_ERROR: + case EVENT_DOWNSTREAM_CLOSED: + return AbortWithError(event_args); + case EVENT_START_RECOGNITION: + return NotFeasible(event_args); + } + break; + case STATE_WAITING_DOWNSTREAM_RESULTS: + switch (event) { + case EVENT_DOWNSTREAM_RESPONSE: + return ProcessDownstreamResponse(event_args); + case EVENT_DOWNSTREAM_CLOSED: + return RaiseNoMatchErrorIfGotNoResults(event_args); + case EVENT_END_RECOGNITION: + return AbortSilently(event_args); + case EVENT_UPSTREAM_ERROR: + case EVENT_DOWNSTREAM_ERROR: + return AbortWithError(event_args); + case EVENT_START_RECOGNITION: + case EVENT_AUDIO_CHUNK: + case EVENT_AUDIO_CHUNKS_ENDED: + return NotFeasible(event_args); + } + break; + } + return NotFeasible(event_args); +} + +// ----------- Contract for all the FSM evolution functions below ------------- +// - Are guaranteed to be executed in the same thread (IO, except for tests); +// - Are guaranteed to be not reentrant (themselves and each other); +// - event_args members are guaranteed to be stable during the call; + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) { + DCHECK(!upstream_fetcher_.get()); + DCHECK(!downstream_fetcher_.get()); + + encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec, + config_.audio_sample_rate, + config_.audio_num_bits_per_sample)); + DCHECK(encoder_.get()); + const std::string request_key = GenerateRequestKey(); + + // Setup downstream fetcher. + std::vector<std::string> downstream_args; + downstream_args.push_back("sky=" + GetWebserviceKey()); + downstream_args.push_back("pair=" + request_key); + downstream_args.push_back("maxresults=" + base::IntToString(kMaxResults)); + + GURL downstream_url(GetWebserviceBaseURL() + std::string(kDownstreamUrl) + + JoinString(downstream_args, '&')); + // TODO(primiano): /////////// Remove this after debug stage. ///////////// + DVLOG(1) << "Opening downstream: " + downstream_url.PathForRequest(); + + downstream_fetcher_.reset(URLFetcher::Create( + kDownstreamUrlFetcherIdForTests, downstream_url, URLFetcher::GET, this)); + downstream_fetcher_->SetRequestContext(url_context_); + downstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | + net::LOAD_DO_NOT_SEND_COOKIES | + net::LOAD_DO_NOT_SEND_AUTH_DATA); + downstream_fetcher_->Start(); + + // Setup upstream fetcher. + // TODO(primiano): Handle config_.grammar array when it will be implemented by + // the speech recognition webservice. + std::vector<std::string> upstream_args; + upstream_args.push_back("sky=" + GetWebserviceKey()); + upstream_args.push_back("pair=" + request_key); + upstream_args.push_back( + "lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true)); + upstream_args.push_back( + config_.filter_profanities ? "pfilter=2" : "pfilter=0"); + upstream_args.push_back("maxresults=" + base::IntToString(kMaxResults)); + upstream_args.push_back("client=myapp.mycompany.com"); + // TODO(primiano): Can we remove this feature sending audio HW information? + if (!config_.hardware_info.empty()) { + upstream_args.push_back( + "xhw=" + net::EscapeQueryParamValue(config_.hardware_info, true)); + } + + GURL upstream_url(GetWebserviceBaseURL() + std::string(kUpstreamUrl) + + JoinString(upstream_args, '&')); + + // TODO(primiano): /////////// Remove this after debug stage. //////////////// + DVLOG(1) << "Opening upstream: " + upstream_url.PathForRequest(); + + upstream_fetcher_.reset(URLFetcher::Create( + kUpstreamUrlFetcherIdForTests, upstream_url, URLFetcher::POST, this)); + upstream_fetcher_->SetChunkedUpload(encoder_->mime_type()); + upstream_fetcher_->SetRequestContext(url_context_); + upstream_fetcher_->SetReferrer(config_.origin_url); + upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | + net::LOAD_DO_NOT_SEND_COOKIES | + net::LOAD_DO_NOT_SEND_AUTH_DATA); + upstream_fetcher_->Start(); + previous_response_length_ = 0; + return STATE_BOTH_STREAMS_CONNECTED; +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::TransmitAudioUpstream( + const FSMEventArgs& event_args) { + DCHECK(upstream_fetcher_.get()); + DCHECK(event_args.audio_data.get()); + const AudioChunk& audio = *(event_args.audio_data); + + DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); + encoder_->Encode(audio); + scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); + upstream_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); + return state_; +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::ProcessDownstreamResponse( + const FSMEventArgs& event_args) { + DCHECK(event_args.response.get()); + bool is_definitive_result = false; + + HttpStreamingResult ws_result; + const bool protobuf_parse_successful = ws_result.ParseFromArray( + &(*event_args.response->begin()), + event_args.response->size()); + if (!protobuf_parse_successful) + return AbortWithError(event_args); + + // TODO(primiano): The code below sounds to me like a hack. Discuss the + // possibility of having a simpler and clearer protobuf grammar, in order to + // distinguish the different type of results and errors in a civilized way. + + // Skip the upstream connected notification, since we're not interested in it. + if (ws_result.has_upstream_connected()) + return state_; + + if (ws_result.has_status()) { + switch (ws_result.status()) { + case kWebserviceStatusNoError: + is_definitive_result = true; + break; + case kWebserviceStatusErrorNoMatch: + // TODO(primiano): Contact gshires@, in case of no results, instead of + // the expected kWebserviceStatusErrorNoMatch status code we receive a + // provisional-like result with no provisional nor ephemeral strings. + return Abort(content::SPEECH_RECOGNITION_ERROR_NO_MATCH); + default: + VLOG(1) << "Received an unknown status code from the speech recognition" + "webservice (" << ws_result.status() << ")"; + return Abort(content::SPEECH_RECOGNITION_ERROR_NETWORK); + } + } + + SpeechRecognitionResult result; + if (is_definitive_result) { + got_last_definitive_result_ = true; + result.is_provisional = false; + for (int i = 0; i < ws_result.hypotheses_size(); ++i) { + const HttpStreamingHypothesis& ws_hypothesis = ws_result.hypotheses(i); + SpeechRecognitionHypothesis hypothesis; + DCHECK(ws_hypothesis.has_confidence()); + hypothesis.confidence = ws_hypothesis.confidence(); + DCHECK(ws_hypothesis.has_utterance()); + hypothesis.utterance = UTF8ToUTF16(ws_hypothesis.utterance()); + result.hypotheses.push_back(hypothesis); + } + } else { + result.is_provisional = true; + string16 transcript; + if (ws_result.has_provisional()) + transcript.append(UTF8ToUTF16(ws_result.provisional())); + if (ws_result.has_ephemeral()) + transcript.append(UTF8ToUTF16(ws_result.ephemeral())); + DCHECK(!transcript.empty()); + result.hypotheses.push_back(SpeechRecognitionHypothesis(transcript, 0.0f)); + } + + // Propagate just actual results. + if (result.hypotheses.size() > 0) + delegate()->OnSpeechRecognitionEngineResult(result); + + return state_; +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::RaiseNoMatchErrorIfGotNoResults( + const FSMEventArgs& event_args) { + if (!got_last_definitive_result_) { + // Provide an empty result to notify that recognition is ended with no + // errors, yet neither any further results. + delegate()->OnSpeechRecognitionEngineResult(SpeechRecognitionResult()); + } + return AbortSilently(event_args); +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults( + const FSMEventArgs&) { + DCHECK(upstream_fetcher_.get()); + DCHECK(encoder_.get()); + + // TODO(primiano): /////////// Remove this after debug stage. //////////////// + DVLOG(1) << "Closing upstream"; + + // The encoder requires a non-empty final buffer. So we encode a packet + // of silence in case encoder had no data already. + std::vector<short> samples( + config_.audio_sample_rate * kAudioPacketIntervalMs / 1000); + scoped_refptr<AudioChunk> dummy_chunk = + new AudioChunk(reinterpret_cast<uint8*>(&samples[0]), + samples.size() * sizeof(short), + encoder_->bits_per_sample() / 8); + encoder_->Encode(*dummy_chunk); + encoder_->Flush(); + scoped_refptr<AudioChunk> encoded_dummy_data = + encoder_->GetEncodedDataAndClear(); + DCHECK(!encoded_dummy_data->IsEmpty()); + encoder_.reset(); + + upstream_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); + got_last_definitive_result_ = false; + return STATE_WAITING_DOWNSTREAM_RESULTS; +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::CloseDownstream(const FSMEventArgs&) { + DCHECK(!upstream_fetcher_.get()); + DCHECK(downstream_fetcher_.get()); + + // TODO(primiano): /////////// Remove this after debug stage. //////////////// + DVLOG(1) << "Closing downstream"; + downstream_fetcher_.reset(); + return STATE_IDLE; +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::AbortSilently(const FSMEventArgs&) { + return Abort(content::SPEECH_RECOGNITION_ERROR_NONE); +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::AbortWithError(const FSMEventArgs&) { + return Abort(content::SPEECH_RECOGNITION_ERROR_NETWORK); +} + +GoogleStreamingRemoteEngine::FSMState GoogleStreamingRemoteEngine::Abort( + SpeechRecognitionErrorCode error_code) { + // TODO(primiano): /////////// Remove this after debug stage. //////////////// + DVLOG(1) << "Aborting with error " << error_code; + + if (error_code != content::SPEECH_RECOGNITION_ERROR_NONE) { + delegate()->OnSpeechRecognitionEngineError( + SpeechRecognitionError(error_code)); + } + downstream_fetcher_.reset(); + upstream_fetcher_.reset(); + encoder_.reset(); + return STATE_IDLE; +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::DoNothing(const FSMEventArgs&) { + return state_; +} + +GoogleStreamingRemoteEngine::FSMState +GoogleStreamingRemoteEngine::NotFeasible(const FSMEventArgs& event_args) { + NOTREACHED() << "Unfeasible event " << event_args.event + << " in state " << state_; + return state_; +} + +std::string GoogleStreamingRemoteEngine::GetAcceptedLanguages() const { + std::string langs = config_.language; + if (langs.empty() && url_context_) { + // If no language is provided then we use the first from the accepted + // language list. If this list is empty then it defaults to "en-US". + // Example of the contents of this list: "es,en-GB;q=0.8", "" + net::URLRequestContext* request_context = + url_context_->GetURLRequestContext(); + DCHECK(request_context); + std::string accepted_language_list = request_context->accept_language(); + size_t separator = accepted_language_list.find_first_of(",;"); + if (separator > std::string::npos) + langs = accepted_language_list.substr(0, separator); + } + if (langs.empty()) + langs = "en-US"; + return langs; +} + +// TODO(primiano): Is there any utility in the codebase that already does this? +std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const { + const int64 kKeepLowBytes = GG_LONGLONG(0x00000000FFFFFFFF); + const int64 kKeepHighBytes = GG_LONGLONG(0xFFFFFFFF00000000); + + // Just keep the least significant bits of timestamp, in order to reduce + // probability of collisions. + int64 key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) | + (base::RandUint64() & kKeepHighBytes); + return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key)); +} + +GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value) + : event(event_value) { +} + +GoogleStreamingRemoteEngine::FSMEventArgs::~FSMEventArgs() { +} + +} // namespace speech diff --git a/content/browser/speech/google_streaming_remote_engine.h b/content/browser/speech/google_streaming_remote_engine.h new file mode 100644 index 0000000..fde4957 --- /dev/null +++ b/content/browser/speech/google_streaming_remote_engine.h @@ -0,0 +1,165 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_ +#define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_ +#pragma once + +#include <string> +#include <vector> + +#include "base/basictypes.h" +#include "base/memory/ref_counted.h" +#include "base/memory/scoped_ptr.h" +#include "base/threading/non_thread_safe.h" +#include "content/browser/speech/audio_encoder.h" +#include "content/browser/speech/chunked_byte_buffer.h" +#include "content/browser/speech/speech_recognition_engine.h" +#include "content/common/content_export.h" +#include "content/public/common/speech_recognition_error.h" +#include "googleurl/src/gurl.h" +#include "net/url_request/url_fetcher_delegate.h" + +namespace content { +struct SpeechRecognitionError; +struct SpeechRecognitionResult; +} + +namespace net { +class URLRequestContextGetter; +} + +namespace speech { + +class AudioChunk; + +// Implements a SpeechRecognitionEngine supporting continuous recognition by +// means of interaction with Google streaming speech recognition webservice. +// More in details, this class establishes two HTTP(S) connections with the +// webservice, for each session, herein called "upstream" and "downstream". +// Audio chunks are sent on the upstream by means of a chunked HTTP POST upload. +// Recognition results are retrieved in a full-duplex fashion (i.e. while +// pushing audio on the upstream) on the downstream by means of a chunked +// HTTP GET request. Pairing between the two stream is handled through a +// randomly generated key, unique for each request, which is passed in the +// &pair= arg to both stream request URLs. +// In the case of a regular session, the upstream is closed when the audio +// capture ends (notified through a |AudioChunksEnded| call) and the downstream +// waits for a corresponding server closure (eventually some late results can +// come after closing the upstream). +// Both stream are guaranteed to be closed when |EndRecognition| call is issued. +class CONTENT_EXPORT GoogleStreamingRemoteEngine + : public NON_EXPORTED_BASE(SpeechRecognitionEngine), + public net::URLFetcherDelegate, + public NON_EXPORTED_BASE(base::NonThreadSafe) { + public: + explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter* context); + virtual ~GoogleStreamingRemoteEngine(); + + // SpeechRecognitionEngine methods. + virtual void SetConfig(const SpeechRecognitionEngineConfig& config) OVERRIDE; + virtual void StartRecognition() OVERRIDE; + virtual void EndRecognition() OVERRIDE; + virtual void TakeAudioChunk(const AudioChunk& data) OVERRIDE; + virtual void AudioChunksEnded() OVERRIDE; + virtual bool IsRecognitionPending() const OVERRIDE; + virtual int GetDesiredAudioChunkDurationMs() const OVERRIDE; + + // net::URLFetcherDelegate methods. + virtual void OnURLFetchComplete(const net::URLFetcher* source) OVERRIDE; + virtual void OnURLFetchDownloadProgress(const net::URLFetcher* source, + int64 current, int64 total) OVERRIDE; + + private: + friend class GoogleStreamingRemoteEngineTest; + + // IDs passed to URLFetcher::Create(). Used for testing. + static const int kUpstreamUrlFetcherIdForTests; + static const int kDownstreamUrlFetcherIdForTests; + + // Response status codes from the speech recognition webservice. + static const int kWebserviceStatusNoError; + static const int kWebserviceStatusErrorNoMatch; + + // Data types for the internal Finite State Machine (FSM). + enum FSMState { + STATE_IDLE = 0, + STATE_BOTH_STREAMS_CONNECTED, + STATE_WAITING_DOWNSTREAM_RESULTS, + STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS + }; + + enum FSMEvent { + EVENT_END_RECOGNITION = 0, + EVENT_START_RECOGNITION, + EVENT_AUDIO_CHUNK, + EVENT_AUDIO_CHUNKS_ENDED, + EVENT_UPSTREAM_ERROR, + EVENT_DOWNSTREAM_ERROR, + EVENT_DOWNSTREAM_RESPONSE, + EVENT_DOWNSTREAM_CLOSED, + EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED + }; + + struct FSMEventArgs { + explicit FSMEventArgs(FSMEvent event_value); + ~FSMEventArgs(); + + FSMEvent event; + + // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|. + scoped_refptr<const AudioChunk> audio_data; + + // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes. + scoped_ptr<std::vector<uint8> > response; + + private: + DISALLOW_COPY_AND_ASSIGN(FSMEventArgs); + }; + + // Invoked by both upstream and downstream URLFetcher callbacks to handle + // new chunk data, connection closed or errors notifications. + void DispatchHTTPResponse(const net::URLFetcher* source, + bool end_of_response); + + // Entry point for pushing any new external event into the recognizer FSM. + void DispatchEvent(const FSMEventArgs& event_args); + + // Defines the behavior of the recognizer FSM, selecting the appropriate + // transition according to the current state and event. + FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args); + + // The methods below handle transitions of the recognizer FSM. + FSMState ConnectBothStreams(const FSMEventArgs& event_args); + FSMState TransmitAudioUpstream(const FSMEventArgs& event_args); + FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args); + FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args); + FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args); + FSMState CloseDownstream(const FSMEventArgs& event_args); + FSMState AbortSilently(const FSMEventArgs& event_args); + FSMState AbortWithError(const FSMEventArgs& event_args); + FSMState Abort(content::SpeechRecognitionErrorCode error); + FSMState DoNothing(const FSMEventArgs& event_args); + FSMState NotFeasible(const FSMEventArgs& event_args); + + std::string GetAcceptedLanguages() const; + std::string GenerateRequestKey() const; + + SpeechRecognitionEngineConfig config_; + scoped_ptr<net::URLFetcher> upstream_fetcher_; + scoped_ptr<net::URLFetcher> downstream_fetcher_; + scoped_refptr<net::URLRequestContextGetter> url_context_; + scoped_ptr<AudioEncoder> encoder_; + ChunkedByteBuffer chunked_byte_buffer_; + size_t previous_response_length_; + bool got_last_definitive_result_; + bool is_dispatching_event_; + FSMState state_; + + DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine); +}; + +} // namespace speech + +#endif // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_ diff --git a/content/browser/speech/google_streaming_remote_engine_unittest.cc b/content/browser/speech/google_streaming_remote_engine_unittest.cc new file mode 100644 index 0000000..9de06b6 --- /dev/null +++ b/content/browser/speech/google_streaming_remote_engine_unittest.cc @@ -0,0 +1,440 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <queue> + +#include "base/memory/scoped_ptr.h" +#include "base/message_loop.h" +#include "base/utf_string_conversions.h" +#include "content/browser/speech/audio_buffer.h" +#include "content/browser/speech/google_streaming_remote_engine.h" +#include "content/browser/speech/proto/google_streaming_api.pb.h" +#include "content/public/common/speech_recognition_error.h" +#include "content/public/common/speech_recognition_result.h" +#include "net/url_request/test_url_fetcher_factory.h" +#include "net/url_request/url_request_context_getter.h" +#include "net/url_request/url_request_status.h" +#include "testing/gtest/include/gtest/gtest.h" + +using content::SpeechRecognitionHypothesis; +using content::SpeechRecognitionResult; +using net::URLRequestStatus; +using net::TestURLFetcher; +using net::TestURLFetcherFactory; + +namespace speech { + +// Note: the terms upstream and downstream are herein referring to the client +// (engine_under_test_) viewpoint. + +class GoogleStreamingRemoteEngineTest + : public SpeechRecognitionEngineDelegate, + public testing::Test { + public: + GoogleStreamingRemoteEngineTest() + : last_number_of_upstream_chunks_seen_(0U), + error_(content::SPEECH_RECOGNITION_ERROR_NONE) { } + + // Creates a speech recognition request and invokes its URL fetcher delegate + // with the given test data. + void CreateAndTestRequest(bool success, const std::string& http_response); + + // SpeechRecognitionRequestDelegate methods. + virtual void OnSpeechRecognitionEngineResult( + const SpeechRecognitionResult& result) OVERRIDE { + results_.push(result); + } + virtual void OnSpeechRecognitionEngineError( + const content::SpeechRecognitionError& error) OVERRIDE { + error_ = error.code; + } + + // testing::Test methods. + virtual void SetUp() OVERRIDE; + virtual void TearDown() OVERRIDE; + + protected: + enum DownstreamError { + DOWNSTREAM_ERROR_NONE, + DOWNSTREAM_ERROR_HTTP500, + DOWNSTREAM_ERROR_NETWORK, + DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH + }; + static bool ResultsAreEqual(const SpeechRecognitionResult& a, + const SpeechRecognitionResult& b); + static std::string SerializeProtobufResponse(const HttpStreamingResult& msg); + static std::string ToBigEndian32(uint32 value); + + TestURLFetcher* GetUpstreamFetcher(); + TestURLFetcher* GetDownstreamFetcher(); + void StartMockRecognition(); + void EndMockRecognition(); + void InjectDummyAudioChunk(); + size_t UpstreamChunksUploadedFromLastCall(); + void ProvideMockResultDownstream(const SpeechRecognitionResult& result); + void ExpectResultReceived(const SpeechRecognitionResult& result); + void CloseMockDownstream(DownstreamError error); + + scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_; + TestURLFetcherFactory url_fetcher_factory_; + size_t last_number_of_upstream_chunks_seen_; + MessageLoop message_loop_; + std::string response_buffer_; + content::SpeechRecognitionErrorCode error_; + std::queue<SpeechRecognitionResult> results_; +}; + +TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) { + StartMockRecognition(); + ASSERT_TRUE(GetUpstreamFetcher()); + ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); + + // Inject some dummy audio chunks and check a corresponding chunked upload + // is performed every time on the server. + for (int i = 0; i < 3; ++i) { + InjectDummyAudioChunk(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + } + + // Ensure that a final (empty) audio chunk is uploaded on chunks end. + engine_under_test_->AudioChunksEnded(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + // Simulate a protobuf message streamed from the server containing a single + // result with two hypotheses. + SpeechRecognitionResult result; + result.is_provisional = false; + result.hypotheses.push_back( + SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 1"), 0.1F)); + result.hypotheses.push_back( + SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 2"), 0.2F)); + + ProvideMockResultDownstream(result); + ExpectResultReceived(result); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + // Ensure everything is closed cleanly after the downstream is closed. + CloseMockDownstream(DOWNSTREAM_ERROR_NONE); + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + EndMockRecognition(); + ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); + ASSERT_EQ(0U, results_.size()); +} + +TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) { + StartMockRecognition(); + ASSERT_TRUE(GetUpstreamFetcher()); + ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); + + for (int i = 0; i < 4; ++i) { + InjectDummyAudioChunk(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + + SpeechRecognitionResult result; + result.is_provisional = (i % 2 == 0); // Alternate result types. + float confidence = result.is_provisional ? 0.0F : (i * 0.1F); + result.hypotheses.push_back( + SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), confidence)); + + ProvideMockResultDownstream(result); + ExpectResultReceived(result); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + } + + // Ensure that a final (empty) audio chunk is uploaded on chunks end. + engine_under_test_->AudioChunksEnded(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + // Simulate a final definitive result. + SpeechRecognitionResult result; + result.is_provisional = false; + result.hypotheses.push_back( + SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 1.0F)); + ProvideMockResultDownstream(result); + ExpectResultReceived(result); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + // Ensure everything is closed cleanly after the downstream is closed. + CloseMockDownstream(DOWNSTREAM_ERROR_NONE); + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + EndMockRecognition(); + ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); + ASSERT_EQ(0U, results_.size()); +} + +TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) { + StartMockRecognition(); + ASSERT_TRUE(GetUpstreamFetcher()); + ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); + + // Simulate one pushed audio chunk. + InjectDummyAudioChunk(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + + // Simulate the corresponding definitive result. + SpeechRecognitionResult result; + result.hypotheses.push_back( + SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), 1.0F)); + ProvideMockResultDownstream(result); + ExpectResultReceived(result); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + // Simulate a silent downstream closure after |AudioChunksEnded|. + engine_under_test_->AudioChunksEnded(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + CloseMockDownstream(DOWNSTREAM_ERROR_NONE); + + // Expect an empty result, aimed at notifying recognition ended with no + // actual results nor errors. + SpeechRecognitionResult empty_result; + ExpectResultReceived(empty_result); + + // Ensure everything is closed cleanly after the downstream is closed. + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + EndMockRecognition(); + ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); + ASSERT_EQ(0U, results_.size()); +} + +TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) { + StartMockRecognition(); + ASSERT_TRUE(GetUpstreamFetcher()); + ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); + + for (int i = 0; i < 3; ++i) + InjectDummyAudioChunk(); + engine_under_test_->AudioChunksEnded(); + ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall()); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + // Simulate only a provisional result. + SpeechRecognitionResult result; + result.is_provisional = true; + result.hypotheses.push_back( + SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 0.0F)); + ProvideMockResultDownstream(result); + ExpectResultReceived(result); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH); + + // Expect a SPEECH_RECOGNITION_ERROR_NO_MATCH error to be raised. + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + EndMockRecognition(); + ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NO_MATCH, error_); + ASSERT_EQ(0U, results_.size()); +} + +TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) { + StartMockRecognition(); + ASSERT_TRUE(GetUpstreamFetcher()); + ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); + + InjectDummyAudioChunk(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + + // Close the downstream with a HTTP 500 error. + CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500); + + // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised. + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + EndMockRecognition(); + ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_); + ASSERT_EQ(0U, results_.size()); +} + +TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) { + StartMockRecognition(); + ASSERT_TRUE(GetUpstreamFetcher()); + ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); + + InjectDummyAudioChunk(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + + // Close the downstream fetcher simulating a network failure. + CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK); + + // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised. + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + EndMockRecognition(); + ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_); + ASSERT_EQ(0U, results_.size()); +} + +void GoogleStreamingRemoteEngineTest::SetUp() { + engine_under_test_.reset( + new GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/)); + engine_under_test_->set_delegate(this); +} + +void GoogleStreamingRemoteEngineTest::TearDown() { + engine_under_test_.reset(); +} + +TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() { + return url_fetcher_factory_.GetFetcherByID( + GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTests); +} + +TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() { + return url_fetcher_factory_.GetFetcherByID( + GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTests); +} + +// Starts recognition on the engine, ensuring that both stream fetchers are +// created. +void GoogleStreamingRemoteEngineTest::StartMockRecognition() { + DCHECK(engine_under_test_.get()); + + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + + engine_under_test_->StartRecognition(); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); + ASSERT_TRUE(upstream_fetcher); + upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL()); + + TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); + ASSERT_TRUE(downstream_fetcher); + downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL()); +} + +void GoogleStreamingRemoteEngineTest::EndMockRecognition() { + DCHECK(engine_under_test_.get()); + engine_under_test_->EndRecognition(); + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + + // TODO(primiano): In order to be very pedantic we should check that both + // the upstream and downstream URL fetchers have been disposed at this time. + // Unfortunately it seems that there is no direct way to detect (in tests) + // if a url_fetcher has been freed or not, since they are not automatically + // de-registered from the TestURLFetcherFactory on destruction. + // ASSERT_FALSE(GetUpstreamFetcher()); + // ASSERT_FALSE(GetDownstreamFetcher()); +} + +void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() { + unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'}; + scoped_refptr<AudioChunk> dummy_audio_chunk( + new AudioChunk(&dummy_audio_buffer_data[0], + sizeof(dummy_audio_buffer_data), + 2 /* bytes per sample */)); + DCHECK(engine_under_test_.get()); + engine_under_test_->TakeAudioChunk(*dummy_audio_chunk); +} + +size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() { + TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); + DCHECK(upstream_fetcher); + const size_t number_of_chunks = upstream_fetcher->upload_chunks().size(); + DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_); + const size_t new_chunks = number_of_chunks - + last_number_of_upstream_chunks_seen_; + last_number_of_upstream_chunks_seen_ = number_of_chunks; + return new_chunks; +} + +void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream( + const SpeechRecognitionResult& result) { + TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); + + ASSERT_TRUE(downstream_fetcher); + downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */)); + downstream_fetcher->set_response_code(200); + + HttpStreamingResult response; + if (result.is_provisional) { + DCHECK_EQ(result.hypotheses.size(), 1U); + const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[0]; + response.set_provisional(UTF16ToUTF8(hypothesis.utterance)); + } else { + response.set_status(GoogleStreamingRemoteEngine::kWebserviceStatusNoError); + for (size_t i = 0; i < result.hypotheses.size(); ++i) { + const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i]; + HttpStreamingHypothesis* ws_hypothesis = response.add_hypotheses(); + ws_hypothesis->set_confidence(hypothesis.confidence); + ws_hypothesis->set_utterance(UTF16ToUTF8(hypothesis.utterance)); + } + } + + std::string response_string = SerializeProtobufResponse(response); + response_buffer_.append(response_string); + downstream_fetcher->SetResponseString(response_buffer_); + downstream_fetcher->delegate()->OnURLFetchDownloadProgress( + downstream_fetcher, + response_buffer_.size(), + -1 /* total response length not used */); +} + +void GoogleStreamingRemoteEngineTest::CloseMockDownstream( + DownstreamError error) { + TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); + ASSERT_TRUE(downstream_fetcher); + + const URLRequestStatus::Status fetcher_status = + (error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED : + URLRequestStatus::SUCCESS; + downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0)); + downstream_fetcher->set_response_code( + (error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200); + +if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) { + HttpStreamingResult response; + response.set_status( + GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch); + response_buffer_.append(SerializeProtobufResponse(response)); + } + downstream_fetcher->SetResponseString(response_buffer_); + downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher); +} + +void GoogleStreamingRemoteEngineTest::ExpectResultReceived( + const SpeechRecognitionResult& result) { + ASSERT_GE(1U, results_.size()); + ASSERT_TRUE(ResultsAreEqual(result, results_.front())); + results_.pop(); +} + +bool GoogleStreamingRemoteEngineTest::ResultsAreEqual( + const SpeechRecognitionResult& a, const SpeechRecognitionResult& b) { + if (a.is_provisional != b.is_provisional || + a.hypotheses.size() != b.hypotheses.size()) { + return false; + } + for (size_t i = 0; i < a.hypotheses.size(); ++i) { + const SpeechRecognitionHypothesis& hyp_a = a.hypotheses[i]; + const SpeechRecognitionHypothesis& hyp_b = b.hypotheses[i]; + if (hyp_a.utterance != hyp_b.utterance || + hyp_a.confidence != hyp_b.confidence) { + return false; + } + } + return true; +} + +std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse( + const HttpStreamingResult& msg) { + std::string response_string; + msg.SerializeToString(&response_string); + + // Append 4 byte prefix length indication to the protobuf message as envisaged + // by the google streaming recognition webservice protocol. + response_string.insert(0, ToBigEndian32(response_string.size())); + return response_string; +} + +std::string GoogleStreamingRemoteEngineTest::ToBigEndian32(uint32 value) { + char raw_data[4]; + raw_data[0] = static_cast<uint8>((value >> 24) & 0xFF); + raw_data[1] = static_cast<uint8>((value >> 16) & 0xFF); + raw_data[2] = static_cast<uint8>((value >> 8) & 0xFF); + raw_data[3] = static_cast<uint8>(value & 0xFF); + return std::string(raw_data, sizeof(raw_data)); +} + +} // namespace speech diff --git a/content/browser/speech/speech_recognition_manager_impl.cc b/content/browser/speech/speech_recognition_manager_impl.cc index 47b5420..65351c1 100644 --- a/content/browser/speech/speech_recognition_manager_impl.cc +++ b/content/browser/speech/speech_recognition_manager_impl.cc @@ -7,6 +7,7 @@ #include "base/bind.h" #include "content/browser/browser_main_loop.h" #include "content/browser/speech/google_one_shot_remote_engine.h" +#include "content/browser/speech/google_streaming_remote_engine.h" #include "content/browser/speech/speech_recognition_engine.h" #include "content/browser/speech/speech_recognizer_impl.h" #include "content/public/browser/browser_thread.h" @@ -101,12 +102,20 @@ int SpeechRecognitionManagerImpl::CreateSession( remote_engine_config.hardware_info = hardware_info; remote_engine_config.origin_url = can_report_metrics ? config.origin_url : ""; - SpeechRecognitionEngine* google_remote_engine = + SpeechRecognitionEngine* google_remote_engine; + if (config.is_one_shot) { + google_remote_engine = new GoogleOneShotRemoteEngine(config.url_request_context_getter); + } else { + google_remote_engine = + new GoogleStreamingRemoteEngine(config.url_request_context_getter); + } + google_remote_engine->SetConfig(remote_engine_config); session.recognizer = new SpeechRecognizerImpl(this, session_id, + config.is_one_shot, google_remote_engine); return session_id; } diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc index 18c6c9f..1141d9e 100644 --- a/content/browser/speech/speech_recognizer_impl.cc +++ b/content/browser/speech/speech_recognizer_impl.cc @@ -80,6 +80,7 @@ COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0, SpeechRecognizerImpl::SpeechRecognizerImpl( SpeechRecognitionEventListener* listener, int session_id, + bool is_single_shot, SpeechRecognitionEngine* engine) : listener_(listener), testing_audio_manager_(NULL), @@ -87,6 +88,7 @@ SpeechRecognizerImpl::SpeechRecognizerImpl( endpointer_(kAudioSampleRate), session_id_(session_id), is_dispatching_event_(false), + is_single_shot_(is_single_shot), state_(STATE_IDLE) { DCHECK(listener_ != NULL); DCHECK(recognition_engine_ != NULL); @@ -197,7 +199,7 @@ void SpeechRecognizerImpl::OnSpeechRecognitionEngineError( } // ----------------------- Core FSM implementation --------------------------- -// TODO(primiano) After the changes in the media package (r129173), this class +// TODO(primiano): After the changes in the media package (r129173), this class // slightly violates the SpeechRecognitionEventListener interface contract. In // particular, it is not true anymore that this class can be freed after the // OnRecognitionEnd event, since the audio_controller_.Close() asynchronous @@ -240,7 +242,7 @@ SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( switch (state_) { case STATE_IDLE: switch (event) { - // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and + // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and // EVENT_STOP_CAPTURE below once speech input extensions are fixed. case EVENT_ABORT: return DoNothing(event_args); @@ -349,7 +351,7 @@ SpeechRecognizerImpl::ExecuteTransitionAndGetNextState( // - The class won't be freed in the meanwhile due to callbacks; // - IsCapturingAudio() returns true if and only if audio_controller_ != NULL. -// TODO(primiano) the audio pipeline is currently serial. However, the +// TODO(primiano): the audio pipeline is currently serial. However, the // clipper->endpointer->vumeter chain and the sr_engine could be parallelized. // We should profile the execution to see if it would be worth or not. void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) { @@ -464,9 +466,10 @@ SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) { SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) { - if (endpointer_.speech_input_complete()) { + // End-of-speech detection is performed only in one-shot mode. + // TODO(primiano): What about introducing a longer timeout for continuous rec? + if (is_single_shot_ && endpointer_.speech_input_complete()) return StopCaptureAndWaitForResult(event_args); - } return STATE_RECOGNIZING; } @@ -532,21 +535,43 @@ SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort( return STATE_IDLE; } -SpeechRecognizerImpl::FSMState -SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) { - // This is in preparation for future speech recognition functions. - NOTREACHED(); +SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult( + const FSMEventArgs& event_args) { + // Provisional results can occur only during continuous (non one-shot) mode. + // If this check is reached it means that a continuous speech recognition + // engine is being used for a one shot recognition. + DCHECK_EQ(false, is_single_shot_); + const SpeechRecognitionResult& result = event_args.engine_result; + listener_->OnRecognitionResult(session_id_, result); return state_; } SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) { const SpeechRecognitionResult& result = event_args.engine_result; - DVLOG(1) << "Got valid result"; - recognition_engine_->EndRecognition(); - listener_->OnRecognitionResult(session_id_, result); - listener_->OnRecognitionEnd(session_id_); - return STATE_IDLE; + if (result.is_provisional) { + DCHECK(!is_single_shot_); + listener_->OnRecognitionResult(session_id_, result); + // We don't end the recognition if a provisional result is received in + // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will + // end the recognition. + return state_; + } else { + recognition_engine_->EndRecognition(); + // We could receive an empty result (which we won't propagate further) + // in the following (continuous) scenario: + // 1. The caller start pushing audio and receives some results; + // 2. A |StopAudioCapture| is issued later; + // 3. The final audio frames captured in the interval ]1,2] do not lead to + // any result (nor any error); + // 4. The speech recognition engine, therefore, emits an empty result to + // notify that the recognition is ended with no error, yet neither any + // further result. + if (result.hypotheses.size() > 0) + listener_->OnRecognitionResult(session_id_, result); + listener_->OnRecognitionEnd(session_id_); + return STATE_IDLE; + } } SpeechRecognizerImpl::FSMState @@ -563,7 +588,7 @@ SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) { void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() { DCHECK(IsCapturingAudio()); - DVLOG(1) << "SpeechRecognizerImpl stopping audio capture."; + DVLOG(1) << "SpeechRecognizerImpl closing audio controller."; // Issues a Close on the audio controller, passing an empty callback. The only // purpose of such callback is to keep the audio controller refcounted until // Close has completed (in the audio thread) and automatically destroy it @@ -581,7 +606,7 @@ void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected) { // Calculate the input volume to display in the UI, smoothing towards the // new level. - // TODO(primiano) Do we really need all this floating point arith here? + // TODO(primiano): Do we really need all this floating point arith here? // Perhaps it might be quite expensive on mobile. float level = (rms - kAudioMeterMinDb) / (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped); diff --git a/content/browser/speech/speech_recognizer_impl.h b/content/browser/speech/speech_recognizer_impl.h index 8179e2743..89e41e6 100644 --- a/content/browser/speech/speech_recognizer_impl.h +++ b/content/browser/speech/speech_recognizer_impl.h @@ -45,6 +45,7 @@ class CONTENT_EXPORT SpeechRecognizerImpl SpeechRecognizerImpl( content::SpeechRecognitionEventListener* listener, int session_id, + bool is_single_shot, SpeechRecognitionEngine* engine); void StartRecognition(); @@ -153,6 +154,7 @@ class CONTENT_EXPORT SpeechRecognizerImpl int num_samples_recorded_; float audio_level_; bool is_dispatching_event_; + bool is_single_shot_; FSMState state_; DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl); diff --git a/content/browser/speech/speech_recognizer_impl_unittest.cc b/content/browser/speech/speech_recognizer_impl_unittest.cc index e73a894b..4a487c3 100644 --- a/content/browser/speech/speech_recognizer_impl_unittest.cc +++ b/content/browser/speech/speech_recognizer_impl_unittest.cc @@ -117,7 +117,10 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener, config.filter_profanities = false; sr_engine->SetConfig(config); - recognizer_ = new SpeechRecognizerImpl(this, 1, sr_engine); + const int kTestingSessionId = 1; + const bool kOneShotMode = true; + recognizer_ = new SpeechRecognizerImpl( + this, kTestingSessionId, kOneShotMode, sr_engine); recognizer_->SetAudioManagerForTesting(audio_manager_.get()); int audio_packet_length_bytes = diff --git a/content/common/speech_recognition_messages.h b/content/common/speech_recognition_messages.h index 864b442..d004b0a 100644 --- a/content/common/speech_recognition_messages.h +++ b/content/common/speech_recognition_messages.h @@ -29,7 +29,7 @@ IPC_STRUCT_TRAITS_BEGIN(content::SpeechRecognitionHypothesis) IPC_STRUCT_TRAITS_END() IPC_STRUCT_TRAITS_BEGIN(content::SpeechRecognitionResult) - IPC_STRUCT_TRAITS_MEMBER(provisional) + IPC_STRUCT_TRAITS_MEMBER(is_provisional) IPC_STRUCT_TRAITS_MEMBER(hypotheses) IPC_STRUCT_TRAITS_END() diff --git a/content/content_browser.gypi b/content/content_browser.gypi index e5489bd..4a56f74 100644 --- a/content/content_browser.gypi +++ b/content/content_browser.gypi @@ -669,6 +669,8 @@ 'browser/speech/endpointer/energy_endpointer_params.h', 'browser/speech/google_one_shot_remote_engine.cc', 'browser/speech/google_one_shot_remote_engine.h', + 'browser/speech/google_streaming_remote_engine.cc', + 'browser/speech/google_streaming_remote_engine.h', 'browser/speech/input_tag_speech_dispatcher_host.cc', 'browser/speech/input_tag_speech_dispatcher_host.h', 'browser/speech/speech_recognition_dispatcher_host.cc', diff --git a/content/content_tests.gypi b/content/content_tests.gypi index 3694b30..2ff3620f 100644 --- a/content/content_tests.gypi +++ b/content/content_tests.gypi @@ -192,6 +192,7 @@ 'content_plugin', 'content_renderer', 'test_support_content', + 'browser/speech/proto/speech_proto.gyp:speech_proto', 'content_resources.gyp:content_resources', '../base/base.gyp:test_support_base', '../base/third_party/dynamic_annotations/dynamic_annotations.gyp:dynamic_annotations', @@ -281,6 +282,7 @@ 'browser/speech/chunked_byte_buffer_unittest.cc', 'browser/speech/endpointer/endpointer_unittest.cc', 'browser/speech/google_one_shot_remote_engine_unittest.cc', + 'browser/speech/google_streaming_remote_engine_unittest.cc', 'browser/speech/speech_recognizer_impl_unittest.cc', 'browser/ssl/ssl_host_state_unittest.cc', 'browser/system_message_window_win_unittest.cc', diff --git a/content/public/common/content_switches.cc b/content/public/common/content_switches.cc index b0dadc7..f2a4034 100644 --- a/content/public/common/content_switches.cc +++ b/content/public/common/content_switches.cc @@ -201,6 +201,15 @@ const char kDisableSpeechInput[] = "disable-speech-input"; // Enables scripted speech api. const char kEnableScriptedSpeech[] = "enable-scripted-speech"; +// TODO(primiano): Remove the two switches below when the URL becomes public. +// Specifies the webservice URL for continuous speech recognition. +const char kSpeechRecognitionWebserviceURL[] = + "speech-service"; + +// Specifies the request key for the continuous speech recognition webservice. +const char kSpeechRecognitionWebserviceKey[] = + "speech-service-key"; + // Disables animation on the compositor thread. const char kDisableThreadedAnimation[] = "disable-threaded-animation"; diff --git a/content/public/common/content_switches.h b/content/public/common/content_switches.h index a876146..2c8ab43 100644 --- a/content/public/common/content_switches.h +++ b/content/public/common/content_switches.h @@ -74,6 +74,8 @@ extern const char kDisableSharedWorkers[]; extern const char kDisableSiteSpecificQuirks[]; CONTENT_EXPORT extern const char kDisableSpeechInput[]; CONTENT_EXPORT extern const char kEnableScriptedSpeech[]; +extern const char kSpeechRecognitionWebserviceURL[]; +extern const char kSpeechRecognitionWebserviceKey[]; CONTENT_EXPORT extern const char kDisableThreadedAnimation[]; CONTENT_EXPORT extern const char kDisableWebAudio[]; extern const char kDisableWebSecurity[]; diff --git a/content/public/common/speech_recognition_result.cc b/content/public/common/speech_recognition_result.cc index 9a89bab..af05fc7 100644 --- a/content/public/common/speech_recognition_result.cc +++ b/content/public/common/speech_recognition_result.cc @@ -7,10 +7,11 @@ namespace content { SpeechRecognitionResult::SpeechRecognitionResult() - : provisional(false) { + : is_provisional(false) { } SpeechRecognitionResult::~SpeechRecognitionResult() { } } // namespace content + diff --git a/content/public/common/speech_recognition_result.h b/content/public/common/speech_recognition_result.h index f227e0d..1a912fd 100644 --- a/content/public/common/speech_recognition_result.h +++ b/content/public/common/speech_recognition_result.h @@ -32,7 +32,7 @@ typedef std::vector<SpeechRecognitionHypothesis> struct CONTENT_EXPORT SpeechRecognitionResult { SpeechRecognitionHypothesisArray hypotheses; - bool provisional; + bool is_provisional; SpeechRecognitionResult(); ~SpeechRecognitionResult(); diff --git a/content/renderer/speech_recognition_dispatcher.cc b/content/renderer/speech_recognition_dispatcher.cc index 210a66f..43ca409 100644 --- a/content/renderer/speech_recognition_dispatcher.cc +++ b/content/renderer/speech_recognition_dispatcher.cc @@ -56,10 +56,14 @@ void SpeechRecognitionDispatcher::start( const WebSpeechRecognitionHandle& handle, const WebSpeechRecognitionParams& params, WebSpeechRecognizerClient* recognizer_client) { - //TODO(primiano) What to do if a start is issued to an already started object? DCHECK(!recognizer_client_ || recognizer_client_ == recognizer_client); recognizer_client_ = recognizer_client; + // TODO(primiano): Should return false in order to communicate the failure + // and let WebKit throw an exception (need a hans@ patch to do that). + if (HandleExists(handle)) + return; + SpeechRecognitionHostMsg_StartRequest_Params msg_params; for (size_t i = 0; i < params.grammars().size(); ++i) { const WebSpeechGrammar& grammar = params.grammars()[i]; @@ -71,23 +75,29 @@ void SpeechRecognitionDispatcher::start( msg_params.is_one_shot = !params.continuous(); msg_params.origin_url = params.origin().toString().utf8(); msg_params.render_view_id = routing_id(); - msg_params.request_id = GetIDForHandle(handle); + msg_params.request_id = GetOrCreateIDForHandle(handle); + // The handle mapping will be removed in |OnRecognitionEnd|. Send(new SpeechRecognitionHostMsg_StartRequest(msg_params)); } void SpeechRecognitionDispatcher::stop( const WebSpeechRecognitionHandle& handle, WebSpeechRecognizerClient* recognizer_client) { - DCHECK(recognizer_client_ == recognizer_client); - Send(new SpeechRecognitionHostMsg_StopCaptureRequest(routing_id(), - GetIDForHandle(handle))); + // Ignore a |stop| issued without a matching |start|. + if (recognizer_client_ != recognizer_client || !HandleExists(handle)) + return; + Send(new SpeechRecognitionHostMsg_StopCaptureRequest( + routing_id(), GetOrCreateIDForHandle(handle))); } void SpeechRecognitionDispatcher::abort( const WebSpeechRecognitionHandle& handle, WebSpeechRecognizerClient* recognizer_client) { - Send(new SpeechRecognitionHostMsg_AbortRequest(routing_id(), - GetIDForHandle(handle))); + // Ignore an |abort| issued without a matching |start|. + if (recognizer_client_ != recognizer_client || !HandleExists(handle)) + return; + Send(new SpeechRecognitionHostMsg_AbortRequest( + routing_id(), GetOrCreateIDForHandle(handle))); } void SpeechRecognitionDispatcher::OnRecognitionStarted(int request_id) { @@ -116,8 +126,8 @@ void SpeechRecognitionDispatcher::OnErrorOccurred( recognizer_client_->didReceiveNoMatch(GetHandleFromID(request_id), WebSpeechRecognitionResult()); } else { - // TODO(primiano) speech_recognition_error.h must be updated with the new - // API specs soon. + // TODO(primiano): speech_recognition_error.h must be updated to match the + // new enums defined in the the API specs (thus removing the code below). WebSpeechRecognizerClient::ErrorCode wk_error_code; switch (error.code) { case content::SPEECH_RECOGNITION_ERROR_ABORTED: @@ -140,7 +150,7 @@ void SpeechRecognitionDispatcher::OnErrorOccurred( wk_error_code = WebSpeechRecognizerClient::OtherError; } recognizer_client_->didReceiveError(GetHandleFromID(request_id), - WebString(), // TODO(primiano) message? + WebString(), // TODO(primiano): message? wk_error_code); } } @@ -160,8 +170,8 @@ void SpeechRecognitionDispatcher::OnResultRetrieved( transcripts[i] = result.hypotheses[i].utterance; confidences[i] = static_cast<float>(result.hypotheses[i].confidence); } - webkit_result.assign(transcripts, confidences, !result.provisional); - // TODO(primiano) Handle history, currently empty. + webkit_result.assign(transcripts, confidences, !result.is_provisional); + // TODO(primiano): Handle history, currently empty. WebVector<WebSpeechRecognitionResult> empty_history; recognizer_client_->didReceiveResult(GetHandleFromID(request_id), webkit_result, @@ -169,7 +179,8 @@ void SpeechRecognitionDispatcher::OnResultRetrieved( empty_history); } -int SpeechRecognitionDispatcher::GetIDForHandle( + +int SpeechRecognitionDispatcher::GetOrCreateIDForHandle( const WebSpeechRecognitionHandle& handle) { // Search first for an existing mapping. for (HandleMap::iterator iter = handle_map_.begin(); @@ -185,6 +196,17 @@ int SpeechRecognitionDispatcher::GetIDForHandle( return new_id; } +bool SpeechRecognitionDispatcher::HandleExists( + const WebSpeechRecognitionHandle& handle) { + for (HandleMap::iterator iter = handle_map_.begin(); + iter != handle_map_.end(); + ++iter) { + if (iter->second.equals(handle)) + return true; + } + return false; +} + const WebSpeechRecognitionHandle& SpeechRecognitionDispatcher::GetHandleFromID( int request_id) { HandleMap::iterator iter = handle_map_.find(request_id); diff --git a/content/renderer/speech_recognition_dispatcher.h b/content/renderer/speech_recognition_dispatcher.h index 3e3b141..bd643ede 100644 --- a/content/renderer/speech_recognition_dispatcher.h +++ b/content/renderer/speech_recognition_dispatcher.h @@ -54,7 +54,8 @@ class SpeechRecognitionDispatcher : public content::RenderViewObserver, void OnResultRetrieved(int request_id, const content::SpeechRecognitionResult& result); - int GetIDForHandle(const WebKit::WebSpeechRecognitionHandle& handle); + int GetOrCreateIDForHandle(const WebKit::WebSpeechRecognitionHandle& handle); + bool HandleExists(const WebKit::WebSpeechRecognitionHandle& handle); const WebKit::WebSpeechRecognitionHandle& GetHandleFromID(int handle_id); // The WebKit client class that we use to send events back to the JS world. |