summaryrefslogtreecommitdiffstats
path: root/content/browser
diff options
context:
space:
mode:
authorprimiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-06-22 16:57:14 +0000
committerprimiano@chromium.org <primiano@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-06-22 16:57:14 +0000
commitc766aa9d160a4da403ff48e62c7f1a5dccdb3421 (patch)
treee3deca24d0149086b59a549bf4e87d229f03b88e /content/browser
parent03a94ccc9e67b37d2871290b73609c615e95f61f (diff)
downloadchromium_src-c766aa9d160a4da403ff48e62c7f1a5dccdb3421.zip
chromium_src-c766aa9d160a4da403ff48e62c7f1a5dccdb3421.tar.gz
chromium_src-c766aa9d160a4da403ff48e62c7f1a5dccdb3421.tar.bz2
Introduced experimental support for interacting with the google remote streaming speech recognition webservice (Speech CL2.3).
The support is very experimental by now and has a lot of debugging code for helping the development. BUG=116954 TEST=content_unittests:GoogleStreamingRemoteEngineTest Review URL: https://chromiumcodereview.appspot.com/10546020 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@143616 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'content/browser')
-rw-r--r--content/browser/speech/chunked_byte_buffer.cc1
-rw-r--r--content/browser/speech/google_streaming_remote_engine.cc586
-rw-r--r--content/browser/speech/google_streaming_remote_engine.h165
-rw-r--r--content/browser/speech/google_streaming_remote_engine_unittest.cc440
-rw-r--r--content/browser/speech/speech_recognition_manager_impl.cc11
-rw-r--r--content/browser/speech/speech_recognizer_impl.cc57
-rw-r--r--content/browser/speech/speech_recognizer_impl.h2
-rw-r--r--content/browser/speech/speech_recognizer_impl_unittest.cc5
8 files changed, 1248 insertions, 19 deletions
diff --git a/content/browser/speech/chunked_byte_buffer.cc b/content/browser/speech/chunked_byte_buffer.cc
index 123755a..115fa82 100644
--- a/content/browser/speech/chunked_byte_buffer.cc
+++ b/content/browser/speech/chunked_byte_buffer.cc
@@ -38,7 +38,6 @@ ChunkedByteBuffer::~ChunkedByteBuffer() {
}
void ChunkedByteBuffer::Append(const uint8* start, size_t length) {
- DCHECK(length > 0);
size_t remaining_bytes = length;
const uint8* next_data = start;
diff --git a/content/browser/speech/google_streaming_remote_engine.cc b/content/browser/speech/google_streaming_remote_engine.cc
new file mode 100644
index 0000000..34ff853
--- /dev/null
+++ b/content/browser/speech/google_streaming_remote_engine.cc
@@ -0,0 +1,586 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "content/browser/speech/google_streaming_remote_engine.h"
+
+#include <vector>
+
+#include "base/bind.h"
+#include "base/command_line.h"
+#include "base/rand_util.h"
+#include "base/string_number_conversions.h"
+#include "base/string_util.h"
+#include "base/time.h"
+#include "base/utf_string_conversions.h"
+#include "content/browser/speech/audio_buffer.h"
+#include "content/browser/speech/proto/google_streaming_api.pb.h"
+#include "content/public/browser/browser_thread.h"
+#include "content/public/common/content_switches.h"
+#include "content/public/common/speech_recognition_error.h"
+#include "content/public/common/speech_recognition_result.h"
+#include "net/base/escape.h"
+#include "net/base/load_flags.h"
+#include "net/url_request/url_fetcher.h"
+#include "net/url_request/url_request_context.h"
+#include "net/url_request/url_request_context_getter.h"
+#include "net/url_request/url_request_status.h"
+
+using content::BrowserThread;
+using content::SpeechRecognitionError;
+using content::SpeechRecognitionErrorCode;
+using content::SpeechRecognitionHypothesis;
+using content::SpeechRecognitionResult;
+using net::URLFetcher;
+
+namespace {
+
+// TODO(primiano): This shouldn't be a const, rather it should be taken from
+// maxNBest property (which is not yet implemented in WebKit).
+const int kMaxResults = 5;
+const char kDownstreamUrl[] = "/down?";
+const char kUpstreamUrl[] = "/up?";
+const int kAudioPacketIntervalMs = 100;
+const speech::AudioEncoder::Codec kDefaultAudioCodec =
+ speech::AudioEncoder::CODEC_FLAC;
+
+// TODO(primiano): /////////// Remove this after debug stage. /////////////////
+void DumpResponse(const std::string& response) {
+ bool parse_ok;
+ speech::HttpStreamingResult res;
+ parse_ok = res.ParseFromString(response);
+ DVLOG(1) << "------------";
+ if(!parse_ok) {
+ DVLOG(1) << "Parse failed!";
+ return;
+ }
+ if (res.has_id())
+ DVLOG(1) << "ID\t" << res.id();
+ if (res.has_status())
+ DVLOG(1) << "STATUS\t" << res.status();
+ if (res.has_upstream_connected())
+ DVLOG(1) << "UPCON\t" << res.upstream_connected();
+ if (res.hypotheses_size() > 0)
+ DVLOG(1) << "HYPS\t" << res.hypotheses_size();
+ if (res.has_provisional())
+ DVLOG(1) << "PROV\t" << res.provisional();
+ if (res.has_ephemeral())
+ DVLOG(1) << "EPHM\t" << res.ephemeral();
+ DVLOG(1) << "------------\n";
+}
+
+std::string GetWebserviceBaseURL() {
+ const CommandLine& command_line = *CommandLine::ForCurrentProcess();
+ return command_line.GetSwitchValueASCII(
+ switches::kSpeechRecognitionWebserviceURL);
+}
+
+std::string GetWebserviceKey() {
+ const CommandLine& command_line = *CommandLine::ForCurrentProcess();
+ return command_line.GetSwitchValueASCII(
+ switches::kSpeechRecognitionWebserviceKey);
+}
+
+} // namespace
+
+namespace speech {
+
+const int GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTests = 0;
+const int GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTests = 1;
+const int GoogleStreamingRemoteEngine::kWebserviceStatusNoError = 0;
+const int GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch = 5;
+
+GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine(
+ net::URLRequestContextGetter* context)
+ : url_context_(context),
+ encoder_(NULL),
+ previous_response_length_(0),
+ got_last_definitive_result_(false),
+ is_dispatching_event_(false),
+ state_(STATE_IDLE) {
+}
+
+GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {}
+
+void GoogleStreamingRemoteEngine::SetConfig(
+ const SpeechRecognitionEngineConfig& config) {
+ config_ = config;
+}
+
+void GoogleStreamingRemoteEngine::StartRecognition() {
+ FSMEventArgs event_args(EVENT_START_RECOGNITION);
+ DispatchEvent(event_args);
+}
+
+void GoogleStreamingRemoteEngine::EndRecognition() {
+ FSMEventArgs event_args(EVENT_END_RECOGNITION);
+ DispatchEvent(event_args);
+}
+
+void GoogleStreamingRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
+ FSMEventArgs event_args(EVENT_AUDIO_CHUNK);
+ event_args.audio_data = &data;
+ DispatchEvent(event_args);
+}
+
+void GoogleStreamingRemoteEngine::AudioChunksEnded() {
+ FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED);
+ DispatchEvent(event_args);
+}
+
+void GoogleStreamingRemoteEngine::OnURLFetchComplete(const URLFetcher* source) {
+ const bool kResponseComplete = true;
+ DispatchHTTPResponse(source, kResponseComplete);
+}
+
+void GoogleStreamingRemoteEngine::OnURLFetchDownloadProgress(
+ const URLFetcher* source, int64 current, int64 total) {
+ const bool kPartialResponse = false;
+ DispatchHTTPResponse(source, kPartialResponse);
+}
+
+void GoogleStreamingRemoteEngine::DispatchHTTPResponse(const URLFetcher* source,
+ bool end_of_response) {
+ DCHECK(CalledOnValidThread());
+ DCHECK(source);
+ const bool response_is_good = source->GetStatus().is_success() &&
+ source->GetResponseCode() == 200;
+ std::string response;
+ if (response_is_good)
+ source->GetResponseAsString(&response);
+ const size_t current_response_length = response.size();
+
+ // TODO(primiano): /////////// Remove this after debug stage. ////////////////
+ DVLOG(1) << (source == downstream_fetcher_.get() ? "Downstream" : "Upstream")
+ << "HTTP, code: " << source->GetResponseCode()
+ << " length: " << current_response_length
+ << " eor: " << end_of_response;
+
+ // URLFetcher provides always the entire response buffer, but we are only
+ // interested in the fresh data introduced by the last chunk. Therefore, we
+ // drop the previous content we have already processed.
+ if (current_response_length != 0) {
+ DCHECK_GE(current_response_length, previous_response_length_);
+ response.erase(0, previous_response_length_);
+ previous_response_length_ = current_response_length;
+ }
+
+ if (!response_is_good && source == downstream_fetcher_.get()) {
+ // TODO(primiano): /////////// Remove this after debug stage. /////////////
+ DVLOG(1) << "Downstream error " << source->GetResponseCode();
+ FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR);
+ DispatchEvent(event_args);
+ return;
+ }
+ if (!response_is_good && source == upstream_fetcher_.get()) {
+ // TODO(primiano): /////////// Remove this after debug stage. /////////////
+ DVLOG(1) << "Upstream error " << source->GetResponseCode()
+ << " EOR " << end_of_response;
+ FSMEventArgs event_args(EVENT_UPSTREAM_ERROR);
+ DispatchEvent(event_args);
+ return;
+ }
+ if (source == upstream_fetcher_.get())
+ return;
+
+ DCHECK(response_is_good && source == downstream_fetcher_.get());
+
+ // The downstream response is organized in chunks, whose size is determined
+ // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
+ // Such chunks are sent by the speech recognition webservice over the HTTP
+ // downstream channel using HTTP chunked transfer (unrelated to our chunks).
+ // This function is called every time an HTTP chunk is received by the
+ // url fetcher. However there isn't any particular matching beween our
+ // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
+ // contain a portion of one chunk or even more chunks together.
+ chunked_byte_buffer_.Append(response);
+
+ // A single HTTP chunk can contain more than one data chunk, thus the while.
+ while (chunked_byte_buffer_.HasChunks()) {
+ FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE);
+ event_args.response = chunked_byte_buffer_.PopChunk();
+ DCHECK(event_args.response.get());
+ // TODO(primiano): /////////// Remove this after debug stage. /////////////
+ DumpResponse(std::string((char*)&(*event_args.response->begin()),
+ event_args.response->size()));
+ DispatchEvent(event_args);
+ }
+ if (end_of_response) {
+ FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED);
+ DispatchEvent(event_args);
+ }
+}
+
+bool GoogleStreamingRemoteEngine::IsRecognitionPending() const {
+ DCHECK(CalledOnValidThread());
+ return state_ != STATE_IDLE;
+}
+
+int GoogleStreamingRemoteEngine::GetDesiredAudioChunkDurationMs() const {
+ return kAudioPacketIntervalMs;
+}
+
+// ----------------------- Core FSM implementation ---------------------------
+
+void GoogleStreamingRemoteEngine::DispatchEvent(
+ const FSMEventArgs& event_args) {
+ DCHECK(CalledOnValidThread());
+ DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
+ DCHECK_LE(state_, STATE_MAX_VALUE);
+
+ // Event dispatching must be sequential, otherwise it will break all the rules
+ // and the assumptions of the finite state automata model.
+ DCHECK(!is_dispatching_event_);
+ is_dispatching_event_ = true;
+
+ // TODO(primiano): /////////// Remove this after debug stage. ////////////////
+ //DVLOG(1) << "State " << state_ << ", Event " << event_args.event;
+ state_ = ExecuteTransitionAndGetNextState(event_args);
+
+ is_dispatching_event_ = false;
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::ExecuteTransitionAndGetNextState(
+ const FSMEventArgs& event_args) {
+ const FSMEvent event = event_args.event;
+ switch (state_) {
+ case STATE_IDLE:
+ switch (event) {
+ case EVENT_START_RECOGNITION:
+ return ConnectBothStreams(event_args);
+ case EVENT_END_RECOGNITION:
+ // Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of
+ // abort, so we just silently drop them here.
+ case EVENT_AUDIO_CHUNK:
+ case EVENT_AUDIO_CHUNKS_ENDED:
+ // DOWNSTREAM_CLOSED can be received if we end up here due to an error.
+ case EVENT_DOWNSTREAM_CLOSED:
+ return DoNothing(event_args);
+ case EVENT_UPSTREAM_ERROR:
+ case EVENT_DOWNSTREAM_ERROR:
+ case EVENT_DOWNSTREAM_RESPONSE:
+ return NotFeasible(event_args);
+ }
+ break;
+ case STATE_BOTH_STREAMS_CONNECTED:
+ switch (event) {
+ case EVENT_AUDIO_CHUNK:
+ return TransmitAudioUpstream(event_args);
+ case EVENT_DOWNSTREAM_RESPONSE:
+ return ProcessDownstreamResponse(event_args);
+ case EVENT_AUDIO_CHUNKS_ENDED:
+ return CloseUpstreamAndWaitForResults(event_args);
+ case EVENT_END_RECOGNITION:
+ return AbortSilently(event_args);
+ case EVENT_UPSTREAM_ERROR:
+ case EVENT_DOWNSTREAM_ERROR:
+ case EVENT_DOWNSTREAM_CLOSED:
+ return AbortWithError(event_args);
+ case EVENT_START_RECOGNITION:
+ return NotFeasible(event_args);
+ }
+ break;
+ case STATE_WAITING_DOWNSTREAM_RESULTS:
+ switch (event) {
+ case EVENT_DOWNSTREAM_RESPONSE:
+ return ProcessDownstreamResponse(event_args);
+ case EVENT_DOWNSTREAM_CLOSED:
+ return RaiseNoMatchErrorIfGotNoResults(event_args);
+ case EVENT_END_RECOGNITION:
+ return AbortSilently(event_args);
+ case EVENT_UPSTREAM_ERROR:
+ case EVENT_DOWNSTREAM_ERROR:
+ return AbortWithError(event_args);
+ case EVENT_START_RECOGNITION:
+ case EVENT_AUDIO_CHUNK:
+ case EVENT_AUDIO_CHUNKS_ENDED:
+ return NotFeasible(event_args);
+ }
+ break;
+ }
+ return NotFeasible(event_args);
+}
+
+// ----------- Contract for all the FSM evolution functions below -------------
+// - Are guaranteed to be executed in the same thread (IO, except for tests);
+// - Are guaranteed to be not reentrant (themselves and each other);
+// - event_args members are guaranteed to be stable during the call;
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) {
+ DCHECK(!upstream_fetcher_.get());
+ DCHECK(!downstream_fetcher_.get());
+
+ encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
+ config_.audio_sample_rate,
+ config_.audio_num_bits_per_sample));
+ DCHECK(encoder_.get());
+ const std::string request_key = GenerateRequestKey();
+
+ // Setup downstream fetcher.
+ std::vector<std::string> downstream_args;
+ downstream_args.push_back("sky=" + GetWebserviceKey());
+ downstream_args.push_back("pair=" + request_key);
+ downstream_args.push_back("maxresults=" + base::IntToString(kMaxResults));
+
+ GURL downstream_url(GetWebserviceBaseURL() + std::string(kDownstreamUrl) +
+ JoinString(downstream_args, '&'));
+ // TODO(primiano): /////////// Remove this after debug stage. /////////////
+ DVLOG(1) << "Opening downstream: " + downstream_url.PathForRequest();
+
+ downstream_fetcher_.reset(URLFetcher::Create(
+ kDownstreamUrlFetcherIdForTests, downstream_url, URLFetcher::GET, this));
+ downstream_fetcher_->SetRequestContext(url_context_);
+ downstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
+ net::LOAD_DO_NOT_SEND_COOKIES |
+ net::LOAD_DO_NOT_SEND_AUTH_DATA);
+ downstream_fetcher_->Start();
+
+ // Setup upstream fetcher.
+ // TODO(primiano): Handle config_.grammar array when it will be implemented by
+ // the speech recognition webservice.
+ std::vector<std::string> upstream_args;
+ upstream_args.push_back("sky=" + GetWebserviceKey());
+ upstream_args.push_back("pair=" + request_key);
+ upstream_args.push_back(
+ "lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true));
+ upstream_args.push_back(
+ config_.filter_profanities ? "pfilter=2" : "pfilter=0");
+ upstream_args.push_back("maxresults=" + base::IntToString(kMaxResults));
+ upstream_args.push_back("client=myapp.mycompany.com");
+ // TODO(primiano): Can we remove this feature sending audio HW information?
+ if (!config_.hardware_info.empty()) {
+ upstream_args.push_back(
+ "xhw=" + net::EscapeQueryParamValue(config_.hardware_info, true));
+ }
+
+ GURL upstream_url(GetWebserviceBaseURL() + std::string(kUpstreamUrl) +
+ JoinString(upstream_args, '&'));
+
+ // TODO(primiano): /////////// Remove this after debug stage. ////////////////
+ DVLOG(1) << "Opening upstream: " + upstream_url.PathForRequest();
+
+ upstream_fetcher_.reset(URLFetcher::Create(
+ kUpstreamUrlFetcherIdForTests, upstream_url, URLFetcher::POST, this));
+ upstream_fetcher_->SetChunkedUpload(encoder_->mime_type());
+ upstream_fetcher_->SetRequestContext(url_context_);
+ upstream_fetcher_->SetReferrer(config_.origin_url);
+ upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
+ net::LOAD_DO_NOT_SEND_COOKIES |
+ net::LOAD_DO_NOT_SEND_AUTH_DATA);
+ upstream_fetcher_->Start();
+ previous_response_length_ = 0;
+ return STATE_BOTH_STREAMS_CONNECTED;
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::TransmitAudioUpstream(
+ const FSMEventArgs& event_args) {
+ DCHECK(upstream_fetcher_.get());
+ DCHECK(event_args.audio_data.get());
+ const AudioChunk& audio = *(event_args.audio_data);
+
+ DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
+ encoder_->Encode(audio);
+ scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
+ upstream_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
+ return state_;
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::ProcessDownstreamResponse(
+ const FSMEventArgs& event_args) {
+ DCHECK(event_args.response.get());
+ bool is_definitive_result = false;
+
+ HttpStreamingResult ws_result;
+ const bool protobuf_parse_successful = ws_result.ParseFromArray(
+ &(*event_args.response->begin()),
+ event_args.response->size());
+ if (!protobuf_parse_successful)
+ return AbortWithError(event_args);
+
+ // TODO(primiano): The code below sounds to me like a hack. Discuss the
+ // possibility of having a simpler and clearer protobuf grammar, in order to
+ // distinguish the different type of results and errors in a civilized way.
+
+ // Skip the upstream connected notification, since we're not interested in it.
+ if (ws_result.has_upstream_connected())
+ return state_;
+
+ if (ws_result.has_status()) {
+ switch (ws_result.status()) {
+ case kWebserviceStatusNoError:
+ is_definitive_result = true;
+ break;
+ case kWebserviceStatusErrorNoMatch:
+ // TODO(primiano): Contact gshires@, in case of no results, instead of
+ // the expected kWebserviceStatusErrorNoMatch status code we receive a
+ // provisional-like result with no provisional nor ephemeral strings.
+ return Abort(content::SPEECH_RECOGNITION_ERROR_NO_MATCH);
+ default:
+ VLOG(1) << "Received an unknown status code from the speech recognition"
+ "webservice (" << ws_result.status() << ")";
+ return Abort(content::SPEECH_RECOGNITION_ERROR_NETWORK);
+ }
+ }
+
+ SpeechRecognitionResult result;
+ if (is_definitive_result) {
+ got_last_definitive_result_ = true;
+ result.is_provisional = false;
+ for (int i = 0; i < ws_result.hypotheses_size(); ++i) {
+ const HttpStreamingHypothesis& ws_hypothesis = ws_result.hypotheses(i);
+ SpeechRecognitionHypothesis hypothesis;
+ DCHECK(ws_hypothesis.has_confidence());
+ hypothesis.confidence = ws_hypothesis.confidence();
+ DCHECK(ws_hypothesis.has_utterance());
+ hypothesis.utterance = UTF8ToUTF16(ws_hypothesis.utterance());
+ result.hypotheses.push_back(hypothesis);
+ }
+ } else {
+ result.is_provisional = true;
+ string16 transcript;
+ if (ws_result.has_provisional())
+ transcript.append(UTF8ToUTF16(ws_result.provisional()));
+ if (ws_result.has_ephemeral())
+ transcript.append(UTF8ToUTF16(ws_result.ephemeral()));
+ DCHECK(!transcript.empty());
+ result.hypotheses.push_back(SpeechRecognitionHypothesis(transcript, 0.0f));
+ }
+
+ // Propagate just actual results.
+ if (result.hypotheses.size() > 0)
+ delegate()->OnSpeechRecognitionEngineResult(result);
+
+ return state_;
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::RaiseNoMatchErrorIfGotNoResults(
+ const FSMEventArgs& event_args) {
+ if (!got_last_definitive_result_) {
+ // Provide an empty result to notify that recognition is ended with no
+ // errors, yet neither any further results.
+ delegate()->OnSpeechRecognitionEngineResult(SpeechRecognitionResult());
+ }
+ return AbortSilently(event_args);
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults(
+ const FSMEventArgs&) {
+ DCHECK(upstream_fetcher_.get());
+ DCHECK(encoder_.get());
+
+ // TODO(primiano): /////////// Remove this after debug stage. ////////////////
+ DVLOG(1) << "Closing upstream";
+
+ // The encoder requires a non-empty final buffer. So we encode a packet
+ // of silence in case encoder had no data already.
+ std::vector<short> samples(
+ config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
+ scoped_refptr<AudioChunk> dummy_chunk =
+ new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
+ samples.size() * sizeof(short),
+ encoder_->bits_per_sample() / 8);
+ encoder_->Encode(*dummy_chunk);
+ encoder_->Flush();
+ scoped_refptr<AudioChunk> encoded_dummy_data =
+ encoder_->GetEncodedDataAndClear();
+ DCHECK(!encoded_dummy_data->IsEmpty());
+ encoder_.reset();
+
+ upstream_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
+ got_last_definitive_result_ = false;
+ return STATE_WAITING_DOWNSTREAM_RESULTS;
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::CloseDownstream(const FSMEventArgs&) {
+ DCHECK(!upstream_fetcher_.get());
+ DCHECK(downstream_fetcher_.get());
+
+ // TODO(primiano): /////////// Remove this after debug stage. ////////////////
+ DVLOG(1) << "Closing downstream";
+ downstream_fetcher_.reset();
+ return STATE_IDLE;
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::AbortSilently(const FSMEventArgs&) {
+ return Abort(content::SPEECH_RECOGNITION_ERROR_NONE);
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::AbortWithError(const FSMEventArgs&) {
+ return Abort(content::SPEECH_RECOGNITION_ERROR_NETWORK);
+}
+
+GoogleStreamingRemoteEngine::FSMState GoogleStreamingRemoteEngine::Abort(
+ SpeechRecognitionErrorCode error_code) {
+ // TODO(primiano): /////////// Remove this after debug stage. ////////////////
+ DVLOG(1) << "Aborting with error " << error_code;
+
+ if (error_code != content::SPEECH_RECOGNITION_ERROR_NONE) {
+ delegate()->OnSpeechRecognitionEngineError(
+ SpeechRecognitionError(error_code));
+ }
+ downstream_fetcher_.reset();
+ upstream_fetcher_.reset();
+ encoder_.reset();
+ return STATE_IDLE;
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::DoNothing(const FSMEventArgs&) {
+ return state_;
+}
+
+GoogleStreamingRemoteEngine::FSMState
+GoogleStreamingRemoteEngine::NotFeasible(const FSMEventArgs& event_args) {
+ NOTREACHED() << "Unfeasible event " << event_args.event
+ << " in state " << state_;
+ return state_;
+}
+
+std::string GoogleStreamingRemoteEngine::GetAcceptedLanguages() const {
+ std::string langs = config_.language;
+ if (langs.empty() && url_context_) {
+ // If no language is provided then we use the first from the accepted
+ // language list. If this list is empty then it defaults to "en-US".
+ // Example of the contents of this list: "es,en-GB;q=0.8", ""
+ net::URLRequestContext* request_context =
+ url_context_->GetURLRequestContext();
+ DCHECK(request_context);
+ std::string accepted_language_list = request_context->accept_language();
+ size_t separator = accepted_language_list.find_first_of(",;");
+ if (separator > std::string::npos)
+ langs = accepted_language_list.substr(0, separator);
+ }
+ if (langs.empty())
+ langs = "en-US";
+ return langs;
+}
+
+// TODO(primiano): Is there any utility in the codebase that already does this?
+std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const {
+ const int64 kKeepLowBytes = GG_LONGLONG(0x00000000FFFFFFFF);
+ const int64 kKeepHighBytes = GG_LONGLONG(0xFFFFFFFF00000000);
+
+ // Just keep the least significant bits of timestamp, in order to reduce
+ // probability of collisions.
+ int64 key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) |
+ (base::RandUint64() & kKeepHighBytes);
+ return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
+}
+
+GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
+ : event(event_value) {
+}
+
+GoogleStreamingRemoteEngine::FSMEventArgs::~FSMEventArgs() {
+}
+
+} // namespace speech
diff --git a/content/browser/speech/google_streaming_remote_engine.h b/content/browser/speech/google_streaming_remote_engine.h
new file mode 100644
index 0000000..fde4957
--- /dev/null
+++ b/content/browser/speech/google_streaming_remote_engine.h
@@ -0,0 +1,165 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
+#define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "base/basictypes.h"
+#include "base/memory/ref_counted.h"
+#include "base/memory/scoped_ptr.h"
+#include "base/threading/non_thread_safe.h"
+#include "content/browser/speech/audio_encoder.h"
+#include "content/browser/speech/chunked_byte_buffer.h"
+#include "content/browser/speech/speech_recognition_engine.h"
+#include "content/common/content_export.h"
+#include "content/public/common/speech_recognition_error.h"
+#include "googleurl/src/gurl.h"
+#include "net/url_request/url_fetcher_delegate.h"
+
+namespace content {
+struct SpeechRecognitionError;
+struct SpeechRecognitionResult;
+}
+
+namespace net {
+class URLRequestContextGetter;
+}
+
+namespace speech {
+
+class AudioChunk;
+
+// Implements a SpeechRecognitionEngine supporting continuous recognition by
+// means of interaction with Google streaming speech recognition webservice.
+// More in details, this class establishes two HTTP(S) connections with the
+// webservice, for each session, herein called "upstream" and "downstream".
+// Audio chunks are sent on the upstream by means of a chunked HTTP POST upload.
+// Recognition results are retrieved in a full-duplex fashion (i.e. while
+// pushing audio on the upstream) on the downstream by means of a chunked
+// HTTP GET request. Pairing between the two stream is handled through a
+// randomly generated key, unique for each request, which is passed in the
+// &pair= arg to both stream request URLs.
+// In the case of a regular session, the upstream is closed when the audio
+// capture ends (notified through a |AudioChunksEnded| call) and the downstream
+// waits for a corresponding server closure (eventually some late results can
+// come after closing the upstream).
+// Both stream are guaranteed to be closed when |EndRecognition| call is issued.
+class CONTENT_EXPORT GoogleStreamingRemoteEngine
+ : public NON_EXPORTED_BASE(SpeechRecognitionEngine),
+ public net::URLFetcherDelegate,
+ public NON_EXPORTED_BASE(base::NonThreadSafe) {
+ public:
+ explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter* context);
+ virtual ~GoogleStreamingRemoteEngine();
+
+ // SpeechRecognitionEngine methods.
+ virtual void SetConfig(const SpeechRecognitionEngineConfig& config) OVERRIDE;
+ virtual void StartRecognition() OVERRIDE;
+ virtual void EndRecognition() OVERRIDE;
+ virtual void TakeAudioChunk(const AudioChunk& data) OVERRIDE;
+ virtual void AudioChunksEnded() OVERRIDE;
+ virtual bool IsRecognitionPending() const OVERRIDE;
+ virtual int GetDesiredAudioChunkDurationMs() const OVERRIDE;
+
+ // net::URLFetcherDelegate methods.
+ virtual void OnURLFetchComplete(const net::URLFetcher* source) OVERRIDE;
+ virtual void OnURLFetchDownloadProgress(const net::URLFetcher* source,
+ int64 current, int64 total) OVERRIDE;
+
+ private:
+ friend class GoogleStreamingRemoteEngineTest;
+
+ // IDs passed to URLFetcher::Create(). Used for testing.
+ static const int kUpstreamUrlFetcherIdForTests;
+ static const int kDownstreamUrlFetcherIdForTests;
+
+ // Response status codes from the speech recognition webservice.
+ static const int kWebserviceStatusNoError;
+ static const int kWebserviceStatusErrorNoMatch;
+
+ // Data types for the internal Finite State Machine (FSM).
+ enum FSMState {
+ STATE_IDLE = 0,
+ STATE_BOTH_STREAMS_CONNECTED,
+ STATE_WAITING_DOWNSTREAM_RESULTS,
+ STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
+ };
+
+ enum FSMEvent {
+ EVENT_END_RECOGNITION = 0,
+ EVENT_START_RECOGNITION,
+ EVENT_AUDIO_CHUNK,
+ EVENT_AUDIO_CHUNKS_ENDED,
+ EVENT_UPSTREAM_ERROR,
+ EVENT_DOWNSTREAM_ERROR,
+ EVENT_DOWNSTREAM_RESPONSE,
+ EVENT_DOWNSTREAM_CLOSED,
+ EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
+ };
+
+ struct FSMEventArgs {
+ explicit FSMEventArgs(FSMEvent event_value);
+ ~FSMEventArgs();
+
+ FSMEvent event;
+
+ // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
+ scoped_refptr<const AudioChunk> audio_data;
+
+ // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
+ scoped_ptr<std::vector<uint8> > response;
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);
+ };
+
+ // Invoked by both upstream and downstream URLFetcher callbacks to handle
+ // new chunk data, connection closed or errors notifications.
+ void DispatchHTTPResponse(const net::URLFetcher* source,
+ bool end_of_response);
+
+ // Entry point for pushing any new external event into the recognizer FSM.
+ void DispatchEvent(const FSMEventArgs& event_args);
+
+ // Defines the behavior of the recognizer FSM, selecting the appropriate
+ // transition according to the current state and event.
+ FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);
+
+ // The methods below handle transitions of the recognizer FSM.
+ FSMState ConnectBothStreams(const FSMEventArgs& event_args);
+ FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
+ FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
+ FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
+ FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
+ FSMState CloseDownstream(const FSMEventArgs& event_args);
+ FSMState AbortSilently(const FSMEventArgs& event_args);
+ FSMState AbortWithError(const FSMEventArgs& event_args);
+ FSMState Abort(content::SpeechRecognitionErrorCode error);
+ FSMState DoNothing(const FSMEventArgs& event_args);
+ FSMState NotFeasible(const FSMEventArgs& event_args);
+
+ std::string GetAcceptedLanguages() const;
+ std::string GenerateRequestKey() const;
+
+ SpeechRecognitionEngineConfig config_;
+ scoped_ptr<net::URLFetcher> upstream_fetcher_;
+ scoped_ptr<net::URLFetcher> downstream_fetcher_;
+ scoped_refptr<net::URLRequestContextGetter> url_context_;
+ scoped_ptr<AudioEncoder> encoder_;
+ ChunkedByteBuffer chunked_byte_buffer_;
+ size_t previous_response_length_;
+ bool got_last_definitive_result_;
+ bool is_dispatching_event_;
+ FSMState state_;
+
+ DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine);
+};
+
+} // namespace speech
+
+#endif // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
diff --git a/content/browser/speech/google_streaming_remote_engine_unittest.cc b/content/browser/speech/google_streaming_remote_engine_unittest.cc
new file mode 100644
index 0000000..9de06b6
--- /dev/null
+++ b/content/browser/speech/google_streaming_remote_engine_unittest.cc
@@ -0,0 +1,440 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <queue>
+
+#include "base/memory/scoped_ptr.h"
+#include "base/message_loop.h"
+#include "base/utf_string_conversions.h"
+#include "content/browser/speech/audio_buffer.h"
+#include "content/browser/speech/google_streaming_remote_engine.h"
+#include "content/browser/speech/proto/google_streaming_api.pb.h"
+#include "content/public/common/speech_recognition_error.h"
+#include "content/public/common/speech_recognition_result.h"
+#include "net/url_request/test_url_fetcher_factory.h"
+#include "net/url_request/url_request_context_getter.h"
+#include "net/url_request/url_request_status.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using content::SpeechRecognitionHypothesis;
+using content::SpeechRecognitionResult;
+using net::URLRequestStatus;
+using net::TestURLFetcher;
+using net::TestURLFetcherFactory;
+
+namespace speech {
+
+// Note: the terms upstream and downstream are herein referring to the client
+// (engine_under_test_) viewpoint.
+
+class GoogleStreamingRemoteEngineTest
+ : public SpeechRecognitionEngineDelegate,
+ public testing::Test {
+ public:
+ GoogleStreamingRemoteEngineTest()
+ : last_number_of_upstream_chunks_seen_(0U),
+ error_(content::SPEECH_RECOGNITION_ERROR_NONE) { }
+
+ // Creates a speech recognition request and invokes its URL fetcher delegate
+ // with the given test data.
+ void CreateAndTestRequest(bool success, const std::string& http_response);
+
+ // SpeechRecognitionRequestDelegate methods.
+ virtual void OnSpeechRecognitionEngineResult(
+ const SpeechRecognitionResult& result) OVERRIDE {
+ results_.push(result);
+ }
+ virtual void OnSpeechRecognitionEngineError(
+ const content::SpeechRecognitionError& error) OVERRIDE {
+ error_ = error.code;
+ }
+
+ // testing::Test methods.
+ virtual void SetUp() OVERRIDE;
+ virtual void TearDown() OVERRIDE;
+
+ protected:
+ enum DownstreamError {
+ DOWNSTREAM_ERROR_NONE,
+ DOWNSTREAM_ERROR_HTTP500,
+ DOWNSTREAM_ERROR_NETWORK,
+ DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
+ };
+ static bool ResultsAreEqual(const SpeechRecognitionResult& a,
+ const SpeechRecognitionResult& b);
+ static std::string SerializeProtobufResponse(const HttpStreamingResult& msg);
+ static std::string ToBigEndian32(uint32 value);
+
+ TestURLFetcher* GetUpstreamFetcher();
+ TestURLFetcher* GetDownstreamFetcher();
+ void StartMockRecognition();
+ void EndMockRecognition();
+ void InjectDummyAudioChunk();
+ size_t UpstreamChunksUploadedFromLastCall();
+ void ProvideMockResultDownstream(const SpeechRecognitionResult& result);
+ void ExpectResultReceived(const SpeechRecognitionResult& result);
+ void CloseMockDownstream(DownstreamError error);
+
+ scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_;
+ TestURLFetcherFactory url_fetcher_factory_;
+ size_t last_number_of_upstream_chunks_seen_;
+ MessageLoop message_loop_;
+ std::string response_buffer_;
+ content::SpeechRecognitionErrorCode error_;
+ std::queue<SpeechRecognitionResult> results_;
+};
+
+TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) {
+ StartMockRecognition();
+ ASSERT_TRUE(GetUpstreamFetcher());
+ ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
+
+ // Inject some dummy audio chunks and check a corresponding chunked upload
+ // is performed every time on the server.
+ for (int i = 0; i < 3; ++i) {
+ InjectDummyAudioChunk();
+ ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+ }
+
+ // Ensure that a final (empty) audio chunk is uploaded on chunks end.
+ engine_under_test_->AudioChunksEnded();
+ ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+ // Simulate a protobuf message streamed from the server containing a single
+ // result with two hypotheses.
+ SpeechRecognitionResult result;
+ result.is_provisional = false;
+ result.hypotheses.push_back(
+ SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 1"), 0.1F));
+ result.hypotheses.push_back(
+ SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 2"), 0.2F));
+
+ ProvideMockResultDownstream(result);
+ ExpectResultReceived(result);
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+ // Ensure everything is closed cleanly after the downstream is closed.
+ CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
+ ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+ EndMockRecognition();
+ ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_);
+ ASSERT_EQ(0U, results_.size());
+}
+
+TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) {
+ StartMockRecognition();
+ ASSERT_TRUE(GetUpstreamFetcher());
+ ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
+
+ for (int i = 0; i < 4; ++i) {
+ InjectDummyAudioChunk();
+ ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+
+ SpeechRecognitionResult result;
+ result.is_provisional = (i % 2 == 0); // Alternate result types.
+ float confidence = result.is_provisional ? 0.0F : (i * 0.1F);
+ result.hypotheses.push_back(
+ SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), confidence));
+
+ ProvideMockResultDownstream(result);
+ ExpectResultReceived(result);
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+ }
+
+ // Ensure that a final (empty) audio chunk is uploaded on chunks end.
+ engine_under_test_->AudioChunksEnded();
+ ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+ // Simulate a final definitive result.
+ SpeechRecognitionResult result;
+ result.is_provisional = false;
+ result.hypotheses.push_back(
+ SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 1.0F));
+ ProvideMockResultDownstream(result);
+ ExpectResultReceived(result);
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+ // Ensure everything is closed cleanly after the downstream is closed.
+ CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
+ ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+ EndMockRecognition();
+ ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_);
+ ASSERT_EQ(0U, results_.size());
+}
+
+TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) {
+ StartMockRecognition();
+ ASSERT_TRUE(GetUpstreamFetcher());
+ ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
+
+ // Simulate one pushed audio chunk.
+ InjectDummyAudioChunk();
+ ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+
+ // Simulate the corresponding definitive result.
+ SpeechRecognitionResult result;
+ result.hypotheses.push_back(
+ SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), 1.0F));
+ ProvideMockResultDownstream(result);
+ ExpectResultReceived(result);
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+ // Simulate a silent downstream closure after |AudioChunksEnded|.
+ engine_under_test_->AudioChunksEnded();
+ ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+ CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
+
+ // Expect an empty result, aimed at notifying recognition ended with no
+ // actual results nor errors.
+ SpeechRecognitionResult empty_result;
+ ExpectResultReceived(empty_result);
+
+ // Ensure everything is closed cleanly after the downstream is closed.
+ ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+ EndMockRecognition();
+ ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_);
+ ASSERT_EQ(0U, results_.size());
+}
+
+TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) {
+ StartMockRecognition();
+ ASSERT_TRUE(GetUpstreamFetcher());
+ ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
+
+ for (int i = 0; i < 3; ++i)
+ InjectDummyAudioChunk();
+ engine_under_test_->AudioChunksEnded();
+ ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall());
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+ // Simulate only a provisional result.
+ SpeechRecognitionResult result;
+ result.is_provisional = true;
+ result.hypotheses.push_back(
+ SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 0.0F));
+ ProvideMockResultDownstream(result);
+ ExpectResultReceived(result);
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+ CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH);
+
+ // Expect a SPEECH_RECOGNITION_ERROR_NO_MATCH error to be raised.
+ ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+ EndMockRecognition();
+ ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NO_MATCH, error_);
+ ASSERT_EQ(0U, results_.size());
+}
+
+TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) {
+ StartMockRecognition();
+ ASSERT_TRUE(GetUpstreamFetcher());
+ ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
+
+ InjectDummyAudioChunk();
+ ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+
+ // Close the downstream with a HTTP 500 error.
+ CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500);
+
+ // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
+ ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+ EndMockRecognition();
+ ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_);
+ ASSERT_EQ(0U, results_.size());
+}
+
+TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) {
+ StartMockRecognition();
+ ASSERT_TRUE(GetUpstreamFetcher());
+ ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
+
+ InjectDummyAudioChunk();
+ ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+
+ // Close the downstream fetcher simulating a network failure.
+ CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK);
+
+ // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
+ ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+ EndMockRecognition();
+ ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_);
+ ASSERT_EQ(0U, results_.size());
+}
+
+void GoogleStreamingRemoteEngineTest::SetUp() {
+ engine_under_test_.reset(
+ new GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/));
+ engine_under_test_->set_delegate(this);
+}
+
+void GoogleStreamingRemoteEngineTest::TearDown() {
+ engine_under_test_.reset();
+}
+
+TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() {
+ return url_fetcher_factory_.GetFetcherByID(
+ GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTests);
+}
+
+TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() {
+ return url_fetcher_factory_.GetFetcherByID(
+ GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTests);
+}
+
+// Starts recognition on the engine, ensuring that both stream fetchers are
+// created.
+void GoogleStreamingRemoteEngineTest::StartMockRecognition() {
+ DCHECK(engine_under_test_.get());
+
+ ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+
+ engine_under_test_->StartRecognition();
+ ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+ TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
+ ASSERT_TRUE(upstream_fetcher);
+ upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL());
+
+ TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
+ ASSERT_TRUE(downstream_fetcher);
+ downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL());
+}
+
+void GoogleStreamingRemoteEngineTest::EndMockRecognition() {
+ DCHECK(engine_under_test_.get());
+ engine_under_test_->EndRecognition();
+ ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+
+ // TODO(primiano): In order to be very pedantic we should check that both
+ // the upstream and downstream URL fetchers have been disposed at this time.
+ // Unfortunately it seems that there is no direct way to detect (in tests)
+ // if a url_fetcher has been freed or not, since they are not automatically
+ // de-registered from the TestURLFetcherFactory on destruction.
+ // ASSERT_FALSE(GetUpstreamFetcher());
+ // ASSERT_FALSE(GetDownstreamFetcher());
+}
+
+void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() {
+ unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'};
+ scoped_refptr<AudioChunk> dummy_audio_chunk(
+ new AudioChunk(&dummy_audio_buffer_data[0],
+ sizeof(dummy_audio_buffer_data),
+ 2 /* bytes per sample */));
+ DCHECK(engine_under_test_.get());
+ engine_under_test_->TakeAudioChunk(*dummy_audio_chunk);
+}
+
+size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() {
+ TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
+ DCHECK(upstream_fetcher);
+ const size_t number_of_chunks = upstream_fetcher->upload_chunks().size();
+ DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_);
+ const size_t new_chunks = number_of_chunks -
+ last_number_of_upstream_chunks_seen_;
+ last_number_of_upstream_chunks_seen_ = number_of_chunks;
+ return new_chunks;
+}
+
+void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream(
+ const SpeechRecognitionResult& result) {
+ TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
+
+ ASSERT_TRUE(downstream_fetcher);
+ downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */));
+ downstream_fetcher->set_response_code(200);
+
+ HttpStreamingResult response;
+ if (result.is_provisional) {
+ DCHECK_EQ(result.hypotheses.size(), 1U);
+ const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[0];
+ response.set_provisional(UTF16ToUTF8(hypothesis.utterance));
+ } else {
+ response.set_status(GoogleStreamingRemoteEngine::kWebserviceStatusNoError);
+ for (size_t i = 0; i < result.hypotheses.size(); ++i) {
+ const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i];
+ HttpStreamingHypothesis* ws_hypothesis = response.add_hypotheses();
+ ws_hypothesis->set_confidence(hypothesis.confidence);
+ ws_hypothesis->set_utterance(UTF16ToUTF8(hypothesis.utterance));
+ }
+ }
+
+ std::string response_string = SerializeProtobufResponse(response);
+ response_buffer_.append(response_string);
+ downstream_fetcher->SetResponseString(response_buffer_);
+ downstream_fetcher->delegate()->OnURLFetchDownloadProgress(
+ downstream_fetcher,
+ response_buffer_.size(),
+ -1 /* total response length not used */);
+}
+
+void GoogleStreamingRemoteEngineTest::CloseMockDownstream(
+ DownstreamError error) {
+ TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
+ ASSERT_TRUE(downstream_fetcher);
+
+ const URLRequestStatus::Status fetcher_status =
+ (error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED :
+ URLRequestStatus::SUCCESS;
+ downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0));
+ downstream_fetcher->set_response_code(
+ (error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200);
+
+if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) {
+ HttpStreamingResult response;
+ response.set_status(
+ GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch);
+ response_buffer_.append(SerializeProtobufResponse(response));
+ }
+ downstream_fetcher->SetResponseString(response_buffer_);
+ downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher);
+}
+
+void GoogleStreamingRemoteEngineTest::ExpectResultReceived(
+ const SpeechRecognitionResult& result) {
+ ASSERT_GE(1U, results_.size());
+ ASSERT_TRUE(ResultsAreEqual(result, results_.front()));
+ results_.pop();
+}
+
+bool GoogleStreamingRemoteEngineTest::ResultsAreEqual(
+ const SpeechRecognitionResult& a, const SpeechRecognitionResult& b) {
+ if (a.is_provisional != b.is_provisional ||
+ a.hypotheses.size() != b.hypotheses.size()) {
+ return false;
+ }
+ for (size_t i = 0; i < a.hypotheses.size(); ++i) {
+ const SpeechRecognitionHypothesis& hyp_a = a.hypotheses[i];
+ const SpeechRecognitionHypothesis& hyp_b = b.hypotheses[i];
+ if (hyp_a.utterance != hyp_b.utterance ||
+ hyp_a.confidence != hyp_b.confidence) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse(
+ const HttpStreamingResult& msg) {
+ std::string response_string;
+ msg.SerializeToString(&response_string);
+
+ // Append 4 byte prefix length indication to the protobuf message as envisaged
+ // by the google streaming recognition webservice protocol.
+ response_string.insert(0, ToBigEndian32(response_string.size()));
+ return response_string;
+}
+
+std::string GoogleStreamingRemoteEngineTest::ToBigEndian32(uint32 value) {
+ char raw_data[4];
+ raw_data[0] = static_cast<uint8>((value >> 24) & 0xFF);
+ raw_data[1] = static_cast<uint8>((value >> 16) & 0xFF);
+ raw_data[2] = static_cast<uint8>((value >> 8) & 0xFF);
+ raw_data[3] = static_cast<uint8>(value & 0xFF);
+ return std::string(raw_data, sizeof(raw_data));
+}
+
+} // namespace speech
diff --git a/content/browser/speech/speech_recognition_manager_impl.cc b/content/browser/speech/speech_recognition_manager_impl.cc
index 47b5420..65351c1 100644
--- a/content/browser/speech/speech_recognition_manager_impl.cc
+++ b/content/browser/speech/speech_recognition_manager_impl.cc
@@ -7,6 +7,7 @@
#include "base/bind.h"
#include "content/browser/browser_main_loop.h"
#include "content/browser/speech/google_one_shot_remote_engine.h"
+#include "content/browser/speech/google_streaming_remote_engine.h"
#include "content/browser/speech/speech_recognition_engine.h"
#include "content/browser/speech/speech_recognizer_impl.h"
#include "content/public/browser/browser_thread.h"
@@ -101,12 +102,20 @@ int SpeechRecognitionManagerImpl::CreateSession(
remote_engine_config.hardware_info = hardware_info;
remote_engine_config.origin_url = can_report_metrics ? config.origin_url : "";
- SpeechRecognitionEngine* google_remote_engine =
+ SpeechRecognitionEngine* google_remote_engine;
+ if (config.is_one_shot) {
+ google_remote_engine =
new GoogleOneShotRemoteEngine(config.url_request_context_getter);
+ } else {
+ google_remote_engine =
+ new GoogleStreamingRemoteEngine(config.url_request_context_getter);
+ }
+
google_remote_engine->SetConfig(remote_engine_config);
session.recognizer = new SpeechRecognizerImpl(this,
session_id,
+ config.is_one_shot,
google_remote_engine);
return session_id;
}
diff --git a/content/browser/speech/speech_recognizer_impl.cc b/content/browser/speech/speech_recognizer_impl.cc
index 18c6c9f..1141d9e 100644
--- a/content/browser/speech/speech_recognizer_impl.cc
+++ b/content/browser/speech/speech_recognizer_impl.cc
@@ -80,6 +80,7 @@ COMPILE_ASSERT(SpeechRecognizerImpl::kNumBitsPerAudioSample % 8 == 0,
SpeechRecognizerImpl::SpeechRecognizerImpl(
SpeechRecognitionEventListener* listener,
int session_id,
+ bool is_single_shot,
SpeechRecognitionEngine* engine)
: listener_(listener),
testing_audio_manager_(NULL),
@@ -87,6 +88,7 @@ SpeechRecognizerImpl::SpeechRecognizerImpl(
endpointer_(kAudioSampleRate),
session_id_(session_id),
is_dispatching_event_(false),
+ is_single_shot_(is_single_shot),
state_(STATE_IDLE) {
DCHECK(listener_ != NULL);
DCHECK(recognition_engine_ != NULL);
@@ -197,7 +199,7 @@ void SpeechRecognizerImpl::OnSpeechRecognitionEngineError(
}
// ----------------------- Core FSM implementation ---------------------------
-// TODO(primiano) After the changes in the media package (r129173), this class
+// TODO(primiano): After the changes in the media package (r129173), this class
// slightly violates the SpeechRecognitionEventListener interface contract. In
// particular, it is not true anymore that this class can be freed after the
// OnRecognitionEnd event, since the audio_controller_.Close() asynchronous
@@ -240,7 +242,7 @@ SpeechRecognizerImpl::ExecuteTransitionAndGetNextState(
switch (state_) {
case STATE_IDLE:
switch (event) {
- // TODO(primiano) restore UNREACHABLE_CONDITION on EVENT_ABORT and
+ // TODO(primiano): restore UNREACHABLE_CONDITION on EVENT_ABORT and
// EVENT_STOP_CAPTURE below once speech input extensions are fixed.
case EVENT_ABORT:
return DoNothing(event_args);
@@ -349,7 +351,7 @@ SpeechRecognizerImpl::ExecuteTransitionAndGetNextState(
// - The class won't be freed in the meanwhile due to callbacks;
// - IsCapturingAudio() returns true if and only if audio_controller_ != NULL.
-// TODO(primiano) the audio pipeline is currently serial. However, the
+// TODO(primiano): the audio pipeline is currently serial. However, the
// clipper->endpointer->vumeter chain and the sr_engine could be parallelized.
// We should profile the execution to see if it would be worth or not.
void SpeechRecognizerImpl::ProcessAudioPipeline(const AudioChunk& raw_audio) {
@@ -464,9 +466,10 @@ SpeechRecognizerImpl::DetectUserSpeechOrTimeout(const FSMEventArgs&) {
SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::DetectEndOfSpeech(const FSMEventArgs& event_args) {
- if (endpointer_.speech_input_complete()) {
+ // End-of-speech detection is performed only in one-shot mode.
+ // TODO(primiano): What about introducing a longer timeout for continuous rec?
+ if (is_single_shot_ && endpointer_.speech_input_complete())
return StopCaptureAndWaitForResult(event_args);
- }
return STATE_RECOGNIZING;
}
@@ -532,21 +535,43 @@ SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::Abort(
return STATE_IDLE;
}
-SpeechRecognizerImpl::FSMState
-SpeechRecognizerImpl::ProcessIntermediateResult(const FSMEventArgs&) {
- // This is in preparation for future speech recognition functions.
- NOTREACHED();
+SpeechRecognizerImpl::FSMState SpeechRecognizerImpl::ProcessIntermediateResult(
+ const FSMEventArgs& event_args) {
+ // Provisional results can occur only during continuous (non one-shot) mode.
+ // If this check is reached it means that a continuous speech recognition
+ // engine is being used for a one shot recognition.
+ DCHECK_EQ(false, is_single_shot_);
+ const SpeechRecognitionResult& result = event_args.engine_result;
+ listener_->OnRecognitionResult(session_id_, result);
return state_;
}
SpeechRecognizerImpl::FSMState
SpeechRecognizerImpl::ProcessFinalResult(const FSMEventArgs& event_args) {
const SpeechRecognitionResult& result = event_args.engine_result;
- DVLOG(1) << "Got valid result";
- recognition_engine_->EndRecognition();
- listener_->OnRecognitionResult(session_id_, result);
- listener_->OnRecognitionEnd(session_id_);
- return STATE_IDLE;
+ if (result.is_provisional) {
+ DCHECK(!is_single_shot_);
+ listener_->OnRecognitionResult(session_id_, result);
+ // We don't end the recognition if a provisional result is received in
+ // STATE_WAITING_FINAL_RESULT. A definitive result will come next and will
+ // end the recognition.
+ return state_;
+ } else {
+ recognition_engine_->EndRecognition();
+ // We could receive an empty result (which we won't propagate further)
+ // in the following (continuous) scenario:
+ // 1. The caller start pushing audio and receives some results;
+ // 2. A |StopAudioCapture| is issued later;
+ // 3. The final audio frames captured in the interval ]1,2] do not lead to
+ // any result (nor any error);
+ // 4. The speech recognition engine, therefore, emits an empty result to
+ // notify that the recognition is ended with no error, yet neither any
+ // further result.
+ if (result.hypotheses.size() > 0)
+ listener_->OnRecognitionResult(session_id_, result);
+ listener_->OnRecognitionEnd(session_id_);
+ return STATE_IDLE;
+ }
}
SpeechRecognizerImpl::FSMState
@@ -563,7 +588,7 @@ SpeechRecognizerImpl::NotFeasible(const FSMEventArgs& event_args) {
void SpeechRecognizerImpl::CloseAudioControllerAsynchronously() {
DCHECK(IsCapturingAudio());
- DVLOG(1) << "SpeechRecognizerImpl stopping audio capture.";
+ DVLOG(1) << "SpeechRecognizerImpl closing audio controller.";
// Issues a Close on the audio controller, passing an empty callback. The only
// purpose of such callback is to keep the audio controller refcounted until
// Close has completed (in the audio thread) and automatically destroy it
@@ -581,7 +606,7 @@ void SpeechRecognizerImpl::UpdateSignalAndNoiseLevels(const float& rms,
bool clip_detected) {
// Calculate the input volume to display in the UI, smoothing towards the
// new level.
- // TODO(primiano) Do we really need all this floating point arith here?
+ // TODO(primiano): Do we really need all this floating point arith here?
// Perhaps it might be quite expensive on mobile.
float level = (rms - kAudioMeterMinDb) /
(kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
diff --git a/content/browser/speech/speech_recognizer_impl.h b/content/browser/speech/speech_recognizer_impl.h
index 8179e2743..89e41e6 100644
--- a/content/browser/speech/speech_recognizer_impl.h
+++ b/content/browser/speech/speech_recognizer_impl.h
@@ -45,6 +45,7 @@ class CONTENT_EXPORT SpeechRecognizerImpl
SpeechRecognizerImpl(
content::SpeechRecognitionEventListener* listener,
int session_id,
+ bool is_single_shot,
SpeechRecognitionEngine* engine);
void StartRecognition();
@@ -153,6 +154,7 @@ class CONTENT_EXPORT SpeechRecognizerImpl
int num_samples_recorded_;
float audio_level_;
bool is_dispatching_event_;
+ bool is_single_shot_;
FSMState state_;
DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl);
diff --git a/content/browser/speech/speech_recognizer_impl_unittest.cc b/content/browser/speech/speech_recognizer_impl_unittest.cc
index e73a894b..4a487c3 100644
--- a/content/browser/speech/speech_recognizer_impl_unittest.cc
+++ b/content/browser/speech/speech_recognizer_impl_unittest.cc
@@ -117,7 +117,10 @@ class SpeechRecognizerImplTest : public content::SpeechRecognitionEventListener,
config.filter_profanities = false;
sr_engine->SetConfig(config);
- recognizer_ = new SpeechRecognizerImpl(this, 1, sr_engine);
+ const int kTestingSessionId = 1;
+ const bool kOneShotMode = true;
+ recognizer_ = new SpeechRecognizerImpl(
+ this, kTestingSessionId, kOneShotMode, sr_engine);
recognizer_->SetAudioManagerForTesting(audio_manager_.get());
int audio_packet_length_bytes =