diff options
author | amistry <amistry@chromium.org> | 2014-12-15 18:10:00 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2014-12-16 02:10:31 +0000 |
commit | 77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65 (patch) | |
tree | b3a93e1fec64ed5914c8ed3749be7a379e4ae9c7 /content/browser/speech | |
parent | 388679724f9abba959ba70a600cb5946b7be9249 (diff) | |
download | chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.zip chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.tar.gz chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.tar.bz2 |
Implement support for uploading preamble audio in the voice recognition
service.
The preamble is the few seconds of audio before starting voice recognition. In
particular, it contains the 'Ok Google' hotword used to start a voice query.
The 'Ok Google' hotword is the only scenario where this is recorded, and only
if the user has audio history enabled.
To support this, a new framed data format has been introduced for the /up POST
request. This change switches to that new format only if the recognition
request contains a preamble. The preamble is only uploaded when authentication
parameters are available.
BUG=397019
Review URL: https://codereview.chromium.org/797913002
Cr-Commit-Position: refs/heads/master@{#308501}
Diffstat (limited to 'content/browser/speech')
5 files changed, 146 insertions, 3 deletions
diff --git a/content/browser/speech/google_streaming_remote_engine.cc b/content/browser/speech/google_streaming_remote_engine.cc index ee84444..ca8e906 100644 --- a/content/browser/speech/google_streaming_remote_engine.cc +++ b/content/browser/speech/google_streaming_remote_engine.cc @@ -7,6 +7,7 @@ #include <algorithm> #include <vector> +#include "base/big_endian.h" #include "base/bind.h" #include "base/rand_util.h" #include "base/strings/string_number_conversions.h" @@ -82,6 +83,7 @@ GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine( previous_response_length_(0), got_last_definitive_result_(false), is_dispatching_event_(false), + use_framed_post_data_(false), state_(STATE_IDLE) {} GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {} @@ -298,6 +300,18 @@ GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) { DCHECK(encoder_.get()); const std::string request_key = GenerateRequestKey(); + // Only use the framed post data format when a preamble needs to be logged. + use_framed_post_data_ = (config_.preamble && + !config_.preamble->sample_data.empty() && + !config_.auth_token.empty() && + !config_.auth_scope.empty()); + if (use_framed_post_data_) { + preamble_encoder_.reset(AudioEncoder::Create( + kDefaultAudioCodec, + config_.preamble->sample_rate, + config_.preamble->sample_depth * 8)); + } + // Setup downstream fetcher. std::vector<std::string> downstream_args; downstream_args.push_back( @@ -349,13 +363,24 @@ GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) { upstream_args.push_back( "authToken=" + net::EscapeQueryParamValue(config_.auth_token, true)); } + if (use_framed_post_data_) { + std::string audio_format; + if (preamble_encoder_) + audio_format = preamble_encoder_->mime_type() + ","; + audio_format += encoder_->mime_type(); + upstream_args.push_back( + "audioFormat=" + net::EscapeQueryParamValue(audio_format, true)); + } GURL upstream_url(std::string(kWebServiceBaseUrl) + std::string(kUpstreamUrl) + JoinString(upstream_args, '&')); upstream_fetcher_.reset(URLFetcher::Create( kUpstreamUrlFetcherIdForTesting, upstream_url, URLFetcher::POST, this)); - upstream_fetcher_->SetChunkedUpload(encoder_->mime_type()); + if (use_framed_post_data_) + upstream_fetcher_->SetChunkedUpload("application/octet-stream"); + else + upstream_fetcher_->SetChunkedUpload(encoder_->mime_type()); upstream_fetcher_->SetRequestContext(url_context_.get()); upstream_fetcher_->SetReferrer(config_.origin_url); upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | @@ -363,6 +388,19 @@ GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) { net::LOAD_DO_NOT_SEND_AUTH_DATA); upstream_fetcher_->Start(); previous_response_length_ = 0; + + if (preamble_encoder_) { + // Encode and send preamble right away. + scoped_refptr<AudioChunk> chunk = new AudioChunk( + reinterpret_cast<const uint8*>(config_.preamble->sample_data.data()), + config_.preamble->sample_data.size(), + config_.preamble->sample_depth); + preamble_encoder_->Encode(*chunk); + preamble_encoder_->Flush(); + scoped_refptr<AudioChunk> encoded_data( + preamble_encoder_->GetEncodedDataAndClear()); + UploadAudioChunk(encoded_data->AsString(), FRAME_PREAMBLE_AUDIO, false); + } return STATE_BOTH_STREAMS_CONNECTED; } @@ -376,7 +414,7 @@ GoogleStreamingRemoteEngine::TransmitAudioUpstream( DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); encoder_->Encode(audio); scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); - upstream_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); + UploadAudioChunk(encoded_data->AsString(), FRAME_RECOGNITION_AUDIO, false); return state_; } @@ -488,7 +526,9 @@ GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults( DCHECK(!encoded_dummy_data->IsEmpty()); encoder_.reset(); - upstream_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); + UploadAudioChunk(encoded_dummy_data->AsString(), + FRAME_RECOGNITION_AUDIO, + true); got_last_definitive_result_ = false; return STATE_WAITING_DOWNSTREAM_RESULTS; } @@ -576,6 +616,20 @@ std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const { return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key)); } +void GoogleStreamingRemoteEngine::UploadAudioChunk(const std::string& data, + FrameType type, + bool is_final) { + if (use_framed_post_data_) { + std::string frame(data.size() + 8, 0); + base::WriteBigEndian(&frame[0], static_cast<uint32_t>(data.size())); + base::WriteBigEndian(&frame[4], static_cast<uint32_t>(type)); + frame.replace(8, data.size(), data); + upstream_fetcher_->AppendChunkToUpload(frame, is_final); + } else { + upstream_fetcher_->AppendChunkToUpload(data, is_final); + } +} + GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value) : event(event_value) { } diff --git a/content/browser/speech/google_streaming_remote_engine.h b/content/browser/speech/google_streaming_remote_engine.h index 961ef08..15b866b 100644 --- a/content/browser/speech/google_streaming_remote_engine.h +++ b/content/browser/speech/google_streaming_remote_engine.h @@ -79,6 +79,13 @@ class CONTENT_EXPORT GoogleStreamingRemoteEngine static const int kWebserviceStatusNoError; static const int kWebserviceStatusErrorNoMatch; + // Frame type for framed POST data. Do NOT change these. They must match + // values the server expects. + enum FrameType { + FRAME_PREAMBLE_AUDIO = 0, + FRAME_RECOGNITION_AUDIO = 1 + }; + // Data types for the internal Finite State Machine (FSM). enum FSMState { STATE_IDLE = 0, @@ -143,15 +150,21 @@ class CONTENT_EXPORT GoogleStreamingRemoteEngine std::string GetAcceptedLanguages() const; std::string GenerateRequestKey() const; + // Upload a single chunk of audio data. Handles both unframed and framed + // upload formats, and uses the appropriate one. + void UploadAudioChunk(const std::string& data, FrameType type, bool is_final); + SpeechRecognitionEngineConfig config_; scoped_ptr<net::URLFetcher> upstream_fetcher_; scoped_ptr<net::URLFetcher> downstream_fetcher_; scoped_refptr<net::URLRequestContextGetter> url_context_; scoped_ptr<AudioEncoder> encoder_; + scoped_ptr<AudioEncoder> preamble_encoder_; ChunkedByteBuffer chunked_byte_buffer_; size_t previous_response_length_; bool got_last_definitive_result_; bool is_dispatching_event_; + bool use_framed_post_data_; FSMState state_; DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine); diff --git a/content/browser/speech/google_streaming_remote_engine_unittest.cc b/content/browser/speech/google_streaming_remote_engine_unittest.cc index 57947e6..0f17fa0 100644 --- a/content/browser/speech/google_streaming_remote_engine_unittest.cc +++ b/content/browser/speech/google_streaming_remote_engine_unittest.cc @@ -4,6 +4,7 @@ #include <queue> +#include "base/big_endian.h" #include "base/memory/scoped_ptr.h" #include "base/message_loop/message_loop.h" #include "base/numerics/safe_conversions.h" @@ -27,6 +28,10 @@ using net::TestURLFetcherFactory; namespace content { +// Frame types for framed POST data. +static const uint32_t kFrameTypePreamble = 0; +static const uint32_t kFrameTypeRecognitionAudio = 1; + // Note: the terms upstream and downstream are from the point-of-view of the // client (engine_under_test_). @@ -73,10 +78,12 @@ class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate, void EndMockRecognition(); void InjectDummyAudioChunk(); size_t UpstreamChunksUploadedFromLastCall(); + std::string LastUpstreamChunkUploaded(); void ProvideMockProtoResultDownstream( const proto::SpeechRecognitionEvent& result); void ProvideMockResultDownstream(const SpeechRecognitionResult& result); void ExpectResultsReceived(const SpeechRecognitionResults& result); + void ExpectFramedChunk(const std::string& chunk, uint32_t type); void CloseMockDownstream(DownstreamError error); scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_; @@ -325,6 +332,56 @@ TEST_F(GoogleStreamingRemoteEngineTest, Stability) { ASSERT_EQ(0U, results_.size()); } +TEST_F(GoogleStreamingRemoteEngineTest, SendPreamble) { + const int kPreambleLength = 100; + scoped_refptr<SpeechRecognitionSessionPreamble> preamble = + new SpeechRecognitionSessionPreamble(); + preamble->sample_rate = 16000; + preamble->sample_depth = 2; + preamble->sample_data = std::string(kPreambleLength, 0); + SpeechRecognitionEngine::Config config; + config.auth_token = "foo"; + config.auth_scope = "bar"; + config.preamble = preamble; + engine_under_test_->SetConfig(config); + + StartMockRecognition(); + ASSERT_TRUE(GetUpstreamFetcher()); + // First chunk uploaded should be the preamble. + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + std::string chunk = LastUpstreamChunkUploaded(); + ExpectFramedChunk(chunk, kFrameTypePreamble); + + for (int i = 0; i < 3; ++i) { + InjectDummyAudioChunk(); + ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); + chunk = LastUpstreamChunkUploaded(); + ExpectFramedChunk(chunk, kFrameTypeRecognitionAudio); + } + engine_under_test_->AudioChunksEnded(); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + // Simulate a protobuf message streamed from the server containing a single + // result with one hypotheses. + SpeechRecognitionResults results; + results.push_back(SpeechRecognitionResult()); + SpeechRecognitionResult& result = results.back(); + result.is_provisional = false; + result.hypotheses.push_back( + SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 1"), 0.1F)); + + ProvideMockResultDownstream(result); + ExpectResultsReceived(results); + ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); + + // Ensure everything is closed cleanly after the downstream is closed. + CloseMockDownstream(DOWNSTREAM_ERROR_NONE); + ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); + EndMockRecognition(); + ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_); + ASSERT_EQ(0U, results_.size()); +} + void GoogleStreamingRemoteEngineTest::SetUp() { engine_under_test_.reset( new GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/)); @@ -397,6 +454,13 @@ size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() { return new_chunks; } +std::string GoogleStreamingRemoteEngineTest::LastUpstreamChunkUploaded() { + TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); + DCHECK(upstream_fetcher); + DCHECK(!upstream_fetcher->upload_chunks().empty()); + return upstream_fetcher->upload_chunks().back(); +} + void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream( const proto::SpeechRecognitionEvent& result) { TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); @@ -483,6 +547,15 @@ bool GoogleStreamingRemoteEngineTest::ResultsAreEqual( return true; } +void GoogleStreamingRemoteEngineTest::ExpectFramedChunk( + const std::string& chunk, uint32_t type) { + uint32_t value; + base::ReadBigEndian(&chunk[0], &value); + EXPECT_EQ(chunk.size() - 8, value); + base::ReadBigEndian(&chunk[4], &value); + EXPECT_EQ(type, value); +} + std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse( const proto::SpeechRecognitionEvent& msg) { std::string msg_string; diff --git a/content/browser/speech/speech_recognition_engine.h b/content/browser/speech/speech_recognition_engine.h index b756833..4f945b5 100644 --- a/content/browser/speech/speech_recognition_engine.h +++ b/content/browser/speech/speech_recognition_engine.h @@ -9,6 +9,7 @@ #include "base/basictypes.h" #include "content/common/content_export.h" +#include "content/public/browser/speech_recognition_session_preamble.h" #include "content/public/common/speech_recognition_grammar.h" #include "content/public/common/speech_recognition_result.h" @@ -61,6 +62,7 @@ class SpeechRecognitionEngine { int audio_num_bits_per_sample; std::string auth_token; std::string auth_scope; + scoped_refptr<SpeechRecognitionSessionPreamble> preamble; }; virtual ~SpeechRecognitionEngine() {} diff --git a/content/browser/speech/speech_recognition_manager_impl.cc b/content/browser/speech/speech_recognition_manager_impl.cc index 24ca6e5..8a4d1b4 100644 --- a/content/browser/speech/speech_recognition_manager_impl.cc +++ b/content/browser/speech/speech_recognition_manager_impl.cc @@ -134,6 +134,7 @@ int SpeechRecognitionManagerImpl::CreateSession( can_report_metrics ? config.origin_url : std::string(); remote_engine_config.auth_token = config.auth_token; remote_engine_config.auth_scope = config.auth_scope; + remote_engine_config.preamble = config.preamble; SpeechRecognitionEngine* google_remote_engine; if (config.is_legacy_api) { |