Implement support for uploading preamble audio in the voice recognition

service. The preamble is the few seconds of audio before starting voice recognition. In particular, it contains the 'Ok Google' hotword used to start a voice query. The 'Ok Google' hotword is the only scenario where this is recorded, and only if the user has audio history enabled. To support this, a new framed data format has been introduced for the /up POST request. This change switches to that new format only if the recognition request contains a preamble. The preamble is only uploaded when authentication parameters are available. BUG=397019 Review URL: https://codereview.chromium.org/797913002 Cr-Commit-Position: refs/heads/master@{#308501}
author: amistry <amistry@chromium.org> 2014-12-15 18:10:00 -0800
committer: Commit bot <commit-bot@chromium.org> 2014-12-16 02:10:31 +0000
commit: 77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65 (patch)
tree: b3a93e1fec64ed5914c8ed3749be7a379e4ae9c7 /content/browser/speech
parent: 388679724f9abba959ba70a600cb5946b7be9249 (diff)
download: chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.zip
chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.tar.gz
chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.tar.bz2
5 files changed, 146 insertions, 3 deletions
diff --git a/content/browser/speech/google_streaming_remote_engine.cc b/content/browser/speech/google_streaming_remote_engine.cc
index ee84444..ca8e906 100644
--- a/content/browser/speech/google_streaming_remote_engine.cc
+++ b/content/browser/speech/google_streaming_remote_engine.cc
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <vector>
 
+#include "base/big_endian.h"
 #include "base/bind.h"
 #include "base/rand_util.h"
 #include "base/strings/string_number_conversions.h"
@@ -82,6 +83,7 @@ GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine(
       previous_response_length_(0),
       got_last_definitive_result_(false),
       is_dispatching_event_(false),
+      use_framed_post_data_(false),
       state_(STATE_IDLE) {}
 
 GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {}
@@ -298,6 +300,18 @@ GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) {
   DCHECK(encoder_.get());
   const std::string request_key = GenerateRequestKey();
 
+  // Only use the framed post data format when a preamble needs to be logged.
+  use_framed_post_data_ = (config_.preamble &&
+                           !config_.preamble->sample_data.empty() &&
+                           !config_.auth_token.empty() &&
+                           !config_.auth_scope.empty());
+  if (use_framed_post_data_) {
+    preamble_encoder_.reset(AudioEncoder::Create(
+        kDefaultAudioCodec,
+        config_.preamble->sample_rate,
+        config_.preamble->sample_depth * 8));
+  }
+
   // Setup downstream fetcher.
   std::vector<std::string> downstream_args;
   downstream_args.push_back(
@@ -349,13 +363,24 @@ GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) {
     upstream_args.push_back(
         "authToken=" + net::EscapeQueryParamValue(config_.auth_token, true));
   }
+  if (use_framed_post_data_) {
+    std::string audio_format;
+    if (preamble_encoder_)
+      audio_format = preamble_encoder_->mime_type() + ",";
+    audio_format += encoder_->mime_type();
+    upstream_args.push_back(
+        "audioFormat=" + net::EscapeQueryParamValue(audio_format, true));
+  }
   GURL upstream_url(std::string(kWebServiceBaseUrl) +
                     std::string(kUpstreamUrl) +
                     JoinString(upstream_args, '&'));
 
   upstream_fetcher_.reset(URLFetcher::Create(
       kUpstreamUrlFetcherIdForTesting, upstream_url, URLFetcher::POST, this));
-  upstream_fetcher_->SetChunkedUpload(encoder_->mime_type());
+  if (use_framed_post_data_)
+    upstream_fetcher_->SetChunkedUpload("application/octet-stream");
+  else
+    upstream_fetcher_->SetChunkedUpload(encoder_->mime_type());
   upstream_fetcher_->SetRequestContext(url_context_.get());
   upstream_fetcher_->SetReferrer(config_.origin_url);
   upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
@@ -363,6 +388,19 @@ GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) {
                                   net::LOAD_DO_NOT_SEND_AUTH_DATA);
   upstream_fetcher_->Start();
   previous_response_length_ = 0;
+
+  if (preamble_encoder_) {
+    // Encode and send preamble right away.
+    scoped_refptr<AudioChunk> chunk = new AudioChunk(
+        reinterpret_cast<const uint8*>(config_.preamble->sample_data.data()),
+        config_.preamble->sample_data.size(),
+        config_.preamble->sample_depth);
+    preamble_encoder_->Encode(*chunk);
+    preamble_encoder_->Flush();
+    scoped_refptr<AudioChunk> encoded_data(
+        preamble_encoder_->GetEncodedDataAndClear());
+    UploadAudioChunk(encoded_data->AsString(), FRAME_PREAMBLE_AUDIO, false);
+  }
   return STATE_BOTH_STREAMS_CONNECTED;
 }
 
@@ -376,7 +414,7 @@ GoogleStreamingRemoteEngine::TransmitAudioUpstream(
   DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
   encoder_->Encode(audio);
   scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
-  upstream_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
+  UploadAudioChunk(encoded_data->AsString(), FRAME_RECOGNITION_AUDIO, false);
   return state_;
 }
 
@@ -488,7 +526,9 @@ GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults(
   DCHECK(!encoded_dummy_data->IsEmpty());
   encoder_.reset();
 
-  upstream_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
+  UploadAudioChunk(encoded_dummy_data->AsString(),
+                   FRAME_RECOGNITION_AUDIO,
+                   true);
   got_last_definitive_result_ = false;
   return STATE_WAITING_DOWNSTREAM_RESULTS;
 }
@@ -576,6 +616,20 @@ std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const {
   return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
 }
 
+void GoogleStreamingRemoteEngine::UploadAudioChunk(const std::string& data,
+                                                   FrameType type,
+                                                   bool is_final) {
+  if (use_framed_post_data_) {
+    std::string frame(data.size() + 8, 0);
+    base::WriteBigEndian(&frame[0], static_cast<uint32_t>(data.size()));
+    base::WriteBigEndian(&frame[4], static_cast<uint32_t>(type));
+    frame.replace(8, data.size(), data);
+    upstream_fetcher_->AppendChunkToUpload(frame, is_final);
+  } else {
+    upstream_fetcher_->AppendChunkToUpload(data, is_final);
+  }
+}
+
 GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
     : event(event_value) {
 }
diff --git a/content/browser/speech/google_streaming_remote_engine.h b/content/browser/speech/google_streaming_remote_engine.h
index 961ef08..15b866b 100644
--- a/content/browser/speech/google_streaming_remote_engine.h
+++ b/content/browser/speech/google_streaming_remote_engine.h
@@ -79,6 +79,13 @@ class CONTENT_EXPORT GoogleStreamingRemoteEngine
   static const int kWebserviceStatusNoError;
   static const int kWebserviceStatusErrorNoMatch;
 
+  // Frame type for framed POST data. Do NOT change these. They must match
+  // values the server expects.
+  enum FrameType {
+    FRAME_PREAMBLE_AUDIO = 0,
+    FRAME_RECOGNITION_AUDIO = 1
+  };
+
   // Data types for the internal Finite State Machine (FSM).
   enum FSMState {
     STATE_IDLE = 0,
@@ -143,15 +150,21 @@ class CONTENT_EXPORT GoogleStreamingRemoteEngine
   std::string GetAcceptedLanguages() const;
   std::string GenerateRequestKey() const;
 
+  // Upload a single chunk of audio data. Handles both unframed and framed
+  // upload formats, and uses the appropriate one.
+  void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);
+
   SpeechRecognitionEngineConfig config_;
   scoped_ptr<net::URLFetcher> upstream_fetcher_;
   scoped_ptr<net::URLFetcher> downstream_fetcher_;
   scoped_refptr<net::URLRequestContextGetter> url_context_;
   scoped_ptr<AudioEncoder> encoder_;
+  scoped_ptr<AudioEncoder> preamble_encoder_;
   ChunkedByteBuffer chunked_byte_buffer_;
   size_t previous_response_length_;
   bool got_last_definitive_result_;
   bool is_dispatching_event_;
+  bool use_framed_post_data_;
   FSMState state_;
 
   DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine);
diff --git a/content/browser/speech/google_streaming_remote_engine_unittest.cc b/content/browser/speech/google_streaming_remote_engine_unittest.cc
index 57947e6..0f17fa0 100644
--- a/content/browser/speech/google_streaming_remote_engine_unittest.cc
+++ b/content/browser/speech/google_streaming_remote_engine_unittest.cc
@@ -4,6 +4,7 @@
 
 #include <queue>
 
+#include "base/big_endian.h"
 #include "base/memory/scoped_ptr.h"
 #include "base/message_loop/message_loop.h"
 #include "base/numerics/safe_conversions.h"
@@ -27,6 +28,10 @@ using net::TestURLFetcherFactory;
 
 namespace content {
 
+// Frame types for framed POST data.
+static const uint32_t kFrameTypePreamble = 0;
+static const uint32_t kFrameTypeRecognitionAudio = 1;
+
 // Note: the terms upstream and downstream are from the point-of-view of the
 // client (engine_under_test_).
 
@@ -73,10 +78,12 @@ class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate,
   void EndMockRecognition();
   void InjectDummyAudioChunk();
   size_t UpstreamChunksUploadedFromLastCall();
+  std::string LastUpstreamChunkUploaded();
   void ProvideMockProtoResultDownstream(
       const proto::SpeechRecognitionEvent& result);
   void ProvideMockResultDownstream(const SpeechRecognitionResult& result);
   void ExpectResultsReceived(const SpeechRecognitionResults& result);
+  void ExpectFramedChunk(const std::string& chunk, uint32_t type);
   void CloseMockDownstream(DownstreamError error);
 
   scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_;
@@ -325,6 +332,56 @@ TEST_F(GoogleStreamingRemoteEngineTest, Stability) {
   ASSERT_EQ(0U, results_.size());
 }
 
+TEST_F(GoogleStreamingRemoteEngineTest, SendPreamble) {
+  const int kPreambleLength = 100;
+  scoped_refptr<SpeechRecognitionSessionPreamble> preamble =
+      new SpeechRecognitionSessionPreamble();
+  preamble->sample_rate = 16000;
+  preamble->sample_depth = 2;
+  preamble->sample_data = std::string(kPreambleLength, 0);
+  SpeechRecognitionEngine::Config config;
+  config.auth_token = "foo";
+  config.auth_scope = "bar";
+  config.preamble = preamble;
+  engine_under_test_->SetConfig(config);
+
+  StartMockRecognition();
+  ASSERT_TRUE(GetUpstreamFetcher());
+  // First chunk uploaded should be the preamble.
+  ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+  std::string chunk = LastUpstreamChunkUploaded();
+  ExpectFramedChunk(chunk, kFrameTypePreamble);
+
+  for (int i = 0; i < 3; ++i) {
+    InjectDummyAudioChunk();
+    ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
+    chunk = LastUpstreamChunkUploaded();
+    ExpectFramedChunk(chunk, kFrameTypeRecognitionAudio);
+  }
+  engine_under_test_->AudioChunksEnded();
+  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+  // Simulate a protobuf message streamed from the server containing a single
+  // result with one hypotheses.
+  SpeechRecognitionResults results;
+  results.push_back(SpeechRecognitionResult());
+  SpeechRecognitionResult& result = results.back();
+  result.is_provisional = false;
+  result.hypotheses.push_back(
+      SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 1"), 0.1F));
+
+  ProvideMockResultDownstream(result);
+  ExpectResultsReceived(results);
+  ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
+
+  // Ensure everything is closed cleanly after the downstream is closed.
+  CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
+  ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
+  EndMockRecognition();
+  ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
+  ASSERT_EQ(0U, results_.size());
+}
+
 void GoogleStreamingRemoteEngineTest::SetUp() {
   engine_under_test_.reset(
       new  GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/));
@@ -397,6 +454,13 @@ size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() {
   return new_chunks;
 }
 
+std::string GoogleStreamingRemoteEngineTest::LastUpstreamChunkUploaded() {
+  TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
+  DCHECK(upstream_fetcher);
+  DCHECK(!upstream_fetcher->upload_chunks().empty());
+  return upstream_fetcher->upload_chunks().back();
+}
+
 void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream(
     const proto::SpeechRecognitionEvent& result) {
   TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
@@ -483,6 +547,15 @@ bool GoogleStreamingRemoteEngineTest::ResultsAreEqual(
   return true;
 }
 
+void GoogleStreamingRemoteEngineTest::ExpectFramedChunk(
+    const std::string& chunk, uint32_t type) {
+  uint32_t value;
+  base::ReadBigEndian(&chunk[0], &value);
+  EXPECT_EQ(chunk.size() - 8, value);
+  base::ReadBigEndian(&chunk[4], &value);
+  EXPECT_EQ(type, value);
+}
+
 std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse(
     const proto::SpeechRecognitionEvent& msg) {
   std::string msg_string;
diff --git a/content/browser/speech/speech_recognition_engine.h b/content/browser/speech/speech_recognition_engine.h
index b756833..4f945b5 100644
--- a/content/browser/speech/speech_recognition_engine.h
+++ b/content/browser/speech/speech_recognition_engine.h
@@ -9,6 +9,7 @@
 
 #include "base/basictypes.h"
 #include "content/common/content_export.h"
+#include "content/public/browser/speech_recognition_session_preamble.h"
 #include "content/public/common/speech_recognition_grammar.h"
 #include "content/public/common/speech_recognition_result.h"
 
@@ -61,6 +62,7 @@ class SpeechRecognitionEngine {
     int audio_num_bits_per_sample;
     std::string auth_token;
     std::string auth_scope;
+    scoped_refptr<SpeechRecognitionSessionPreamble> preamble;
   };
 
   virtual ~SpeechRecognitionEngine() {}
diff --git a/content/browser/speech/speech_recognition_manager_impl.cc b/content/browser/speech/speech_recognition_manager_impl.cc
index 24ca6e5..8a4d1b4 100644
--- a/content/browser/speech/speech_recognition_manager_impl.cc
+++ b/content/browser/speech/speech_recognition_manager_impl.cc
@@ -134,6 +134,7 @@ int SpeechRecognitionManagerImpl::CreateSession(
       can_report_metrics ? config.origin_url : std::string();
   remote_engine_config.auth_token = config.auth_token;
   remote_engine_config.auth_scope = config.auth_scope;
+  remote_engine_config.preamble = config.preamble;
 
   SpeechRecognitionEngine* google_remote_engine;
   if (config.is_legacy_api) {
author	amistry <amistry@chromium.org>	2014-12-15 18:10:00 -0800
committer	Commit bot <commit-bot@chromium.org>	2014-12-16 02:10:31 +0000
commit	77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65 (patch)
tree	b3a93e1fec64ed5914c8ed3749be7a379e4ae9c7 /content/browser/speech
parent	388679724f9abba959ba70a600cb5946b7be9249 (diff)
download	chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.zip chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.tar.gz chromium_src-77b87626cc6b2a9f4d9b2b5b4f23eab33793fb65.tar.bz2