// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include <queue> #include "base/memory/scoped_ptr.h" #include "base/message_loop.h" #include "base/utf_string_conversions.h" #include "content/browser/speech/audio_buffer.h" #include "content/browser/speech/google_streaming_remote_engine.h" #include "content/browser/speech/proto/google_streaming_api.pb.h" #include "content/public/common/speech_recognition_error.h" #include "content/public/common/speech_recognition_result.h" #include "net/url_request/test_url_fetcher_factory.h" #include "net/url_request/url_request_context_getter.h" #include "net/url_request/url_request_status.h" #include "testing/gtest/include/gtest/gtest.h" using content::SpeechRecognitionHypothesis; using content::SpeechRecognitionResult; using net::URLRequestStatus; using net::TestURLFetcher; using net::TestURLFetcherFactory; namespace speech { // Note: the terms upstream and downstream are herein referring to the client // (engine_under_test_) viewpoint. class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate, public testing::Test { public: GoogleStreamingRemoteEngineTest() : last_number_of_upstream_chunks_seen_(0U), error_(content::SPEECH_RECOGNITION_ERROR_NONE) { } // Creates a speech recognition request and invokes its URL fetcher delegate // with the given test data. void CreateAndTestRequest(bool success, const std::string& http_response); // SpeechRecognitionRequestDelegate methods. virtual void OnSpeechRecognitionEngineResult( const SpeechRecognitionResult& result) OVERRIDE { results_.push(result); } virtual void OnSpeechRecognitionEngineError( const content::SpeechRecognitionError& error) OVERRIDE { error_ = error.code; } // testing::Test methods. virtual void SetUp() OVERRIDE; virtual void TearDown() OVERRIDE; protected: enum DownstreamError { DOWNSTREAM_ERROR_NONE, DOWNSTREAM_ERROR_HTTP500, DOWNSTREAM_ERROR_NETWORK, DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH }; static bool ResultsAreEqual(const SpeechRecognitionResult& a, const SpeechRecognitionResult& b); static std::string SerializeProtobufResponse(const HttpStreamingResult& msg); static std::string ToBigEndian32(uint32 value); TestURLFetcher* GetUpstreamFetcher(); TestURLFetcher* GetDownstreamFetcher(); void StartMockRecognition(); void EndMockRecognition(); void InjectDummyAudioChunk(); size_t UpstreamChunksUploadedFromLastCall(); void ProvideMockResultDownstream(const SpeechRecognitionResult& result); void ExpectResultReceived(const SpeechRecognitionResult& result); void CloseMockDownstream(DownstreamError error); scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_; TestURLFetcherFactory url_fetcher_factory_; size_t last_number_of_upstream_chunks_seen_; MessageLoop message_loop_; std::string response_buffer_; content::SpeechRecognitionErrorCode error_; std::queue<SpeechRecognitionResult> results_; }; TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) { StartMockRecognition(); ASSERT_TRUE(GetUpstreamFetcher()); ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); // Inject some dummy audio chunks and check a corresponding chunked upload // is performed every time on the server. for (int i = 0; i < 3; ++i) { InjectDummyAudioChunk(); ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); } // Ensure that a final (empty) audio chunk is uploaded on chunks end. engine_under_test_->AudioChunksEnded(); ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); // Simulate a protobuf message streamed from the server containing a single // result with two hypotheses. SpeechRecognitionResult result; result.is_provisional = false; result.hypotheses.push_back( SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 1"), 0.1F)); result.hypotheses.push_back( SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 2"), 0.2F)); ProvideMockResultDownstream(result); ExpectResultReceived(result); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); // Ensure everything is closed cleanly after the downstream is closed. CloseMockDownstream(DOWNSTREAM_ERROR_NONE); ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); EndMockRecognition(); ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); ASSERT_EQ(0U, results_.size()); } TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) { StartMockRecognition(); ASSERT_TRUE(GetUpstreamFetcher()); ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); for (int i = 0; i < 4; ++i) { InjectDummyAudioChunk(); ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); SpeechRecognitionResult result; result.is_provisional = (i % 2 == 0); // Alternate result types. float confidence = result.is_provisional ? 0.0F : (i * 0.1F); result.hypotheses.push_back( SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), confidence)); ProvideMockResultDownstream(result); ExpectResultReceived(result); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); } // Ensure that a final (empty) audio chunk is uploaded on chunks end. engine_under_test_->AudioChunksEnded(); ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); // Simulate a final definitive result. SpeechRecognitionResult result; result.is_provisional = false; result.hypotheses.push_back( SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 1.0F)); ProvideMockResultDownstream(result); ExpectResultReceived(result); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); // Ensure everything is closed cleanly after the downstream is closed. CloseMockDownstream(DOWNSTREAM_ERROR_NONE); ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); EndMockRecognition(); ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); ASSERT_EQ(0U, results_.size()); } TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) { StartMockRecognition(); ASSERT_TRUE(GetUpstreamFetcher()); ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); // Simulate one pushed audio chunk. InjectDummyAudioChunk(); ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); // Simulate the corresponding definitive result. SpeechRecognitionResult result; result.hypotheses.push_back( SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), 1.0F)); ProvideMockResultDownstream(result); ExpectResultReceived(result); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); // Simulate a silent downstream closure after |AudioChunksEnded|. engine_under_test_->AudioChunksEnded(); ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); CloseMockDownstream(DOWNSTREAM_ERROR_NONE); // Expect an empty result, aimed at notifying recognition ended with no // actual results nor errors. SpeechRecognitionResult empty_result; ExpectResultReceived(empty_result); // Ensure everything is closed cleanly after the downstream is closed. ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); EndMockRecognition(); ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NONE, error_); ASSERT_EQ(0U, results_.size()); } TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) { StartMockRecognition(); ASSERT_TRUE(GetUpstreamFetcher()); ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); for (int i = 0; i < 3; ++i) InjectDummyAudioChunk(); engine_under_test_->AudioChunksEnded(); ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall()); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); // Simulate only a provisional result. SpeechRecognitionResult result; result.is_provisional = true; result.hypotheses.push_back( SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 0.0F)); ProvideMockResultDownstream(result); ExpectResultReceived(result); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH); // Expect a SPEECH_RECOGNITION_ERROR_NO_MATCH error to be raised. ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); EndMockRecognition(); ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NO_MATCH, error_); ASSERT_EQ(0U, results_.size()); } TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) { StartMockRecognition(); ASSERT_TRUE(GetUpstreamFetcher()); ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); InjectDummyAudioChunk(); ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); // Close the downstream with a HTTP 500 error. CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500); // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised. ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); EndMockRecognition(); ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_); ASSERT_EQ(0U, results_.size()); } TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) { StartMockRecognition(); ASSERT_TRUE(GetUpstreamFetcher()); ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall()); InjectDummyAudioChunk(); ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall()); // Close the downstream fetcher simulating a network failure. CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK); // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised. ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); EndMockRecognition(); ASSERT_EQ(content::SPEECH_RECOGNITION_ERROR_NETWORK, error_); ASSERT_EQ(0U, results_.size()); } void GoogleStreamingRemoteEngineTest::SetUp() { engine_under_test_.reset( new GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/)); engine_under_test_->set_delegate(this); } void GoogleStreamingRemoteEngineTest::TearDown() { engine_under_test_.reset(); } TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() { return url_fetcher_factory_.GetFetcherByID( GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTests); } TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() { return url_fetcher_factory_.GetFetcherByID( GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTests); } // Starts recognition on the engine, ensuring that both stream fetchers are // created. void GoogleStreamingRemoteEngineTest::StartMockRecognition() { DCHECK(engine_under_test_.get()); ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); engine_under_test_->StartRecognition(); ASSERT_TRUE(engine_under_test_->IsRecognitionPending()); TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); ASSERT_TRUE(upstream_fetcher); upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL()); TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); ASSERT_TRUE(downstream_fetcher); downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL()); } void GoogleStreamingRemoteEngineTest::EndMockRecognition() { DCHECK(engine_under_test_.get()); engine_under_test_->EndRecognition(); ASSERT_FALSE(engine_under_test_->IsRecognitionPending()); // TODO(primiano): In order to be very pedantic we should check that both the // upstream and downstream URL fetchers have been disposed at this time. // Unfortunately it seems that there is no direct way to detect (in tests) // if a url_fetcher has been freed or not, since they are not automatically // de-registered from the TestURLFetcherFactory on destruction. } void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() { unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'}; scoped_refptr<AudioChunk> dummy_audio_chunk( new AudioChunk(&dummy_audio_buffer_data[0], sizeof(dummy_audio_buffer_data), 2 /* bytes per sample */)); DCHECK(engine_under_test_.get()); engine_under_test_->TakeAudioChunk(*dummy_audio_chunk); } size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() { TestURLFetcher* upstream_fetcher = GetUpstreamFetcher(); DCHECK(upstream_fetcher); const size_t number_of_chunks = upstream_fetcher->upload_chunks().size(); DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_); const size_t new_chunks = number_of_chunks - last_number_of_upstream_chunks_seen_; last_number_of_upstream_chunks_seen_ = number_of_chunks; return new_chunks; } void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream( const SpeechRecognitionResult& result) { TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); ASSERT_TRUE(downstream_fetcher); downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */)); downstream_fetcher->set_response_code(200); HttpStreamingResult response; if (result.is_provisional) { DCHECK_EQ(result.hypotheses.size(), 1U); const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[0]; response.set_provisional(UTF16ToUTF8(hypothesis.utterance)); } else { response.set_status(GoogleStreamingRemoteEngine::kWebserviceStatusNoError); for (size_t i = 0; i < result.hypotheses.size(); ++i) { const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i]; HttpStreamingHypothesis* ws_hypothesis = response.add_hypotheses(); ws_hypothesis->set_confidence(hypothesis.confidence); ws_hypothesis->set_utterance(UTF16ToUTF8(hypothesis.utterance)); } } std::string response_string = SerializeProtobufResponse(response); response_buffer_.append(response_string); downstream_fetcher->SetResponseString(response_buffer_); downstream_fetcher->delegate()->OnURLFetchDownloadProgress( downstream_fetcher, response_buffer_.size(), -1 /* total response length not used */); } void GoogleStreamingRemoteEngineTest::CloseMockDownstream( DownstreamError error) { TestURLFetcher* downstream_fetcher = GetDownstreamFetcher(); ASSERT_TRUE(downstream_fetcher); const URLRequestStatus::Status fetcher_status = (error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED : URLRequestStatus::SUCCESS; downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0)); downstream_fetcher->set_response_code( (error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200); if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) { HttpStreamingResult response; response.set_status( GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch); response_buffer_.append(SerializeProtobufResponse(response)); } downstream_fetcher->SetResponseString(response_buffer_); downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher); } void GoogleStreamingRemoteEngineTest::ExpectResultReceived( const SpeechRecognitionResult& result) { ASSERT_GE(1U, results_.size()); ASSERT_TRUE(ResultsAreEqual(result, results_.front())); results_.pop(); } bool GoogleStreamingRemoteEngineTest::ResultsAreEqual( const SpeechRecognitionResult& a, const SpeechRecognitionResult& b) { if (a.is_provisional != b.is_provisional || a.hypotheses.size() != b.hypotheses.size()) { return false; } for (size_t i = 0; i < a.hypotheses.size(); ++i) { const SpeechRecognitionHypothesis& hyp_a = a.hypotheses[i]; const SpeechRecognitionHypothesis& hyp_b = b.hypotheses[i]; if (hyp_a.utterance != hyp_b.utterance || hyp_a.confidence != hyp_b.confidence) { return false; } } return true; } std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse( const HttpStreamingResult& msg) { std::string response_string; msg.SerializeToString(&response_string); // Append 4 byte prefix length indication to the protobuf message as envisaged // by the google streaming recognition webservice protocol. response_string.insert(0, ToBigEndian32(response_string.size())); return response_string; } std::string GoogleStreamingRemoteEngineTest::ToBigEndian32(uint32 value) { char raw_data[4]; raw_data[0] = static_cast<uint8>((value >> 24) & 0xFF); raw_data[1] = static_cast<uint8>((value >> 16) & 0xFF); raw_data[2] = static_cast<uint8>((value >> 8) & 0xFF); raw_data[3] = static_cast<uint8>(value & 0xFF); return std::string(raw_data, sizeof(raw_data)); } } // namespace speech