// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_ #include "base/macros.h" #include "base/memory/scoped_ptr.h" #include "content/browser/speech/endpointer/endpointer.h" #include "content/browser/speech/speech_recognition_engine.h" #include "content/browser/speech/speech_recognizer.h" #include "content/public/common/speech_recognition_error.h" #include "content/public/common/speech_recognition_result.h" #include "media/audio/audio_input_controller.h" #include "media/audio/audio_logging.h" #include "net/url_request/url_request_context_getter.h" namespace media { class AudioBus; class AudioManager; } namespace content { class SpeechRecognitionEventListener; // Handles speech recognition for a session (identified by |session_id|), taking // care of audio capture, silence detection/endpointer and interaction with the // SpeechRecognitionEngine. class CONTENT_EXPORT SpeechRecognizerImpl : public SpeechRecognizer, public media::AudioInputController::EventHandler, public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) { public: static const int kAudioSampleRate; static const media::ChannelLayout kChannelLayout; static const int kNumBitsPerAudioSample; static const int kNoSpeechTimeoutMs; static const int kEndpointerEstimationTimeMs; static void SetAudioManagerForTesting(media::AudioManager* audio_manager); SpeechRecognizerImpl(SpeechRecognitionEventListener* listener, int session_id, bool continuous, bool provisional_results, SpeechRecognitionEngine* engine); void StartRecognition(const std::string& device_id) override; void AbortRecognition() override; void StopAudioCapture() override; bool IsActive() const override; bool IsCapturingAudio() const override; const SpeechRecognitionEngine& recognition_engine() const; private: friend class SpeechRecognizerTest; enum FSMState { STATE_IDLE = 0, STATE_STARTING, STATE_ESTIMATING_ENVIRONMENT, STATE_WAITING_FOR_SPEECH, STATE_RECOGNIZING, STATE_WAITING_FINAL_RESULT, STATE_ENDED, STATE_MAX_VALUE = STATE_ENDED }; enum FSMEvent { EVENT_ABORT = 0, EVENT_START, EVENT_STOP_CAPTURE, EVENT_AUDIO_DATA, EVENT_ENGINE_RESULT, EVENT_ENGINE_ERROR, EVENT_AUDIO_ERROR, EVENT_MAX_VALUE = EVENT_AUDIO_ERROR }; struct FSMEventArgs { explicit FSMEventArgs(FSMEvent event_value); FSMEventArgs(const FSMEventArgs& other); ~FSMEventArgs(); FSMEvent event; scoped_refptr audio_data; SpeechRecognitionResults engine_results; SpeechRecognitionError engine_error; }; ~SpeechRecognizerImpl() override; // Entry point for pushing any new external event into the recognizer FSM. void DispatchEvent(const FSMEventArgs& event_args); // Defines the behavior of the recognizer FSM, selecting the appropriate // transition according to the current state and event. FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args); // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc). void ProcessAudioPipeline(const AudioChunk& raw_audio); // The methods below handle transitions of the recognizer FSM. FSMState StartRecording(const FSMEventArgs& event_args); FSMState StartRecognitionEngine(const FSMEventArgs& event_args); FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args); FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args); FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args); FSMState ProcessIntermediateResult(const FSMEventArgs& event_args); FSMState ProcessFinalResult(const FSMEventArgs& event_args); FSMState AbortSilently(const FSMEventArgs& event_args); FSMState AbortWithError(const FSMEventArgs& event_args); FSMState Abort(const SpeechRecognitionError& error); FSMState DetectEndOfSpeech(const FSMEventArgs& event_args); FSMState DoNothing(const FSMEventArgs& event_args) const; FSMState NotFeasible(const FSMEventArgs& event_args); // Returns the time span of captured audio samples since the start of capture. int GetElapsedTimeMs() const; // Calculates the input volume to be displayed in the UI, triggering the // OnAudioLevelsChange event accordingly. void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected); void CloseAudioControllerAsynchronously(); // Callback called on IO thread by audio_controller->Close(). void OnAudioClosed(media::AudioInputController*); // AudioInputController::EventHandler methods. void OnCreated(media::AudioInputController* controller) override {} void OnRecording(media::AudioInputController* controller) override {} void OnError(media::AudioInputController* controller, media::AudioInputController::ErrorCode error_code) override; void OnData(media::AudioInputController* controller, const media::AudioBus* data) override; void OnLog(media::AudioInputController* controller, const std::string& message) override {} // SpeechRecognitionEngineDelegate methods. void OnSpeechRecognitionEngineResults( const SpeechRecognitionResults& results) override; void OnSpeechRecognitionEngineEndOfUtterance() override; void OnSpeechRecognitionEngineError( const SpeechRecognitionError& error) override; static media::AudioManager* audio_manager_for_tests_; scoped_ptr recognition_engine_; Endpointer endpointer_; scoped_refptr audio_controller_; scoped_ptr audio_log_; int num_samples_recorded_; float audio_level_; bool is_dispatching_event_; bool provisional_results_; bool end_of_utterance_; FSMState state_; std::string device_id_; class OnDataConverter; // Converts data between native input format and a WebSpeech specific // output format. scoped_ptr audio_converter_; DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl); }; } // namespace content #endif // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_