// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/browser/speech/speech_recognizer.h"

#include "base/time.h"
#include "chrome/browser/profiles/profile.h"
#include "content/browser/browser_thread.h"
#include "net/url_request/url_request_context_getter.h"

using media::AudioInputController;
using std::string;

namespace {

// The following constants are related to the volume level indicator shown in
// the UI for recorded audio.
// Multiplier used when new volume is greater than previous level.
const float kUpSmoothingFactor = 1.0f;
// Multiplier used when new volume is lesser than previous level.
const float kDownSmoothingFactor = 0.7f;
// RMS dB value of a maximum (unclipped) sine wave for int16 samples.
const float kAudioMeterMaxDb = 90.31f;
// This value corresponds to RMS dB for int16 with 6 most-significant-bits = 0.
// Values lower than this will display as empty level-meter.
const float kAudioMeterMinDb = 30.0f;
const float kAudioMeterDbRange = kAudioMeterMaxDb - kAudioMeterMinDb;

// Maximum level to draw to display unclipped meter. (1.0f displays clipping.)
const float kAudioMeterRangeMaxUnclipped = 47.0f / 48.0f;

// Returns true if more than 5% of the samples are at min or max value.
bool Clipping(const int16* samples, int num_samples) {
  int clipping_samples = 0;
  const int kThreshold = num_samples / 20;
  for (int i = 0; i < num_samples; ++i) {
    if (samples[i] <= -32767 || samples[i] >= 32767) {
      if (++clipping_samples > kThreshold)
        return true;
    }
  }
  return false;
}

}  // namespace

namespace speech_input {

const int SpeechRecognizer::kAudioSampleRate = 16000;
const int SpeechRecognizer::kAudioPacketIntervalMs = 100;
const ChannelLayout SpeechRecognizer::kChannelLayout = CHANNEL_LAYOUT_MONO;
const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;

SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
                                   int caller_id,
                                   const std::string& language,
                                   const std::string& grammar,
                                   const std::string& hardware_info,
                                   const std::string& origin_url)
    : delegate_(delegate),
      caller_id_(caller_id),
      language_(language),
      grammar_(grammar),
      hardware_info_(hardware_info),
      origin_url_(origin_url),
      codec_(AudioEncoder::CODEC_FLAC),
      encoder_(NULL),
      endpointer_(kAudioSampleRate),
      num_samples_recorded_(0),
      audio_level_(0.0f) {
  endpointer_.set_speech_input_complete_silence_length(
      base::Time::kMicrosecondsPerSecond / 2);
  endpointer_.set_long_speech_input_complete_silence_length(
      base::Time::kMicrosecondsPerSecond);
  endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
  endpointer_.StartSession();
}

SpeechRecognizer::~SpeechRecognizer() {
  // Recording should have stopped earlier due to the endpointer or
  // |StopRecording| being called.
  DCHECK(!audio_controller_.get());
  DCHECK(!request_.get() || !request_->HasPendingRequest());
  DCHECK(!encoder_.get());
  endpointer_.EndSession();
}

bool SpeechRecognizer::StartRecording() {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
  DCHECK(!audio_controller_.get());
  DCHECK(!request_.get() || !request_->HasPendingRequest());
  DCHECK(!encoder_.get());

  // The endpointer needs to estimate the environment/background noise before
  // starting to treat the audio as user input. In |HandleOnData| we wait until
  // such time has passed before switching to user input mode.
  endpointer_.SetEnvironmentEstimationMode();

  encoder_.reset(AudioEncoder::Create(codec_, kAudioSampleRate,
                                      kNumBitsPerAudioSample));
  int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
  AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kChannelLayout,
                         kAudioSampleRate, kNumBitsPerAudioSample,
                         samples_per_packet);
  audio_controller_ = AudioInputController::Create(this, params);
  DCHECK(audio_controller_.get());
  VLOG(1) << "SpeechRecognizer starting record.";
  num_samples_recorded_ = 0;
  audio_controller_->Record();

  return true;
}

void SpeechRecognizer::CancelRecognition() {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
  DCHECK(audio_controller_.get() || request_.get());

  // Stop recording if required.
  if (audio_controller_.get()) {
    VLOG(1) << "SpeechRecognizer stopping record.";
    audio_controller_->Close();
    audio_controller_ = NULL;  // Releases the ref ptr.
  }

  VLOG(1) << "SpeechRecognizer canceling recognition.";
  encoder_.reset();
  request_.reset();
}

void SpeechRecognizer::StopRecording() {
  DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));

  // If audio recording has already stopped and we are in recognition phase,
  // silently ignore any more calls to stop recording.
  if (!audio_controller_.get())
    return;

  VLOG(1) << "SpeechRecognizer stopping record.";
  audio_controller_->Close();
  audio_controller_ = NULL;  // Releases the ref ptr.

  delegate_->DidCompleteRecording(caller_id_);

  // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
  // of silence in case encoder had no data already.
  std::vector<short> samples((kAudioSampleRate * kAudioPacketIntervalMs) /
                             1000);
  encoder_->Encode(&samples[0], samples.size());
  encoder_->Flush();
  string encoded_data;
  encoder_->GetEncodedDataAndClear(&encoded_data);
  DCHECK(!encoded_data.empty());
  encoder_.reset();

  // If we haven't got any audio yet end the recognition sequence here.
  if (request_ == NULL) {
    // Guard against the delegate freeing us until we finish our job.
    scoped_refptr<SpeechRecognizer> me(this);
    delegate_->DidCompleteRecognition(caller_id_);
  } else {
    request_->UploadAudioChunk(encoded_data, true /* is_last_chunk */);
  }
}

// Invoked in the audio thread.
void SpeechRecognizer::OnError(AudioInputController* controller,
                               int error_code) {
  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
                         NewRunnableMethod(this,
                                           &SpeechRecognizer::HandleOnError,
                                           error_code));
}

void SpeechRecognizer::HandleOnError(int error_code) {
  LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;

  // Check if we are still recording before canceling recognition, as
  // recording might have been stopped after this error was posted to the queue
  // by |OnError|.
  if (!audio_controller_.get())
    return;

  InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);
}

void SpeechRecognizer::OnData(AudioInputController* controller,
                              const uint8* data, uint32 size) {
  if (size == 0)  // This could happen when recording stops and is normal.
    return;

  string* str_data = new string(reinterpret_cast<const char*>(data), size);
  BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
                         NewRunnableMethod(this,
                                           &SpeechRecognizer::HandleOnData,
                                           str_data));
}

void SpeechRecognizer::HandleOnData(string* data) {
  // Check if we are still recording and if not discard this buffer, as
  // recording might have been stopped after this buffer was posted to the queue
  // by |OnData|.
  if (!audio_controller_.get()) {
    delete data;
    return;
  }

  const short* samples = reinterpret_cast<const short*>(data->data());
  DCHECK((data->length() % sizeof(short)) == 0);
  int num_samples = data->length() / sizeof(short);
  encoder_->Encode(samples, num_samples);
  float rms;
  endpointer_.ProcessAudio(samples, num_samples, &rms);
  bool did_clip = Clipping(samples, num_samples);
  delete data;
  num_samples_recorded_ += num_samples;

  if (request_ == NULL) {
    // This was the first audio packet recorded, so start a request to the
    // server to send the data and inform the delegate.
    delegate_->DidStartReceivingAudio(caller_id_);
    request_.reset(new SpeechRecognitionRequest(
        Profile::GetDefaultRequestContext(), this));
    request_->Start(language_, grammar_, hardware_info_, origin_url_,
                    encoder_->mime_type());
  }

  string encoded_data;
  encoder_->GetEncodedDataAndClear(&encoded_data);
  DCHECK(!encoded_data.empty());
  request_->UploadAudioChunk(encoded_data, false /* is_last_chunk */);

  if (endpointer_.IsEstimatingEnvironment()) {
    // Check if we have gathered enough audio for the endpointer to do
    // environment estimation and should move on to detect speech/end of speech.
    if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
                                  kAudioSampleRate) / 1000) {
      endpointer_.SetUserInputMode();
      delegate_->DidCompleteEnvironmentEstimation(caller_id_);
    }
    return;  // No more processing since we are still estimating environment.
  }

  // Check if we have waited too long without hearing any speech.
  if (!endpointer_.DidStartReceivingSpeech() &&
      num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {
    InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);
    return;
  }

  // Calculate the input volume to display in the UI, smoothing towards the
  // new level.
  float level = (rms - kAudioMeterMinDb) /
      (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
  level = std::min(std::max(0.0f, level), kAudioMeterRangeMaxUnclipped);
  if (level > audio_level_) {
    audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
  } else {
    audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
  }

  float noise_level = (endpointer_.NoiseLevelDb() - kAudioMeterMinDb) /
      (kAudioMeterDbRange / kAudioMeterRangeMaxUnclipped);
  noise_level = std::min(std::max(0.0f, noise_level),
      kAudioMeterRangeMaxUnclipped);

  delegate_->SetInputVolume(caller_id_, did_clip ? 1.0f : audio_level_,
                            noise_level);

  if (endpointer_.speech_input_complete()) {
    StopRecording();
  }

  // TODO(satish): Once we have streaming POST, start sending the data received
  // here as POST chunks.
}

void SpeechRecognizer::SetRecognitionResult(
    bool error, const SpeechInputResultArray& result) {
  if (error || result.empty()) {
    InformErrorAndCancelRecognition(error ? RECOGNIZER_ERROR_NETWORK :
                                            RECOGNIZER_ERROR_NO_RESULTS);
    return;
  }

  delegate_->SetRecognitionResult(caller_id_, error, result);

  // Guard against the delegate freeing us until we finish our job.
  scoped_refptr<SpeechRecognizer> me(this);
  delegate_->DidCompleteRecognition(caller_id_);
}

void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {
  CancelRecognition();

  // Guard against the delegate freeing us until we finish our job.
  scoped_refptr<SpeechRecognizer> me(this);
  delegate_->OnRecognizerError(caller_id_, error);
}

}  // namespace speech_input