// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/spellcheck_worditerator.h"

#include <map>
#include <string>

#include "base/basictypes.h"
#include "base/string_util.h"

#include "third_party/icu38/public/common/unicode/uchar.h"
#include "third_party/icu38/public/common/unicode/unorm.h"
#include "third_party/icu38/public/common/unicode/uscript.h"
#include "third_party/icu38/public/common/unicode/uset.h"
#include "third_party/icu38/public/i18n/unicode/ulocdata.h"

SpellcheckCharAttribute::SpellcheckCharAttribute() {
  InitializeScriptTable();

  // Even though many dictionaries treats numbers and contractions as words and
  // treats USCRIPT_COMMON characters as word characters, the
  // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
  // characters to strictly-distinguish contraction characters from word
  // characters.
  SetWordScript(USCRIPT_COMMON, false);

  // Initialize the table of characters used for contractions.
  // This array consists of the 'Midletter' and 'MidNumLet' characters of the
  // word-break property list provided by Unicode, Inc.:
  //   http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
  static const UChar32 kMidLetters[] = {
      L'\x003A',  // MidLetter # COLON
      L'\x00B7',  // MidLetter # MIDDLE DOT
      L'\x0387',  // MidLetter # GREEK ANO TELEIA
      L'\x05F4',  // MidLetter # HEBREW PUNCTUATION GERSHAYIM
      L'\x2027',  // MidLetter # HYPHENATION POINT
      L'\xFE13',  // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
      L'\xFE55',  // MidLetter # SMALL COLON
      L'\xFF1A',  // MidLetter # FULLWIDTH COLON
      L'\x0027',  // MidNumLet # APOSTROPHE
      L'\x002E',  // MidNumLet # FULL STOP
      L'\x2018',  // MidNumLet # LEFT SINGLE QUOTATION MARK
      L'\x2019',  // MidNumLet # RIGHT SINGLE QUOTATION MARK
      L'\x2024',  // MidNumLet # ONE DOT LEADER
      L'\xFE52',  // MidNumLet # SMALL FULL STOP
      L'\xFF07',  // MidNumLet # FULLWIDTH APOSTROPHE
      L'\xFF0E',  // MidNumLet # FULLWIDTH FULL STOP
  };
  for (size_t i = 0; i < arraysize(kMidLetters); ++i)
    middle_letters_[kMidLetters[i]] = true;
}

SpellcheckCharAttribute::~SpellcheckCharAttribute() {
}

// Sets the default language for this object.
// This function retrieves the exemplar set to set up the default character
// attributes.
void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {
  // Retrieves the locale data of the given language.
  std::string language_encoded;
  WideToCodepage(language, "us-ascii", OnStringUtilConversionError::SKIP,
                 &language_encoded);
  UErrorCode status = U_ZERO_ERROR;
  ULocaleData* locale_data = ulocdata_open(language_encoded.c_str(), &status);
  if (U_FAILURE(status))
    return;

  // Retrieves the exemplar set of the given language and update the
  // character-attribute table to treat its characters as word characters.
  USet* exemplar_set = uset_open(1, 0);
  ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
                          &status);
  ulocdata_close(locale_data);
  if (U_SUCCESS(status)) {
    int length = uset_size(exemplar_set);
    for (int i = 0; i < length; ++i) {
      UChar32 character = uset_charAt(exemplar_set, i);
      SetWordScript(GetScriptCode(character), true);
    }
  }
  uset_close(exemplar_set);
}

// Returns whether or not the given character is a character used by the
// selected dictionary.
bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
  return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
}

// Returns whether or not the given character is a character used by
// contractions.
bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
  std::map<UChar32, bool>::const_iterator iterator;
  iterator = middle_letters_.find(character);
  if (iterator == middle_letters_.end())
    return false;
  return iterator->second;
}

// Initializes the mapping table.
void SpellcheckCharAttribute::InitializeScriptTable() {
  for (size_t i = 0; i < arraysize(script_attributes_); ++i)
    script_attributes_[i] = false;
}

// Retrieves the ICU script code.
UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
  UErrorCode status = U_ZERO_ERROR;
  UScriptCode script_code = uscript_getScript(character, &status);
  return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
}

// Updates the mapping table from an ICU script code to its attribute, i.e.
// whether not a script is used by the selected dictionary.
void SpellcheckCharAttribute::SetWordScript(const int script_code,
                                            bool in_use) {
  if (script_code < 0 ||
      static_cast<size_t>(script_code) >= arraysize(script_attributes_))
    return;
  script_attributes_[script_code] = in_use;
}

// Returns whether or not the given script is used by the selected
// dictionary.
bool SpellcheckCharAttribute::IsWordScript(
    const UScriptCode script_code) const {
  if (script_code < 0 ||
      static_cast<size_t>(script_code) >= arraysize(script_attributes_))
    return false;
  return script_attributes_[script_code];
}

SpellcheckWordIterator::SpellcheckWordIterator()
    : word_(NULL),
      length_(0),
      position_(0),
      allow_contraction_(false),
      attribute_(NULL) {
}

SpellcheckWordIterator::~SpellcheckWordIterator() {
}

// Initialize a word-iterator object.
void SpellcheckWordIterator::Initialize(
    const SpellcheckCharAttribute* attribute,
    const char16* word,
    size_t length,
    bool allow_contraction) {
  word_ = word;
  position_ = 0;
  length_ = static_cast<int>(length);
  allow_contraction_ = allow_contraction;
  attribute_ = attribute;
}

// Retrieves a word (or a contraction).
// When a contraction is enclosed with contraction characters (e.g. 'isn't',
// 'rock'n'roll'), we should discard the beginning and the end of the
// contraction but we should never split the contraction.
// To handle this case easily, we should firstly extract a segment consisting
// of word characters and contraction characters, and discard contraction
// characters at the beginning and the end of the extracted segment.
bool SpellcheckWordIterator::GetNextWord(string16* word_string,
                                         int* word_start,
                                         int* word_length) {
  word_string->empty();
  *word_start = 0;
  *word_length = 0;
  while (position_ < length_) {
    int segment_start = 0;
    int segment_end = 0;
    GetSegment(&segment_start, &segment_end);
    TrimSegment(segment_start, segment_end, word_start, word_length);
    if (*word_length > 0)
      return Normalize(*word_start, *word_length, word_string);
  }

  return false;
}

// Retrieves a segment consisting of word characters (and contraction
// characters if the |allow_contraction_| value is true).
// When the current position refers to a non-word character, this function
// returns a non-empty segment consisting of the character itself. In this
// case, the TrimSegment() function discards the character and returns an
// empty word (i.e. |word_length| == 0).
void SpellcheckWordIterator::GetSegment(int* segment_start,
                                        int* segment_end) {
  int position = position_;
  while (position <  length_) {
    UChar32 character;
    U16_NEXT(word_, position, length_, character);
    if (!attribute_->IsWordChar(character)) {
      if (!allow_contraction_ || !attribute_->IsContractionChar(character))
        break;
    }
  }
  *segment_start = position_;
  *segment_end = position;
  position_ = position;
}

// Discards non-word characters at the beginning and the end of the given
// segment.
void SpellcheckWordIterator::TrimSegment(int segment_start,
                                         int segment_end,
                                         int* word_start,
                                         int* word_length) const {
  while (segment_start < segment_end) {
    UChar32 character;
    int segment_next = segment_start;
    U16_NEXT(word_, segment_next, segment_end, character);
    if (attribute_->IsWordChar(character)) {
      *word_start = segment_start;
      break;
    }
    segment_start = segment_next;
  }
  while (segment_end >= segment_start) {
    UChar32 character;
    int segment_prev = segment_end;
    U16_PREV(word_, segment_start, segment_prev, character);
    if (attribute_->IsWordChar(character)) {
      *word_length = segment_end - segment_start;
      break;
    }
    segment_end = segment_prev;
  }
}

// Normalizes a non-terminated string into its canonical form so that
// a spellchecker object can check spellings of words which contain ligatures,
// full-width letters, etc.
// USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
// also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
// etc. For its details, please read the script table in
// "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
bool SpellcheckWordIterator::Normalize(int input_start,
                                       int input_length,
                                       string16* output_string) const {
  // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
  // does not only write NFKD and NFKC can compose ligatures into their ASCII
  // alternatives, but also write NFKC keeps accents of characters.
  // Therefore, NFKC seems to be the best option for hunspell.
  // To use NKFC for normalization, the length of the output string is mostly
  // equal to the one of the input string. (One exception is ligatures.)
  // To avoid the unorm_normalize() function from being called always twice,
  // we temporarily allocate |input_length| + 1 characters to the output string
  // and call the function with it. We re-allocate the output string
  // only if it cannot store the normalized string, i.e. the output string is
  // longer than the input one.
  const char16* input_string = &word_[input_start];
  UErrorCode error_code = U_ZERO_ERROR;
  int output_length = input_length + 1;
  char16* output_buffer = WriteInto(output_string, output_length);
  output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
                                  output_buffer, output_length, &error_code);
  if (error_code == U_BUFFER_OVERFLOW_ERROR) {
    error_code = U_ZERO_ERROR;
    output_buffer = WriteInto(output_string, ++output_length);
    output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
                                    output_buffer, output_length, &error_code);
  }
  return (error_code == U_ZERO_ERROR);
}