Add missing gclient dependencies to .gitignore.

Fix the format of many directories so they don't show up in git status anymore. Run dos2unix on *.cc, caught many inconsistent and CRLF files. TBR=evan TEST=still build, git status shows nothing BUG=none Review URL: http://codereview.chromium.org/211010 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@26441 0039d316-1c4b-4281-b951-d872f2087c98
author: maruel@chromium.org <maruel@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-09-17 13:21:07 +0000
committer: maruel@chromium.org <maruel@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-09-17 13:21:07 +0000
commit: a0abcf44533e242c524ed2955838880d00feadc9 (patch)
tree: a2e15f339acf72930b63122040e6fdaace0f1e70 /third_party/cld
parent: 1f01d47474d6b06a2d1351416eeed320f892dde2 (diff)
download: chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.zip
chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.tar.gz
chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.tar.bz2
1 files changed, 570 insertions, 570 deletions
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc
index 05d4a45..146f862 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc
@@ -1,570 +1,570 @@
-// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h"
-#include <stdio.h>
-#include <string.h>
-
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propjustletter.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propletterscriptnum.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8scannotjustletterspecial.h"
-
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h"
-
-static const Language GRAY_LANG = (Language)254;
-
-static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
-                                                  // else make shorter
-static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
-                                                  // to round to word boundary,
-                                                  // direction above
-
-static const char kSpecialSymbol[256] = {       // true for < > &
-  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-  0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
-  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-
-  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-};
-
-
-
-#define LT 0      // <
-#define GT 1      // >
-#define EX 2      // !
-#define HY 3      // -
-#define QU 4      // "
-#define AP 5      // '
-#define SL 6      // /
-#define S_ 7
-#define C_ 8
-#define R_ 9
-#define I_ 10
-#define P_ 11
-#define T_ 12
-#define Y_ 13
-#define L_ 14
-#define E_ 15
-#define CR 16     // <cr> or <lf>
-#define NL 17     // non-letter: ASCII whitespace, digit, punctuation
-#define PL 18     // possible letter, incl. &
-#define xx 19     // <unused>
-
-// Map byte to one of ~20 interesting categories for cheap tag parsing
-static const uint8 kCharToSub[256] = {
-  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
-  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
-  NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
-  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
-
-  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
-  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
-  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
-  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
-
-  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
-  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
-  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
-  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
-
-  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
-  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
-  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
-  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
-};
-
-#undef LT
-#undef GT
-#undef EX
-#undef HY
-#undef QU
-#undef AP
-#undef SL
-#undef S_
-#undef C_
-#undef R_
-#undef I_
-#undef P_
-#undef T_
-#undef Y_
-#undef L_
-#undef E_
-#undef CR
-#undef NL
-#undef PL
-#undef xx
-
-
-#define OK 0
-#define X_ 1
-
-// State machine to do cheap parse of non-letter strings incl. tags
-// advances <tag>
-//          |    |
-// advances <tag> ... </tag>  for <script> <style>
-//          |               |
-// advances <!-- ... <tag> ... -->
-//          |                     |
-// advances <tag
-//          ||  (0)
-// advances <tag <tag2>
-//          ||  (0)
-static const uint8 kTagParseTbl_0[] = {
-// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
-   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
-  X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
-   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
-  X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
-  X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
-  X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
-   6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
-   6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
-   6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
-  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
-  10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
-  11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
-  X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
-
-// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
-  X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
-  X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
-  X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
-  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
-  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
-  X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
-  20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
-  19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
-  19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
-  19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
-  19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
-  19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
-  19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
-  19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
-  19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
-
-// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
-  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
-  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
-  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
-  X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
-  33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
-  32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
-  32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
-  32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
-  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
-  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
-  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
-  32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
-};
-
-#undef OK
-#undef X_
-
-
-/*
-// Convert GetTimeOfDay output to 64-bit usec
-static inline uint64 Microseconds(const struct timeval& t) {
-  // The SumReducer uses uint64, so convert to (uint64) microseconds,
-  // not (double) seconds.
-  return t.tv_sec * 1000000ULL + t.tv_usec;
-}
-*/
-
-
-// Returns true if character is < > or &
-bool inline IsSpecial(char c) {
-  if ((c & 0xe0) == 0x20) {
-    return kSpecialSymbol[static_cast<uint8>(c)];
-  }
-  return false;
-}
-
-// Quick Skip to next letter or < > & or to end of string (eos)
-// Always return is_letter for eos
-int ScanToLetterOrSpecial(const char* src, int len) {
-  int bytes_consumed;
-  cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
-                       &bytes_consumed);
-  return bytes_consumed;
-}
-
-
-
-// src points to non-letter, such as tag-opening '<'
-// Return length from here to next possible letter
-// On eos or another < before >, return 1
-// advances <tag>
-//          |    |
-// advances <tag> ... </tag>  for <script> <style>
-//          |               |
-// advances <!-- ... <tag> ... -->
-//          |                     |
-// advances <tag
-//          ||  (1)
-// advances <tag <tag2>
-//          ||  (1)
-int ScanToPossibleLetter(const char* isrc, int len) {
-  const uint8* src = reinterpret_cast<const uint8*>(isrc);
-  const uint8* srclimit = src + len;
-  const uint8* tagParseTbl = kTagParseTbl_0;
-  int e = 0;
-  while (src < srclimit) {
-    e = tagParseTbl[kCharToSub[*src++]];
-    if ((e & ~1) == 0) {
-      // We overshot by one byte
-      --src;
-      break;
-    }
-    tagParseTbl = &kTagParseTbl_0[e * 20];
-  }
-
-  if (src >= srclimit) {
-    // We fell off the end of the text.
-    // It looks like the most common case for this is a truncated file, not
-    // mismatched angle brackets. So we pretend that the last char was '>'
-    return len;
-  }
-
-  // OK to be in state 0 or state 2 at exit
-  if ((e != 0) && (e != 2)) {
-    // Error, '<' followed by '<'
-    // We want to back up to first <, then advance by one byte past it
-    int offset = src - reinterpret_cast<const uint8*>(isrc);
-    // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
-
-    // Backscan to first '<' and return enough length to just get past it
-    --offset;   // back up over the second '<', which caused us to stop
-    while ((0 < offset) && (isrc[offset] != '<')) {
-      // Find the first '<', which is unmatched
-      --offset;
-    }
-    // skip to just beyond first '<'
-    // printf("  returning %d\n", offset + 1);
-    return offset + 1;
-  }
-
-  return src - reinterpret_cast<const uint8*>(isrc);
-}
-
-
-
-ScriptScanner::ScriptScanner(const char* buffer,
-                             int buffer_length,
-                             bool is_plain_text)
-  : start_byte_(buffer),
-  next_byte_(buffer),
-  next_byte_limit_(buffer + buffer_length),
-  byte_length_(buffer_length),
-  is_plain_text_(is_plain_text) {
-    script_buffer_ = new char[getone::kMaxScriptBuffer];
-    script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
-}
-
-ScriptScanner::~ScriptScanner() {
-  delete[] script_buffer_;
-  delete[] script_buffer_lower_;
-}
-
-
-
-
-// Get to the first real non-tag letter or entity that is a letter
-// Sets script of that letter
-// Return len if no more letters
-int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
-  int sc = UNKNOWN_LSCRIPT;
-  int skip = 0;
-  int tlen, plen;
-
-  // Do run of non-letters (tag | &NL | NL)*
-  while (skip < len) {
-    // Do fast scan to next interesting byte
-    // int oldskip = skip;
-    skip += ScanToLetterOrSpecial(src + skip, len - skip);
-    // TEMP
-    // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
-    //       oldskip, src[oldskip], skip, src[skip]);
-
-    // Check for no more letters/specials
-    if (skip >= len) {
-      // All done
-      return len;
-    }
-
-    // We are at a letter, nonletter, tag, or entity
-    if (IsSpecial(src[skip]) && !is_plain_text_) {
-      if (src[skip] == '<') {
-        // Begining of tag; skip to end and go around again
-        tlen = ScanToPossibleLetter(src + skip, len - skip);
-        sc = 0;
-        // printf("<...> ");
-      } else if (src[skip] == '>') {
-        // Unexpected end of tag; skip it and go around again
-        tlen = 1;         // Over the >
-        sc = 0;
-        // printf("..> ");
-      } else if (src[skip] == '&') {
-        // Expand entity, no advance
-        char temp[4];
-        EntityToBuffer(src + skip, len - skip,
-                       temp, &tlen, &plen);
-        sc = getone::GetUTF8LetterScriptNum(temp);
-        // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
-      }
-    } else {
-      // Update 1..4 bytes
-      tlen = cld_UniLib::OneCharLen(src + skip);
-      sc = getone::GetUTF8LetterScriptNum(src + skip);
-      // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
-    }
-    // TEMP
-    // printf("sc=%d ", sc);
-    if (sc != 0) {break;}           // Letter found
-    skip += tlen;                   // Advance
-  }
-
-  *script = sc;
-  return skip;
-}
-
-
-
-// Copy next run of same-script non-tag letters to buffer [NUL terminated]
-// Buffer has leading space and all text is lowercased
-bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
-  span->text = script_buffer_;
-  span->text_bytes = 0;
-  span->offset = next_byte_ - start_byte_;
-  span->script = UNKNOWN_LSCRIPT;
-  span->lang = UNKNOWN_LANGUAGE;
-  span->truncated = false;
-
-  // printf("GetOneScriptSpan[[ ");
-  // struct timeval script_start, script_mid, script_end;
-
-  int spanscript;           // The script of this span
-  int sc = UNKNOWN_LSCRIPT;  // The script of next character
-  int tlen, plen;
-
-
-  script_buffer_[0] = ' ';  // Always a space at front of output
-  script_buffer_[1] = '\0';
-  int take = 0;
-  int put = 1;              // Start after the initial space
-
-  // gettimeofday(&script_start, NULL);
-  // Get to the first real non-tag letter or entity that is a letter
-  int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
-  next_byte_ += skip;
-  byte_length_ -= skip;
-  if (byte_length_ <= 0) {
-    // printf("]]\n");
-    return false;               // No more letters to be found
-  }
-
-  // gettimeofday(&script_mid, NULL);
-
-  // There is at least one letter, so we know the script for this span
-  // printf("{%d} ", spanscript);
-  span->script = (UnicodeLScript)spanscript;
-
-
-  // Go over alternating spans of same-script letters and non-letters,
-  // copying letters to buffer with single spaces for each run of non-letters
-  while (take < byte_length_) {
-    // Copy run of letters in same script (&LS | LS)*
-    int letter_count = 0;              // Keep track of word length
-    bool need_break = false;
-    while (take < byte_length_) {
-      // We are at a letter, nonletter, tag, or entity
-      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
-        // printf("\"%c\" ", next_byte_[take]);
-        if (next_byte_[take] == '<') {
-          // Begining of tag
-          sc = 0;
-          break;
-        } else if (next_byte_[take] == '>') {
-          // Unexpected end of tag
-          sc = 0;
-          break;
-        } else if (next_byte_[take] == '&') {
-          // Copy entity, no advance
-          EntityToBuffer(next_byte_ + take, byte_length_ - take,
-                         script_buffer_ + put, &tlen, &plen);
-          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
-        }
-      } else {
-        // Real letter, safely copy up to 4 bytes, increment by 1..4
-        // Will update by 1..4 bytes at Advance, below
-        tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
-        if (take < (byte_length_ - 3)) {
-          // Fast case
-          *reinterpret_cast<uint32*>(script_buffer_ + put) =
-            *reinterpret_cast<const uint32*>(next_byte_ + take);
-        } else {
-          // Slow case, happens 1-3 times per input document
-          memcpy(script_buffer_ + put, next_byte_ + take, plen);
-        }
-        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
-      }
-      // printf("sc(%c)=%d ", next_byte_[take], sc);
-      // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
-      // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
-
-      // Allow continue across a single letter in a different script:
-      // A B D = three scripts, c = common script, i = inherited script,
-      // - = don't care, ( = take position before the += below
-      //  AAA(A-    continue
-      //
-      //  AAA(BA    continue
-      //  AAA(BB    break
-      //  AAA(Bc    continue (breaks after B)
-      //  AAA(BD    break
-      //  AAA(Bi    break
-      //
-      //  AAA(c-    break
-      //
-      //  AAA(i-    continue
-      //
-
-      if ((sc != spanscript) && (sc != ULScript_Inherited)) {
-        // Might need to break this script span
-        if (sc == ULScript_Common) {
-          need_break = true;
-        } else {
-          // Look at next following character, ignoring entity as Common
-          int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
-          if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
-            need_break = true;
-          }
-        }
-      }
-      if (need_break) {break;}  // Non-letter or letter in wrong script
-
-      take += tlen;                   // Advance
-      put += plen;                    // Advance
-      ++letter_count;
-      if (put >= getone::kMaxScriptBytes) {
-        // Buffer is full
-        span->truncated = true;
-        break;
-      }
-    }     // End while letters
-
-    // Do run of non-letters (tag | &NL | NL)*
-    while (take < byte_length_) {
-      // Do fast scan to next interesting byte
-      take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
-      
-      // Check for no more letters/specials
-      if (take >= byte_length_) {
-        take = byte_length_;
-        break;
-      }
-
-      // We are at a letter, nonletter, tag, or entity
-      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
-        // printf("\"%c\" ", next_byte_[take]);
-        if (next_byte_[take] == '<') {
-          // Begining of tag; skip to end and go around again
-          tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
-          sc = 0;
-          // printf("<...> ");
-        } else if (next_byte_[take] == '>') {
-          // Unexpected end of tag; skip it and go around again
-          tlen = 1;         // Over the >
-          sc = 0;
-          // printf("..> ");
-        } else if (next_byte_[take] == '&') {
-          // Expand entity, no advance
-          EntityToBuffer(next_byte_ + take, byte_length_ - take,
-                         script_buffer_ + put, &tlen, &plen);
-          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
-        }
-      } else {
-        // Update 1..4
-        tlen = cld_UniLib::OneCharLen(next_byte_ + take);
-        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
-      }
-      // printf("sc[%c]=%d ", next_byte_[take], sc);
-      if (sc != 0) {break;}           // Letter found
-      take += tlen;                   // Advance
-    }     // End while not-letters
-
-    script_buffer_[put++] = ' ';
-
-    // We are at a letter again (or eos), after letter* not-letter*
-    if (sc != spanscript) {break;}            // Letter in wrong script
-    if (put >= getone::kMaxScriptBytes - 8) {
-      // Buffer is almost full
-      span->truncated = true;
-      break;
-    }
-  }
-
-  // Update input position
-  next_byte_ += take;
-  byte_length_ -= take;
-
-  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
-  //                          kMaxScriptBytes |   | put
-  script_buffer_[put + 0] = ' ';
-  script_buffer_[put + 1] = ' ';
-  script_buffer_[put + 2] = ' ';
-  script_buffer_[put + 3] = '\0';
-
-  span->text_bytes = put;       // Does not include the last four chars above
-
-  // printf(" %d]]\n\n", put);
-  return true;
-}
-
-// Force Latin, Cyrillic, Greek scripts to be lowercase
-void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
-  // On Windows, text is lowercased beforehand, so no need to do anything here.
-#if !defined(CLD_WINDOWS)
-  // If needed, lowercase all the text. If we do it sooner, might miss
-  // lowercasing an entity such as &Aacute;
-  // We only need to do this for Latn and Cyrl scripts
-  if ((span->script == ULScript_Latin) ||
-      (span->script == ULScript_Cyrillic) ||
-      (span->script == ULScript_Greek)) {
-    // Full Unicode lowercase of the entire buffer, including
-    // four pad bytes off the end
-    int consumed, filled;
-    UniLib::ToLower(span->text, span->text_bytes + 4,
-                    script_buffer_lower_, getone::kMaxScriptLowerBuffer,
-                    &consumed, &filled);
-    span->text = script_buffer_lower_;
-    span->text_bytes = filled - 4;
-  }
-#endif
-}
-
-// Copy next run of same-script non-tag letters to buffer [NUL terminated]
-// Force Latin and Cyrillic scripts to be lowercase
-bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
-  bool ok = GetOneScriptSpan(span);
-  LowerScriptSpan(span);
-  return ok;
-}
-
-// Gets lscript number for letters; always returns
-//   0 (common script) for non-letters
-int getone::GetUTF8LetterScriptNum(const char* src) {
-  int srclen = cld_UniLib::OneCharLen(src);
-  const uint8* usrc = reinterpret_cast<const uint8*>(src);
-  return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
-}
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h"
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propjustletter.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propletterscriptnum.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8scannotjustletterspecial.h"
+
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h"
+
+static const Language GRAY_LANG = (Language)254;
+
+static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
+                                                  // else make shorter
+static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
+                                                  // to round to word boundary,
+                                                  // direction above
+
+static const char kSpecialSymbol[256] = {       // true for < > &
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+};
+
+
+
+#define LT 0      // <
+#define GT 1      // >
+#define EX 2      // !
+#define HY 3      // -
+#define QU 4      // "
+#define AP 5      // '
+#define SL 6      // /
+#define S_ 7
+#define C_ 8
+#define R_ 9
+#define I_ 10
+#define P_ 11
+#define T_ 12
+#define Y_ 13
+#define L_ 14
+#define E_ 15
+#define CR 16     // <cr> or <lf>
+#define NL 17     // non-letter: ASCII whitespace, digit, punctuation
+#define PL 18     // possible letter, incl. &
+#define xx 19     // <unused>
+
+// Map byte to one of ~20 interesting categories for cheap tag parsing
+static const uint8 kCharToSub[256] = {
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
+
+  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
+  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
+  PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
+  P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
+
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+  NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
+
+  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
+  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
+  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
+  PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
+};
+
+#undef LT
+#undef GT
+#undef EX
+#undef HY
+#undef QU
+#undef AP
+#undef SL
+#undef S_
+#undef C_
+#undef R_
+#undef I_
+#undef P_
+#undef T_
+#undef Y_
+#undef L_
+#undef E_
+#undef CR
+#undef NL
+#undef PL
+#undef xx
+
+
+#define OK 0
+#define X_ 1
+
+// State machine to do cheap parse of non-letter strings incl. tags
+// advances <tag>
+//          |    |
+// advances <tag> ... </tag>  for <script> <style>
+//          |               |
+// advances <!-- ... <tag> ... -->
+//          |                     |
+// advances <tag
+//          ||  (0)
+// advances <tag <tag2>
+//          ||  (0)
+static const uint8 kTagParseTbl_0[] = {
+// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
+   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK
+  X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
+   3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*
+  X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
+  X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
+  X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
+   6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
+   6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
+   6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
+  10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
+  11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
+  X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
+
+// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
+  X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
+  X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
+  X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
+  20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
+  19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
+  19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
+  19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
+  19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
+  19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
+  19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
+  19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
+  19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
+
+// <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
+  X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
+  X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
+  33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
+  32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
+  32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
+  32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
+  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
+  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
+  32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
+  32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
+};
+
+#undef OK
+#undef X_
+
+
+/*
+// Convert GetTimeOfDay output to 64-bit usec
+static inline uint64 Microseconds(const struct timeval& t) {
+  // The SumReducer uses uint64, so convert to (uint64) microseconds,
+  // not (double) seconds.
+  return t.tv_sec * 1000000ULL + t.tv_usec;
+}
+*/
+
+
+// Returns true if character is < > or &
+bool inline IsSpecial(char c) {
+  if ((c & 0xe0) == 0x20) {
+    return kSpecialSymbol[static_cast<uint8>(c)];
+  }
+  return false;
+}
+
+// Quick Skip to next letter or < > & or to end of string (eos)
+// Always return is_letter for eos
+int ScanToLetterOrSpecial(const char* src, int len) {
+  int bytes_consumed;
+  cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
+                       &bytes_consumed);
+  return bytes_consumed;
+}
+
+
+
+// src points to non-letter, such as tag-opening '<'
+// Return length from here to next possible letter
+// On eos or another < before >, return 1
+// advances <tag>
+//          |    |
+// advances <tag> ... </tag>  for <script> <style>
+//          |               |
+// advances <!-- ... <tag> ... -->
+//          |                     |
+// advances <tag
+//          ||  (1)
+// advances <tag <tag2>
+//          ||  (1)
+int ScanToPossibleLetter(const char* isrc, int len) {
+  const uint8* src = reinterpret_cast<const uint8*>(isrc);
+  const uint8* srclimit = src + len;
+  const uint8* tagParseTbl = kTagParseTbl_0;
+  int e = 0;
+  while (src < srclimit) {
+    e = tagParseTbl[kCharToSub[*src++]];
+    if ((e & ~1) == 0) {
+      // We overshot by one byte
+      --src;
+      break;
+    }
+    tagParseTbl = &kTagParseTbl_0[e * 20];
+  }
+
+  if (src >= srclimit) {
+    // We fell off the end of the text.
+    // It looks like the most common case for this is a truncated file, not
+    // mismatched angle brackets. So we pretend that the last char was '>'
+    return len;
+  }
+
+  // OK to be in state 0 or state 2 at exit
+  if ((e != 0) && (e != 2)) {
+    // Error, '<' followed by '<'
+    // We want to back up to first <, then advance by one byte past it
+    int offset = src - reinterpret_cast<const uint8*>(isrc);
+    // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
+
+    // Backscan to first '<' and return enough length to just get past it
+    --offset;   // back up over the second '<', which caused us to stop
+    while ((0 < offset) && (isrc[offset] != '<')) {
+      // Find the first '<', which is unmatched
+      --offset;
+    }
+    // skip to just beyond first '<'
+    // printf("  returning %d\n", offset + 1);
+    return offset + 1;
+  }
+
+  return src - reinterpret_cast<const uint8*>(isrc);
+}
+
+
+
+ScriptScanner::ScriptScanner(const char* buffer,
+                             int buffer_length,
+                             bool is_plain_text)
+  : start_byte_(buffer),
+  next_byte_(buffer),
+  next_byte_limit_(buffer + buffer_length),
+  byte_length_(buffer_length),
+  is_plain_text_(is_plain_text) {
+    script_buffer_ = new char[getone::kMaxScriptBuffer];
+    script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
+}
+
+ScriptScanner::~ScriptScanner() {
+  delete[] script_buffer_;
+  delete[] script_buffer_lower_;
+}
+
+
+
+
+// Get to the first real non-tag letter or entity that is a letter
+// Sets script of that letter
+// Return len if no more letters
+int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
+  int sc = UNKNOWN_LSCRIPT;
+  int skip = 0;
+  int tlen, plen;
+
+  // Do run of non-letters (tag | &NL | NL)*
+  while (skip < len) {
+    // Do fast scan to next interesting byte
+    // int oldskip = skip;
+    skip += ScanToLetterOrSpecial(src + skip, len - skip);
+    // TEMP
+    // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
+    //       oldskip, src[oldskip], skip, src[skip]);
+
+    // Check for no more letters/specials
+    if (skip >= len) {
+      // All done
+      return len;
+    }
+
+    // We are at a letter, nonletter, tag, or entity
+    if (IsSpecial(src[skip]) && !is_plain_text_) {
+      if (src[skip] == '<') {
+        // Begining of tag; skip to end and go around again
+        tlen = ScanToPossibleLetter(src + skip, len - skip);
+        sc = 0;
+        // printf("<...> ");
+      } else if (src[skip] == '>') {
+        // Unexpected end of tag; skip it and go around again
+        tlen = 1;         // Over the >
+        sc = 0;
+        // printf("..> ");
+      } else if (src[skip] == '&') {
+        // Expand entity, no advance
+        char temp[4];
+        EntityToBuffer(src + skip, len - skip,
+                       temp, &tlen, &plen);
+        sc = getone::GetUTF8LetterScriptNum(temp);
+        // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
+      }
+    } else {
+      // Update 1..4 bytes
+      tlen = cld_UniLib::OneCharLen(src + skip);
+      sc = getone::GetUTF8LetterScriptNum(src + skip);
+      // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
+    }
+    // TEMP
+    // printf("sc=%d ", sc);
+    if (sc != 0) {break;}           // Letter found
+    skip += tlen;                   // Advance
+  }
+
+  *script = sc;
+  return skip;
+}
+
+
+
+// Copy next run of same-script non-tag letters to buffer [NUL terminated]
+// Buffer has leading space and all text is lowercased
+bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
+  span->text = script_buffer_;
+  span->text_bytes = 0;
+  span->offset = next_byte_ - start_byte_;
+  span->script = UNKNOWN_LSCRIPT;
+  span->lang = UNKNOWN_LANGUAGE;
+  span->truncated = false;
+
+  // printf("GetOneScriptSpan[[ ");
+  // struct timeval script_start, script_mid, script_end;
+
+  int spanscript;           // The script of this span
+  int sc = UNKNOWN_LSCRIPT;  // The script of next character
+  int tlen, plen;
+
+
+  script_buffer_[0] = ' ';  // Always a space at front of output
+  script_buffer_[1] = '\0';
+  int take = 0;
+  int put = 1;              // Start after the initial space
+
+  // gettimeofday(&script_start, NULL);
+  // Get to the first real non-tag letter or entity that is a letter
+  int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
+  next_byte_ += skip;
+  byte_length_ -= skip;
+  if (byte_length_ <= 0) {
+    // printf("]]\n");
+    return false;               // No more letters to be found
+  }
+
+  // gettimeofday(&script_mid, NULL);
+
+  // There is at least one letter, so we know the script for this span
+  // printf("{%d} ", spanscript);
+  span->script = (UnicodeLScript)spanscript;
+
+
+  // Go over alternating spans of same-script letters and non-letters,
+  // copying letters to buffer with single spaces for each run of non-letters
+  while (take < byte_length_) {
+    // Copy run of letters in same script (&LS | LS)*
+    int letter_count = 0;              // Keep track of word length
+    bool need_break = false;
+    while (take < byte_length_) {
+      // We are at a letter, nonletter, tag, or entity
+      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
+        // printf("\"%c\" ", next_byte_[take]);
+        if (next_byte_[take] == '<') {
+          // Begining of tag
+          sc = 0;
+          break;
+        } else if (next_byte_[take] == '>') {
+          // Unexpected end of tag
+          sc = 0;
+          break;
+        } else if (next_byte_[take] == '&') {
+          // Copy entity, no advance
+          EntityToBuffer(next_byte_ + take, byte_length_ - take,
+                         script_buffer_ + put, &tlen, &plen);
+          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
+        }
+      } else {
+        // Real letter, safely copy up to 4 bytes, increment by 1..4
+        // Will update by 1..4 bytes at Advance, below
+        tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
+        if (take < (byte_length_ - 3)) {
+          // Fast case
+          *reinterpret_cast<uint32*>(script_buffer_ + put) =
+            *reinterpret_cast<const uint32*>(next_byte_ + take);
+        } else {
+          // Slow case, happens 1-3 times per input document
+          memcpy(script_buffer_ + put, next_byte_ + take, plen);
+        }
+        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
+      }
+      // printf("sc(%c)=%d ", next_byte_[take], sc);
+      // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
+      // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
+
+      // Allow continue across a single letter in a different script:
+      // A B D = three scripts, c = common script, i = inherited script,
+      // - = don't care, ( = take position before the += below
+      //  AAA(A-    continue
+      //
+      //  AAA(BA    continue
+      //  AAA(BB    break
+      //  AAA(Bc    continue (breaks after B)
+      //  AAA(BD    break
+      //  AAA(Bi    break
+      //
+      //  AAA(c-    break
+      //
+      //  AAA(i-    continue
+      //
+
+      if ((sc != spanscript) && (sc != ULScript_Inherited)) {
+        // Might need to break this script span
+        if (sc == ULScript_Common) {
+          need_break = true;
+        } else {
+          // Look at next following character, ignoring entity as Common
+          int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
+          if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
+            need_break = true;
+          }
+        }
+      }
+      if (need_break) {break;}  // Non-letter or letter in wrong script
+
+      take += tlen;                   // Advance
+      put += plen;                    // Advance
+      ++letter_count;
+      if (put >= getone::kMaxScriptBytes) {
+        // Buffer is full
+        span->truncated = true;
+        break;
+      }
+    }     // End while letters
+
+    // Do run of non-letters (tag | &NL | NL)*
+    while (take < byte_length_) {
+      // Do fast scan to next interesting byte
+      take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
+      
+      // Check for no more letters/specials
+      if (take >= byte_length_) {
+        take = byte_length_;
+        break;
+      }
+
+      // We are at a letter, nonletter, tag, or entity
+      if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
+        // printf("\"%c\" ", next_byte_[take]);
+        if (next_byte_[take] == '<') {
+          // Begining of tag; skip to end and go around again
+          tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
+          sc = 0;
+          // printf("<...> ");
+        } else if (next_byte_[take] == '>') {
+          // Unexpected end of tag; skip it and go around again
+          tlen = 1;         // Over the >
+          sc = 0;
+          // printf("..> ");
+        } else if (next_byte_[take] == '&') {
+          // Expand entity, no advance
+          EntityToBuffer(next_byte_ + take, byte_length_ - take,
+                         script_buffer_ + put, &tlen, &plen);
+          sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
+        }
+      } else {
+        // Update 1..4
+        tlen = cld_UniLib::OneCharLen(next_byte_ + take);
+        sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
+      }
+      // printf("sc[%c]=%d ", next_byte_[take], sc);
+      if (sc != 0) {break;}           // Letter found
+      take += tlen;                   // Advance
+    }     // End while not-letters
+
+    script_buffer_[put++] = ' ';
+
+    // We are at a letter again (or eos), after letter* not-letter*
+    if (sc != spanscript) {break;}            // Letter in wrong script
+    if (put >= getone::kMaxScriptBytes - 8) {
+      // Buffer is almost full
+      span->truncated = true;
+      break;
+    }
+  }
+
+  // Update input position
+  next_byte_ += take;
+  byte_length_ -= take;
+
+  // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
+  //                          kMaxScriptBytes |   | put
+  script_buffer_[put + 0] = ' ';
+  script_buffer_[put + 1] = ' ';
+  script_buffer_[put + 2] = ' ';
+  script_buffer_[put + 3] = '\0';
+
+  span->text_bytes = put;       // Does not include the last four chars above
+
+  // printf(" %d]]\n\n", put);
+  return true;
+}
+
+// Force Latin, Cyrillic, Greek scripts to be lowercase
+void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
+  // On Windows, text is lowercased beforehand, so no need to do anything here.
+#if !defined(CLD_WINDOWS)
+  // If needed, lowercase all the text. If we do it sooner, might miss
+  // lowercasing an entity such as &Aacute;
+  // We only need to do this for Latn and Cyrl scripts
+  if ((span->script == ULScript_Latin) ||
+      (span->script == ULScript_Cyrillic) ||
+      (span->script == ULScript_Greek)) {
+    // Full Unicode lowercase of the entire buffer, including
+    // four pad bytes off the end
+    int consumed, filled;
+    UniLib::ToLower(span->text, span->text_bytes + 4,
+                    script_buffer_lower_, getone::kMaxScriptLowerBuffer,
+                    &consumed, &filled);
+    span->text = script_buffer_lower_;
+    span->text_bytes = filled - 4;
+  }
+#endif
+}
+
+// Copy next run of same-script non-tag letters to buffer [NUL terminated]
+// Force Latin and Cyrillic scripts to be lowercase
+bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
+  bool ok = GetOneScriptSpan(span);
+  LowerScriptSpan(span);
+  return ok;
+}
+
+// Gets lscript number for letters; always returns
+//   0 (common script) for non-letters
+int getone::GetUTF8LetterScriptNum(const char* src) {
+  int srclen = cld_UniLib::OneCharLen(src);
+  const uint8* usrc = reinterpret_cast<const uint8*>(src);
+  return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
+}
author	maruel@chromium.org <maruel@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-09-17 13:21:07 +0000
committer	maruel@chromium.org <maruel@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-09-17 13:21:07 +0000
commit	a0abcf44533e242c524ed2955838880d00feadc9 (patch)
tree	a2e15f339acf72930b63122040e6fdaace0f1e70 /third_party/cld
parent	1f01d47474d6b06a2d1351416eeed320f892dde2 (diff)
download	chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.zip chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.tar.gz chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.tar.bz2