diff options
author | maruel@chromium.org <maruel@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-09-17 13:21:07 +0000 |
---|---|---|
committer | maruel@chromium.org <maruel@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-09-17 13:21:07 +0000 |
commit | a0abcf44533e242c524ed2955838880d00feadc9 (patch) | |
tree | a2e15f339acf72930b63122040e6fdaace0f1e70 /third_party/cld | |
parent | 1f01d47474d6b06a2d1351416eeed320f892dde2 (diff) | |
download | chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.zip chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.tar.gz chromium_src-a0abcf44533e242c524ed2955838880d00feadc9.tar.bz2 |
Add missing gclient dependencies to .gitignore.
Fix the format of many directories so they don't show up in git status anymore.
Run dos2unix on *.cc, caught many inconsistent and CRLF files.
TBR=evan
TEST=still build, git status shows nothing
BUG=none
Review URL: http://codereview.chromium.org/211010
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@26441 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party/cld')
-rw-r--r-- | third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc | 1140 |
1 files changed, 570 insertions, 570 deletions
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc index 05d4a45..146f862 100644 --- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc +++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc @@ -1,570 +1,570 @@ -// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h"
-#include <stdio.h>
-#include <string.h>
-
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propjustletter.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propletterscriptnum.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8scannotjustletterspecial.h"
-
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h"
-#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h"
-
-static const Language GRAY_LANG = (Language)254;
-
-static const int kMaxUpToWordBoundary = 50; // span < this make longer,
- // else make shorter
-static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes
- // to round to word boundary,
- // direction above
-
-static const char kSpecialSymbol[256] = { // true for < > &
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
-};
-
-
-
-#define LT 0 // <
-#define GT 1 // >
-#define EX 2 // !
-#define HY 3 // -
-#define QU 4 // "
-#define AP 5 // '
-#define SL 6 // /
-#define S_ 7
-#define C_ 8
-#define R_ 9
-#define I_ 10
-#define P_ 11
-#define T_ 12
-#define Y_ 13
-#define L_ 14
-#define E_ 15
-#define CR 16 // <cr> or <lf>
-#define NL 17 // non-letter: ASCII whitespace, digit, punctuation
-#define PL 18 // possible letter, incl. &
-#define xx 19 // <unused>
-
-// Map byte to one of ~20 interesting categories for cheap tag parsing
-static const uint8 kCharToSub[256] = {
- NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
- NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
- NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
- NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
-
- PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
- P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
- PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
- P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
-
- NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
- NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
- NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
- NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
-
- PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
- PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
- PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
- PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
-};
-
-#undef LT
-#undef GT
-#undef EX
-#undef HY
-#undef QU
-#undef AP
-#undef SL
-#undef S_
-#undef C_
-#undef R_
-#undef I_
-#undef P_
-#undef T_
-#undef Y_
-#undef L_
-#undef E_
-#undef CR
-#undef NL
-#undef PL
-#undef xx
-
-
-#define OK 0
-#define X_ 1
-
-// State machine to do cheap parse of non-letter strings incl. tags
-// advances <tag>
-// | |
-// advances <tag> ... </tag> for <script> <style>
-// | |
-// advances <!-- ... <tag> ... -->
-// | |
-// advances <tag
-// || (0)
-// advances <tag <tag2>
-// || (0)
-static const uint8 kTagParseTbl_0[] = {
-// < > ! - " ' / S C R I P T Y L E CR NL PL xx
- 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK
- X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error
- 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*
- X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <
- X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!
- X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-
- 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*
- 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-
- 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--
- X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*
- 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
- 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
- X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
-
-// < > ! - " ' / S C R I P T Y L E CR NL PL xx
- X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S
- X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC
- X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR
- X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI
- X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP
- X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
- 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
- 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
- 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</
- 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
- 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
- 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
- 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
- 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
- 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
-
-// < > ! - " ' / S C R I P T Y L E CR NL PL xx
- X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST
- X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY
- X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL
- X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
- 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
- 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
- 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</
- 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
- 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
- 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
- 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
- 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
-};
-
-#undef OK
-#undef X_
-
-
-/*
-// Convert GetTimeOfDay output to 64-bit usec
-static inline uint64 Microseconds(const struct timeval& t) {
- // The SumReducer uses uint64, so convert to (uint64) microseconds,
- // not (double) seconds.
- return t.tv_sec * 1000000ULL + t.tv_usec;
-}
-*/
-
-
-// Returns true if character is < > or &
-bool inline IsSpecial(char c) {
- if ((c & 0xe0) == 0x20) {
- return kSpecialSymbol[static_cast<uint8>(c)];
- }
- return false;
-}
-
-// Quick Skip to next letter or < > & or to end of string (eos)
-// Always return is_letter for eos
-int ScanToLetterOrSpecial(const char* src, int len) {
- int bytes_consumed;
- cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,
- &bytes_consumed);
- return bytes_consumed;
-}
-
-
-
-// src points to non-letter, such as tag-opening '<'
-// Return length from here to next possible letter
-// On eos or another < before >, return 1
-// advances <tag>
-// | |
-// advances <tag> ... </tag> for <script> <style>
-// | |
-// advances <!-- ... <tag> ... -->
-// | |
-// advances <tag
-// || (1)
-// advances <tag <tag2>
-// || (1)
-int ScanToPossibleLetter(const char* isrc, int len) {
- const uint8* src = reinterpret_cast<const uint8*>(isrc);
- const uint8* srclimit = src + len;
- const uint8* tagParseTbl = kTagParseTbl_0;
- int e = 0;
- while (src < srclimit) {
- e = tagParseTbl[kCharToSub[*src++]];
- if ((e & ~1) == 0) {
- // We overshot by one byte
- --src;
- break;
- }
- tagParseTbl = &kTagParseTbl_0[e * 20];
- }
-
- if (src >= srclimit) {
- // We fell off the end of the text.
- // It looks like the most common case for this is a truncated file, not
- // mismatched angle brackets. So we pretend that the last char was '>'
- return len;
- }
-
- // OK to be in state 0 or state 2 at exit
- if ((e != 0) && (e != 2)) {
- // Error, '<' followed by '<'
- // We want to back up to first <, then advance by one byte past it
- int offset = src - reinterpret_cast<const uint8*>(isrc);
- // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);
-
- // Backscan to first '<' and return enough length to just get past it
- --offset; // back up over the second '<', which caused us to stop
- while ((0 < offset) && (isrc[offset] != '<')) {
- // Find the first '<', which is unmatched
- --offset;
- }
- // skip to just beyond first '<'
- // printf(" returning %d\n", offset + 1);
- return offset + 1;
- }
-
- return src - reinterpret_cast<const uint8*>(isrc);
-}
-
-
-
-ScriptScanner::ScriptScanner(const char* buffer,
- int buffer_length,
- bool is_plain_text)
- : start_byte_(buffer),
- next_byte_(buffer),
- next_byte_limit_(buffer + buffer_length),
- byte_length_(buffer_length),
- is_plain_text_(is_plain_text) {
- script_buffer_ = new char[getone::kMaxScriptBuffer];
- script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];
-}
-
-ScriptScanner::~ScriptScanner() {
- delete[] script_buffer_;
- delete[] script_buffer_lower_;
-}
-
-
-
-
-// Get to the first real non-tag letter or entity that is a letter
-// Sets script of that letter
-// Return len if no more letters
-int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
- int sc = UNKNOWN_LSCRIPT;
- int skip = 0;
- int tlen, plen;
-
- // Do run of non-letters (tag | &NL | NL)*
- while (skip < len) {
- // Do fast scan to next interesting byte
- // int oldskip = skip;
- skip += ScanToLetterOrSpecial(src + skip, len - skip);
- // TEMP
- // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",
- // oldskip, src[oldskip], skip, src[skip]);
-
- // Check for no more letters/specials
- if (skip >= len) {
- // All done
- return len;
- }
-
- // We are at a letter, nonletter, tag, or entity
- if (IsSpecial(src[skip]) && !is_plain_text_) {
- if (src[skip] == '<') {
- // Begining of tag; skip to end and go around again
- tlen = ScanToPossibleLetter(src + skip, len - skip);
- sc = 0;
- // printf("<...> ");
- } else if (src[skip] == '>') {
- // Unexpected end of tag; skip it and go around again
- tlen = 1; // Over the >
- sc = 0;
- // printf("..> ");
- } else if (src[skip] == '&') {
- // Expand entity, no advance
- char temp[4];
- EntityToBuffer(src + skip, len - skip,
- temp, &tlen, &plen);
- sc = getone::GetUTF8LetterScriptNum(temp);
- // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);
- }
- } else {
- // Update 1..4 bytes
- tlen = cld_UniLib::OneCharLen(src + skip);
- sc = getone::GetUTF8LetterScriptNum(src + skip);
- // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);
- }
- // TEMP
- // printf("sc=%d ", sc);
- if (sc != 0) {break;} // Letter found
- skip += tlen; // Advance
- }
-
- *script = sc;
- return skip;
-}
-
-
-
-// Copy next run of same-script non-tag letters to buffer [NUL terminated]
-// Buffer has leading space and all text is lowercased
-bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {
- span->text = script_buffer_;
- span->text_bytes = 0;
- span->offset = next_byte_ - start_byte_;
- span->script = UNKNOWN_LSCRIPT;
- span->lang = UNKNOWN_LANGUAGE;
- span->truncated = false;
-
- // printf("GetOneScriptSpan[[ ");
- // struct timeval script_start, script_mid, script_end;
-
- int spanscript; // The script of this span
- int sc = UNKNOWN_LSCRIPT; // The script of next character
- int tlen, plen;
-
-
- script_buffer_[0] = ' '; // Always a space at front of output
- script_buffer_[1] = '\0';
- int take = 0;
- int put = 1; // Start after the initial space
-
- // gettimeofday(&script_start, NULL);
- // Get to the first real non-tag letter or entity that is a letter
- int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
- next_byte_ += skip;
- byte_length_ -= skip;
- if (byte_length_ <= 0) {
- // printf("]]\n");
- return false; // No more letters to be found
- }
-
- // gettimeofday(&script_mid, NULL);
-
- // There is at least one letter, so we know the script for this span
- // printf("{%d} ", spanscript);
- span->script = (UnicodeLScript)spanscript;
-
-
- // Go over alternating spans of same-script letters and non-letters,
- // copying letters to buffer with single spaces for each run of non-letters
- while (take < byte_length_) {
- // Copy run of letters in same script (&LS | LS)*
- int letter_count = 0; // Keep track of word length
- bool need_break = false;
- while (take < byte_length_) {
- // We are at a letter, nonletter, tag, or entity
- if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
- // printf("\"%c\" ", next_byte_[take]);
- if (next_byte_[take] == '<') {
- // Begining of tag
- sc = 0;
- break;
- } else if (next_byte_[take] == '>') {
- // Unexpected end of tag
- sc = 0;
- break;
- } else if (next_byte_[take] == '&') {
- // Copy entity, no advance
- EntityToBuffer(next_byte_ + take, byte_length_ - take,
- script_buffer_ + put, &tlen, &plen);
- sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
- }
- } else {
- // Real letter, safely copy up to 4 bytes, increment by 1..4
- // Will update by 1..4 bytes at Advance, below
- tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);
- if (take < (byte_length_ - 3)) {
- // Fast case
- *reinterpret_cast<uint32*>(script_buffer_ + put) =
- *reinterpret_cast<const uint32*>(next_byte_ + take);
- } else {
- // Slow case, happens 1-3 times per input document
- memcpy(script_buffer_ + put, next_byte_ + take, plen);
- }
- sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
- }
- // printf("sc(%c)=%d ", next_byte_[take], sc);
- // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);
- // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);
-
- // Allow continue across a single letter in a different script:
- // A B D = three scripts, c = common script, i = inherited script,
- // - = don't care, ( = take position before the += below
- // AAA(A- continue
- //
- // AAA(BA continue
- // AAA(BB break
- // AAA(Bc continue (breaks after B)
- // AAA(BD break
- // AAA(Bi break
- //
- // AAA(c- break
- //
- // AAA(i- continue
- //
-
- if ((sc != spanscript) && (sc != ULScript_Inherited)) {
- // Might need to break this script span
- if (sc == ULScript_Common) {
- need_break = true;
- } else {
- // Look at next following character, ignoring entity as Common
- int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);
- if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
- need_break = true;
- }
- }
- }
- if (need_break) {break;} // Non-letter or letter in wrong script
-
- take += tlen; // Advance
- put += plen; // Advance
- ++letter_count;
- if (put >= getone::kMaxScriptBytes) {
- // Buffer is full
- span->truncated = true;
- break;
- }
- } // End while letters
-
- // Do run of non-letters (tag | &NL | NL)*
- while (take < byte_length_) {
- // Do fast scan to next interesting byte
- take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
-
- // Check for no more letters/specials
- if (take >= byte_length_) {
- take = byte_length_;
- break;
- }
-
- // We are at a letter, nonletter, tag, or entity
- if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
- // printf("\"%c\" ", next_byte_[take]);
- if (next_byte_[take] == '<') {
- // Begining of tag; skip to end and go around again
- tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);
- sc = 0;
- // printf("<...> ");
- } else if (next_byte_[take] == '>') {
- // Unexpected end of tag; skip it and go around again
- tlen = 1; // Over the >
- sc = 0;
- // printf("..> ");
- } else if (next_byte_[take] == '&') {
- // Expand entity, no advance
- EntityToBuffer(next_byte_ + take, byte_length_ - take,
- script_buffer_ + put, &tlen, &plen);
- sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);
- }
- } else {
- // Update 1..4
- tlen = cld_UniLib::OneCharLen(next_byte_ + take);
- sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);
- }
- // printf("sc[%c]=%d ", next_byte_[take], sc);
- if (sc != 0) {break;} // Letter found
- take += tlen; // Advance
- } // End while not-letters
-
- script_buffer_[put++] = ' ';
-
- // We are at a letter again (or eos), after letter* not-letter*
- if (sc != spanscript) {break;} // Letter in wrong script
- if (put >= getone::kMaxScriptBytes - 8) {
- // Buffer is almost full
- span->truncated = true;
- break;
- }
- }
-
- // Update input position
- next_byte_ += take;
- byte_length_ -= take;
-
- // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
- // kMaxScriptBytes | | put
- script_buffer_[put + 0] = ' ';
- script_buffer_[put + 1] = ' ';
- script_buffer_[put + 2] = ' ';
- script_buffer_[put + 3] = '\0';
-
- span->text_bytes = put; // Does not include the last four chars above
-
- // printf(" %d]]\n\n", put);
- return true;
-}
-
-// Force Latin, Cyrillic, Greek scripts to be lowercase
-void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {
- // On Windows, text is lowercased beforehand, so no need to do anything here.
-#if !defined(CLD_WINDOWS)
- // If needed, lowercase all the text. If we do it sooner, might miss
- // lowercasing an entity such as Á
- // We only need to do this for Latn and Cyrl scripts
- if ((span->script == ULScript_Latin) ||
- (span->script == ULScript_Cyrillic) ||
- (span->script == ULScript_Greek)) {
- // Full Unicode lowercase of the entire buffer, including
- // four pad bytes off the end
- int consumed, filled;
- UniLib::ToLower(span->text, span->text_bytes + 4,
- script_buffer_lower_, getone::kMaxScriptLowerBuffer,
- &consumed, &filled);
- span->text = script_buffer_lower_;
- span->text_bytes = filled - 4;
- }
-#endif
-}
-
-// Copy next run of same-script non-tag letters to buffer [NUL terminated]
-// Force Latin and Cyrillic scripts to be lowercase
-bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {
- bool ok = GetOneScriptSpan(span);
- LowerScriptSpan(span);
- return ok;
-}
-
-// Gets lscript number for letters; always returns
-// 0 (common script) for non-letters
-int getone::GetUTF8LetterScriptNum(const char* src) {
- int srclen = cld_UniLib::OneCharLen(src);
- const uint8* usrc = reinterpret_cast<const uint8*>(src);
- return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);
-}
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h" +#include <stdio.h> +#include <string.h> + +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propjustletter.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propletterscriptnum.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8scannotjustletterspecial.h" + +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h" +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h" + +static const Language GRAY_LANG = (Language)254; + +static const int kMaxUpToWordBoundary = 50; // span < this make longer, + // else make shorter +static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes + // to round to word boundary, + // direction above + +static const char kSpecialSymbol[256] = { // true for < > & + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, +}; + + + +#define LT 0 // < +#define GT 1 // > +#define EX 2 // ! +#define HY 3 // - +#define QU 4 // " +#define AP 5 // ' +#define SL 6 // / +#define S_ 7 +#define C_ 8 +#define R_ 9 +#define I_ 10 +#define P_ 11 +#define T_ 12 +#define Y_ 13 +#define L_ 14 +#define E_ 15 +#define CR 16 // <cr> or <lf> +#define NL 17 // non-letter: ASCII whitespace, digit, punctuation +#define PL 18 // possible letter, incl. & +#define xx 19 // <unused> + +// Map byte to one of ~20 interesting categories for cheap tag parsing +static const uint8 kCharToSub[256] = { + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, + + PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, + P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, + PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, + P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, + + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, + + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, + PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, +}; + +#undef LT +#undef GT +#undef EX +#undef HY +#undef QU +#undef AP +#undef SL +#undef S_ +#undef C_ +#undef R_ +#undef I_ +#undef P_ +#undef T_ +#undef Y_ +#undef L_ +#undef E_ +#undef CR +#undef NL +#undef PL +#undef xx + + +#define OK 0 +#define X_ 1 + +// State machine to do cheap parse of non-letter strings incl. tags +// advances <tag> +// | | +// advances <tag> ... </tag> for <script> <style> +// | | +// advances <!-- ... <tag> ... --> +// | | +// advances <tag +// || (0) +// advances <tag <tag2> +// || (0) +static const uint8 kTagParseTbl_0[] = { +// < > ! - " ' / S C R I P T Y L E CR NL PL xx + 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK + X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error + 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* + X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < + X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! + X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- + 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* + 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- + 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* + 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" + 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' + X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' + +// < > ! - " ' / S C R I P T Y L E CR NL PL xx + X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S + X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC + X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP + X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT + 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* + 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< + 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</ + 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S + 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC + 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR + 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI + 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP + 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT + +// < > ! - " ' / S C R I P T Y L E CR NL PL xx + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY + X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL + X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE + 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* + 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< + 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</ + 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY + 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL + 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE +}; + +#undef OK +#undef X_ + + +/* +// Convert GetTimeOfDay output to 64-bit usec +static inline uint64 Microseconds(const struct timeval& t) { + // The SumReducer uses uint64, so convert to (uint64) microseconds, + // not (double) seconds. + return t.tv_sec * 1000000ULL + t.tv_usec; +} +*/ + + +// Returns true if character is < > or & +bool inline IsSpecial(char c) { + if ((c & 0xe0) == 0x20) { + return kSpecialSymbol[static_cast<uint8>(c)]; + } + return false; +} + +// Quick Skip to next letter or < > & or to end of string (eos) +// Always return is_letter for eos +int ScanToLetterOrSpecial(const char* src, int len) { + int bytes_consumed; + cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len, + &bytes_consumed); + return bytes_consumed; +} + + + +// src points to non-letter, such as tag-opening '<' +// Return length from here to next possible letter +// On eos or another < before >, return 1 +// advances <tag> +// | | +// advances <tag> ... </tag> for <script> <style> +// | | +// advances <!-- ... <tag> ... --> +// | | +// advances <tag +// || (1) +// advances <tag <tag2> +// || (1) +int ScanToPossibleLetter(const char* isrc, int len) { + const uint8* src = reinterpret_cast<const uint8*>(isrc); + const uint8* srclimit = src + len; + const uint8* tagParseTbl = kTagParseTbl_0; + int e = 0; + while (src < srclimit) { + e = tagParseTbl[kCharToSub[*src++]]; + if ((e & ~1) == 0) { + // We overshot by one byte + --src; + break; + } + tagParseTbl = &kTagParseTbl_0[e * 20]; + } + + if (src >= srclimit) { + // We fell off the end of the text. + // It looks like the most common case for this is a truncated file, not + // mismatched angle brackets. So we pretend that the last char was '>' + return len; + } + + // OK to be in state 0 or state 2 at exit + if ((e != 0) && (e != 2)) { + // Error, '<' followed by '<' + // We want to back up to first <, then advance by one byte past it + int offset = src - reinterpret_cast<const uint8*>(isrc); + // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc); + + // Backscan to first '<' and return enough length to just get past it + --offset; // back up over the second '<', which caused us to stop + while ((0 < offset) && (isrc[offset] != '<')) { + // Find the first '<', which is unmatched + --offset; + } + // skip to just beyond first '<' + // printf(" returning %d\n", offset + 1); + return offset + 1; + } + + return src - reinterpret_cast<const uint8*>(isrc); +} + + + +ScriptScanner::ScriptScanner(const char* buffer, + int buffer_length, + bool is_plain_text) + : start_byte_(buffer), + next_byte_(buffer), + next_byte_limit_(buffer + buffer_length), + byte_length_(buffer_length), + is_plain_text_(is_plain_text) { + script_buffer_ = new char[getone::kMaxScriptBuffer]; + script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer]; +} + +ScriptScanner::~ScriptScanner() { + delete[] script_buffer_; + delete[] script_buffer_lower_; +} + + + + +// Get to the first real non-tag letter or entity that is a letter +// Sets script of that letter +// Return len if no more letters +int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { + int sc = UNKNOWN_LSCRIPT; + int skip = 0; + int tlen, plen; + + // Do run of non-letters (tag | &NL | NL)* + while (skip < len) { + // Do fast scan to next interesting byte + // int oldskip = skip; + skip += ScanToLetterOrSpecial(src + skip, len - skip); + // TEMP + // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n", + // oldskip, src[oldskip], skip, src[skip]); + + // Check for no more letters/specials + if (skip >= len) { + // All done + return len; + } + + // We are at a letter, nonletter, tag, or entity + if (IsSpecial(src[skip]) && !is_plain_text_) { + if (src[skip] == '<') { + // Begining of tag; skip to end and go around again + tlen = ScanToPossibleLetter(src + skip, len - skip); + sc = 0; + // printf("<...> "); + } else if (src[skip] == '>') { + // Unexpected end of tag; skip it and go around again + tlen = 1; // Over the > + sc = 0; + // printf("..> "); + } else if (src[skip] == '&') { + // Expand entity, no advance + char temp[4]; + EntityToBuffer(src + skip, len - skip, + temp, &tlen, &plen); + sc = getone::GetUTF8LetterScriptNum(temp); + // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc); + } + } else { + // Update 1..4 bytes + tlen = cld_UniLib::OneCharLen(src + skip); + sc = getone::GetUTF8LetterScriptNum(src + skip); + // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc); + } + // TEMP + // printf("sc=%d ", sc); + if (sc != 0) {break;} // Letter found + skip += tlen; // Advance + } + + *script = sc; + return skip; +} + + + +// Copy next run of same-script non-tag letters to buffer [NUL terminated] +// Buffer has leading space and all text is lowercased +bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) { + span->text = script_buffer_; + span->text_bytes = 0; + span->offset = next_byte_ - start_byte_; + span->script = UNKNOWN_LSCRIPT; + span->lang = UNKNOWN_LANGUAGE; + span->truncated = false; + + // printf("GetOneScriptSpan[[ "); + // struct timeval script_start, script_mid, script_end; + + int spanscript; // The script of this span + int sc = UNKNOWN_LSCRIPT; // The script of next character + int tlen, plen; + + + script_buffer_[0] = ' '; // Always a space at front of output + script_buffer_[1] = '\0'; + int take = 0; + int put = 1; // Start after the initial space + + // gettimeofday(&script_start, NULL); + // Get to the first real non-tag letter or entity that is a letter + int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); + next_byte_ += skip; + byte_length_ -= skip; + if (byte_length_ <= 0) { + // printf("]]\n"); + return false; // No more letters to be found + } + + // gettimeofday(&script_mid, NULL); + + // There is at least one letter, so we know the script for this span + // printf("{%d} ", spanscript); + span->script = (UnicodeLScript)spanscript; + + + // Go over alternating spans of same-script letters and non-letters, + // copying letters to buffer with single spaces for each run of non-letters + while (take < byte_length_) { + // Copy run of letters in same script (&LS | LS)* + int letter_count = 0; // Keep track of word length + bool need_break = false; + while (take < byte_length_) { + // We are at a letter, nonletter, tag, or entity + if (IsSpecial(next_byte_[take]) && !is_plain_text_) { + // printf("\"%c\" ", next_byte_[take]); + if (next_byte_[take] == '<') { + // Begining of tag + sc = 0; + break; + } else if (next_byte_[take] == '>') { + // Unexpected end of tag + sc = 0; + break; + } else if (next_byte_[take] == '&') { + // Copy entity, no advance + EntityToBuffer(next_byte_ + take, byte_length_ - take, + script_buffer_ + put, &tlen, &plen); + sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); + } + } else { + // Real letter, safely copy up to 4 bytes, increment by 1..4 + // Will update by 1..4 bytes at Advance, below + tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take); + if (take < (byte_length_ - 3)) { + // Fast case + *reinterpret_cast<uint32*>(script_buffer_ + put) = + *reinterpret_cast<const uint32*>(next_byte_ + take); + } else { + // Slow case, happens 1-3 times per input document + memcpy(script_buffer_ + put, next_byte_ + take, plen); + } + sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); + } + // printf("sc(%c)=%d ", next_byte_[take], sc); + // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen); + // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc); + + // Allow continue across a single letter in a different script: + // A B D = three scripts, c = common script, i = inherited script, + // - = don't care, ( = take position before the += below + // AAA(A- continue + // + // AAA(BA continue + // AAA(BB break + // AAA(Bc continue (breaks after B) + // AAA(BD break + // AAA(Bi break + // + // AAA(c- break + // + // AAA(i- continue + // + + if ((sc != spanscript) && (sc != ULScript_Inherited)) { + // Might need to break this script span + if (sc == ULScript_Common) { + need_break = true; + } else { + // Look at next following character, ignoring entity as Common + int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen); + if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { + need_break = true; + } + } + } + if (need_break) {break;} // Non-letter or letter in wrong script + + take += tlen; // Advance + put += plen; // Advance + ++letter_count; + if (put >= getone::kMaxScriptBytes) { + // Buffer is full + span->truncated = true; + break; + } + } // End while letters + + // Do run of non-letters (tag | &NL | NL)* + while (take < byte_length_) { + // Do fast scan to next interesting byte + take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); + + // Check for no more letters/specials + if (take >= byte_length_) { + take = byte_length_; + break; + } + + // We are at a letter, nonletter, tag, or entity + if (IsSpecial(next_byte_[take]) && !is_plain_text_) { + // printf("\"%c\" ", next_byte_[take]); + if (next_byte_[take] == '<') { + // Begining of tag; skip to end and go around again + tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take); + sc = 0; + // printf("<...> "); + } else if (next_byte_[take] == '>') { + // Unexpected end of tag; skip it and go around again + tlen = 1; // Over the > + sc = 0; + // printf("..> "); + } else if (next_byte_[take] == '&') { + // Expand entity, no advance + EntityToBuffer(next_byte_ + take, byte_length_ - take, + script_buffer_ + put, &tlen, &plen); + sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); + } + } else { + // Update 1..4 + tlen = cld_UniLib::OneCharLen(next_byte_ + take); + sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); + } + // printf("sc[%c]=%d ", next_byte_[take], sc); + if (sc != 0) {break;} // Letter found + take += tlen; // Advance + } // End while not-letters + + script_buffer_[put++] = ' '; + + // We are at a letter again (or eos), after letter* not-letter* + if (sc != spanscript) {break;} // Letter in wrong script + if (put >= getone::kMaxScriptBytes - 8) { + // Buffer is almost full + span->truncated = true; + break; + } + } + + // Update input position + next_byte_ += take; + byte_length_ -= take; + + // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 + // kMaxScriptBytes | | put + script_buffer_[put + 0] = ' '; + script_buffer_[put + 1] = ' '; + script_buffer_[put + 2] = ' '; + script_buffer_[put + 3] = '\0'; + + span->text_bytes = put; // Does not include the last four chars above + + // printf(" %d]]\n\n", put); + return true; +} + +// Force Latin, Cyrillic, Greek scripts to be lowercase +void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) { + // On Windows, text is lowercased beforehand, so no need to do anything here. +#if !defined(CLD_WINDOWS) + // If needed, lowercase all the text. If we do it sooner, might miss + // lowercasing an entity such as Á + // We only need to do this for Latn and Cyrl scripts + if ((span->script == ULScript_Latin) || + (span->script == ULScript_Cyrillic) || + (span->script == ULScript_Greek)) { + // Full Unicode lowercase of the entire buffer, including + // four pad bytes off the end + int consumed, filled; + UniLib::ToLower(span->text, span->text_bytes + 4, + script_buffer_lower_, getone::kMaxScriptLowerBuffer, + &consumed, &filled); + span->text = script_buffer_lower_; + span->text_bytes = filled - 4; + } +#endif +} + +// Copy next run of same-script non-tag letters to buffer [NUL terminated] +// Force Latin and Cyrillic scripts to be lowercase +bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) { + bool ok = GetOneScriptSpan(span); + LowerScriptSpan(span); + return ok; +} + +// Gets lscript number for letters; always returns +// 0 (common script) for non-letters +int getone::GetUTF8LetterScriptNum(const char* src) { + int srclen = cld_UniLib::OneCharLen(src); + const uint8* usrc = reinterpret_cast<const uint8*>(src); + return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen); +} |