1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
#include "encodings/compact_lang_det/letterscript_enum.h"
#include "encodings/compact_lang_det/compact_lang_det_impl.h"
namespace getone {
static const int kMaxScriptBuffer = 4096;
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room
static const int kMaxAnswerBuffer = 256;
typedef enum UnicodeLScript ULScript;
typedef struct {
char* text; // Pointer to the span, somewhere
int text_bytes; // Number of bytes of text in the span
int offset; // Offset of start of span in original input buffer
ULScript script; // Script of all the letters in this span
Language lang; // Language identified for this span
bool truncated; // true if buffer filled up before a
// different script or EOF was found
} LangSpan;
static inline bool IsContinuationByte(char c) {
return static_cast<signed char>(c) < -64;
}
// Gets lscript number for letters; always returns
// 0 (common script) for non-letters
int GetUTF8LetterScriptNum(const char* src);
// Update src pointer to point to next quadgram, +2..+5
// Looks at src[0..4]
const char* AdvanceQuad(const char* src);
} // end namespace getone
class ScriptScanner {
public:
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
~ScriptScanner();
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
bool GetOneScriptSpan(getone::LangSpan* span);
// Force Latin and Cyrillic scripts to be lowercase
void LowerScriptSpan(getone::LangSpan* span);
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
// Force Latin and Cyrillic scripts to be lowercase
bool GetOneScriptSpanLower(getone::LangSpan* span);
private:
int SkipToFrontOfSpan(const char* src, int len, int* script);
const char* start_byte_;
const char* next_byte_;
const char* next_byte_limit_;
int byte_length_;
bool is_plain_text_;
char* script_buffer_; // Holds text with expanded entities
char* script_buffer_lower_; // Holds lowercased text
};
class LangScanner {
public:
LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
int maxlangs, int minlangspan);
~LangScanner();
int script() {return script_;}
// Use new text
// Keep smoothing state if same script, otherwise reinit smoothing
void NewText(getone::LangSpan* spn);
bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping
// The real ones
bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
getone::LangSpan* span);
bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
getone::LangSpan* span);
// Increases language bias by delta
void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
Language key, int delta);
// For debugging output
int next_answer_;
char answer_buffer_[getone::kMaxAnswerBuffer];
char answer_buffer2_[getone::kMaxAnswerBuffer];
char answer_buffer3_[getone::kMaxAnswerBuffer];
char answer_buffer4_[getone::kMaxAnswerBuffer];
private:
const char* start_byte_;
const char* next_byte_limit_;
const char* next_byte_;
const char* onelangspan_begin_;
int byte_length_;
int script_;
Language spanlang_;
int smoothwidth_;
int smoothwidth_2_;
int smoothcandidates_;
int maxlangs_;
int minlangspan_;
int rb_size_;
int next_rb_;
int rb_mask_;
uint32* rb_;
int* offset_rb_;
};
#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
|