summaryrefslogtreecommitdiffstats
path: root/third_party/cld/encodings/lang_enc.h
blob: 0d2a4f19dd12914f466c42e0db544e963fb3b507 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// This file is for i18n. It contains two enums, namely Language and
// Encoding, where Language is the linguistic convention, and Encoding
// contains information on both language encoding and character set.
//
// The language and encoding are both based on Teragram's conventions,
// except for some common ISO-8859 encodings that are not detected by
// Teragram but might be in the future.
//
// This file also includes functions that do mappings among
// Language/Encoding enums, language/encoding string names (typically
// the output from Language Encoding identifier), and language codes
// (iso 639), and two-letter country codes (iso 3166)
//
// NOTE: Both Language and Encoding enums should always start from
// zero value. This assumption has been made and used.
//

#ifndef ENCODINGS_LANG_ENC_H__
#define ENCODINGS_LANG_ENC_H__

#include "languages/public/languages.h"
#include "encodings/public/encodings.h"


// EncodingsForLanguage
// --------------------
//
// Given the language, returns a pointer to an array of encodings this
// language supports. Typically, the encs array has at least one
// element: UNKNOWN_ENCODING, which is always the last element of the
// array. The first encoding is the default encoding of the language.
// Return NULL if the input is invalid.
//
// Note: The output encoding array does not include ASCII_7BIT, UTF8
// or UNICODE which are good for all languages. TODO: Find out whether
// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
// as special cases.
//
const Encoding* EncodingsForLanguage(Language lang);


// DefaultEncodingForLanguage
// --------------------------
//
// Given the language, returns the default encoding for the language
// via the argument encoding.
//
// The function returns true if the input lang is valid. Otherwise,
// false is returned, and encoding is set to UNKNOWN_ENCODING.
//
bool DefaultEncodingForLanguage(Language lang,
                                Encoding *encoding);

// LanguagesForEncoding
// --------------------
//
// Given the encoding, returns a pointer to an array of languages this
// encoding supports. Typically, the langs array has at least one
// element: UNKNOWN_LANGUAGE, which is always the last element of the
// array. The first language in the array if the most popular
// language for that encoding. NULL is returned if the input is
// invalid.
//
// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
// the languages or to treat these two encodings as special cases.
//
// For other known encodings, ENGLISH is always included. This is
// because English (Latin) characters are included in each encoding.
//
const Language* LanguagesForEncoding(Encoding enc);

// DefaultLanguageForEncoding
// --------------------------
//
// Given the encoding, returns the default language for that encoding
// via the argument language.
//
// The function returns true if the input enc is valid. Otherwise,
// false is returned, and language is set to UNKNOWN_LANGUAGE.
//
// Note, this function is more useful for the encodings that have only
// one corresponding language i.e. shift_jis => Japanese. There are
// cases that multiple langauges have the same encoding, for which the
// default language is an arbitrary choice from them.
//
bool DefaultLanguageForEncoding(Encoding enc, Language* language);

//
// IsLangEncCompatible
// -------------------
//
// This function is to determine whether the input language and
// encoding are compatible. For example, FRENCH and LATIN1 are
// compatible, but FRENCH and GB are not.
//
// If either lang or enc is invalid return false.
// If either lang is unknown, return true.
//    (e.g. we can detect a page's encoding as latin1 from metatag info, but
//     cannot derive it language since there are more than one
//     language encoding in Latin1 )
// If language is known, but encoding is unknown, return false.
//    (return true will do us no good since we cannot convert to UTF8 anyway)
// If enc is unicode or utf8, return true.
// Otherwise check if lang is supported by enc and enc supported by
// lang.
//
bool IsLangEncCompatible(Language lang, Encoding enc);

//
// DominantLanguageFromEncoding
// ----------------------------
//
// This function determine if there exists a dominant language for the
// input encoding. For example, the encoding GB has a dominant
// language (Chinese), but Latin1 does not.
//
// The word "dominant" is used here because English characters are
// included in each encoding.
//
// If there is no dominant langauge for the encoding, such as Latin1,
// UNKNOWN_LANGUAGE is returned.
//
Language DominantLanguageFromEncoding(Encoding enc);

// LanguageCode
// ------------------------
// Given the Language and Encoding, return language code with dialects
// (>= 2 letters).  Encoding is necessary to disambiguate between
// Simplified and Traditional Chinese.
//
// See the note on Chinese Language Codes in
// i18n/languages/public/languages.h
// for the details.

const char* LanguageCode(Language lang, Encoding enc);

//
// IsEncodingWithSupportedLanguage()
// ---------------------------------
//
// There are some encoding listed here just because they are commonly
// used.  There is no interface language for them yet. They are not
// detected by Teragram, but can be detected from the meta info of the
// HTML page.
//
// For example, we have list ARABIC_ENCODING but there is no arabic in
// the Language enum. If the user input an Arabic query from Google
// main page, Netscape will just send the raw bytes to GWS, and GWS
// will treat them as Latin1.  Therefore, there is no use to detect
// ARABIC_ENCODING for indexing, since they will never match the
// queries which are treated as Latin1 by GWS. On the contrary, if we
// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
// fall them through as Latin1 in indexing time. And there might be a
// match for some ARABIC queries which are also treated as Latin1 by
// GWS. In fact, some people are relying on this feature to do Arabic
// searches.
//
// Thus for these type of encoding, before we have the UI support for
// their language and have a pretty comprehensive language/encoding
// identification quality, it is better to revert them as
// UNKNOWN_ENCODING.
//
// This function checks whether the input encoding is one with
// an interface language.
bool IsEncodingWithSupportedLanguage(Encoding enc);


//
// LangsFromCountryCode and EncFromCountryCode
// -------------------------------------------
//
// These two functions return the possible languages and encodings,
// respectively, according to the input country code, which is a
// 2-letter string. The country code is usually specified in the url
// of a document.
//
//

// LangsFromCountryCode
// --------------------
//
// This function takes a string of arbitrary length. It treats the
// first 2 bytes of the string as the country code, as defined in iso
// 3166-1993 (E).  It returns, via arguments, an array of the
// languages that are popular in that country, roughly in order of
// popularity, together with the size of the array.
//
// This function returns true if we have language information for
// country_code.  Otherwise, it returns false.
//
bool LangsFromCountryCode(const char* country_code,
                          const Language** lang_arry,
                          int* num_langs);


//
// EncFromCountryCode
// ------------------
//
// This function takes a string of arbitrary length. It treats the
// first 2 bytes of that string as the country code, as defined in iso
// 3166-1993 (E). It sets *enc to the encoding that is
// most often used for the languages spoken in that country.
//
// This function returns true if we have encoding information for
// country_code.  Otherwise, it returns false, and *enc is set to
// UNKNOWN_ENCODING.
//
bool EncFromCountryCode(const char* country_code, Encoding* enc);



// VisualType
// ----------
//
// Right-to-left documents may be in logical or visual order. When they
// are in visual order we convert them to logical order before processing.
// This enum lists the types of visual document we can encounter.
// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
// The other documents in those languages, and all documents in non-RTL
// languages, will be NOT_VISUAL_DOCUMENT.
enum VisualType {
  NOT_VISUAL_DOCUMENT = 0,
  VISUAL_HEBREW_HTML,  // HTML documents in the legacy visual order.
  CONVERTED_RTL_PDF,   // Converted RTL PDFs, which are always visual.
};

VisualType default_visualtype();

// VisualTypeName
// --------------
//
// Given the visual type, returns a string name useful for debug output.
const char* VisualTypeName(VisualType visualtype);



// InitLangEnc
// -----------
//
// Ensures the LangEnc module has been initialized.  Normally this
// happens during InitGoogle, but this allows access for scripts that
// don't support InitGoogle. InitLangEnc calls InitEncodings (see
// i18n/encodings/public/encodings.h) and also initializes data
// structures used in lang_enc.cc.
//
void InitLangEnc();

#endif  // ENCODINGS_LANG_ENC_H__