// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // This file is for i18n. It contains two enums, namely Language and // Encoding, where Language is the linguistic convention, and Encoding // contains information on both language encoding and character set. // // The language and encoding are both based on Teragram's conventions, // except for some common ISO-8859 encodings that are not detected by // Teragram but might be in the future. // // This file also includes functions that do mappings among // Language/Encoding enums, language/encoding string names (typically // the output from Language Encoding identifier), and language codes // (iso 639), and two-letter country codes (iso 3166) // // NOTE: Both Language and Encoding enums should always start from // zero value. This assumption has been made and used. // #ifndef ENCODINGS_LANG_ENC_H__ #define ENCODINGS_LANG_ENC_H__ #include "languages/public/languages.h" #include "encodings/public/encodings.h" // EncodingsForLanguage // -------------------- // // Given the language, returns a pointer to an array of encodings this // language supports. Typically, the encs array has at least one // element: UNKNOWN_ENCODING, which is always the last element of the // array. The first encoding is the default encoding of the language. // Return NULL if the input is invalid. // // Note: The output encoding array does not include ASCII_7BIT, UTF8 // or UNICODE which are good for all languages. TODO: Find out whether // it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them // as special cases. // const Encoding* EncodingsForLanguage(Language lang); // DefaultEncodingForLanguage // -------------------------- // // Given the language, returns the default encoding for the language // via the argument encoding. // // The function returns true if the input lang is valid. Otherwise, // false is returned, and encoding is set to UNKNOWN_ENCODING. // bool DefaultEncodingForLanguage(Language lang, Encoding *encoding); // LanguagesForEncoding // -------------------- // // Given the encoding, returns a pointer to an array of languages this // encoding supports. Typically, the langs array has at least one // element: UNKNOWN_LANGUAGE, which is always the last element of the // array. The first language in the array if the most popular // language for that encoding. NULL is returned if the input is // invalid. // // Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and // UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all // the languages or to treat these two encodings as special cases. // // For other known encodings, ENGLISH is always included. This is // because English (Latin) characters are included in each encoding. // const Language* LanguagesForEncoding(Encoding enc); // DefaultLanguageForEncoding // -------------------------- // // Given the encoding, returns the default language for that encoding // via the argument language. // // The function returns true if the input enc is valid. Otherwise, // false is returned, and language is set to UNKNOWN_LANGUAGE. // // Note, this function is more useful for the encodings that have only // one corresponding language i.e. shift_jis => Japanese. There are // cases that multiple langauges have the same encoding, for which the // default language is an arbitrary choice from them. // bool DefaultLanguageForEncoding(Encoding enc, Language* language); // // IsLangEncCompatible // ------------------- // // This function is to determine whether the input language and // encoding are compatible. For example, FRENCH and LATIN1 are // compatible, but FRENCH and GB are not. // // If either lang or enc is invalid return false. // If either lang is unknown, return true. // (e.g. we can detect a page's encoding as latin1 from metatag info, but // cannot derive it language since there are more than one // language encoding in Latin1 ) // If language is known, but encoding is unknown, return false. // (return true will do us no good since we cannot convert to UTF8 anyway) // If enc is unicode or utf8, return true. // Otherwise check if lang is supported by enc and enc supported by // lang. // bool IsLangEncCompatible(Language lang, Encoding enc); // // DominantLanguageFromEncoding // ---------------------------- // // This function determine if there exists a dominant language for the // input encoding. For example, the encoding GB has a dominant // language (Chinese), but Latin1 does not. // // The word "dominant" is used here because English characters are // included in each encoding. // // If there is no dominant langauge for the encoding, such as Latin1, // UNKNOWN_LANGUAGE is returned. // Language DominantLanguageFromEncoding(Encoding enc); // LanguageCode // ------------------------ // Given the Language and Encoding, return language code with dialects // (>= 2 letters). Encoding is necessary to disambiguate between // Simplified and Traditional Chinese. // // See the note on Chinese Language Codes in // i18n/languages/public/languages.h // for the details. const char* LanguageCode(Language lang, Encoding enc); // // IsEncodingWithSupportedLanguage() // --------------------------------- // // There are some encoding listed here just because they are commonly // used. There is no interface language for them yet. They are not // detected by Teragram, but can be detected from the meta info of the // HTML page. // // For example, we have list ARABIC_ENCODING but there is no arabic in // the Language enum. If the user input an Arabic query from Google // main page, Netscape will just send the raw bytes to GWS, and GWS // will treat them as Latin1. Therefore, there is no use to detect // ARABIC_ENCODING for indexing, since they will never match the // queries which are treated as Latin1 by GWS. On the contrary, if we // treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will // fall them through as Latin1 in indexing time. And there might be a // match for some ARABIC queries which are also treated as Latin1 by // GWS. In fact, some people are relying on this feature to do Arabic // searches. // // Thus for these type of encoding, before we have the UI support for // their language and have a pretty comprehensive language/encoding // identification quality, it is better to revert them as // UNKNOWN_ENCODING. // // This function checks whether the input encoding is one with // an interface language. bool IsEncodingWithSupportedLanguage(Encoding enc); // // LangsFromCountryCode and EncFromCountryCode // ------------------------------------------- // // These two functions return the possible languages and encodings, // respectively, according to the input country code, which is a // 2-letter string. The country code is usually specified in the url // of a document. // // // LangsFromCountryCode // -------------------- // // This function takes a string of arbitrary length. It treats the // first 2 bytes of the string as the country code, as defined in iso // 3166-1993 (E). It returns, via arguments, an array of the // languages that are popular in that country, roughly in order of // popularity, together with the size of the array. // // This function returns true if we have language information for // country_code. Otherwise, it returns false. // bool LangsFromCountryCode(const char* country_code, const Language** lang_arry, int* num_langs); // // EncFromCountryCode // ------------------ // // This function takes a string of arbitrary length. It treats the // first 2 bytes of that string as the country code, as defined in iso // 3166-1993 (E). It sets *enc to the encoding that is // most often used for the languages spoken in that country. // // This function returns true if we have encoding information for // country_code. Otherwise, it returns false, and *enc is set to // UNKNOWN_ENCODING. // bool EncFromCountryCode(const char* country_code, Encoding* enc); // VisualType // ---------- // // Right-to-left documents may be in logical or visual order. When they // are in visual order we convert them to logical order before processing. // This enum lists the types of visual document we can encounter. // Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual. // The other documents in those languages, and all documents in non-RTL // languages, will be NOT_VISUAL_DOCUMENT. enum VisualType { NOT_VISUAL_DOCUMENT = 0, VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order. CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual. }; VisualType default_visualtype(); // VisualTypeName // -------------- // // Given the visual type, returns a string name useful for debug output. const char* VisualTypeName(VisualType visualtype); // InitLangEnc // ----------- // // Ensures the LangEnc module has been initialized. Normally this // happens during InitGoogle, but this allows access for scripts that // don't support InitGoogle. InitLangEnc calls InitEncodings (see // i18n/encodings/public/encodings.h) and also initializes data // structures used in lang_enc.cc. // void InitLangEnc(); #endif // ENCODINGS_LANG_ENC_H__