1// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// This file is for i18n. It contains two enums, namely Language and 6// Encoding, where Language is the linguistic convention, and Encoding 7// contains information on both language encoding and character set. 8// 9// The language and encoding are both based on Teragram's conventions, 10// except for some common ISO-8859 encodings that are not detected by 11// Teragram but might be in the future. 12// 13// This file also includes functions that do mappings among 14// Language/Encoding enums, language/encoding string names (typically 15// the output from Language Encoding identifier), and language codes 16// (iso 639), and two-letter country codes (iso 3166) 17// 18// NOTE: Both Language and Encoding enums should always start from 19// zero value. This assumption has been made and used. 20// 21 22#ifndef ENCODINGS_LANG_ENC_H__ 23#define ENCODINGS_LANG_ENC_H__ 24 25#include "languages/public/languages.h" 26#include "encodings/public/encodings.h" 27 28 29// EncodingsForLanguage 30// -------------------- 31// 32// Given the language, returns a pointer to an array of encodings this 33// language supports. Typically, the encs array has at least one 34// element: UNKNOWN_ENCODING, which is always the last element of the 35// array. The first encoding is the default encoding of the language. 36// Return NULL if the input is invalid. 37// 38// Note: The output encoding array does not include ASCII_7BIT, UTF8 39// or UNICODE which are good for all languages. TODO: Find out whether 40// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them 41// as special cases. 42// 43const Encoding* EncodingsForLanguage(Language lang); 44 45 46// DefaultEncodingForLanguage 47// -------------------------- 48// 49// Given the language, returns the default encoding for the language 50// via the argument encoding. 51// 52// The function returns true if the input lang is valid. Otherwise, 53// false is returned, and encoding is set to UNKNOWN_ENCODING. 54// 55bool DefaultEncodingForLanguage(Language lang, 56 Encoding *encoding); 57 58// LanguagesForEncoding 59// -------------------- 60// 61// Given the encoding, returns a pointer to an array of languages this 62// encoding supports. Typically, the langs array has at least one 63// element: UNKNOWN_LANGUAGE, which is always the last element of the 64// array. The first language in the array if the most popular 65// language for that encoding. NULL is returned if the input is 66// invalid. 67// 68// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and 69// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all 70// the languages or to treat these two encodings as special cases. 71// 72// For other known encodings, ENGLISH is always included. This is 73// because English (Latin) characters are included in each encoding. 74// 75const Language* LanguagesForEncoding(Encoding enc); 76 77// DefaultLanguageForEncoding 78// -------------------------- 79// 80// Given the encoding, returns the default language for that encoding 81// via the argument language. 82// 83// The function returns true if the input enc is valid. Otherwise, 84// false is returned, and language is set to UNKNOWN_LANGUAGE. 85// 86// Note, this function is more useful for the encodings that have only 87// one corresponding language i.e. shift_jis => Japanese. There are 88// cases that multiple langauges have the same encoding, for which the 89// default language is an arbitrary choice from them. 90// 91bool DefaultLanguageForEncoding(Encoding enc, Language* language); 92 93// 94// IsLangEncCompatible 95// ------------------- 96// 97// This function is to determine whether the input language and 98// encoding are compatible. For example, FRENCH and LATIN1 are 99// compatible, but FRENCH and GB are not. 100// 101// If either lang or enc is invalid return false. 102// If either lang is unknown, return true. 103// (e.g. we can detect a page's encoding as latin1 from metatag info, but 104// cannot derive it language since there are more than one 105// language encoding in Latin1 ) 106// If language is known, but encoding is unknown, return false. 107// (return true will do us no good since we cannot convert to UTF8 anyway) 108// If enc is unicode or utf8, return true. 109// Otherwise check if lang is supported by enc and enc supported by 110// lang. 111// 112bool IsLangEncCompatible(Language lang, Encoding enc); 113 114// 115// DominantLanguageFromEncoding 116// ---------------------------- 117// 118// This function determine if there exists a dominant language for the 119// input encoding. For example, the encoding GB has a dominant 120// language (Chinese), but Latin1 does not. 121// 122// The word "dominant" is used here because English characters are 123// included in each encoding. 124// 125// If there is no dominant langauge for the encoding, such as Latin1, 126// UNKNOWN_LANGUAGE is returned. 127// 128Language DominantLanguageFromEncoding(Encoding enc); 129 130// LanguageCode 131// ------------------------ 132// Given the Language and Encoding, return language code with dialects 133// (>= 2 letters). Encoding is necessary to disambiguate between 134// Simplified and Traditional Chinese. 135// 136// See the note on Chinese Language Codes in 137// i18n/languages/public/languages.h 138// for the details. 139 140const char* LanguageCode(Language lang, Encoding enc); 141 142// 143// IsEncodingWithSupportedLanguage() 144// --------------------------------- 145// 146// There are some encoding listed here just because they are commonly 147// used. There is no interface language for them yet. They are not 148// detected by Teragram, but can be detected from the meta info of the 149// HTML page. 150// 151// For example, we have list ARABIC_ENCODING but there is no arabic in 152// the Language enum. If the user input an Arabic query from Google 153// main page, Netscape will just send the raw bytes to GWS, and GWS 154// will treat them as Latin1. Therefore, there is no use to detect 155// ARABIC_ENCODING for indexing, since they will never match the 156// queries which are treated as Latin1 by GWS. On the contrary, if we 157// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will 158// fall them through as Latin1 in indexing time. And there might be a 159// match for some ARABIC queries which are also treated as Latin1 by 160// GWS. In fact, some people are relying on this feature to do Arabic 161// searches. 162// 163// Thus for these type of encoding, before we have the UI support for 164// their language and have a pretty comprehensive language/encoding 165// identification quality, it is better to revert them as 166// UNKNOWN_ENCODING. 167// 168// This function checks whether the input encoding is one with 169// an interface language. 170bool IsEncodingWithSupportedLanguage(Encoding enc); 171 172 173// 174// LangsFromCountryCode and EncFromCountryCode 175// ------------------------------------------- 176// 177// These two functions return the possible languages and encodings, 178// respectively, according to the input country code, which is a 179// 2-letter string. The country code is usually specified in the url 180// of a document. 181// 182// 183 184// LangsFromCountryCode 185// -------------------- 186// 187// This function takes a string of arbitrary length. It treats the 188// first 2 bytes of the string as the country code, as defined in iso 189// 3166-1993 (E). It returns, via arguments, an array of the 190// languages that are popular in that country, roughly in order of 191// popularity, together with the size of the array. 192// 193// This function returns true if we have language information for 194// country_code. Otherwise, it returns false. 195// 196bool LangsFromCountryCode(const char* country_code, 197 const Language** lang_arry, 198 int* num_langs); 199 200 201// 202// EncFromCountryCode 203// ------------------ 204// 205// This function takes a string of arbitrary length. It treats the 206// first 2 bytes of that string as the country code, as defined in iso 207// 3166-1993 (E). It sets *enc to the encoding that is 208// most often used for the languages spoken in that country. 209// 210// This function returns true if we have encoding information for 211// country_code. Otherwise, it returns false, and *enc is set to 212// UNKNOWN_ENCODING. 213// 214bool EncFromCountryCode(const char* country_code, Encoding* enc); 215 216 217 218// VisualType 219// ---------- 220// 221// Right-to-left documents may be in logical or visual order. When they 222// are in visual order we convert them to logical order before processing. 223// This enum lists the types of visual document we can encounter. 224// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual. 225// The other documents in those languages, and all documents in non-RTL 226// languages, will be NOT_VISUAL_DOCUMENT. 227enum VisualType { 228 NOT_VISUAL_DOCUMENT = 0, 229 VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order. 230 CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual. 231}; 232 233VisualType default_visualtype(); 234 235// VisualTypeName 236// -------------- 237// 238// Given the visual type, returns a string name useful for debug output. 239const char* VisualTypeName(VisualType visualtype); 240 241 242 243// InitLangEnc 244// ----------- 245// 246// Ensures the LangEnc module has been initialized. Normally this 247// happens during InitGoogle, but this allows access for scripts that 248// don't support InitGoogle. InitLangEnc calls InitEncodings (see 249// i18n/encodings/public/encodings.h) and also initializes data 250// structures used in lang_enc.cc. 251// 252void InitLangEnc(); 253 254#endif // ENCODINGS_LANG_ENC_H__ 255