1// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// This file is for i18n. It contains two enums, namely Language and
6// Encoding, where Language is the linguistic convention, and Encoding
7// contains information on both language encoding and character set.
8//
9// The language and encoding are both based on Teragram's conventions,
10// except for some common ISO-8859 encodings that are not detected by
11// Teragram but might be in the future.
12//
13// This file also includes functions that do mappings among
14// Language/Encoding enums, language/encoding string names (typically
15// the output from Language Encoding identifier), and language codes
16// (iso 639), and two-letter country codes (iso 3166)
17//
18// NOTE: Both Language and Encoding enums should always start from
19// zero value. This assumption has been made and used.
20//
21
22#ifndef ENCODINGS_LANG_ENC_H__
23#define ENCODINGS_LANG_ENC_H__
24
25#include "languages/public/languages.h"
26#include "encodings/public/encodings.h"
27
28
29// EncodingsForLanguage
30// --------------------
31//
32// Given the language, returns a pointer to an array of encodings this
33// language supports. Typically, the encs array has at least one
34// element: UNKNOWN_ENCODING, which is always the last element of the
35// array. The first encoding is the default encoding of the language.
36// Return NULL if the input is invalid.
37//
38// Note: The output encoding array does not include ASCII_7BIT, UTF8
39// or UNICODE which are good for all languages. TODO: Find out whether
40// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
41// as special cases.
42//
43const Encoding* EncodingsForLanguage(Language lang);
44
45
46// DefaultEncodingForLanguage
47// --------------------------
48//
49// Given the language, returns the default encoding for the language
50// via the argument encoding.
51//
52// The function returns true if the input lang is valid. Otherwise,
53// false is returned, and encoding is set to UNKNOWN_ENCODING.
54//
55bool DefaultEncodingForLanguage(Language lang,
56                                Encoding *encoding);
57
58// LanguagesForEncoding
59// --------------------
60//
61// Given the encoding, returns a pointer to an array of languages this
62// encoding supports. Typically, the langs array has at least one
63// element: UNKNOWN_LANGUAGE, which is always the last element of the
64// array. The first language in the array if the most popular
65// language for that encoding. NULL is returned if the input is
66// invalid.
67//
68// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
69// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
70// the languages or to treat these two encodings as special cases.
71//
72// For other known encodings, ENGLISH is always included. This is
73// because English (Latin) characters are included in each encoding.
74//
75const Language* LanguagesForEncoding(Encoding enc);
76
77// DefaultLanguageForEncoding
78// --------------------------
79//
80// Given the encoding, returns the default language for that encoding
81// via the argument language.
82//
83// The function returns true if the input enc is valid. Otherwise,
84// false is returned, and language is set to UNKNOWN_LANGUAGE.
85//
86// Note, this function is more useful for the encodings that have only
87// one corresponding language i.e. shift_jis => Japanese. There are
88// cases that multiple langauges have the same encoding, for which the
89// default language is an arbitrary choice from them.
90//
91bool DefaultLanguageForEncoding(Encoding enc, Language* language);
92
93//
94// IsLangEncCompatible
95// -------------------
96//
97// This function is to determine whether the input language and
98// encoding are compatible. For example, FRENCH and LATIN1 are
99// compatible, but FRENCH and GB are not.
100//
101// If either lang or enc is invalid return false.
102// If either lang is unknown, return true.
103//    (e.g. we can detect a page's encoding as latin1 from metatag info, but
104//     cannot derive it language since there are more than one
105//     language encoding in Latin1 )
106// If language is known, but encoding is unknown, return false.
107//    (return true will do us no good since we cannot convert to UTF8 anyway)
108// If enc is unicode or utf8, return true.
109// Otherwise check if lang is supported by enc and enc supported by
110// lang.
111//
112bool IsLangEncCompatible(Language lang, Encoding enc);
113
114//
115// DominantLanguageFromEncoding
116// ----------------------------
117//
118// This function determine if there exists a dominant language for the
119// input encoding. For example, the encoding GB has a dominant
120// language (Chinese), but Latin1 does not.
121//
122// The word "dominant" is used here because English characters are
123// included in each encoding.
124//
125// If there is no dominant langauge for the encoding, such as Latin1,
126// UNKNOWN_LANGUAGE is returned.
127//
128Language DominantLanguageFromEncoding(Encoding enc);
129
130// LanguageCode
131// ------------------------
132// Given the Language and Encoding, return language code with dialects
133// (>= 2 letters).  Encoding is necessary to disambiguate between
134// Simplified and Traditional Chinese.
135//
136// See the note on Chinese Language Codes in
137// i18n/languages/public/languages.h
138// for the details.
139
140const char* LanguageCode(Language lang, Encoding enc);
141
142//
143// IsEncodingWithSupportedLanguage()
144// ---------------------------------
145//
146// There are some encoding listed here just because they are commonly
147// used.  There is no interface language for them yet. They are not
148// detected by Teragram, but can be detected from the meta info of the
149// HTML page.
150//
151// For example, we have list ARABIC_ENCODING but there is no arabic in
152// the Language enum. If the user input an Arabic query from Google
153// main page, Netscape will just send the raw bytes to GWS, and GWS
154// will treat them as Latin1.  Therefore, there is no use to detect
155// ARABIC_ENCODING for indexing, since they will never match the
156// queries which are treated as Latin1 by GWS. On the contrary, if we
157// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
158// fall them through as Latin1 in indexing time. And there might be a
159// match for some ARABIC queries which are also treated as Latin1 by
160// GWS. In fact, some people are relying on this feature to do Arabic
161// searches.
162//
163// Thus for these type of encoding, before we have the UI support for
164// their language and have a pretty comprehensive language/encoding
165// identification quality, it is better to revert them as
166// UNKNOWN_ENCODING.
167//
168// This function checks whether the input encoding is one with
169// an interface language.
170bool IsEncodingWithSupportedLanguage(Encoding enc);
171
172
173//
174// LangsFromCountryCode and EncFromCountryCode
175// -------------------------------------------
176//
177// These two functions return the possible languages and encodings,
178// respectively, according to the input country code, which is a
179// 2-letter string. The country code is usually specified in the url
180// of a document.
181//
182//
183
184// LangsFromCountryCode
185// --------------------
186//
187// This function takes a string of arbitrary length. It treats the
188// first 2 bytes of the string as the country code, as defined in iso
189// 3166-1993 (E).  It returns, via arguments, an array of the
190// languages that are popular in that country, roughly in order of
191// popularity, together with the size of the array.
192//
193// This function returns true if we have language information for
194// country_code.  Otherwise, it returns false.
195//
196bool LangsFromCountryCode(const char* country_code,
197                          const Language** lang_arry,
198                          int* num_langs);
199
200
201//
202// EncFromCountryCode
203// ------------------
204//
205// This function takes a string of arbitrary length. It treats the
206// first 2 bytes of that string as the country code, as defined in iso
207// 3166-1993 (E). It sets *enc to the encoding that is
208// most often used for the languages spoken in that country.
209//
210// This function returns true if we have encoding information for
211// country_code.  Otherwise, it returns false, and *enc is set to
212// UNKNOWN_ENCODING.
213//
214bool EncFromCountryCode(const char* country_code, Encoding* enc);
215
216
217
218// VisualType
219// ----------
220//
221// Right-to-left documents may be in logical or visual order. When they
222// are in visual order we convert them to logical order before processing.
223// This enum lists the types of visual document we can encounter.
224// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
225// The other documents in those languages, and all documents in non-RTL
226// languages, will be NOT_VISUAL_DOCUMENT.
227enum VisualType {
228  NOT_VISUAL_DOCUMENT = 0,
229  VISUAL_HEBREW_HTML,  // HTML documents in the legacy visual order.
230  CONVERTED_RTL_PDF,   // Converted RTL PDFs, which are always visual.
231};
232
233VisualType default_visualtype();
234
235// VisualTypeName
236// --------------
237//
238// Given the visual type, returns a string name useful for debug output.
239const char* VisualTypeName(VisualType visualtype);
240
241
242
243// InitLangEnc
244// -----------
245//
246// Ensures the LangEnc module has been initialized.  Normally this
247// happens during InitGoogle, but this allows access for scripts that
248// don't support InitGoogle. InitLangEnc calls InitEncodings (see
249// i18n/encodings/public/encodings.h) and also initializes data
250// structures used in lang_enc.cc.
251//
252void InitLangEnc();
253
254#endif  // ENCODINGS_LANG_ENC_H__
255