languages.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef LANGUAGES_PUBLIC_LANGUAGES_H_
6#define LANGUAGES_PUBLIC_LANGUAGES_H_
7
8// This interface defines the Language enum and functions that depend
9// only on Language values.
10
11// A hash-function for Language, hash<Language>, is defined in
12// i18n/languages/public/languages-hash.h
13
14#ifndef SWIG
15// Language enum defined in languages.proto
16// Also description on how to add languages.
17#include "languages/proto/languages.pb.h"
18
19// We need this for compatibility:
20// - The Language enum in the default namespace.
21// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
22//using namespace i18n::languages;
23
24#else
25// And we must have a swig-compatible enum.
26// This one is a simple cleaned up version of language.proto, making the enum
27// compatible with C++.
28#include "i18n/languages/internal/languages_proto_wrapper.h"
29
30#endif
31
32// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
33//#include "util/utf8/proptables/script_enum.h"
34
35const int kNumLanguages = NUM_LANGUAGES;
36
37// Return the default language (ENGLISH).
38Language default_language();
39
40
41// *******************************************
42// Language predicates
43//   IsValidLanguage()
44//   IS_LANGUAGE_UNKNOWN()
45//   IsCJKLanguage()
46//   IsChineseLanguage()
47//   IsNorwegianLanguage()
48//   IsPortugueseLanguage()
49//   IsRightToLeftLanguage()
50//   IsMaybeRightToLeftLanguage()
51//   IsSameLanguage()
52//   IsScriptRequiringLongerSnippets()
53// *******************************************
54
55// IsValidLanguage
56// ===============
57//
58// Function to check if the input is within range of the Language enum. If
59// IsValidLanguage(lang) returns true, it is safe to call
60// static_cast<Language>(lang).
61//
62inline bool IsValidLanguage(int lang) {
63  return ((lang >= 0) && (lang < kNumLanguages));
64}
65
66// Return true if the language is "unknown". (This function was
67// previously a macro, hence the spelling in all caps.)
68//
69inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
70  return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
71}
72
73// IsCJKLanguage
74// -------------
75//
76// This function returns true if the language is either Chinese
77// (simplified or traditional), Japanese, or Korean.
78bool IsCJKLanguage(Language lang);
79
80// IsChineseLanguage
81// -----------------
82//
83// This function returns true if the language is either Chinese
84// (simplified or traditional)
85bool IsChineseLanguage(Language lang);
86
87// IsNorwegianLanguage
88// --------------------
89//
90// This function returns true if the language is any of the Norwegian
91// (regular or Nynorsk).
92bool IsNorwegianLanguage(Language lang);
93
94// IsPortugueseLanguage
95// --------------------
96//
97// This function returns true if the language is any of the Portuguese
98// languages (regular, Portugal or Brazil)
99bool IsPortugueseLanguage(Language lang);
100
101// IsSameLanguage
102// --------------
103//
104// WARNING: This function provides only a simple test on the values of
105// the two Language arguments. It returns false if either language is
106// invalid. It returns true if the language arguments are equal, or
107// if they are both Chinese languages, both Norwegian languages, or
108// both Portuguese languages, as defined by IsChineseLanguage,
109// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
110// false.
111bool IsSameLanguage(Language lang1, Language lang2);
112
113
114// IsRightToLeftLanguage
115// ---------------------
116//
117// This function returns true if the language is only written right-to-left
118// (E.g., Hebrew, Arabic, Persian etc.)
119//
120// IMPORTANT NOTE: Technically we're talking about scripts, not languages.
121// There are languages that can be written in more than one script.
122// Examples:
123//   - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
124//     Latin or Cyrillic script, and right-to-left in Arabic script.
125//   - Sindhi and Punjabi are written in different scripts, depending on
126//     region and dialect.
127//   - Turkmen used an Arabic script historically, but not any more.
128//   - Pashto and Uyghur can use Arabic script, but use a Roman script
129//     on the Internet.
130//   - Kashmiri and Urdu are written either with Arabic or Devanagari script.
131//
132// This function only returns true for languages that are always, unequivocally
133// written in right-to-left script.
134//
135// TODO(benjy): If we want to do anything special with multi-script languages
136// we should create new 'languages' for each language+script, as we do for
137// traditional vs. simplified Chinese. However most such languages are rare in
138// use and even rarer on the web, so this is unlikely to be something we'll
139// be concerned with for a while.
140bool IsRightToLeftLanguage(Language lang);
141
142// IsMaybeRightToLeftLanguage
143// --------------------------
144//
145// This function returns true if the language may appear on the web in a
146// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
147//
148// NOTE: See important notes under IsRightToLeftLanguage(...).
149//
150// This function returns true for languages that *may* appear on the web in a
151// right-to-left script, even if they may also appear in a left-to-right
152// script.
153//
154// This function should typically be used in cases where doing some work on
155// left-to-right text would be OK (usually a no-op), and this function is used
156// just to cut down on unnecessary work on regular, LTR text.
157bool IsMaybeRightToLeftLanguage(Language lang);
158
159// IsScriptRequiringLongerSnippets
160// --------------------
161//
162// This function returns true if the script chracteristics require longer
163// snippet length (Devanagari, Bengali, Gurmukhi,
164// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
165// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
166// bool IsScriptRequiringLongerSnippets(UnicodeScript script);
167
168
169// *******************************************
170// LANGUAGE NAMES
171//
172// This interface defines a standard name for each valid Language,
173// and a standard name for invalid languages. Some language names use all
174// uppercase letters, but others use mixed case.
175//   LanguageName() [Language to name]
176//   LanguageEnumName() [language to enum name]
177//   LanguageFromName() [name to Language]
178//   default_language_name()
179//   invalid_language_name()
180// *******************************************
181
182// Given a Language, returns its standard name.
183// Return invalid_language_name() if the language is invalid.
184const char* LanguageName(Language lang);
185
186// Given a Language, return the name of the enum constant for that
187// language. In all but a few cases, this is the same as its standard
188// name. For example, LanguageName(CHINESE) returns "Chinese", but
189// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
190// code that is generating C++ code, where the enum constant is more
191// useful than its integer value.  Return "NUM_LANGUAGES" if
192// the language is invalid.
193const char* LanguageEnumName(Language lang);
194
195// The maximum length of a standard language name.
196const int kMaxLanguageNameSize = 50;
197
198// The standard name for the default language.
199const char* default_language_name();
200
201// The standard name for all invalid languages.
202const char* invalid_language_name();
203
204// If lang_name matches the standard name of a Language, using a
205// case-insensitive comparison, set *language to that Language and
206// return true.
207// Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
208//
209// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
210// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
211// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
212// as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
213// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
214// CHINESE_T (i.e., a synonym for "ChineseT").
215//
216// REQUIRES: language must not be NULL.
217//
218bool LanguageFromName(const char* lang_name, Language *language);
219
220
221
222// *******************************************
223// LANGUAGE CODES
224//
225// This interface defines a standard code for each valid language, and
226// a standard code for invalid languages. These are derived from ISO codes,
227// with some Google additions.
228//   LanguageCode()
229//   default_language_code()
230//   invalid_language_code()
231//   LanguageCodeWithDialects()
232//   LanguageCodeISO639_1()
233//   LanguageCodeISO639_2()
234// *******************************************
235
236// Given a Language, return its standard code. There are Google-specific codes:
237//     For CHINESE_T, return "zh-TW".
238//     For TG_UNKNOWN_LANGUAGE, return "ut".
239//     For UNKNOWN_LANGUAGE, return "un".
240//     For PORTUGUESE_P, return "pt-PT".
241//     For PORTUGUESE_B, return "pt-BR".
242//     For LIMBU, return "sit-NP".
243//     For CHEROKEE, return "chr".
244//     For SYRIAC, return "syr".
245// Otherwise return the ISO 639-1 two-letter language code for lang.
246// If lang is invalid, return invalid_language_code().
247//
248// NOTE: See the note below about the codes for Chinese languages.
249//
250const char* LanguageCode(Language lang);
251
252// The maximum length of a language code.
253const int kMaxLanguageCodeSize = 50;
254
255// The standard code for the default language.
256const char* default_language_code();
257
258// The standard code for all invalid languages.
259const char* invalid_language_code();
260
261
262// --------------------------------------------
263// NOTE: CHINESE LANGUAGE CODES
264//
265// There are three functions that return codes for Chinese languages.
266// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
267// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
268// The following list shows the different results.
269//
270// LanguageCode(CHINESE) returns "zh"
271// LanguageCode(CHINESE_T) returns "zh-TW".
272//
273// LanguageCodeWithDialects(CHINESE) returns "zh-CN".
274// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
275//
276// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
277// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
278// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
279//
280// --------------------------------------------
281
282// LanguageCodeWithDialects
283// ------------------------
284//
285// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
286const char* LanguageCodeWithDialects(Language lang);
287
288// LanguageCodeISO639_1
289// --------------------
290//
291// Return the ISO 639-1 two-letter language code for lang.
292// Return invalid_language_code() if lang is invalid or does not have
293// an ISO 639-1 two-letter language code.
294const char* LanguageCodeISO639_1(Language lang);
295
296// LanguageCodeISO639_2
297// --------------------
298//
299// Return the ISO 639-2 three-letter language for lang.
300// Return invalid_language_code() if lang is invalid or does not have
301// an ISO 639-2 three-letter language code.
302const char* LanguageCodeISO639_2(Language lang);
303
304// LanguageFromCode
305// ----------------
306//
307// If lang_code matches the code for a Language, using a case-insensitive
308// comparison, set *lang to that Language and return true.
309// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
310//
311// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
312// (three-letter) code, or a Google-specific code (see LanguageCode).
313//
314// Certain language-code aliases are also allowed:
315//   For "zh-cn" and "zh_cn", set *lang to CHINESE.
316//   For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
317//   For "he", set *lang to HEBREW.
318//   For "in", set *lang to INDONESIAN.
319//   For "ji", set *lang to YIDDISH.
320//   For "fil", set *lang to TAGALOG.
321//
322// REQUIRES: 'lang' must not be NULL.
323bool LanguageFromCode(const char* lang_code, Language *language);
324
325
326// LanguageFromCodeOrName
327// ----------------------
328//
329// If lang_code_or_name is a language code or a language name.
330// set *language to the corresponding Language and return true.
331// Otherwise set *language to UNKNOWN_LANGUAGE and return false.
332//
333bool LanguageFromCodeOrName(const char* lang_code_or_name,
334                            Language* language);
335
336// LanguageNameFromCode
337// --------------------
338//
339// If language_code is the code for a Language (see LanguageFromCode),
340// return the standard name of that language (see LanguageName).
341// Otherwise return invalid_language_name().
342//
343const char* LanguageNameFromCode(const char* language_code);
344
345
346// Miscellany
347
348// LanguageCodeToUnderscoreForm
349// ----------------------------
350//
351// Given a language code, convert the dash "-" to underscore "_".
352//
353// Specifically, if result_length <= strlen(lang_code), set result[0]
354// to '\0' and return false. Otherwise, copy lang_code to result,
355// converting every dash to an underscore, converting every character
356// before the first dash or underscore to lower case, and converting
357// every character after the first dash or underscore to upper
358// case. If there is no dash or underscore, convert the entire string
359// to lower case.
360//
361// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
362
363bool LanguageCodeToUnderscoreForm(const char* lang_code,
364                                  char* result,
365                                  int result_length);
366
367//
368// AlwaysPutInExpectedRestrict
369// ---------------------------
370//
371// For Web pages in certain top-level domains, Web Search always
372// applies a "country restrict". If 'tld' matches one of those, using
373// a case-SENSITIVE comparison, set *expected_language to the Language
374// most commonly found in that top-level domain and return true.
375// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
376bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
377
378
379#endif  // LANGUAGES_PUBLIC_LANGUAGES_H_
380