languages.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright (c) 2009 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef LANGUAGES_PUBLIC_LANGUAGES_H_ 6#define LANGUAGES_PUBLIC_LANGUAGES_H_ 7 8// This interface defines the Language enum and functions that depend 9// only on Language values. 10 11// A hash-function for Language, hash<Language>, is defined in 12// i18n/languages/public/languages-hash.h 13 14#ifndef SWIG 15// Language enum defined in languages.proto 16// Also description on how to add languages. 17#include "languages/proto/languages.pb.h" 18 19// We need this for compatibility: 20// - The Language enum in the default namespace. 21// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE 22//using namespace i18n::languages; 23 24#else 25// And we must have a swig-compatible enum. 26// This one is a simple cleaned up version of language.proto, making the enum 27// compatible with C++. 28#include "i18n/languages/internal/languages_proto_wrapper.h" 29 30#endif 31 32// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE 33//#include "util/utf8/proptables/script_enum.h" 34 35const int kNumLanguages = NUM_LANGUAGES; 36 37// Return the default language (ENGLISH). 38Language default_language(); 39 40 41// ******************************************* 42// Language predicates 43// IsValidLanguage() 44// IS_LANGUAGE_UNKNOWN() 45// IsCJKLanguage() 46// IsChineseLanguage() 47// IsNorwegianLanguage() 48// IsPortugueseLanguage() 49// IsRightToLeftLanguage() 50// IsMaybeRightToLeftLanguage() 51// IsSameLanguage() 52// IsScriptRequiringLongerSnippets() 53// ******************************************* 54 55// IsValidLanguage 56// =============== 57// 58// Function to check if the input is within range of the Language enum. If 59// IsValidLanguage(lang) returns true, it is safe to call 60// static_cast<Language>(lang). 61// 62inline bool IsValidLanguage(int lang) { 63 return ((lang >= 0) && (lang < kNumLanguages)); 64} 65 66// Return true if the language is "unknown". (This function was 67// previously a macro, hence the spelling in all caps.) 68// 69inline bool IS_LANGUAGE_UNKNOWN(Language lang) { 70 return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE; 71} 72 73// IsCJKLanguage 74// ------------- 75// 76// This function returns true if the language is either Chinese 77// (simplified or traditional), Japanese, or Korean. 78bool IsCJKLanguage(Language lang); 79 80// IsChineseLanguage 81// ----------------- 82// 83// This function returns true if the language is either Chinese 84// (simplified or traditional) 85bool IsChineseLanguage(Language lang); 86 87// IsNorwegianLanguage 88// -------------------- 89// 90// This function returns true if the language is any of the Norwegian 91// (regular or Nynorsk). 92bool IsNorwegianLanguage(Language lang); 93 94// IsPortugueseLanguage 95// -------------------- 96// 97// This function returns true if the language is any of the Portuguese 98// languages (regular, Portugal or Brazil) 99bool IsPortugueseLanguage(Language lang); 100 101// IsSameLanguage 102// -------------- 103// 104// WARNING: This function provides only a simple test on the values of 105// the two Language arguments. It returns false if either language is 106// invalid. It returns true if the language arguments are equal, or 107// if they are both Chinese languages, both Norwegian languages, or 108// both Portuguese languages, as defined by IsChineseLanguage, 109// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns 110// false. 111bool IsSameLanguage(Language lang1, Language lang2); 112 113 114// IsRightToLeftLanguage 115// --------------------- 116// 117// This function returns true if the language is only written right-to-left 118// (E.g., Hebrew, Arabic, Persian etc.) 119// 120// IMPORTANT NOTE: Technically we're talking about scripts, not languages. 121// There are languages that can be written in more than one script. 122// Examples: 123// - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in 124// Latin or Cyrillic script, and right-to-left in Arabic script. 125// - Sindhi and Punjabi are written in different scripts, depending on 126// region and dialect. 127// - Turkmen used an Arabic script historically, but not any more. 128// - Pashto and Uyghur can use Arabic script, but use a Roman script 129// on the Internet. 130// - Kashmiri and Urdu are written either with Arabic or Devanagari script. 131// 132// This function only returns true for languages that are always, unequivocally 133// written in right-to-left script. 134// 135// TODO(benjy): If we want to do anything special with multi-script languages 136// we should create new 'languages' for each language+script, as we do for 137// traditional vs. simplified Chinese. However most such languages are rare in 138// use and even rarer on the web, so this is unlikely to be something we'll 139// be concerned with for a while. 140bool IsRightToLeftLanguage(Language lang); 141 142// IsMaybeRightToLeftLanguage 143// -------------------------- 144// 145// This function returns true if the language may appear on the web in a 146// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.) 147// 148// NOTE: See important notes under IsRightToLeftLanguage(...). 149// 150// This function returns true for languages that *may* appear on the web in a 151// right-to-left script, even if they may also appear in a left-to-right 152// script. 153// 154// This function should typically be used in cases where doing some work on 155// left-to-right text would be OK (usually a no-op), and this function is used 156// just to cut down on unnecessary work on regular, LTR text. 157bool IsMaybeRightToLeftLanguage(Language lang); 158 159// IsScriptRequiringLongerSnippets 160// -------------------- 161// 162// This function returns true if the script chracteristics require longer 163// snippet length (Devanagari, Bengali, Gurmukhi, 164// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam). 165// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE 166// bool IsScriptRequiringLongerSnippets(UnicodeScript script); 167 168 169// ******************************************* 170// LANGUAGE NAMES 171// 172// This interface defines a standard name for each valid Language, 173// and a standard name for invalid languages. Some language names use all 174// uppercase letters, but others use mixed case. 175// LanguageName() [Language to name] 176// LanguageEnumName() [language to enum name] 177// LanguageFromName() [name to Language] 178// default_language_name() 179// invalid_language_name() 180// ******************************************* 181 182// Given a Language, returns its standard name. 183// Return invalid_language_name() if the language is invalid. 184const char* LanguageName(Language lang); 185 186// Given a Language, return the name of the enum constant for that 187// language. In all but a few cases, this is the same as its standard 188// name. For example, LanguageName(CHINESE) returns "Chinese", but 189// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for 190// code that is generating C++ code, where the enum constant is more 191// useful than its integer value. Return "NUM_LANGUAGES" if 192// the language is invalid. 193const char* LanguageEnumName(Language lang); 194 195// The maximum length of a standard language name. 196const int kMaxLanguageNameSize = 50; 197 198// The standard name for the default language. 199const char* default_language_name(); 200 201// The standard name for all invalid languages. 202const char* invalid_language_name(); 203 204// If lang_name matches the standard name of a Language, using a 205// case-insensitive comparison, set *language to that Language and 206// return true. 207// Otherwise, set *language to UNKNOWN_LANGUAGE and return false. 208// 209// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name 210// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA. 211// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed 212// as a name for UNKNOWN_LANGUAGE (the return value is true in this case, 213// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for 214// CHINESE_T (i.e., a synonym for "ChineseT"). 215// 216// REQUIRES: language must not be NULL. 217// 218bool LanguageFromName(const char* lang_name, Language *language); 219 220 221 222// ******************************************* 223// LANGUAGE CODES 224// 225// This interface defines a standard code for each valid language, and 226// a standard code for invalid languages. These are derived from ISO codes, 227// with some Google additions. 228// LanguageCode() 229// default_language_code() 230// invalid_language_code() 231// LanguageCodeWithDialects() 232// LanguageCodeISO639_1() 233// LanguageCodeISO639_2() 234// ******************************************* 235 236// Given a Language, return its standard code. There are Google-specific codes: 237// For CHINESE_T, return "zh-TW". 238// For TG_UNKNOWN_LANGUAGE, return "ut". 239// For UNKNOWN_LANGUAGE, return "un". 240// For PORTUGUESE_P, return "pt-PT". 241// For PORTUGUESE_B, return "pt-BR". 242// For LIMBU, return "sit-NP". 243// For CHEROKEE, return "chr". 244// For SYRIAC, return "syr". 245// Otherwise return the ISO 639-1 two-letter language code for lang. 246// If lang is invalid, return invalid_language_code(). 247// 248// NOTE: See the note below about the codes for Chinese languages. 249// 250const char* LanguageCode(Language lang); 251 252// The maximum length of a language code. 253const int kMaxLanguageCodeSize = 50; 254 255// The standard code for the default language. 256const char* default_language_code(); 257 258// The standard code for all invalid languages. 259const char* invalid_language_code(); 260 261 262// -------------------------------------------- 263// NOTE: CHINESE LANGUAGE CODES 264// 265// There are three functions that return codes for Chinese languages. 266// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here. 267// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h. 268// The following list shows the different results. 269// 270// LanguageCode(CHINESE) returns "zh" 271// LanguageCode(CHINESE_T) returns "zh-TW". 272// 273// LanguageCodeWithDialects(CHINESE) returns "zh-CN". 274// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW". 275// 276// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW". 277// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW". 278// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN". 279// 280// -------------------------------------------- 281 282// LanguageCodeWithDialects 283// ------------------------ 284// 285// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang). 286const char* LanguageCodeWithDialects(Language lang); 287 288// LanguageCodeISO639_1 289// -------------------- 290// 291// Return the ISO 639-1 two-letter language code for lang. 292// Return invalid_language_code() if lang is invalid or does not have 293// an ISO 639-1 two-letter language code. 294const char* LanguageCodeISO639_1(Language lang); 295 296// LanguageCodeISO639_2 297// -------------------- 298// 299// Return the ISO 639-2 three-letter language for lang. 300// Return invalid_language_code() if lang is invalid or does not have 301// an ISO 639-2 three-letter language code. 302const char* LanguageCodeISO639_2(Language lang); 303 304// LanguageFromCode 305// ---------------- 306// 307// If lang_code matches the code for a Language, using a case-insensitive 308// comparison, set *lang to that Language and return true. 309// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false. 310// 311// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2 312// (three-letter) code, or a Google-specific code (see LanguageCode). 313// 314// Certain language-code aliases are also allowed: 315// For "zh-cn" and "zh_cn", set *lang to CHINESE. 316// For "zh-tw" and "zh_tw", set *lang to CHINESE_T. 317// For "he", set *lang to HEBREW. 318// For "in", set *lang to INDONESIAN. 319// For "ji", set *lang to YIDDISH. 320// For "fil", set *lang to TAGALOG. 321// 322// REQUIRES: 'lang' must not be NULL. 323bool LanguageFromCode(const char* lang_code, Language *language); 324 325 326// LanguageFromCodeOrName 327// ---------------------- 328// 329// If lang_code_or_name is a language code or a language name. 330// set *language to the corresponding Language and return true. 331// Otherwise set *language to UNKNOWN_LANGUAGE and return false. 332// 333bool LanguageFromCodeOrName(const char* lang_code_or_name, 334 Language* language); 335 336// LanguageNameFromCode 337// -------------------- 338// 339// If language_code is the code for a Language (see LanguageFromCode), 340// return the standard name of that language (see LanguageName). 341// Otherwise return invalid_language_name(). 342// 343const char* LanguageNameFromCode(const char* language_code); 344 345 346// Miscellany 347 348// LanguageCodeToUnderscoreForm 349// ---------------------------- 350// 351// Given a language code, convert the dash "-" to underscore "_". 352// 353// Specifically, if result_length <= strlen(lang_code), set result[0] 354// to '\0' and return false. Otherwise, copy lang_code to result, 355// converting every dash to an underscore, converting every character 356// before the first dash or underscore to lower case, and converting 357// every character after the first dash or underscore to upper 358// case. If there is no dash or underscore, convert the entire string 359// to lower case. 360// 361// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL. 362 363bool LanguageCodeToUnderscoreForm(const char* lang_code, 364 char* result, 365 int result_length); 366 367// 368// AlwaysPutInExpectedRestrict 369// --------------------------- 370// 371// For Web pages in certain top-level domains, Web Search always 372// applies a "country restrict". If 'tld' matches one of those, using 373// a case-SENSITIVE comparison, set *expected_language to the Language 374// most commonly found in that top-level domain and return true. 375// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false. 376bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language); 377 378 379#endif // LANGUAGES_PUBLIC_LANGUAGES_H_ 380