1// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// This file extends lang_enc.cc with additional languages and extended routines
6// It is current with Unicode 5.1 (beta Jan 2008)
7//
8
9#include <stdlib.h>
10#include <stdio.h>
11#include <string.h>
12
13#include "encodings/compact_lang_det/ext_lang_enc.h"
14#include "encodings/compact_lang_det/win/cld_macros.h"
15#include "encodings/compact_lang_det/win/cld_strtoint.h"
16
17// Language names above NUM_LANGUAGES
18// These are also the C enum declared names
19static const char* const kExtLanguageName[] = {
20"X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
21
22// Pseudo-languages for Unicode scripts that express a single language
23"X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
24"X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
25"X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
26"X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
27"X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
28"X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
29
30// Unicode 5.1
31"X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
32"X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
33"X_CHAM",
34};
35
36
37// These are the C enum declared names, for programs creating C code
38static const char* const kExtLangDeclaredName[] = {
39  "ENGLISH",      /* 0 */
40  "DANISH",       /* 1 */
41  "DUTCH",        /* 2 */
42  "FINNISH",      /* 3 */
43  "FRENCH",       /* 4 */
44  "GERMAN",       /* 5 */
45  "HEBREW",       /* 6 */
46  "ITALIAN",      /* 7 */
47  "JAPANESE",     /* 8 */
48  "KOREAN",       /* 9 */
49  "NORWEGIAN",    /* 10 */
50  "POLISH",       /* 11 */
51  "PORTUGUESE",   /* 12 */
52  "RUSSIAN",      /* 13 */
53  "SPANISH",      /* 14 */
54  "SWEDISH",      /* 15 */
55  "CHINESE",      /* 16 */
56  "CZECH",        /* 17 */
57  "GREEK",        /* 18 */
58  "ICELANDIC",    /* 19 */
59  "LATVIAN",      /* 20 */
60  "LITHUANIAN",   /* 21 */
61  "ROMANIAN",     /* 22 */
62  "HUNGARIAN",    /* 23 */
63  "ESTONIAN",     /* 24 */
64  "TG_UNKNOWN_LANGUAGE",  /* 25 */
65  "UNKNOWN_LANGUAGE",     /* 26 */
66  "BULGARIAN",    /* 27 */
67  "CROATIAN",     /* 28 */
68  "SERBIAN",      /* 29 */
69  "IRISH",        /* 30 */
70  "GALICIAN",     /* 31 */
71  "TAGALOG",      /* 32 */
72  "TURKISH",      /* 33 */
73  "UKRAINIAN",    /* 34 */
74  "HINDI",        /* 35 */
75  "MACEDONIAN",   /* 36 */
76  "BENGALI",      /* 37 */
77  "INDONESIAN",   /* 38 */
78  "LATIN",        /* 39 */
79  "MALAY",        /* 40 */
80  "MALAYALAM",    /* 41 */
81  "WELSH",        /* 42 */
82  "NEPALI",       /* 43 */
83  "TELUGU",       /* 44 */
84  "ALBANIAN",     /* 45 */
85  "TAMIL",        /* 46 */
86  "BELARUSIAN",   /* 47 */
87  "JAVANESE",     /* 48 */
88  "OCCITAN",      /* 49 */
89  "URDU",         /* 50 */
90  "BIHARI",       /* 51 */
91  "GUJARATI",     /* 52 */
92  "THAI",         /* 53 */
93  "ARABIC",       /* 54 */
94  "CATALAN",      /* 55 */
95  "ESPERANTO",    /* 56 */
96  "BASQUE",       /* 57 */
97  "INTERLINGUA",  /* 58 */
98  "KANNADA",      /* 59 */
99  "PUNJABI",      /* 60 */
100  "SCOTS_GAELIC", /* 61 */
101  "SWAHILI",      /* 62 */
102  "SLOVENIAN",    /* 63 */
103  "MARATHI",      /* 64 */
104  "MALTESE",      /* 65 */
105  "VIETNAMESE",   /* 66 */
106  "FRISIAN",      /* 67 */
107  "SLOVAK",       /* 68 */
108  "CHINESE_T",    /* 69 */
109  "FAROESE",      /* 70 */
110  "SUNDANESE",    /* 71 */
111  "UZBEK",        /* 72 */
112  "AMHARIC",      /* 73 */
113  "AZERBAIJANI",  /* 74 */
114  "GEORGIAN",     /* 75 */
115  "TIGRINYA",     /* 76 */
116  "PERSIAN",      /* 77 */
117  "BOSNIAN",      /* 78 */
118  "SINHALESE",    /* 79 */
119  "NORWEGIAN_N",  /* 80 */
120  "PORTUGUESE_P", /* 81 */
121  "PORTUGUESE_B", /* 82 */
122  "XHOSA",        /* 83 */
123  "ZULU",         /* 84 */
124  "GUARANI",      /* 85 */
125  "SESOTHO",      /* 86 */
126  "TURKMEN",      /* 87 */
127  "KYRGYZ",       /* 88 */
128  "BRETON",       /* 89 */
129  "TWI",          /* 90 */
130  "YIDDISH",      /* 91 */
131  "SERBO_CROATIAN",       /* 92 */
132  "SOMALI",       /* 93 */
133  "UIGHUR",       /* 94 */
134  "KURDISH",      /* 95 */
135  "MONGOLIAN",    /* 96 */
136  "ARMENIAN",     /* 97 */
137  "LAOTHIAN",     /* 98 */
138  "SINDHI",       /* 99 */
139  "RHAETO_ROMANCE",  /* 100 */
140  "AFRIKAANS",    /* 101 */
141  "LUXEMBOURGISH", /* 102 */
142  "BURMESE",      /* 103 */
143  "KHMER",        /* 104 */
144  "TIBETAN",      /* 105 */
145  "DHIVEHI",      /* 106 */       // sometimes spelled Divehi; lang of Maldives
146  "CHEROKEE",     /* 107 */
147  "SYRIAC",       /* 108 */
148  "LIMBU",        /* 109 */
149  "ORIYA",        /* 110 */
150  "ASSAMESE",     /* 111 */
151  "CORSICAN",     /* 112 */
152  "INTERLINGUE",  /* 113 */
153  "KAZAKH",       /* 114 */
154  "LINGALA",      /* 115 */
155  "MOLDAVIAN",    /* 116 */
156  "PASHTO",       /* 117 */
157  "QUECHUA",      /* 118 */
158  "SHONA",        /* 119 */
159  "TAJIK",        /* 120 */
160  "TATAR",        /* 121 */
161  "TONGA",        /* 122 */
162  "YORUBA",       /* 123 */
163  "CREOLES_AND_PIDGINS_ENGLISH_BASED",      /* 124 */
164  "CREOLES_AND_PIDGINS_FRENCH_BASED",       /* 125 */
165  "CREOLES_AND_PIDGINS_PORTUGUESE_BASED",   /* 126 */
166  "CREOLES_AND_PIDGINS_OTHER",              /* 127 */
167  "MAORI",        /* 128 */
168  "WOLOF",        /* 129 */
169  "ABKHAZIAN",    /* 130 */
170  "AFAR",         /* 131 */
171  "AYMARA",       /* 132 */
172  "BASHKIR",      /* 133 */
173  "BISLAMA",      /* 134 */
174  "DZONGKHA",     /* 135 */
175  "FIJIAN",       /* 136 */
176  "GREENLANDIC",  /* 137 */
177  "HAUSA",        /* 138 */
178  "HAITIAN_CREOLE",  /* 139 */
179  "INUPIAK",      /* 140 */
180  "INUKTITUT",    /* 141 */
181  "KASHMIRI",     /* 142 */
182  "KINYARWANDA",  /* 143 */
183  "MALAGASY",     /* 144 */
184  "NAURU",        /* 145 */
185  "OROMO",        /* 146 */
186  "RUNDI",        /* 147 */
187  "SAMOAN",       /* 148 */
188  "SANGO",        /* 149 */
189  "SANSKRIT",     /* 150 */
190  "SISWANT",      /* 151 */
191  "TSONGA",       /* 152 */
192  "TSWANA",       /* 153 */
193  "VOLAPUK",      /* 154 */
194  "ZHUANG",       /* 155 */
195  "KHASI",        /* 156 */
196  "SCOTS",        /* 157 */
197  "GANDA",        /* 158 */
198  "MANX",         /* 159 */
199  "MONTENEGRIN",  /* 160 */
200  // Add new language declared names just before here
201};
202
203COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
204       kExtLangDeclaredName_has_incorrect_length);
205
206
207// Language codes above NUM_LANGUAGES
208// I made all these up, except Klingon from ISO-639-2 (dsites)
209// NOTE: zza is a standard name
210static const char* const kExtLanguageCode[] = {
211  // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
212  // All Latin script
213  "zzb", "zzp", "zzh", "tlh", "zze",
214
215  // Pseudo-languages for Unicode scripts that express a single language
216  "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
217  "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
218  "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
219  "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
220  "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
221  "xx-Phnx", "xx-Phag", "xx-Nkoo",
222
223  // Unicode 5.1
224  "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
225  "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
226  "xx-Cham",
227};
228
229
230// Given the Language, returns its string name used as the output by
231// the lang/enc identifier, e.g. "Korean"
232// "invalid_language" if the input is invalid.
233// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
234// used to subtract out HTML, link farms, DNA strings, and alittle English porn
235const char* ExtLanguageName(const Language lang) {
236  if (lang < 0) {
237    // No-text-at-all result from a Tote
238    return "";
239  }
240  // CompactLanguageDetect extension
241  if (lang == TG_UNKNOWN_LANGUAGE) {
242    return "Ignore";
243  }
244  if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
245    return LanguageName(lang);
246  }
247  if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
248    return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
249  }
250  return invalid_language_name();
251}
252
253
254// Given the Language, returns its Language enum spelling, for use by
255// programs that create C declarations, e.g. "KOREAN"
256// "UNKNOWN_LANGUAGE" if the input is invalid.
257const char* ExtLanguageDeclaredName(const Language lang) {
258  if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
259    return kExtLangDeclaredName[lang];
260  }
261  if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
262    return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
263  }
264  return "UNKNOWN_LANGUAGE";
265}
266
267// Given the Language, return the language code, e.g. "ko"
268const char* ExtLanguageCode(const Language lang) {
269  // Hack for ignore/porn pseudo-language
270  if (lang == TG_UNKNOWN_LANGUAGE) {
271    return "xxx";
272  }
273  if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
274    return LanguageCode(lang);
275  }
276  if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
277    return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
278  }
279  return "??";
280}
281
282
283// Convert "en-Latn-GB" to ENGLISH
284// Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
285// Consider for later: NORWEGIAN, NORWEGIAN_N
286// Consider for later: SCOTS, SCOTS_GAELIC
287// Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
288//
289Language GetLanguageFromNumberOrName(const char* src) {
290  if (strspn(src, "0123456789") == strlen(src)) {
291    // All digits
292    return static_cast<Language>(strto32(src, NULL, 10));
293  }
294
295  Language retlang = UNKNOWN_LANGUAGE;
296  size_t len = strlen(src);
297
298  if (true /*FLAGS_mergepairs*/) {
299    // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
300    if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
301    if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
302    if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
303    // Use NormalizeLanguage instead
304    if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
305    if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
306    if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
307    if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
308    if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
309    if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
310  }
311
312  // Extensions
313  if (len >= 3) {
314    // Standin for ignore/porn "language"
315    if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
316
317    if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
318    if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
319    if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
320    if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
321    if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
322  }
323
324  // We have a name like en-Latn-GB or pt-BR
325  // First, get rid of some special cases
326  if (len <= 3) {
327    LanguageFromCode(src, &retlang);
328  } else if (len == 7) {
329    // More Extensions
330    if (memcmp(src, "xx-", 3) == 0) {
331      if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
332      if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
333      if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
334      if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
335      if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
336      if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
337      if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
338      if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
339      if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
340      if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
341      if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
342      if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
343      if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
344      if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
345      if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
346      if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
347      if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
348      if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
349      if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
350      if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
351      if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
352      if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
353      if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
354      if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
355      if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
356      if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
357      if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
358      if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
359
360      // Unicode 5.1
361      if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
362      if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
363      if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
364      if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
365      if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
366      if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
367      if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
368      if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
369      if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
370      if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
371      if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
372    }
373  }
374  // Some other weird ones
375  // Could be Latn or Limb; all our current training data is Latn
376  if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
377  if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
378
379  // Multi-country langauges
380  if (memcmp(src, "zh", 2) == 0) {
381    if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
382    if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
383    return CHINESE;
384  }
385  if (memcmp(src, "pt", 2) == 0) {
386    if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
387    return PORTUGUESE;
388  }
389  if (memcmp(src, "fr", 2) == 0) {
390    if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
391    return FRENCH;
392  }
393
394  // None of the special cases matched
395  if (src[2] == '-') {
396    char temp[4];
397    memcpy(temp, src, 4);
398    temp[2] = '\0';
399    LanguageFromCode(temp, &retlang);
400  }
401  if (src[3] == '-') {
402    char temp[4];
403    memcpy(temp, src, 4);
404    temp[3] = '\0';
405    LanguageFromCode(temp, &retlang);
406  }
407  if (retlang != UNKNOWN_LANGUAGE) {
408    return retlang;
409  }
410
411  return retlang;
412}
413
414typedef struct {
415  const char* name;
416  UnicodeLScript lscript;
417} NameScriptPair;
418
419// In alphabetic order for binary search
420static const NameScriptPair kNameScriptPair[] = {
421  // Unicode 5.1 additional scripts
422  {"Arab", ULScript_Arabic},
423  {"Armn", ULScript_Armenian},
424  {"Bali", ULScript_Balinese},
425  {"Beng", ULScript_Bengali},
426  {"Bugi", ULScript_Buginese},
427  {"Buhd", ULScript_Buhid},
428  {"Cans", ULScript_Canadian_Aboriginal},
429  {"Cari", ULScript_Carian},      // Unicode 5.1
430  {"Cham", ULScript_Cham},        // Unicode 5.1
431  {"Cher", ULScript_Cherokee},
432  {"Copt", ULScript_Coptic},
433  {"Cprt", ULScript_Cypriot},
434  {"Cyrl", ULScript_Cyrillic},
435  {"Deva", ULScript_Devanagari},
436  {"Dsrt", ULScript_Deseret},
437  {"Ethi", ULScript_Ethiopic},
438  {"Geor", ULScript_Georgian},
439  {"Glag", ULScript_Glagolitic},
440  {"Goth", ULScript_Gothic},
441  {"Grek", ULScript_Greek},
442  {"Gujr", ULScript_Gujarati},
443  {"Guru", ULScript_Gurmukhi},
444  {"Hani", ULScript_HanCJK},
445  {"Hano", ULScript_Hanunoo},
446  {"Hebr", ULScript_Hebrew},
447  {"Ital", ULScript_Old_Italic},
448  {"Kali", ULScript_Kayah_Li},    // Unicode 5.1
449  {"Khar", ULScript_Kharoshthi},
450  {"Khmr", ULScript_Khmer},
451  {"Knda", ULScript_Kannada},
452  {"Laoo", ULScript_Lao},
453  {"Latn", ULScript_Latin},
454  {"Lepc", ULScript_Lepcha},      // Unicode 5.1
455  {"Limb", ULScript_Limbu},
456  {"Linb", ULScript_Linear_B},
457  {"Lyci", ULScript_Lycian},      // Unicode 5.1
458  {"Lydi", ULScript_Lydian},      // Unicode 5.1
459  {"Mlym", ULScript_Malayalam},
460  {"Mong", ULScript_Mongolian},
461  {"Mymr", ULScript_Myanmar},
462  {"Nkoo", ULScript_Nko},
463  {"Ogam", ULScript_Ogham},
464  {"Olck", ULScript_Ol_Chiki},    // Unicode 5.1
465  {"Orya", ULScript_Oriya},
466  {"Osma", ULScript_Osmanya},
467  {"Phag", ULScript_Phags_Pa},
468  {"Phnx", ULScript_Phoenician},
469  {"Rjng", ULScript_Rejang},      // Unicode 5.1
470  {"Runr", ULScript_Runic},
471  {"Saur", ULScript_Saurashtra},  // Unicode 5.1
472  {"Shaw", ULScript_Shavian},
473  {"Sinh", ULScript_Sinhala},
474  {"Sund", ULScript_Sundanese},   // Unicode 5.1
475  {"Sylo", ULScript_Syloti_Nagri},
476  {"Syrc", ULScript_Syriac},
477  {"Tagb", ULScript_Tagbanwa},
478  {"Tale", ULScript_Tai_Le},
479  {"Talu", ULScript_New_Tai_Lue},
480  {"Taml", ULScript_Tamil},
481  {"Telu", ULScript_Telugu},
482  {"Tfng", ULScript_Tifinagh},
483  {"Tglg", ULScript_Tagalog},
484  {"Thaa", ULScript_Thaana},
485  {"Thai", ULScript_Thai},
486  {"Tibt", ULScript_Tibetan},
487  {"Ugar", ULScript_Ugaritic},
488  {"Vaii", ULScript_Vai},         // Unicode 5.1 // NOTE: apparently 'Vai '
489  {"Xpeo", ULScript_Old_Persian},
490  {"Xsux", ULScript_Cuneiform},
491  {"Yiii", ULScript_Yi},
492  {"Zyyy", ULScript_Common},
493  {"Zzzz", ULScript_Inherited},
494};
495
496// Convert "en-Latn-GB" to ULScript_Latin
497UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
498  if (strspn(src, "0123456789") == strlen(src)) {
499    // All digits
500    return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
501  }
502
503  if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
504  if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
505  if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
506  if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
507  // Could be Latn or Limb; all our current training data is Latn
508  if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
509
510  // Isolate just the script field
511  char temp[5];
512  const char* src2 = strchr(src, '-');
513  if (src2 == NULL) {return ULScript_Latin;}
514  src2 += 1;      // over the -
515  memcpy(temp, src2, 4);
516  temp[4] = '\0';
517
518  int lo = 0;
519  int hi = ULScript_NUM_SCRIPTS;
520  while (lo < hi) {
521    int mid = (lo + hi) >> 1;
522    if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
523      hi = mid;
524    } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
525      lo = mid + 1;
526    } else {
527      return kNameScriptPair[mid].lscript;
528    }
529  }
530  return ULScript_Latin;
531}
532
533
534// Merge together some languages, such as bo/hr/sr
535// Croatian Latin and Serbian Cyrillic now.
536Language NormalizeLanguage(Language lang) {
537  if (lang == BOSNIAN) {return CROATIAN;}
538  if (lang == SERBO_CROATIAN) {return SERBIAN;}
539
540  if (lang == PORTUGUESE_P) {return PORTUGUESE;}
541  if (lang == PORTUGUESE_B) {return PORTUGUESE;}
542
543  return lang;
544}
545
546