1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "platform/text/LocaleToScriptMapping.h"
33
34#include "wtf/HashMap.h"
35#include "wtf/HashSet.h"
36#include "wtf/text/StringHash.h"
37
38namespace blink {
39
40UScriptCode scriptNameToCode(const String& scriptName)
41{
42    struct ScriptNameCode {
43        const char* name;
44        UScriptCode code;
45    };
46
47    // This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
48    // treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
49    // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
50    // using the same font setting.
51    static const ScriptNameCode scriptNameCodeList[] = {
52        { "zyyy", USCRIPT_COMMON },
53        { "qaai", USCRIPT_INHERITED },
54        { "arab", USCRIPT_ARABIC },
55        { "armn", USCRIPT_ARMENIAN },
56        { "beng", USCRIPT_BENGALI },
57        { "bopo", USCRIPT_BOPOMOFO },
58        { "cher", USCRIPT_CHEROKEE },
59        { "copt", USCRIPT_COPTIC },
60        { "cyrl", USCRIPT_CYRILLIC },
61        { "dsrt", USCRIPT_DESERET },
62        { "deva", USCRIPT_DEVANAGARI },
63        { "ethi", USCRIPT_ETHIOPIC },
64        { "geor", USCRIPT_GEORGIAN },
65        { "goth", USCRIPT_GOTHIC },
66        { "grek", USCRIPT_GREEK },
67        { "gujr", USCRIPT_GUJARATI },
68        { "guru", USCRIPT_GURMUKHI },
69        { "hani", USCRIPT_HAN },
70        { "hang", USCRIPT_HANGUL },
71        { "hebr", USCRIPT_HEBREW },
72        { "hira", USCRIPT_KATAKANA_OR_HIRAGANA },
73        { "knda", USCRIPT_KANNADA },
74        { "kana", USCRIPT_KATAKANA_OR_HIRAGANA },
75        { "khmr", USCRIPT_KHMER },
76        { "laoo", USCRIPT_LAO },
77        { "latn", USCRIPT_LATIN },
78        { "mlym", USCRIPT_MALAYALAM },
79        { "mong", USCRIPT_MONGOLIAN },
80        { "mymr", USCRIPT_MYANMAR },
81        { "ogam", USCRIPT_OGHAM },
82        { "ital", USCRIPT_OLD_ITALIC },
83        { "orya", USCRIPT_ORIYA },
84        { "runr", USCRIPT_RUNIC },
85        { "sinh", USCRIPT_SINHALA },
86        { "syrc", USCRIPT_SYRIAC },
87        { "taml", USCRIPT_TAMIL },
88        { "telu", USCRIPT_TELUGU },
89        { "thaa", USCRIPT_THAANA },
90        { "thai", USCRIPT_THAI },
91        { "tibt", USCRIPT_TIBETAN },
92        { "cans", USCRIPT_CANADIAN_ABORIGINAL },
93        { "yiii", USCRIPT_YI },
94        { "tglg", USCRIPT_TAGALOG },
95        { "hano", USCRIPT_HANUNOO },
96        { "buhd", USCRIPT_BUHID },
97        { "tagb", USCRIPT_TAGBANWA },
98        { "brai", USCRIPT_BRAILLE },
99        { "cprt", USCRIPT_CYPRIOT },
100        { "limb", USCRIPT_LIMBU },
101        { "linb", USCRIPT_LINEAR_B },
102        { "osma", USCRIPT_OSMANYA },
103        { "shaw", USCRIPT_SHAVIAN },
104        { "tale", USCRIPT_TAI_LE },
105        { "ugar", USCRIPT_UGARITIC },
106        { "hrkt", USCRIPT_KATAKANA_OR_HIRAGANA },
107        { "bugi", USCRIPT_BUGINESE },
108        { "glag", USCRIPT_GLAGOLITIC },
109        { "khar", USCRIPT_KHAROSHTHI },
110        { "sylo", USCRIPT_SYLOTI_NAGRI },
111        { "talu", USCRIPT_NEW_TAI_LUE },
112        { "tfng", USCRIPT_TIFINAGH },
113        { "xpeo", USCRIPT_OLD_PERSIAN },
114        { "bali", USCRIPT_BALINESE },
115        { "batk", USCRIPT_BATAK },
116        { "blis", USCRIPT_BLISSYMBOLS },
117        { "brah", USCRIPT_BRAHMI },
118        { "cham", USCRIPT_CHAM },
119        { "cirt", USCRIPT_CIRTH },
120        { "cyrs", USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
121        { "egyd", USCRIPT_DEMOTIC_EGYPTIAN },
122        { "egyh", USCRIPT_HIERATIC_EGYPTIAN },
123        { "egyp", USCRIPT_EGYPTIAN_HIEROGLYPHS },
124        { "geok", USCRIPT_KHUTSURI },
125        { "hans", USCRIPT_SIMPLIFIED_HAN },
126        { "hant", USCRIPT_TRADITIONAL_HAN },
127        { "hmng", USCRIPT_PAHAWH_HMONG },
128        { "hung", USCRIPT_OLD_HUNGARIAN },
129        { "inds", USCRIPT_HARAPPAN_INDUS },
130        { "java", USCRIPT_JAVANESE },
131        { "kali", USCRIPT_KAYAH_LI },
132        { "latf", USCRIPT_LATIN_FRAKTUR },
133        { "latg", USCRIPT_LATIN_GAELIC },
134        { "lepc", USCRIPT_LEPCHA },
135        { "lina", USCRIPT_LINEAR_A },
136        { "mand", USCRIPT_MANDAEAN },
137        { "maya", USCRIPT_MAYAN_HIEROGLYPHS },
138        { "mero", USCRIPT_MEROITIC },
139        { "nkoo", USCRIPT_NKO },
140        { "orkh", USCRIPT_ORKHON },
141        { "perm", USCRIPT_OLD_PERMIC },
142        { "phag", USCRIPT_PHAGS_PA },
143        { "phnx", USCRIPT_PHOENICIAN },
144        { "plrd", USCRIPT_PHONETIC_POLLARD },
145        { "roro", USCRIPT_RONGORONGO },
146        { "sara", USCRIPT_SARATI },
147        { "syre", USCRIPT_ESTRANGELO_SYRIAC },
148        { "syrj", USCRIPT_WESTERN_SYRIAC },
149        { "syrn", USCRIPT_EASTERN_SYRIAC },
150        { "teng", USCRIPT_TENGWAR },
151        { "vaii", USCRIPT_VAI },
152        { "visp", USCRIPT_VISIBLE_SPEECH },
153        { "xsux", USCRIPT_CUNEIFORM },
154        { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA },
155        { "kore", USCRIPT_HANGUL },
156        { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES },
157        { "zzzz", USCRIPT_UNKNOWN }
158    };
159
160    typedef HashMap<String, UScriptCode> ScriptNameCodeMap;
161    DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ());
162    if (scriptNameCodeMap.isEmpty()) {
163        for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(scriptNameCodeList[0]); ++i)
164            scriptNameCodeMap.set(scriptNameCodeList[i].name, scriptNameCodeList[i].code);
165    }
166
167    HashMap<String, UScriptCode>::iterator it = scriptNameCodeMap.find(scriptName.lower());
168    if (it != scriptNameCodeMap.end())
169        return it->value;
170    return USCRIPT_INVALID_CODE;
171}
172
173UScriptCode localeToScriptCodeForFontSelection(const String& locale)
174{
175    struct LocaleScript {
176        const char* locale;
177        UScriptCode script;
178    };
179
180    static const LocaleScript localeScriptList[] = {
181        { "aa", USCRIPT_LATIN },
182        { "ab", USCRIPT_CYRILLIC },
183        { "ady", USCRIPT_CYRILLIC },
184        { "af", USCRIPT_LATIN },
185        { "ak", USCRIPT_LATIN },
186        { "am", USCRIPT_ETHIOPIC },
187        { "ar", USCRIPT_ARABIC },
188        { "as", USCRIPT_BENGALI },
189        { "ast", USCRIPT_LATIN },
190        { "av", USCRIPT_CYRILLIC },
191        { "ay", USCRIPT_LATIN },
192        { "az", USCRIPT_LATIN },
193        { "ba", USCRIPT_CYRILLIC },
194        { "be", USCRIPT_CYRILLIC },
195        { "bg", USCRIPT_CYRILLIC },
196        { "bi", USCRIPT_LATIN },
197        { "bn", USCRIPT_BENGALI },
198        { "bo", USCRIPT_TIBETAN },
199        { "bs", USCRIPT_LATIN },
200        { "ca", USCRIPT_LATIN },
201        { "ce", USCRIPT_CYRILLIC },
202        { "ceb", USCRIPT_LATIN },
203        { "ch", USCRIPT_LATIN },
204        { "chk", USCRIPT_LATIN },
205        { "cs", USCRIPT_LATIN },
206        { "cy", USCRIPT_LATIN },
207        { "da", USCRIPT_LATIN },
208        { "de", USCRIPT_LATIN },
209        { "dv", USCRIPT_THAANA },
210        { "dz", USCRIPT_TIBETAN },
211        { "ee", USCRIPT_LATIN },
212        { "efi", USCRIPT_LATIN },
213        { "el", USCRIPT_GREEK },
214        { "en", USCRIPT_LATIN },
215        { "es", USCRIPT_LATIN },
216        { "et", USCRIPT_LATIN },
217        { "eu", USCRIPT_LATIN },
218        { "fa", USCRIPT_ARABIC },
219        { "fi", USCRIPT_LATIN },
220        { "fil", USCRIPT_LATIN },
221        { "fj", USCRIPT_LATIN },
222        { "fo", USCRIPT_LATIN },
223        { "fr", USCRIPT_LATIN },
224        { "fur", USCRIPT_LATIN },
225        { "fy", USCRIPT_LATIN },
226        { "ga", USCRIPT_LATIN },
227        { "gaa", USCRIPT_LATIN },
228        { "gd", USCRIPT_LATIN },
229        { "gil", USCRIPT_LATIN },
230        { "gl", USCRIPT_LATIN },
231        { "gn", USCRIPT_LATIN },
232        { "gsw", USCRIPT_LATIN },
233        { "gu", USCRIPT_GUJARATI },
234        { "ha", USCRIPT_LATIN },
235        { "haw", USCRIPT_LATIN },
236        { "he", USCRIPT_HEBREW },
237        { "hi", USCRIPT_DEVANAGARI },
238        { "hil", USCRIPT_LATIN },
239        { "ho", USCRIPT_LATIN },
240        { "hr", USCRIPT_LATIN },
241        { "ht", USCRIPT_LATIN },
242        { "hu", USCRIPT_LATIN },
243        { "hy", USCRIPT_ARMENIAN },
244        { "id", USCRIPT_LATIN },
245        { "ig", USCRIPT_LATIN },
246        { "ii", USCRIPT_YI },
247        { "ilo", USCRIPT_LATIN },
248        { "inh", USCRIPT_CYRILLIC },
249        { "is", USCRIPT_LATIN },
250        { "it", USCRIPT_LATIN },
251        { "iu", USCRIPT_CANADIAN_ABORIGINAL },
252        { "ja", USCRIPT_KATAKANA_OR_HIRAGANA },
253        { "jv", USCRIPT_LATIN },
254        { "ka", USCRIPT_GEORGIAN },
255        { "kaj", USCRIPT_LATIN },
256        { "kam", USCRIPT_LATIN },
257        { "kbd", USCRIPT_CYRILLIC },
258        { "kha", USCRIPT_LATIN },
259        { "kk", USCRIPT_CYRILLIC },
260        { "kl", USCRIPT_LATIN },
261        { "km", USCRIPT_KHMER },
262        { "kn", USCRIPT_KANNADA },
263        { "ko", USCRIPT_HANGUL },
264        { "kok", USCRIPT_DEVANAGARI },
265        { "kos", USCRIPT_LATIN },
266        { "kpe", USCRIPT_LATIN },
267        { "krc", USCRIPT_CYRILLIC },
268        { "ks", USCRIPT_ARABIC },
269        { "ku", USCRIPT_ARABIC },
270        { "kum", USCRIPT_CYRILLIC },
271        { "ky", USCRIPT_CYRILLIC },
272        { "la", USCRIPT_LATIN },
273        { "lah", USCRIPT_ARABIC },
274        { "lb", USCRIPT_LATIN },
275        { "lez", USCRIPT_CYRILLIC },
276        { "ln", USCRIPT_LATIN },
277        { "lo", USCRIPT_LAO },
278        { "lt", USCRIPT_LATIN },
279        { "lv", USCRIPT_LATIN },
280        { "mai", USCRIPT_DEVANAGARI },
281        { "mdf", USCRIPT_CYRILLIC },
282        { "mg", USCRIPT_LATIN },
283        { "mh", USCRIPT_LATIN },
284        { "mi", USCRIPT_LATIN },
285        { "mk", USCRIPT_CYRILLIC },
286        { "ml", USCRIPT_MALAYALAM },
287        { "mn", USCRIPT_CYRILLIC },
288        { "mr", USCRIPT_DEVANAGARI },
289        { "ms", USCRIPT_LATIN },
290        { "mt", USCRIPT_LATIN },
291        { "my", USCRIPT_MYANMAR },
292        { "myv", USCRIPT_CYRILLIC },
293        { "na", USCRIPT_LATIN },
294        { "nb", USCRIPT_LATIN },
295        { "ne", USCRIPT_DEVANAGARI },
296        { "niu", USCRIPT_LATIN },
297        { "nl", USCRIPT_LATIN },
298        { "nn", USCRIPT_LATIN },
299        { "nr", USCRIPT_LATIN },
300        { "nso", USCRIPT_LATIN },
301        { "ny", USCRIPT_LATIN },
302        { "oc", USCRIPT_LATIN },
303        { "om", USCRIPT_LATIN },
304        { "or", USCRIPT_ORIYA },
305        { "os", USCRIPT_CYRILLIC },
306        { "pa", USCRIPT_GURMUKHI },
307        { "pag", USCRIPT_LATIN },
308        { "pap", USCRIPT_LATIN },
309        { "pau", USCRIPT_LATIN },
310        { "pl", USCRIPT_LATIN },
311        { "pon", USCRIPT_LATIN },
312        { "ps", USCRIPT_ARABIC },
313        { "pt", USCRIPT_LATIN },
314        { "qu", USCRIPT_LATIN },
315        { "rm", USCRIPT_LATIN },
316        { "rn", USCRIPT_LATIN },
317        { "ro", USCRIPT_LATIN },
318        { "ru", USCRIPT_CYRILLIC },
319        { "rw", USCRIPT_LATIN },
320        { "sa", USCRIPT_DEVANAGARI },
321        { "sah", USCRIPT_CYRILLIC },
322        { "sat", USCRIPT_LATIN },
323        { "sd", USCRIPT_ARABIC },
324        { "se", USCRIPT_LATIN },
325        { "sg", USCRIPT_LATIN },
326        { "si", USCRIPT_SINHALA },
327        { "sid", USCRIPT_LATIN },
328        { "sk", USCRIPT_LATIN },
329        { "sl", USCRIPT_LATIN },
330        { "sm", USCRIPT_LATIN },
331        { "so", USCRIPT_LATIN },
332        { "sq", USCRIPT_LATIN },
333        { "sr", USCRIPT_CYRILLIC },
334        { "ss", USCRIPT_LATIN },
335        { "st", USCRIPT_LATIN },
336        { "su", USCRIPT_LATIN },
337        { "sv", USCRIPT_LATIN },
338        { "sw", USCRIPT_LATIN },
339        { "ta", USCRIPT_TAMIL },
340        { "te", USCRIPT_TELUGU },
341        { "tet", USCRIPT_LATIN },
342        { "tg", USCRIPT_CYRILLIC },
343        { "th", USCRIPT_THAI },
344        { "ti", USCRIPT_ETHIOPIC },
345        { "tig", USCRIPT_ETHIOPIC },
346        { "tk", USCRIPT_LATIN },
347        { "tkl", USCRIPT_LATIN },
348        { "tl", USCRIPT_LATIN },
349        { "tn", USCRIPT_LATIN },
350        { "to", USCRIPT_LATIN },
351        { "tpi", USCRIPT_LATIN },
352        { "tr", USCRIPT_LATIN },
353        { "trv", USCRIPT_LATIN },
354        { "ts", USCRIPT_LATIN },
355        { "tt", USCRIPT_CYRILLIC },
356        { "tvl", USCRIPT_LATIN },
357        { "tw", USCRIPT_LATIN },
358        { "ty", USCRIPT_LATIN },
359        { "tyv", USCRIPT_CYRILLIC },
360        { "udm", USCRIPT_CYRILLIC },
361        { "ug", USCRIPT_ARABIC },
362        { "uk", USCRIPT_CYRILLIC },
363        { "und", USCRIPT_LATIN },
364        { "ur", USCRIPT_ARABIC },
365        { "uz", USCRIPT_CYRILLIC },
366        { "ve", USCRIPT_LATIN },
367        { "vi", USCRIPT_LATIN },
368        { "wal", USCRIPT_ETHIOPIC },
369        { "war", USCRIPT_LATIN },
370        { "wo", USCRIPT_LATIN },
371        { "xh", USCRIPT_LATIN },
372        { "yap", USCRIPT_LATIN },
373        { "yo", USCRIPT_LATIN },
374        { "za", USCRIPT_LATIN },
375        { "zh", USCRIPT_SIMPLIFIED_HAN },
376        { "zh_hk", USCRIPT_TRADITIONAL_HAN },
377        { "zh_tw", USCRIPT_TRADITIONAL_HAN },
378        { "zu", USCRIPT_LATIN }
379    };
380
381    typedef HashMap<String, UScriptCode> LocaleScriptMap;
382    DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ());
383    if (localeScriptMap.isEmpty()) {
384        for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(localeScriptList[0]); ++i)
385            localeScriptMap.set(localeScriptList[i].locale, localeScriptList[i].script);
386    }
387
388    String canonicalLocale = locale.lower().replace('-', '_');
389    while (!canonicalLocale.isEmpty()) {
390        HashMap<String, UScriptCode>::iterator it = localeScriptMap.find(canonicalLocale);
391        if (it != localeScriptMap.end())
392            return it->value;
393        size_t pos = canonicalLocale.reverseFind('_');
394        if (pos == kNotFound)
395            break;
396        UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1));
397        if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN)
398            return code;
399        canonicalLocale = canonicalLocale.substring(0, pos);
400    }
401    return USCRIPT_COMMON;
402}
403
404} // namespace blink
405