1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2010-2013, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.impl.locale;
10
11import java.util.ArrayList;
12import java.util.Collections;
13import java.util.HashMap;
14import java.util.List;
15import java.util.Map;
16import java.util.Set;
17
18public class LanguageTag {
19    private static final boolean JDKIMPL = false;
20
21    //
22    // static fields
23    //
24    public static final String SEP = "-";
25    public static final String PRIVATEUSE = "x";
26    public static String UNDETERMINED = "und";
27    public static final String PRIVUSE_VARIANT_PREFIX = "lvariant";
28
29    //
30    // Language subtag fields
31    //
32    private String _language = "";      // language subtag
33    private String _script = "";        // script subtag
34    private String _region = "";        // region subtag
35    private String _privateuse = "";    // privateuse
36
37    private List<String> _extlangs = Collections.emptyList();   // extlang subtags
38    private List<String> _variants = Collections.emptyList();   // variant subtags
39    private List<String> _extensions = Collections.emptyList(); // extensions
40
41    // Map contains grandfathered tags and its preferred mappings from
42    // http://www.ietf.org/rfc/rfc5646.txt
43    private static final Map<AsciiUtil.CaseInsensitiveKey, String[]> GRANDFATHERED =
44        new HashMap<AsciiUtil.CaseInsensitiveKey, String[]>();
45
46    static {
47        // grandfathered = irregular           ; non-redundant tags registered
48        //               / regular             ; during the RFC 3066 era
49        //
50        // irregular     = "en-GB-oed"         ; irregular tags do not match
51        //               / "i-ami"             ; the 'langtag' production and
52        //               / "i-bnn"             ; would not otherwise be
53        //               / "i-default"         ; considered 'well-formed'
54        //               / "i-enochian"        ; These tags are all valid,
55        //               / "i-hak"             ; but most are deprecated
56        //               / "i-klingon"         ; in favor of more modern
57        //               / "i-lux"             ; subtags or subtag
58        //               / "i-mingo"           ; combination
59        //               / "i-navajo"
60        //               / "i-pwn"
61        //               / "i-tao"
62        //               / "i-tay"
63        //               / "i-tsu"
64        //               / "sgn-BE-FR"
65        //               / "sgn-BE-NL"
66        //               / "sgn-CH-DE"
67        //
68        // regular       = "art-lojban"        ; these tags match the 'langtag'
69        //               / "cel-gaulish"       ; production, but their subtags
70        //               / "no-bok"            ; are not extended language
71        //               / "no-nyn"            ; or variant subtags: their meaning
72        //               / "zh-guoyu"          ; is defined by their registration
73        //               / "zh-hakka"          ; and all of these are deprecated
74        //               / "zh-min"            ; in favor of a more modern
75        //               / "zh-min-nan"        ; subtag or sequence of subtags
76        //               / "zh-xiang"
77
78        final String[][] entries = {
79          //{"tag",         "preferred"},
80            {"art-lojban",  "jbo"},
81            {"cel-gaulish", "xtg-x-cel-gaulish"},   // fallback
82            {"en-GB-oed",   "en-GB-x-oed"},         // fallback
83            {"i-ami",       "ami"},
84            {"i-bnn",       "bnn"},
85            {"i-default",   "en-x-i-default"},      // fallback
86            {"i-enochian",  "und-x-i-enochian"},    // fallback
87            {"i-hak",       "hak"},
88            {"i-klingon",   "tlh"},
89            {"i-lux",       "lb"},
90            {"i-mingo",     "see-x-i-mingo"},       // fallback
91            {"i-navajo",    "nv"},
92            {"i-pwn",       "pwn"},
93            {"i-tao",       "tao"},
94            {"i-tay",       "tay"},
95            {"i-tsu",       "tsu"},
96            {"no-bok",      "nb"},
97            {"no-nyn",      "nn"},
98            {"sgn-BE-FR",   "sfb"},
99            {"sgn-BE-NL",   "vgt"},
100            {"sgn-CH-DE",   "sgg"},
101            {"zh-guoyu",    "cmn"},
102            {"zh-hakka",    "hak"},
103            {"zh-min",      "nan-x-zh-min"},        // fallback
104            {"zh-min-nan",  "nan"},
105            {"zh-xiang",    "hsn"},
106        };
107        for (String[] e : entries) {
108            GRANDFATHERED.put(new AsciiUtil.CaseInsensitiveKey(e[0]), e);
109        }
110    }
111
112    private LanguageTag() {
113    }
114
115    /*
116     * BNF in RFC5464
117     *
118     * Language-Tag  = langtag             ; normal language tags
119     *               / privateuse          ; private use tag
120     *               / grandfathered       ; grandfathered tags
121     *
122     *
123     * langtag       = language
124     *                 ["-" script]
125     *                 ["-" region]
126     *                 *("-" variant)
127     *                 *("-" extension)
128     *                 ["-" privateuse]
129     *
130     * language      = 2*3ALPHA            ; shortest ISO 639 code
131     *                 ["-" extlang]       ; sometimes followed by
132     *                                     ; extended language subtags
133     *               / 4ALPHA              ; or reserved for future use
134     *               / 5*8ALPHA            ; or registered language subtag
135     *
136     * extlang       = 3ALPHA              ; selected ISO 639 codes
137     *                 *2("-" 3ALPHA)      ; permanently reserved
138     *
139     * script        = 4ALPHA              ; ISO 15924 code
140     *
141     * region        = 2ALPHA              ; ISO 3166-1 code
142     *               / 3DIGIT              ; UN M.49 code
143     *
144     * variant       = 5*8alphanum         ; registered variants
145     *               / (DIGIT 3alphanum)
146     *
147     * extension     = singleton 1*("-" (2*8alphanum))
148     *
149     *                                     ; Single alphanumerics
150     *                                     ; "x" reserved for private use
151     * singleton     = DIGIT               ; 0 - 9
152     *               / %x41-57             ; A - W
153     *               / %x59-5A             ; Y - Z
154     *               / %x61-77             ; a - w
155     *               / %x79-7A             ; y - z
156     *
157     * privateuse    = "x" 1*("-" (1*8alphanum))
158     *
159     */
160    public static LanguageTag parse(String languageTag, ParseStatus sts) {
161        if (sts == null) {
162            sts = new ParseStatus();
163        } else {
164            sts.reset();
165        }
166
167        StringTokenIterator itr;
168        boolean isGrandfathered = false;
169
170        // Check if the tag is grandfathered
171        String[] gfmap = GRANDFATHERED.get(new AsciiUtil.CaseInsensitiveKey(languageTag));
172        if (gfmap != null) {
173            // use preferred mapping
174            itr = new StringTokenIterator(gfmap[1], SEP);
175            isGrandfathered = true;
176        } else {
177            itr = new StringTokenIterator(languageTag, SEP);
178        }
179
180        LanguageTag tag = new LanguageTag();
181
182        // langtag must start with either language or privateuse
183        if (tag.parseLanguage(itr, sts)) {
184            tag.parseExtlangs(itr, sts);
185            tag.parseScript(itr, sts);
186            tag.parseRegion(itr, sts);
187            tag.parseVariants(itr, sts);
188            tag.parseExtensions(itr, sts);
189        }
190        tag.parsePrivateuse(itr, sts);
191
192        if (isGrandfathered) {
193            // Grandfathered tag is replaced with a well-formed tag above.
194            // However, the parsed length must be the original tag length.
195            assert (itr.isDone());
196            assert (!sts.isError());
197            sts._parseLength = languageTag.length();
198        } else if (!itr.isDone() && !sts.isError()) {
199            String s = itr.current();
200            sts._errorIndex = itr.currentStart();
201            if (s.length() == 0) {
202                sts._errorMsg = "Empty subtag";
203            } else {
204                sts._errorMsg = "Invalid subtag: " + s;
205            }
206        }
207
208        return tag;
209    }
210
211    //
212    // Language subtag parsers
213    //
214
215    private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) {
216        if (itr.isDone() || sts.isError()) {
217            return false;
218        }
219
220        boolean found = false;
221
222        String s = itr.current();
223        if (isLanguage(s)) {
224            found = true;
225            _language = s;
226            sts._parseLength = itr.currentEnd();
227            itr.next();
228        }
229
230        return found;
231    }
232
233    private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) {
234        if (itr.isDone() || sts.isError()) {
235            return false;
236        }
237
238        boolean found = false;
239
240        while (!itr.isDone()) {
241            String s = itr.current();
242            if (!isExtlang(s)) {
243                break;
244            }
245            found = true;
246            if (_extlangs.isEmpty()) {
247                _extlangs = new ArrayList<String>(3);
248            }
249            _extlangs.add(s);
250            sts._parseLength = itr.currentEnd();
251            itr.next();
252
253            if (_extlangs.size() == 3) {
254                // Maximum 3 extlangs
255                break;
256            }
257        }
258
259        return found;
260    }
261
262    private boolean parseScript(StringTokenIterator itr, ParseStatus sts) {
263        if (itr.isDone() || sts.isError()) {
264            return false;
265        }
266
267        boolean found = false;
268
269        String s = itr.current();
270        if (isScript(s)) {
271            found = true;
272            _script = s;
273            sts._parseLength = itr.currentEnd();
274            itr.next();
275        }
276
277        return found;
278    }
279
280    private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) {
281        if (itr.isDone() || sts.isError()) {
282            return false;
283        }
284
285        boolean found = false;
286
287        String s = itr.current();
288        if (isRegion(s)) {
289            found = true;
290            _region = s;
291            sts._parseLength = itr.currentEnd();
292            itr.next();
293        }
294
295        return found;
296    }
297
298    private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) {
299        if (itr.isDone() || sts.isError()) {
300            return false;
301        }
302
303        boolean found = false;
304
305        while (!itr.isDone()) {
306            String s = itr.current();
307            if (!isVariant(s)) {
308                break;
309            }
310            found = true;
311            if (_variants.isEmpty()) {
312                _variants = new ArrayList<String>(3);
313            }
314            _variants.add(s);
315            sts._parseLength = itr.currentEnd();
316            itr.next();
317        }
318
319        return found;
320    }
321
322    private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) {
323        if (itr.isDone() || sts.isError()) {
324            return false;
325        }
326
327        boolean found = false;
328
329        while (!itr.isDone()) {
330            String s = itr.current();
331            if (isExtensionSingleton(s)) {
332                int start = itr.currentStart();
333                String singleton = s;
334                StringBuilder sb = new StringBuilder(singleton);
335
336                itr.next();
337                while (!itr.isDone()) {
338                    s = itr.current();
339                    if (isExtensionSubtag(s)) {
340                        sb.append(SEP).append(s);
341                        sts._parseLength = itr.currentEnd();
342                    } else {
343                        break;
344                    }
345                    itr.next();
346                }
347
348                if (sts._parseLength <= start) {
349                    sts._errorIndex = start;
350                    sts._errorMsg = "Incomplete extension '" + singleton + "'";
351                    break;
352                }
353
354                if (_extensions.size() == 0) {
355                    _extensions = new ArrayList<String>(4);
356                }
357                _extensions.add(sb.toString());
358                found = true;
359            } else {
360                break;
361            }
362        }
363        return found;
364    }
365
366    private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) {
367        if (itr.isDone() || sts.isError()) {
368            return false;
369        }
370
371        boolean found = false;
372
373        String s = itr.current();
374        if (isPrivateusePrefix(s)) {
375            int start = itr.currentStart();
376            StringBuilder sb = new StringBuilder(s);
377
378            itr.next();
379            while (!itr.isDone()) {
380                s = itr.current();
381                if (!isPrivateuseSubtag(s)) {
382                    break;
383                }
384                sb.append(SEP).append(s);
385                sts._parseLength = itr.currentEnd();
386
387                itr.next();
388            }
389
390            if (sts._parseLength <= start) {
391                // need at least 1 private subtag
392                sts._errorIndex = start;
393                sts._errorMsg = "Incomplete privateuse";
394            } else {
395                _privateuse = sb.toString();
396                found = true;
397            }
398        }
399
400        return found;
401    }
402
403    public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) {
404        LanguageTag tag = new LanguageTag();
405
406        String language = baseLocale.getLanguage();
407        String script = baseLocale.getScript();
408        String region = baseLocale.getRegion();
409        String variant = baseLocale.getVariant();
410
411        boolean hasSubtag = false;
412
413        String privuseVar = null;   // store ill-formed variant subtags
414
415        if (language.length() > 0 && isLanguage(language)) {
416            // Convert a deprecated language code used by Java to
417            // a new code
418            if (language.equals("iw")) {
419                language = "he";
420            } else if (language.equals("ji")) {
421                language = "yi";
422            } else if (language.equals("in")) {
423                language = "id";
424            }
425            tag._language = language;
426        }
427
428        if (script.length() > 0 && isScript(script)) {
429            tag._script = canonicalizeScript(script);
430            hasSubtag = true;
431        }
432
433        if (region.length() > 0 && isRegion(region)) {
434            tag._region = canonicalizeRegion(region);
435            hasSubtag = true;
436        }
437
438        if (JDKIMPL) {
439            // Special handling for no_NO_NY - use nn_NO for language tag
440            if (tag._language.equals("no") && tag._region.equals("NO") && variant.equals("NY")) {
441                tag._language = "nn";
442                variant = "";
443            }
444        }
445
446        if (variant.length() > 0) {
447            List<String> variants = null;
448            StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP);
449            while (!varitr.isDone()) {
450                String var = varitr.current();
451                if (!isVariant(var)) {
452                    break;
453                }
454                if (variants == null) {
455                    variants = new ArrayList<String>();
456                }
457                if (JDKIMPL) {
458                    variants.add(var);  // Do not canonicalize!
459                } else {
460                    variants.add(canonicalizeVariant(var));
461                }
462                varitr.next();
463            }
464            if (variants != null) {
465                tag._variants = variants;
466                hasSubtag = true;
467            }
468            if (!varitr.isDone()) {
469                // ill-formed variant subtags
470                StringBuilder buf = new StringBuilder();
471                while (!varitr.isDone()) {
472                    String prvv = varitr.current();
473                    if (!isPrivateuseSubtag(prvv)) {
474                        // cannot use private use subtag - truncated
475                        break;
476                    }
477                    if (buf.length() > 0) {
478                        buf.append(SEP);
479                    }
480                    if (!JDKIMPL) {
481                        prvv = AsciiUtil.toLowerString(prvv);
482                    }
483                    buf.append(prvv);
484                    varitr.next();
485                }
486                if (buf.length() > 0) {
487                    privuseVar = buf.toString();
488                }
489            }
490        }
491
492        List<String> extensions = null;
493        String privateuse = null;
494
495        Set<Character> locextKeys = localeExtensions.getKeys();
496        for (Character locextKey : locextKeys) {
497            Extension ext = localeExtensions.getExtension(locextKey);
498            if (isPrivateusePrefixChar(locextKey.charValue())) {
499                privateuse = ext.getValue();
500            } else {
501                if (extensions == null) {
502                    extensions = new ArrayList<String>();
503                }
504                extensions.add(locextKey.toString() + SEP + ext.getValue());
505            }
506        }
507
508        if (extensions != null) {
509            tag._extensions = extensions;
510            hasSubtag = true;
511        }
512
513        // append ill-formed variant subtags to private use
514        if (privuseVar != null) {
515            if (privateuse == null) {
516                privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar;
517            } else {
518                privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX + SEP + privuseVar.replace(BaseLocale.SEP, SEP);
519            }
520        }
521
522        if (privateuse != null) {
523            tag._privateuse = privateuse;
524        }
525
526        if (tag._language.length() == 0 && (hasSubtag || privateuse == null)) {
527            // use lang "und" when 1) no language is available AND
528            // 2) any of other subtags other than private use are available or
529            // no private use tag is available
530            tag._language = UNDETERMINED;
531        }
532
533        return tag;
534    }
535
536    //
537    // Getter methods for language subtag fields
538    //
539
540    public String getLanguage() {
541        return _language;
542    }
543
544    public List<String> getExtlangs() {
545        return Collections.unmodifiableList(_extlangs);
546    }
547
548    public String getScript() {
549        return _script;
550    }
551
552    public String getRegion() {
553        return _region;
554    }
555
556    public List<String> getVariants() {
557        return Collections.unmodifiableList(_variants);
558    }
559
560    public List<String> getExtensions() {
561        return Collections.unmodifiableList(_extensions);
562    }
563
564    public String getPrivateuse() {
565        return _privateuse;
566    }
567
568    //
569    // Language subtag syntax checking methods
570    //
571
572    public static boolean isLanguage(String s) {
573        // language      = 2*3ALPHA            ; shortest ISO 639 code
574        //                 ["-" extlang]       ; sometimes followed by
575        //                                     ;   extended language subtags
576        //               / 4ALPHA              ; or reserved for future use
577        //               / 5*8ALPHA            ; or registered language subtag
578        return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaString(s);
579    }
580
581    public static boolean isExtlang(String s) {
582        // extlang       = 3ALPHA              ; selected ISO 639 codes
583        //                 *2("-" 3ALPHA)      ; permanently reserved
584        return (s.length() == 3) && AsciiUtil.isAlphaString(s);
585    }
586
587    public static boolean isScript(String s) {
588        // script        = 4ALPHA              ; ISO 15924 code
589        return (s.length() == 4) && AsciiUtil.isAlphaString(s);
590    }
591
592    public static boolean isRegion(String s) {
593        // region        = 2ALPHA              ; ISO 3166-1 code
594        //               / 3DIGIT              ; UN M.49 code
595        return ((s.length() == 2) && AsciiUtil.isAlphaString(s))
596                || ((s.length() == 3) && AsciiUtil.isNumericString(s));
597    }
598
599    public static boolean isVariant(String s) {
600        // variant       = 5*8alphanum         ; registered variants
601        //               / (DIGIT 3alphanum)
602        int len = s.length();
603        if (len >= 5 && len <= 8) {
604            return AsciiUtil.isAlphaNumericString(s);
605        }
606        if (len == 4) {
607            return AsciiUtil.isNumeric(s.charAt(0))
608                    && AsciiUtil.isAlphaNumeric(s.charAt(1))
609                    && AsciiUtil.isAlphaNumeric(s.charAt(2))
610                    && AsciiUtil.isAlphaNumeric(s.charAt(3));
611        }
612        return false;
613    }
614
615    public static boolean isExtensionSingleton(String s) {
616        // singleton     = DIGIT               ; 0 - 9
617        //               / %x41-57             ; A - W
618        //               / %x59-5A             ; Y - Z
619        //               / %x61-77             ; a - w
620        //               / %x79-7A             ; y - z
621
622        return (s.length() == 1)
623                && AsciiUtil.isAlphaString(s)
624                && !AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s);
625    }
626
627    public static boolean isExtensionSingletonChar(char c) {
628        return isExtensionSingleton(String.valueOf(c));
629    }
630
631    public static boolean isExtensionSubtag(String s) {
632        // extension     = singleton 1*("-" (2*8alphanum))
633        return (s.length() >= 2) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s);
634    }
635
636    public static boolean isPrivateusePrefix(String s) {
637        // privateuse    = "x" 1*("-" (1*8alphanum))
638        return (s.length() == 1)
639                && AsciiUtil.caseIgnoreMatch(PRIVATEUSE, s);
640    }
641
642    public static boolean isPrivateusePrefixChar(char c) {
643        return (AsciiUtil.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c)));
644    }
645
646    public static boolean isPrivateuseSubtag(String s) {
647        // privateuse    = "x" 1*("-" (1*8alphanum))
648        return (s.length() >= 1) && (s.length() <= 8) && AsciiUtil.isAlphaNumericString(s);
649    }
650
651    //
652    // Language subtag canonicalization methods
653    //
654
655    public static String canonicalizeLanguage(String s) {
656        return AsciiUtil.toLowerString(s);
657    }
658
659    public static String canonicalizeExtlang(String s) {
660        return AsciiUtil.toLowerString(s);
661    }
662
663    public static String canonicalizeScript(String s) {
664        return AsciiUtil.toTitleString(s);
665    }
666
667    public static String canonicalizeRegion(String s) {
668        return AsciiUtil.toUpperString(s);
669    }
670
671    public static String canonicalizeVariant(String s) {
672        return AsciiUtil.toLowerString(s);
673    }
674
675    public static String canonicalizeExtension(String s) {
676        return AsciiUtil.toLowerString(s);
677    }
678
679    public static String canonicalizeExtensionSingleton(String s) {
680        return AsciiUtil.toLowerString(s);
681    }
682
683    public static String canonicalizeExtensionSubtag(String s) {
684        return AsciiUtil.toLowerString(s);
685    }
686
687    public static String canonicalizePrivateuse(String s) {
688        return AsciiUtil.toLowerString(s);
689    }
690
691    public static String canonicalizePrivateuseSubtag(String s) {
692        return AsciiUtil.toLowerString(s);
693    }
694
695    @Override
696    public String toString() {
697        StringBuilder sb = new StringBuilder();
698
699        if (_language.length() > 0) {
700            sb.append(_language);
701
702            for (String extlang : _extlangs) {
703                sb.append(SEP).append(extlang);
704            }
705
706            if (_script.length() > 0) {
707                sb.append(SEP).append(_script);
708            }
709
710            if (_region.length() > 0) {
711                sb.append(SEP).append(_region);
712            }
713
714            for (String variant : _variants) {
715                sb.append(SEP).append(variant);
716            }
717
718            for (String extension : _extensions) {
719                sb.append(SEP).append(extension);
720            }
721        }
722        if (_privateuse.length() > 0) {
723            if (sb.length() > 0) {
724                sb.append(SEP);
725            }
726            sb.append(_privateuse);
727        }
728
729        return sb.toString();
730    }
731}
732