1/*
2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.  Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26/*
27 *******************************************************************************
28 * Copyright (C) 2010, International Business Machines Corporation and         *
29 * others. All Rights Reserved.                                                *
30 *******************************************************************************
31 */
32package sun.util.locale;
33
34import java.util.ArrayList;
35import java.util.Collections;
36import java.util.HashMap;
37import java.util.List;
38import java.util.Map;
39import java.util.Set;
40
41public class LanguageTag {
42    //
43    // static fields
44    //
45    public static final String SEP = "-";
46    public static final String PRIVATEUSE = "x";
47    public static final String UNDETERMINED = "und";
48    public static final String PRIVUSE_VARIANT_PREFIX = "lvariant";
49
50    //
51    // Language subtag fields
52    //
53    private String language = "";      // language subtag
54    private String script = "";        // script subtag
55    private String region = "";        // region subtag
56    private String privateuse = "";    // privateuse
57
58    private List<String> extlangs = Collections.emptyList();   // extlang subtags
59    private List<String> variants = Collections.emptyList();   // variant subtags
60    private List<String> extensions = Collections.emptyList(); // extensions
61
62    // Map contains grandfathered tags and its preferred mappings from
63    // http://www.ietf.org/rfc/rfc5646.txt
64    // Keys are lower-case strings.
65    private static final Map<String, String[]> GRANDFATHERED = new HashMap<>();
66
67    static {
68        // grandfathered = irregular           ; non-redundant tags registered
69        //               / regular             ; during the RFC 3066 era
70        //
71        // irregular     = "en-GB-oed"         ; irregular tags do not match
72        //               / "i-ami"             ; the 'langtag' production and
73        //               / "i-bnn"             ; would not otherwise be
74        //               / "i-default"         ; considered 'well-formed'
75        //               / "i-enochian"        ; These tags are all valid,
76        //               / "i-hak"             ; but most are deprecated
77        //               / "i-klingon"         ; in favor of more modern
78        //               / "i-lux"             ; subtags or subtag
79        //               / "i-mingo"           ; combination
80        //               / "i-navajo"
81        //               / "i-pwn"
82        //               / "i-tao"
83        //               / "i-tay"
84        //               / "i-tsu"
85        //               / "sgn-BE-FR"
86        //               / "sgn-BE-NL"
87        //               / "sgn-CH-DE"
88        //
89        // regular       = "art-lojban"        ; these tags match the 'langtag'
90        //               / "cel-gaulish"       ; production, but their subtags
91        //               / "no-bok"            ; are not extended language
92        //               / "no-nyn"            ; or variant subtags: their meaning
93        //               / "zh-guoyu"          ; is defined by their registration
94        //               / "zh-hakka"          ; and all of these are deprecated
95        //               / "zh-min"            ; in favor of a more modern
96        //               / "zh-min-nan"        ; subtag or sequence of subtags
97        //               / "zh-xiang"
98
99        final String[][] entries = {
100          //{"tag",         "preferred"},
101            {"art-lojban",  "jbo"},
102            {"cel-gaulish", "xtg-x-cel-gaulish"},   // fallback
103            {"en-GB-oed",   "en-GB-x-oed"},         // fallback
104            {"i-ami",       "ami"},
105            {"i-bnn",       "bnn"},
106            {"i-default",   "en-x-i-default"},      // fallback
107            {"i-enochian",  "und-x-i-enochian"},    // fallback
108            {"i-hak",       "hak"},
109            {"i-klingon",   "tlh"},
110            {"i-lux",       "lb"},
111            {"i-mingo",     "see-x-i-mingo"},       // fallback
112            {"i-navajo",    "nv"},
113            {"i-pwn",       "pwn"},
114            {"i-tao",       "tao"},
115            {"i-tay",       "tay"},
116            {"i-tsu",       "tsu"},
117            {"no-bok",      "nb"},
118            {"no-nyn",      "nn"},
119            {"sgn-BE-FR",   "sfb"},
120            {"sgn-BE-NL",   "vgt"},
121            {"sgn-CH-DE",   "sgg"},
122            {"zh-guoyu",    "cmn"},
123            {"zh-hakka",    "hak"},
124            {"zh-min",      "nan-x-zh-min"},        // fallback
125            {"zh-min-nan",  "nan"},
126            {"zh-xiang",    "hsn"},
127        };
128        for (String[] e : entries) {
129            GRANDFATHERED.put(LocaleUtils.toLowerString(e[0]), e);
130        }
131    }
132
133    private LanguageTag() {
134    }
135
136    /*
137     * BNF in RFC5646
138     *
139     * Language-Tag  = langtag             ; normal language tags
140     *               / privateuse          ; private use tag
141     *               / grandfathered       ; grandfathered tags
142     *
143     *
144     * langtag       = language
145     *                 ["-" script]
146     *                 ["-" region]
147     *                 *("-" variant)
148     *                 *("-" extension)
149     *                 ["-" privateuse]
150     *
151     * language      = 2*3ALPHA            ; shortest ISO 639 code
152     *                 ["-" extlang]       ; sometimes followed by
153     *                                     ; extended language subtags
154     *               / 4ALPHA              ; or reserved for future use
155     *               / 5*8ALPHA            ; or registered language subtag
156     *
157     * extlang       = 3ALPHA              ; selected ISO 639 codes
158     *                 *2("-" 3ALPHA)      ; permanently reserved
159     *
160     * script        = 4ALPHA              ; ISO 15924 code
161     *
162     * region        = 2ALPHA              ; ISO 3166-1 code
163     *               / 3DIGIT              ; UN M.49 code
164     *
165     * variant       = 5*8alphanum         ; registered variants
166     *               / (DIGIT 3alphanum)
167     *
168     * extension     = singleton 1*("-" (2*8alphanum))
169     *
170     *                                     ; Single alphanumerics
171     *                                     ; "x" reserved for private use
172     * singleton     = DIGIT               ; 0 - 9
173     *               / %x41-57             ; A - W
174     *               / %x59-5A             ; Y - Z
175     *               / %x61-77             ; a - w
176     *               / %x79-7A             ; y - z
177     *
178     * privateuse    = "x" 1*("-" (1*8alphanum))
179     *
180     */
181    public static LanguageTag parse(String languageTag, ParseStatus sts) {
182        if (sts == null) {
183            sts = new ParseStatus();
184        } else {
185            sts.reset();
186        }
187
188        StringTokenIterator itr;
189
190        // Check if the tag is grandfathered
191        String[] gfmap = GRANDFATHERED.get(LocaleUtils.toLowerString(languageTag));
192        if (gfmap != null) {
193            // use preferred mapping
194            itr = new StringTokenIterator(gfmap[1], SEP);
195        } else {
196            itr = new StringTokenIterator(languageTag, SEP);
197        }
198
199        LanguageTag tag = new LanguageTag();
200
201        // langtag must start with either language or privateuse
202        if (tag.parseLanguage(itr, sts)) {
203            tag.parseExtlangs(itr, sts);
204            tag.parseScript(itr, sts);
205            tag.parseRegion(itr, sts);
206            tag.parseVariants(itr, sts);
207            tag.parseExtensions(itr, sts);
208        }
209        tag.parsePrivateuse(itr, sts);
210
211        if (!itr.isDone() && !sts.isError()) {
212            String s = itr.current();
213            sts.errorIndex = itr.currentStart();
214            if (s.length() == 0) {
215                sts.errorMsg = "Empty subtag";
216            } else {
217                sts.errorMsg = "Invalid subtag: " + s;
218            }
219        }
220
221        return tag;
222    }
223
224    //
225    // Language subtag parsers
226    //
227
228    private boolean parseLanguage(StringTokenIterator itr, ParseStatus sts) {
229        if (itr.isDone() || sts.isError()) {
230            return false;
231        }
232
233        boolean found = false;
234
235        String s = itr.current();
236        if (isLanguage(s)) {
237            found = true;
238            language = s;
239            sts.parseLength = itr.currentEnd();
240            itr.next();
241        }
242
243        return found;
244    }
245
246    private boolean parseExtlangs(StringTokenIterator itr, ParseStatus sts) {
247        if (itr.isDone() || sts.isError()) {
248            return false;
249        }
250
251        boolean found = false;
252
253        while (!itr.isDone()) {
254            String s = itr.current();
255            if (!isExtlang(s)) {
256                break;
257            }
258            found = true;
259            if (extlangs.isEmpty()) {
260                extlangs = new ArrayList<>(3);
261            }
262            extlangs.add(s);
263            sts.parseLength = itr.currentEnd();
264            itr.next();
265
266            if (extlangs.size() == 3) {
267                // Maximum 3 extlangs
268                break;
269            }
270        }
271
272        return found;
273    }
274
275    private boolean parseScript(StringTokenIterator itr, ParseStatus sts) {
276        if (itr.isDone() || sts.isError()) {
277            return false;
278        }
279
280        boolean found = false;
281
282        String s = itr.current();
283        if (isScript(s)) {
284            found = true;
285            script = s;
286            sts.parseLength = itr.currentEnd();
287            itr.next();
288        }
289
290        return found;
291    }
292
293    private boolean parseRegion(StringTokenIterator itr, ParseStatus sts) {
294        if (itr.isDone() || sts.isError()) {
295            return false;
296        }
297
298        boolean found = false;
299
300        String s = itr.current();
301        if (isRegion(s)) {
302            found = true;
303            region = s;
304            sts.parseLength = itr.currentEnd();
305            itr.next();
306        }
307
308        return found;
309    }
310
311    private boolean parseVariants(StringTokenIterator itr, ParseStatus sts) {
312        if (itr.isDone() || sts.isError()) {
313            return false;
314        }
315
316        boolean found = false;
317
318        while (!itr.isDone()) {
319            String s = itr.current();
320            if (!isVariant(s)) {
321                break;
322            }
323            found = true;
324            if (variants.isEmpty()) {
325                variants = new ArrayList<>(3);
326            }
327            variants.add(s);
328            sts.parseLength = itr.currentEnd();
329            itr.next();
330        }
331
332        return found;
333    }
334
335    private boolean parseExtensions(StringTokenIterator itr, ParseStatus sts) {
336        if (itr.isDone() || sts.isError()) {
337            return false;
338        }
339
340        boolean found = false;
341
342        while (!itr.isDone()) {
343            String s = itr.current();
344            if (isExtensionSingleton(s)) {
345                int start = itr.currentStart();
346                String singleton = s;
347                StringBuilder sb = new StringBuilder(singleton);
348
349                itr.next();
350                while (!itr.isDone()) {
351                    s = itr.current();
352                    if (isExtensionSubtag(s)) {
353                        sb.append(SEP).append(s);
354                        sts.parseLength = itr.currentEnd();
355                    } else {
356                        break;
357                    }
358                    itr.next();
359                }
360
361                if (sts.parseLength <= start) {
362                    sts.errorIndex = start;
363                    sts.errorMsg = "Incomplete extension '" + singleton + "'";
364                    break;
365                }
366
367                if (extensions.isEmpty()) {
368                    extensions = new ArrayList<>(4);
369                }
370                extensions.add(sb.toString());
371                found = true;
372            } else {
373                break;
374            }
375        }
376        return found;
377    }
378
379    private boolean parsePrivateuse(StringTokenIterator itr, ParseStatus sts) {
380        if (itr.isDone() || sts.isError()) {
381            return false;
382        }
383
384        boolean found = false;
385
386        String s = itr.current();
387        if (isPrivateusePrefix(s)) {
388            int start = itr.currentStart();
389            StringBuilder sb = new StringBuilder(s);
390
391            itr.next();
392            while (!itr.isDone()) {
393                s = itr.current();
394                if (!isPrivateuseSubtag(s)) {
395                    break;
396                }
397                sb.append(SEP).append(s);
398                sts.parseLength = itr.currentEnd();
399
400                itr.next();
401            }
402
403            if (sts.parseLength <= start) {
404                // need at least 1 private subtag
405                sts.errorIndex = start;
406                sts.errorMsg = "Incomplete privateuse";
407            } else {
408                privateuse = sb.toString();
409                found = true;
410            }
411        }
412
413        return found;
414    }
415
416    public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) {
417        LanguageTag tag = new LanguageTag();
418
419        String language = baseLocale.getLanguage();
420        String script = baseLocale.getScript();
421        String region = baseLocale.getRegion();
422        String variant = baseLocale.getVariant();
423
424        boolean hasSubtag = false;
425
426        String privuseVar = null;   // store ill-formed variant subtags
427
428        if (isLanguage(language)) {
429            // Convert a deprecated language code to its new code
430            if (language.equals("iw")) {
431                language = "he";
432            } else if (language.equals("ji")) {
433                language = "yi";
434            } else if (language.equals("in")) {
435                language = "id";
436            }
437            tag.language = language;
438        }
439
440        if (isScript(script)) {
441            tag.script = canonicalizeScript(script);
442            hasSubtag = true;
443        }
444
445        if (isRegion(region)) {
446            tag.region = canonicalizeRegion(region);
447            hasSubtag = true;
448        }
449
450        // Special handling for no_NO_NY - use nn_NO for language tag
451        if (tag.language.equals("no") && tag.region.equals("NO") && variant.equals("NY")) {
452            tag.language = "nn";
453            variant = "";
454        }
455
456        if (variant.length() > 0) {
457            List<String> variants = null;
458            StringTokenIterator varitr = new StringTokenIterator(variant, BaseLocale.SEP);
459            while (!varitr.isDone()) {
460                String var = varitr.current();
461                if (!isVariant(var)) {
462                    break;
463                }
464                if (variants == null) {
465                    variants = new ArrayList<>();
466                }
467                variants.add(var);  // Do not canonicalize!
468                varitr.next();
469            }
470            if (variants != null) {
471                tag.variants = variants;
472                hasSubtag = true;
473            }
474            if (!varitr.isDone()) {
475                // ill-formed variant subtags
476                StringBuilder buf = new StringBuilder();
477                while (!varitr.isDone()) {
478                    String prvv = varitr.current();
479                    if (!isPrivateuseSubtag(prvv)) {
480                        // cannot use private use subtag - truncated
481                        break;
482                    }
483                    if (buf.length() > 0) {
484                        buf.append(SEP);
485                    }
486                    buf.append(prvv);
487                    varitr.next();
488                }
489                if (buf.length() > 0) {
490                    privuseVar = buf.toString();
491                }
492            }
493        }
494
495        List<String> extensions = null;
496        String privateuse = null;
497
498        if (localeExtensions != null) {
499            Set<Character> locextKeys = localeExtensions.getKeys();
500            for (Character locextKey : locextKeys) {
501                Extension ext = localeExtensions.getExtension(locextKey);
502                if (isPrivateusePrefixChar(locextKey)) {
503                    privateuse = ext.getValue();
504                } else {
505                    if (extensions == null) {
506                        extensions = new ArrayList<>();
507                    }
508                    extensions.add(locextKey.toString() + SEP + ext.getValue());
509                }
510            }
511        }
512
513        if (extensions != null) {
514            tag.extensions = extensions;
515            hasSubtag = true;
516        }
517
518        // append ill-formed variant subtags to private use
519        if (privuseVar != null) {
520            if (privateuse == null) {
521                privateuse = PRIVUSE_VARIANT_PREFIX + SEP + privuseVar;
522            } else {
523                privateuse = privateuse + SEP + PRIVUSE_VARIANT_PREFIX
524                             + SEP + privuseVar.replace(BaseLocale.SEP, SEP);
525            }
526        }
527
528        if (privateuse != null) {
529            tag.privateuse = privateuse;
530        }
531
532        if (tag.language.length() == 0 && (hasSubtag || privateuse == null)) {
533            // use lang "und" when 1) no language is available AND
534            // 2) any of other subtags other than private use are available or
535            // no private use tag is available
536            tag.language = UNDETERMINED;
537        }
538
539        return tag;
540    }
541
542    //
543    // Getter methods for language subtag fields
544    //
545
546    public String getLanguage() {
547        return language;
548    }
549
550    public List<String> getExtlangs() {
551        if (extlangs.isEmpty()) {
552            return Collections.emptyList();
553        }
554        return Collections.unmodifiableList(extlangs);
555    }
556
557    public String getScript() {
558        return script;
559    }
560
561    public String getRegion() {
562        return region;
563    }
564
565    public List<String> getVariants() {
566        if (variants.isEmpty()) {
567            return Collections.emptyList();
568        }
569        return Collections.unmodifiableList(variants);
570    }
571
572    public List<String> getExtensions() {
573        if (extensions.isEmpty()) {
574            return Collections.emptyList();
575        }
576        return Collections.unmodifiableList(extensions);
577    }
578
579    public String getPrivateuse() {
580        return privateuse;
581    }
582
583    //
584    // Language subtag syntax checking methods
585    //
586
587    public static boolean isLanguage(String s) {
588        // language      = 2*3ALPHA            ; shortest ISO 639 code
589        //                 ["-" extlang]       ; sometimes followed by
590        //                                     ;   extended language subtags
591        //               / 4ALPHA              ; or reserved for future use
592        //               / 5*8ALPHA            ; or registered language subtag
593        int len = s.length();
594        return (len >= 2) && (len <= 8) && LocaleUtils.isAlphaString(s);
595    }
596
597    public static boolean isExtlang(String s) {
598        // extlang       = 3ALPHA              ; selected ISO 639 codes
599        //                 *2("-" 3ALPHA)      ; permanently reserved
600        return (s.length() == 3) && LocaleUtils.isAlphaString(s);
601    }
602
603    public static boolean isScript(String s) {
604        // script        = 4ALPHA              ; ISO 15924 code
605        return (s.length() == 4) && LocaleUtils.isAlphaString(s);
606    }
607
608    public static boolean isRegion(String s) {
609        // region        = 2ALPHA              ; ISO 3166-1 code
610        //               / 3DIGIT              ; UN M.49 code
611        return ((s.length() == 2) && LocaleUtils.isAlphaString(s))
612                || ((s.length() == 3) && LocaleUtils.isNumericString(s));
613    }
614
615    public static boolean isVariant(String s) {
616        // variant       = 5*8alphanum         ; registered variants
617        //               / (DIGIT 3alphanum)
618        int len = s.length();
619        if (len >= 5 && len <= 8) {
620            return LocaleUtils.isAlphaNumericString(s);
621        }
622        if (len == 4) {
623            return LocaleUtils.isNumeric(s.charAt(0))
624                    && LocaleUtils.isAlphaNumeric(s.charAt(1))
625                    && LocaleUtils.isAlphaNumeric(s.charAt(2))
626                    && LocaleUtils.isAlphaNumeric(s.charAt(3));
627        }
628        return false;
629    }
630
631    public static boolean isExtensionSingleton(String s) {
632        // singleton     = DIGIT               ; 0 - 9
633        //               / %x41-57             ; A - W
634        //               / %x59-5A             ; Y - Z
635        //               / %x61-77             ; a - w
636        //               / %x79-7A             ; y - z
637
638        return (s.length() == 1)
639                && LocaleUtils.isAlphaString(s)
640                && !LocaleUtils.caseIgnoreMatch(PRIVATEUSE, s);
641    }
642
643    public static boolean isExtensionSingletonChar(char c) {
644        return isExtensionSingleton(String.valueOf(c));
645    }
646
647    public static boolean isExtensionSubtag(String s) {
648        // extension     = singleton 1*("-" (2*8alphanum))
649        int len = s.length();
650        return (len >= 2) && (len <= 8) && LocaleUtils.isAlphaNumericString(s);
651    }
652
653    public static boolean isPrivateusePrefix(String s) {
654        // privateuse    = "x" 1*("-" (1*8alphanum))
655        return (s.length() == 1)
656                && LocaleUtils.caseIgnoreMatch(PRIVATEUSE, s);
657    }
658
659    public static boolean isPrivateusePrefixChar(char c) {
660        return (LocaleUtils.caseIgnoreMatch(PRIVATEUSE, String.valueOf(c)));
661    }
662
663    public static boolean isPrivateuseSubtag(String s) {
664        // privateuse    = "x" 1*("-" (1*8alphanum))
665        int len = s.length();
666        return (len >= 1) && (len <= 8) && LocaleUtils.isAlphaNumericString(s);
667    }
668
669    //
670    // Language subtag canonicalization methods
671    //
672
673    public static String canonicalizeLanguage(String s) {
674        return LocaleUtils.toLowerString(s);
675    }
676
677    public static String canonicalizeExtlang(String s) {
678        return LocaleUtils.toLowerString(s);
679    }
680
681    public static String canonicalizeScript(String s) {
682        return LocaleUtils.toTitleString(s);
683    }
684
685    public static String canonicalizeRegion(String s) {
686        return LocaleUtils.toUpperString(s);
687    }
688
689    public static String canonicalizeVariant(String s) {
690        return LocaleUtils.toLowerString(s);
691    }
692
693    public static String canonicalizeExtension(String s) {
694        return LocaleUtils.toLowerString(s);
695    }
696
697    public static String canonicalizeExtensionSingleton(String s) {
698        return LocaleUtils.toLowerString(s);
699    }
700
701    public static String canonicalizeExtensionSubtag(String s) {
702        return LocaleUtils.toLowerString(s);
703    }
704
705    public static String canonicalizePrivateuse(String s) {
706        return LocaleUtils.toLowerString(s);
707    }
708
709    public static String canonicalizePrivateuseSubtag(String s) {
710        return LocaleUtils.toLowerString(s);
711    }
712
713    @Override
714    public String toString() {
715        StringBuilder sb = new StringBuilder();
716
717        if (language.length() > 0) {
718            sb.append(language);
719
720            for (String extlang : extlangs) {
721                sb.append(SEP).append(extlang);
722            }
723
724            if (script.length() > 0) {
725                sb.append(SEP).append(script);
726            }
727
728            if (region.length() > 0) {
729                sb.append(SEP).append(region);
730            }
731
732            for (String variant : variants) {
733                sb.append(SEP).append(variant);
734            }
735
736            for (String extension : extensions) {
737                sb.append(SEP).append(extension);
738            }
739        }
740        if (privateuse.length() > 0) {
741            if (sb.length() > 0) {
742                sb.append(SEP);
743            }
744            sb.append(privateuse);
745        }
746
747        return sb.toString();
748    }
749}
750