1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4******************************************************************************
5* Copyright (C) 2003-2011, International Business Machines Corporation and   *
6* others. All Rights Reserved.                                               *
7******************************************************************************
8*/
9
10package com.ibm.icu.impl;
11
12import java.util.Collections;
13import java.util.Comparator;
14import java.util.Iterator;
15import java.util.Map;
16import java.util.TreeMap;
17
18import com.ibm.icu.impl.locale.AsciiUtil;
19
20/**
21 * Utility class to parse and normalize locale ids (including POSIX style)
22 */
23public final class LocaleIDParser {
24
25    /**
26     * Char array representing the locale ID.
27     */
28    private char[] id;
29
30    /**
31     * Current position in {@link #id} (while parsing).
32     */
33    private int index;
34
35    /**
36     * Temporary buffer for parsed sections of data.
37     */
38    private StringBuilder buffer;
39
40    // um, don't handle POSIX ids unless we request it.  why not?  well... because.
41    private boolean canonicalize;
42    private boolean hadCountry;
43
44    // used when canonicalizing
45    Map<String, String> keywords;
46    String baseName;
47
48    /**
49     * Parsing constants.
50     */
51    private static final char KEYWORD_SEPARATOR     = '@';
52    private static final char HYPHEN                = '-';
53    private static final char KEYWORD_ASSIGN        = '=';
54    private static final char COMMA                 = ',';
55    private static final char ITEM_SEPARATOR        = ';';
56    private static final char DOT                   = '.';
57    private static final char UNDERSCORE            = '_';
58
59    public LocaleIDParser(String localeID) {
60        this(localeID, false);
61    }
62
63    public LocaleIDParser(String localeID, boolean canonicalize) {
64        id = localeID.toCharArray();
65        index = 0;
66        buffer = new StringBuilder(id.length + 5);
67        this.canonicalize = canonicalize;
68    }
69
70    private void reset() {
71        index = 0;
72        buffer = new StringBuilder(id.length + 5);
73    }
74
75    // utilities for working on text in the buffer
76
77    /**
78     * Append c to the buffer.
79     */
80    private void append(char c) {
81        buffer.append(c);
82    }
83
84    private void addSeparator() {
85        append(UNDERSCORE);
86    }
87
88    /**
89     * Returns the text in the buffer from start to blen as a String.
90     */
91    private String getString(int start) {
92        return buffer.substring(start);
93    }
94
95    /**
96     * Set the length of the buffer to pos, then append the string.
97     */
98    private void set(int pos, String s) {
99        buffer.delete(pos, buffer.length());
100        buffer.insert(pos, s);
101    }
102
103    /**
104     * Append the string to the buffer.
105     */
106    private void append(String s) {
107        buffer.append(s);
108    }
109
110    // utilities for parsing text out of the id
111
112    /**
113     * Character to indicate no more text is available in the id.
114     */
115    private static final char DONE = '\uffff';
116
117    /**
118     * Returns the character at index in the id, and advance index.  The returned character
119     * is DONE if index was at the limit of the buffer.  The index is advanced regardless
120     * so that decrementing the index will always 'unget' the last character returned.
121     */
122    private char next() {
123        if (index == id.length) {
124            index++;
125            return DONE;
126        }
127
128        return id[index++];
129    }
130
131    /**
132     * Advance index until the next terminator or id separator, and leave it there.
133     */
134    private void skipUntilTerminatorOrIDSeparator() {
135        while (!isTerminatorOrIDSeparator(next()));
136        --index;
137    }
138
139    /**
140     * Returns true if the character at index in the id is a terminator.
141     */
142    private boolean atTerminator() {
143        return index >= id.length || isTerminator(id[index]);
144    }
145
146    /**
147     * Returns true if the character is a terminator (keyword separator, dot, or DONE).
148     * Dot is a terminator because of the POSIX form, where dot precedes the codepage.
149     */
150    private boolean isTerminator(char c) {
151        // always terminate at DOT, even if not handling POSIX.  It's an error...
152        return c == KEYWORD_SEPARATOR || c == DONE || c == DOT;
153    }
154
155    /**
156     * Returns true if the character is a terminator or id separator.
157     */
158    private boolean isTerminatorOrIDSeparator(char c) {
159        return c == UNDERSCORE || c == HYPHEN || isTerminator(c);
160    }
161
162    /**
163     * Returns true if the start of the buffer has an experimental or private language
164     * prefix, the pattern '[ixIX][-_].' shows the syntax checked.
165     */
166    private boolean haveExperimentalLanguagePrefix() {
167        if (id.length > 2) {
168            char c = id[1];
169            if (c == HYPHEN || c == UNDERSCORE) {
170                c = id[0];
171                return c == 'x' || c == 'X' || c == 'i' || c == 'I';
172            }
173        }
174        return false;
175    }
176
177    /**
178     * Returns true if a value separator occurs at or after index.
179     */
180    private boolean haveKeywordAssign() {
181        // assume it is safe to start from index
182        for (int i = index; i < id.length; ++i) {
183            if (id[i] == KEYWORD_ASSIGN) {
184                return true;
185            }
186        }
187        return false;
188    }
189
190    /**
191     * Advance index past language, and accumulate normalized language code in buffer.
192     * Index must be at 0 when this is called.  Index is left at a terminator or id
193     * separator.  Returns the start of the language code in the buffer.
194     */
195    private int parseLanguage() {
196        int startLength = buffer.length();
197
198        if (haveExperimentalLanguagePrefix()) {
199            append(AsciiUtil.toLower(id[0]));
200            append(HYPHEN);
201            index = 2;
202        }
203
204        char c;
205        while(!isTerminatorOrIDSeparator(c = next())) {
206            append(AsciiUtil.toLower(c));
207        }
208        --index; // unget
209
210        if (buffer.length() - startLength == 3) {
211            String lang = LocaleIDs.threeToTwoLetterLanguage(getString(0));
212            if (lang != null) {
213                set(0, lang);
214            }
215        }
216
217        return 0;
218    }
219
220    /**
221     * Advance index past language.  Index must be at 0 when this is called.  Index
222     * is left at a terminator or id separator.
223     */
224    private void skipLanguage() {
225        if (haveExperimentalLanguagePrefix()) {
226            index = 2;
227        }
228        skipUntilTerminatorOrIDSeparator();
229    }
230
231    /**
232     * Advance index past script, and accumulate normalized script in buffer.
233     * Index must be immediately after the language.
234     * If the item at this position is not a script (is not four characters
235     * long) leave index and buffer unchanged.  Otherwise index is left at
236     * a terminator or id separator.  Returns the start of the script code
237     * in the buffer (this may be equal to the buffer length, if there is no
238     * script).
239     */
240    private int parseScript() {
241        if (!atTerminator()) {
242            int oldIndex = index; // save original index
243            ++index;
244
245            int oldBlen = buffer.length(); // get before append hyphen, if we truncate everything is undone
246            char c;
247            boolean firstPass = true;
248            while(!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c)) {
249                if (firstPass) {
250                    addSeparator();
251                    append(AsciiUtil.toUpper(c));
252                    firstPass = false;
253                } else {
254                    append(AsciiUtil.toLower(c));
255                }
256            }
257            --index; // unget
258
259            /* If it's not exactly 4 characters long, then it's not a script. */
260            if (index - oldIndex != 5) { // +1 to account for separator
261                index = oldIndex;
262                buffer.delete(oldBlen, buffer.length());
263            } else {
264                oldBlen++; // index past hyphen, for clients who want to extract just the script
265            }
266
267            return oldBlen;
268        }
269        return buffer.length();
270    }
271
272    /**
273     * Advance index past script.
274     * Index must be immediately after the language and IDSeparator.
275     * If the item at this position is not a script (is not four characters
276     * long) leave index.  Otherwise index is left at a terminator or
277     * id separator.
278     */
279    private void skipScript() {
280        if (!atTerminator()) {
281            int oldIndex = index;
282            ++index;
283
284            char c;
285            while (!isTerminatorOrIDSeparator(c = next()) && AsciiUtil.isAlpha(c));
286            --index;
287
288            if (index - oldIndex != 5) { // +1 to account for separator
289                index = oldIndex;
290            }
291        }
292    }
293
294    /**
295     * Advance index past country, and accumulate normalized country in buffer.
296     * Index must be immediately after the script (if there is one, else language)
297     * and IDSeparator.  Return the start of the country code in the buffer.
298     */
299    private int parseCountry() {
300        if (!atTerminator()) {
301            int oldIndex = index;
302            ++index;
303
304            int oldBlen = buffer.length();
305            char c;
306            boolean firstPass = true;
307            while (!isTerminatorOrIDSeparator(c = next())) {
308                if (firstPass) { // first, add hyphen
309                    hadCountry = true; // we have a country, let variant parsing know
310                    addSeparator();
311                    ++oldBlen; // increment past hyphen
312                    firstPass = false;
313                }
314                append(AsciiUtil.toUpper(c));
315            }
316            --index; // unget
317
318            int charsAppended = buffer.length() - oldBlen;
319
320            if (charsAppended == 0) {
321                // Do nothing.
322            }
323            else if (charsAppended < 2 || charsAppended > 3) {
324                // It's not a country, so return index and blen to
325                // their previous values.
326                index = oldIndex;
327                --oldBlen;
328                buffer.delete(oldBlen, buffer.length());
329                hadCountry = false;
330            }
331            else if (charsAppended == 3) {
332                String region = LocaleIDs.threeToTwoLetterRegion(getString(oldBlen));
333                if (region != null) {
334                    set(oldBlen, region);
335                }
336            }
337
338            return oldBlen;
339        }
340
341        return buffer.length();
342    }
343
344    /**
345     * Advance index past country.
346     * Index must be immediately after the script (if there is one, else language)
347     * and IDSeparator.
348     */
349    private void skipCountry() {
350        if (!atTerminator()) {
351            if (id[index] == UNDERSCORE || id[index] == HYPHEN) {
352                ++index;
353            }
354            /*
355             * Save the index point after the separator, since the format
356             * requires two separators if the country is not present.
357             */
358            int oldIndex = index;
359
360            skipUntilTerminatorOrIDSeparator();
361            int charsSkipped = index - oldIndex;
362            if (charsSkipped < 2 || charsSkipped > 3) {
363                index = oldIndex;
364            }
365        }
366    }
367
368    /**
369     * Advance index past variant, and accumulate normalized variant in buffer.  This ignores
370     * the codepage information from POSIX ids.  Index must be immediately after the country
371     * or script.  Index is left at the keyword separator or at the end of the text.  Return
372     * the start of the variant code in the buffer.
373     *
374     * In standard form, we can have the following forms:
375     * ll__VVVV
376     * ll_CC_VVVV
377     * ll_Ssss_VVVV
378     * ll_Ssss_CC_VVVV
379     *
380     * This also handles POSIX ids, which can have the following forms (pppp is code page id):
381     * ll_CC.pppp          --> ll_CC
382     * ll_CC.pppp@VVVV     --> ll_CC_VVVV
383     * ll_CC@VVVV          --> ll_CC_VVVV
384     *
385     * We identify this use of '@' in POSIX ids by looking for an '=' following
386     * the '@'.  If there is one, we consider '@' to start a keyword list, instead of
387     * being part of a POSIX id.
388     *
389     * Note:  since it was decided that we want an option to not handle POSIX ids, this
390     * becomes a bit more complex.
391     */
392    private int parseVariant() {
393        int oldBlen = buffer.length();
394
395        boolean start = true;
396        boolean needSeparator = true;
397        boolean skipping = false;
398        char c;
399        boolean firstPass = true;
400
401        while ((c = next()) != DONE) {
402            if (c == DOT) {
403                start = false;
404                skipping = true;
405            } else if (c == KEYWORD_SEPARATOR) {
406                if (haveKeywordAssign()) {
407                    break;
408                }
409                skipping = false;
410                start = false;
411                needSeparator = true; // add another underscore if we have more text
412            } else if (start) {
413                start = false;
414                if (c != UNDERSCORE && c != HYPHEN) {
415                    index--;
416                }
417            } else if (!skipping) {
418                if (needSeparator) {
419                    needSeparator = false;
420                    if (firstPass && !hadCountry) { // no country, we'll need two
421                        addSeparator();
422                        ++oldBlen; // for sure
423                    }
424                    addSeparator();
425                    if (firstPass) { // only for the first separator
426                        ++oldBlen;
427                        firstPass = false;
428                    }
429                }
430                c = AsciiUtil.toUpper(c);
431                if (c == HYPHEN || c == COMMA) {
432                    c = UNDERSCORE;
433                }
434                append(c);
435            }
436        }
437        --index; // unget
438
439        return oldBlen;
440    }
441
442    // no need for skipvariant, to get the keywords we'll just scan directly for
443    // the keyword separator
444
445    /**
446     * Returns the normalized language id, or the empty string.
447     */
448    public String getLanguage() {
449        reset();
450        return getString(parseLanguage());
451    }
452
453    /**
454     * Returns the normalized script id, or the empty string.
455     */
456    public String getScript() {
457        reset();
458        skipLanguage();
459        return getString(parseScript());
460    }
461
462    /**
463     * return the normalized country id, or the empty string.
464     */
465    public String getCountry() {
466        reset();
467        skipLanguage();
468        skipScript();
469        return getString(parseCountry());
470    }
471
472    /**
473     * Returns the normalized variant id, or the empty string.
474     */
475    public String getVariant() {
476        reset();
477        skipLanguage();
478        skipScript();
479        skipCountry();
480        return getString(parseVariant());
481    }
482
483    /**
484     * Returns the language, script, country, and variant as separate strings.
485     */
486    public String[] getLanguageScriptCountryVariant() {
487        reset();
488        return new String[] {
489                getString(parseLanguage()),
490                getString(parseScript()),
491                getString(parseCountry()),
492                getString(parseVariant())
493        };
494    }
495
496    public void setBaseName(String baseName) {
497        this.baseName = baseName;
498    }
499
500    public void parseBaseName() {
501        if (baseName != null) {
502            set(0, baseName);
503        } else {
504            reset();
505            parseLanguage();
506            parseScript();
507            parseCountry();
508            parseVariant();
509
510            // catch unwanted trailing underscore after country if there was no variant
511            int len = buffer.length();
512            if (len > 0 && buffer.charAt(len - 1) == UNDERSCORE) {
513                buffer.deleteCharAt(len - 1);
514            }
515        }
516    }
517
518    /**
519     * Returns the normalized base form of the locale id.  The base
520     * form does not include keywords.
521     */
522    public String getBaseName() {
523        if (baseName != null) {
524            return baseName;
525        }
526        parseBaseName();
527        return getString(0);
528    }
529
530    /**
531     * Returns the normalized full form of the locale id.  The full
532     * form includes keywords if they are present.
533     */
534    public String getName() {
535        parseBaseName();
536        parseKeywords();
537        return getString(0);
538    }
539
540    // keyword utilities
541
542    /**
543     * If we have keywords, advance index to the start of the keywords and return true,
544     * otherwise return false.
545     */
546    private boolean setToKeywordStart() {
547        for (int i = index; i < id.length; ++i) {
548            if (id[i] == KEYWORD_SEPARATOR) {
549                if (canonicalize) {
550                    for (int j = ++i; j < id.length; ++j) { // increment i past separator for return
551                        if (id[j] == KEYWORD_ASSIGN) {
552                            index = i;
553                            return true;
554                        }
555                    }
556                } else {
557                    if (++i < id.length) {
558                        index = i;
559                        return true;
560                    }
561                }
562                break;
563            }
564        }
565        return false;
566    }
567
568    private static boolean isDoneOrKeywordAssign(char c) {
569        return c == DONE || c == KEYWORD_ASSIGN;
570    }
571
572    private static boolean isDoneOrItemSeparator(char c) {
573        return c == DONE || c == ITEM_SEPARATOR;
574    }
575
576    private String getKeyword() {
577        int start = index;
578        while (!isDoneOrKeywordAssign(next())) {
579        }
580        --index;
581        return AsciiUtil.toLowerString(new String(id, start, index-start).trim());
582    }
583
584    private String getValue() {
585        int start = index;
586        while (!isDoneOrItemSeparator(next())) {
587        }
588        --index;
589        return new String(id, start, index-start).trim(); // leave case alone
590    }
591
592    private Comparator<String> getKeyComparator() {
593        final Comparator<String> comp = new Comparator<String>() {
594            @Override
595            public int compare(String lhs, String rhs) {
596                return lhs.compareTo(rhs);
597            }
598        };
599        return comp;
600    }
601
602    /**
603     * Returns a map of the keywords and values, or null if there are none.
604     */
605    public Map<String, String> getKeywordMap() {
606        if (keywords == null) {
607            TreeMap<String, String> m = null;
608            if (setToKeywordStart()) {
609                // trim spaces and convert to lower case, both keywords and values.
610                do {
611                    String key = getKeyword();
612                    if (key.length() == 0) {
613                        break;
614                    }
615                    char c = next();
616                    if (c != KEYWORD_ASSIGN) {
617                        // throw new IllegalArgumentException("key '" + key + "' missing a value.");
618                        if (c == DONE) {
619                            break;
620                        } else {
621                            continue;
622                        }
623                    }
624                    String value = getValue();
625                    if (value.length() == 0) {
626                        // throw new IllegalArgumentException("key '" + key + "' missing a value.");
627                        continue;
628                    }
629                    if (m == null) {
630                        m = new TreeMap<String, String>(getKeyComparator());
631                    } else if (m.containsKey(key)) {
632                        // throw new IllegalArgumentException("key '" + key + "' already has a value.");
633                        continue;
634                    }
635                    m.put(key, value);
636                } while (next() == ITEM_SEPARATOR);
637            }
638            keywords = m != null ? m : Collections.<String, String>emptyMap();
639        }
640
641        return keywords;
642    }
643
644
645    /**
646     * Parse the keywords and return start of the string in the buffer.
647     */
648    private int parseKeywords() {
649        int oldBlen = buffer.length();
650        Map<String, String> m = getKeywordMap();
651        if (!m.isEmpty()) {
652            boolean first = true;
653            for (Map.Entry<String, String> e : m.entrySet()) {
654                append(first ? KEYWORD_SEPARATOR : ITEM_SEPARATOR);
655                first = false;
656                append(e.getKey());
657                append(KEYWORD_ASSIGN);
658                append(e.getValue());
659            }
660            if (first == false) {
661                ++oldBlen;
662            }
663        }
664        return oldBlen;
665    }
666
667    /**
668     * Returns an iterator over the keywords, or null if we have an empty map.
669     */
670    public Iterator<String> getKeywords() {
671        Map<String, String> m = getKeywordMap();
672        return m.isEmpty() ? null : m.keySet().iterator();
673    }
674
675    /**
676     * Returns the value for the named keyword, or null if the keyword is not
677     * present.
678     */
679    public String getKeywordValue(String keywordName) {
680        Map<String, String> m = getKeywordMap();
681        return m.isEmpty() ? null : m.get(AsciiUtil.toLowerString(keywordName.trim()));
682    }
683
684    /**
685     * Set the keyword value only if it is not already set to something else.
686     */
687    public void defaultKeywordValue(String keywordName, String value) {
688        setKeywordValue(keywordName, value, false);
689    }
690
691    /**
692     * Set the value for the named keyword, or unset it if value is null.  If
693     * keywordName itself is null, unset all keywords.  If keywordName is not null,
694     * value must not be null.
695     */
696    public void setKeywordValue(String keywordName, String value) {
697        setKeywordValue(keywordName, value, true);
698    }
699
700    /**
701     * Set the value for the named keyword, or unset it if value is null.  If
702     * keywordName itself is null, unset all keywords.  If keywordName is not null,
703     * value must not be null.  If reset is true, ignore any previous value for
704     * the keyword, otherwise do not change the keyword (including removal of
705     * one or all keywords).
706     */
707    private void setKeywordValue(String keywordName, String value, boolean reset) {
708        if (keywordName == null) {
709            if (reset) {
710                // force new map, ignore value
711                keywords = Collections.<String, String>emptyMap();
712            }
713        } else {
714            keywordName = AsciiUtil.toLowerString(keywordName.trim());
715            if (keywordName.length() == 0) {
716                throw new IllegalArgumentException("keyword must not be empty");
717            }
718            if (value != null) {
719                value = value.trim();
720                if (value.length() == 0) {
721                    throw new IllegalArgumentException("value must not be empty");
722                }
723            }
724            Map<String, String> m = getKeywordMap();
725            if (m.isEmpty()) { // it is EMPTY_MAP
726                if (value != null) {
727                    // force new map
728                    keywords = new TreeMap<String, String>(getKeyComparator());
729                    keywords.put(keywordName, value.trim());
730                }
731            } else {
732                if (reset || !m.containsKey(keywordName)) {
733                    if (value != null) {
734                        m.put(keywordName, value);
735                    } else {
736                        m.remove(keywordName);
737                        if (m.isEmpty()) {
738                            // force new map
739                            keywords = Collections.<String, String>emptyMap();
740                        }
741                    }
742                }
743            }
744        }
745    }
746}
747