1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2013-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* CollationRuleParser.java, ported from collationruleparser.h/.cpp
9*
10* C++ version created on: 2013apr10
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import java.text.ParseException;
17import java.util.ArrayList;
18
19import com.ibm.icu.impl.IllegalIcuArgumentException;
20import com.ibm.icu.impl.PatternProps;
21import com.ibm.icu.lang.UCharacter;
22import com.ibm.icu.lang.UProperty;
23import com.ibm.icu.text.Collator;
24import com.ibm.icu.text.Normalizer2;
25import com.ibm.icu.text.UTF16;
26import com.ibm.icu.text.UnicodeSet;
27import com.ibm.icu.util.ULocale;
28
29public final class CollationRuleParser {
30    /** Special reset positions. */
31    enum Position {
32        FIRST_TERTIARY_IGNORABLE,
33        LAST_TERTIARY_IGNORABLE,
34        FIRST_SECONDARY_IGNORABLE,
35        LAST_SECONDARY_IGNORABLE,
36        FIRST_PRIMARY_IGNORABLE,
37        LAST_PRIMARY_IGNORABLE,
38        FIRST_VARIABLE,
39        LAST_VARIABLE,
40        FIRST_REGULAR,
41        LAST_REGULAR,
42        FIRST_IMPLICIT,
43        LAST_IMPLICIT,
44        FIRST_TRAILING,
45        LAST_TRAILING
46    }
47    static final Position[] POSITION_VALUES = Position.values();
48
49    /**
50     * First character of contractions that encode special reset positions.
51     * U+FFFE cannot be tailored via rule syntax.
52     *
53     * The second contraction character is POS_BASE + Position.
54     */
55    static final char POS_LEAD = 0xfffe;
56    /**
57     * Base for the second character of contractions that encode special reset positions.
58     * Braille characters U+28xx are printable and normalization-inert.
59     * @see POS_LEAD
60     */
61    static final char POS_BASE = 0x2800;
62
63    static abstract class Sink {
64        /**
65         * Adds a reset.
66         * strength=UCOL_IDENTICAL for &str.
67         * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
68         */
69        abstract void addReset(int strength, CharSequence str);
70        /**
71         * Adds a relation with strength and prefix | str / extension.
72         */
73        abstract void addRelation(int strength, CharSequence prefix,
74                CharSequence str, CharSequence extension);
75
76        void suppressContractions(UnicodeSet set) {}
77
78        void optimize(UnicodeSet set) {}
79    }
80
81    interface Importer {
82        String getRules(String localeID, String collationType);
83    }
84
85    /**
86     * Constructor.
87     * The Sink must be set before parsing.
88     * The Importer can be set, otherwise [import locale] syntax is not supported.
89     */
90    CollationRuleParser(CollationData base) {
91        baseData = base;
92    }
93
94    /**
95     * Sets the pointer to a Sink object.
96     * The pointer is aliased: Pointer copy without cloning or taking ownership.
97     */
98    void setSink(Sink sinkAlias) {
99        sink = sinkAlias;
100    }
101
102    /**
103     * Sets the pointer to an Importer object.
104     * The pointer is aliased: Pointer copy without cloning or taking ownership.
105     */
106    void setImporter(Importer importerAlias) {
107        importer = importerAlias;
108    }
109
110    void parse(String ruleString, CollationSettings outSettings) throws ParseException {
111        settings = outSettings;
112        parse(ruleString);
113    }
114
115    private static final int UCOL_DEFAULT = -1;
116    private static final int UCOL_OFF = 0;
117    private static final int UCOL_ON = 1;
118
119    /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
120    private static final int STRENGTH_MASK = 0xf;
121    private static final int STARRED_FLAG = 0x10;
122    private static final int OFFSET_SHIFT = 8;
123
124    private static final String BEFORE = "[before";
125
126    // In C++, we parse into temporary UnicodeString objects named "raw" or "str".
127    // In Java, we reuse this StringBuilder.
128    private final StringBuilder rawBuilder = new StringBuilder();
129
130    private void parse(String ruleString) throws ParseException {
131        rules = ruleString;
132        ruleIndex = 0;
133
134        while(ruleIndex < rules.length()) {
135            char c = rules.charAt(ruleIndex);
136            if(PatternProps.isWhiteSpace(c)) {
137                ++ruleIndex;
138                continue;
139            }
140            switch(c) {
141            case 0x26:  // '&'
142                parseRuleChain();
143                break;
144            case 0x5b:  // '['
145                parseSetting();
146                break;
147            case 0x23:  // '#' starts a comment, until the end of the line
148                ruleIndex = skipComment(ruleIndex + 1);
149                break;
150            case 0x40:  // '@' is equivalent to [backwards 2]
151                settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
152                ++ruleIndex;
153                break;
154            case 0x21:  // '!' used to turn on Thai/Lao character reversal
155                // Accept but ignore. The root collator has contractions
156                // that are equivalent to the character reversal, where appropriate.
157                ++ruleIndex;
158                break;
159            default:
160                setParseError("expected a reset or setting or comment");
161                break;
162            }
163        }
164    }
165
166    private void parseRuleChain() throws ParseException {
167        int resetStrength = parseResetAndPosition();
168        boolean isFirstRelation = true;
169        for(;;) {
170            int result = parseRelationOperator();
171            if(result < 0) {
172                if(ruleIndex < rules.length() && rules.charAt(ruleIndex) == 0x23) {
173                    // '#' starts a comment, until the end of the line
174                    ruleIndex = skipComment(ruleIndex + 1);
175                    continue;
176                }
177                if(isFirstRelation) {
178                    setParseError("reset not followed by a relation");
179                }
180                return;
181            }
182            int strength = result & STRENGTH_MASK;
183            if(resetStrength < Collator.IDENTICAL) {
184                // reset-before rule chain
185                if(isFirstRelation) {
186                    if(strength != resetStrength) {
187                        setParseError("reset-before strength differs from its first relation");
188                        return;
189                    }
190                } else {
191                    if(strength < resetStrength) {
192                        setParseError("reset-before strength followed by a stronger relation");
193                        return;
194                    }
195                }
196            }
197            int i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
198            if((result & STARRED_FLAG) == 0) {
199                parseRelationStrings(strength, i);
200            } else {
201                parseStarredCharacters(strength, i);
202            }
203            isFirstRelation = false;
204        }
205    }
206
207    private int parseResetAndPosition() throws ParseException {
208        int i = skipWhiteSpace(ruleIndex + 1);
209        int j;
210        char c;
211        int resetStrength;
212        if(rules.regionMatches(i, BEFORE, 0, BEFORE.length()) &&
213                (j = i + BEFORE.length()) < rules.length() &&
214                PatternProps.isWhiteSpace(rules.charAt(j)) &&
215                ((j = skipWhiteSpace(j + 1)) + 1) < rules.length() &&
216                0x31 <= (c = rules.charAt(j)) && c <= 0x33 &&
217                rules.charAt(j + 1) == 0x5d) {
218            // &[before n] with n=1 or 2 or 3
219            resetStrength = Collator.PRIMARY + (c - 0x31);
220            i = skipWhiteSpace(j + 2);
221        } else {
222            resetStrength = Collator.IDENTICAL;
223        }
224        if(i >= rules.length()) {
225            setParseError("reset without position");
226            return UCOL_DEFAULT;
227        }
228        if(rules.charAt(i) == 0x5b) {  // '['
229            i = parseSpecialPosition(i, rawBuilder);
230        } else {
231            i = parseTailoringString(i, rawBuilder);
232        }
233        try {
234            sink.addReset(resetStrength, rawBuilder);
235        } catch(Exception e) {
236            setParseError("adding reset failed", e);
237            return UCOL_DEFAULT;
238        }
239        ruleIndex = i;
240        return resetStrength;
241    }
242
243    private int parseRelationOperator() {
244        ruleIndex = skipWhiteSpace(ruleIndex);
245        if(ruleIndex >= rules.length()) { return UCOL_DEFAULT; }
246        int strength;
247        int i = ruleIndex;
248        char c = rules.charAt(i++);
249        switch(c) {
250        case 0x3c:  // '<'
251            if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<
252                ++i;
253                if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<<
254                    ++i;
255                    if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<<<
256                        ++i;
257                        strength = Collator.QUATERNARY;
258                    } else {
259                        strength = Collator.TERTIARY;
260                    }
261                } else {
262                    strength = Collator.SECONDARY;
263                }
264            } else {
265                strength = Collator.PRIMARY;
266            }
267            if(i < rules.length() && rules.charAt(i) == 0x2a) {  // '*'
268                ++i;
269                strength |= STARRED_FLAG;
270            }
271            break;
272        case 0x3b:  // ';' same as <<
273            strength = Collator.SECONDARY;
274            break;
275        case 0x2c:  // ',' same as <<<
276            strength = Collator.TERTIARY;
277            break;
278        case 0x3d:  // '='
279            strength = Collator.IDENTICAL;
280            if(i < rules.length() && rules.charAt(i) == 0x2a) {  // '*'
281                ++i;
282                strength |= STARRED_FLAG;
283            }
284            break;
285        default:
286            return UCOL_DEFAULT;
287        }
288        return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
289    }
290
291    private void parseRelationStrings(int strength, int i) throws ParseException {
292        // Parse
293        //     prefix | str / extension
294        // where prefix and extension are optional.
295        String prefix = "";
296        CharSequence extension = "";
297        i = parseTailoringString(i, rawBuilder);
298        char next = (i < rules.length()) ? rules.charAt(i) : 0;
299        if(next == 0x7c) {  // '|' separates the context prefix from the string.
300            prefix = rawBuilder.toString();
301            i = parseTailoringString(i + 1, rawBuilder);
302            next = (i < rules.length()) ? rules.charAt(i) : 0;
303        }
304        // str = rawBuilder (do not modify rawBuilder any more in this function)
305        if(next == 0x2f) {  // '/' separates the string from the extension.
306            StringBuilder extBuilder = new StringBuilder();
307            i = parseTailoringString(i + 1, extBuilder);
308            extension = extBuilder;
309        }
310        if(prefix.length() != 0) {
311            int prefix0 = prefix.codePointAt(0);
312            int c = rawBuilder.codePointAt(0);
313            if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
314                setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary");
315                return;
316            }
317        }
318        try {
319            sink.addRelation(strength, prefix, rawBuilder, extension);
320        } catch(Exception e) {
321            setParseError("adding relation failed", e);
322            return;
323        }
324        ruleIndex = i;
325    }
326
327    private void parseStarredCharacters(int strength, int i) throws ParseException {
328        String empty = "";
329        i = parseString(skipWhiteSpace(i), rawBuilder);
330        if(rawBuilder.length() == 0) {
331            setParseError("missing starred-relation string");
332            return;
333        }
334        int prev = -1;
335        int j = 0;
336        for(;;) {
337            while(j < rawBuilder.length()) {
338                int c = rawBuilder.codePointAt(j);
339                if(!nfd.isInert(c)) {
340                    setParseError("starred-relation string is not all NFD-inert");
341                    return;
342                }
343                try {
344                    sink.addRelation(strength, empty, UTF16.valueOf(c), empty);
345                } catch(Exception e) {
346                    setParseError("adding relation failed", e);
347                    return;
348                }
349                j += Character.charCount(c);
350                prev = c;
351            }
352            if(i >= rules.length() || rules.charAt(i) != 0x2d) {  // '-'
353                break;
354            }
355            if(prev < 0) {
356                setParseError("range without start in starred-relation string");
357                return;
358            }
359            i = parseString(i + 1, rawBuilder);
360            if(rawBuilder.length() == 0) {
361                setParseError("range without end in starred-relation string");
362                return;
363            }
364            int c = rawBuilder.codePointAt(0);
365            if(c < prev) {
366                setParseError("range start greater than end in starred-relation string");
367                return;
368            }
369            // range prev-c
370            while(++prev <= c) {
371                if(!nfd.isInert(prev)) {
372                    setParseError("starred-relation string range is not all NFD-inert");
373                    return;
374                }
375                if(isSurrogate(prev)) {
376                    setParseError("starred-relation string range contains a surrogate");
377                    return;
378                }
379                if(0xfffd <= prev && prev <= 0xffff) {
380                    setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF");
381                    return;
382                }
383                try {
384                    sink.addRelation(strength, empty, UTF16.valueOf(prev), empty);
385                } catch(Exception e) {
386                    setParseError("adding relation failed", e);
387                    return;
388                }
389            }
390            prev = -1;
391            j = Character.charCount(c);
392        }
393        ruleIndex = skipWhiteSpace(i);
394    }
395
396    private int parseTailoringString(int i, StringBuilder raw) throws ParseException {
397        i = parseString(skipWhiteSpace(i), raw);
398        if(raw.length() == 0) {
399            setParseError("missing relation string");
400        }
401        return skipWhiteSpace(i);
402    }
403
404    private int parseString(int i, StringBuilder raw) throws ParseException {
405        raw.setLength(0);
406        while(i < rules.length()) {
407            char c = rules.charAt(i++);
408            if(isSyntaxChar(c)) {
409                if(c == 0x27) {  // apostrophe
410                    if(i < rules.length() && rules.charAt(i) == 0x27) {
411                        // Double apostrophe, encodes a single one.
412                        raw.append((char)0x27);
413                        ++i;
414                        continue;
415                    }
416                    // Quote literal text until the next single apostrophe.
417                    for(;;) {
418                        if(i == rules.length()) {
419                            setParseError("quoted literal text missing terminating apostrophe");
420                            return i;
421                        }
422                        c = rules.charAt(i++);
423                        if(c == 0x27) {
424                            if(i < rules.length() && rules.charAt(i) == 0x27) {
425                                // Double apostrophe inside quoted literal text,
426                                // still encodes a single apostrophe.
427                                ++i;
428                            } else {
429                                break;
430                            }
431                        }
432                        raw.append(c);
433                    }
434                } else if(c == 0x5c) {  // backslash
435                    if(i == rules.length()) {
436                        setParseError("backslash escape at the end of the rule string");
437                        return i;
438                    }
439                    int cp = rules.codePointAt(i);
440                    raw.appendCodePoint(cp);
441                    i += Character.charCount(cp);
442                } else {
443                    // Any other syntax character terminates a string.
444                    --i;
445                    break;
446                }
447            } else if(PatternProps.isWhiteSpace(c)) {
448                // Unquoted white space terminates a string.
449                --i;
450                break;
451            } else {
452                raw.append(c);
453            }
454        }
455        for(int j = 0; j < raw.length();) {
456            int c = raw.codePointAt(j);
457            if(isSurrogate(c)) {
458                setParseError("string contains an unpaired surrogate");
459                return i;
460            }
461            if(0xfffd <= c && c <= 0xffff) {
462                setParseError("string contains U+FFFD, U+FFFE or U+FFFF");
463                return i;
464            }
465            j += Character.charCount(c);
466        }
467        return i;
468    }
469
470    // TODO: Widen UTF16.isSurrogate(char16) to take an int.
471    private static final boolean isSurrogate(int c) {
472        return (c & 0xfffff800) == 0xd800;
473    }
474
475    private static final String[] positions = {
476        "first tertiary ignorable",
477        "last tertiary ignorable",
478        "first secondary ignorable",
479        "last secondary ignorable",
480        "first primary ignorable",
481        "last primary ignorable",
482        "first variable",
483        "last variable",
484        "first regular",
485        "last regular",
486        "first implicit",
487        "last implicit",
488        "first trailing",
489        "last trailing"
490    };
491
492    /**
493     * Sets str to a contraction of U+FFFE and (U+2800 + Position).
494     * @return rule index after the special reset position
495     * @throws ParseException
496     */
497    private int parseSpecialPosition(int i, StringBuilder str) throws ParseException {
498        int j = readWords(i + 1, rawBuilder);
499        if(j > i && rules.charAt(j) == 0x5d && rawBuilder.length() != 0) {  // words end with ]
500            ++j;
501            String raw = rawBuilder.toString();
502            str.setLength(0);
503            for(int pos = 0; pos < positions.length; ++pos) {
504                if(raw.equals(positions[pos])) {
505                    str.append(POS_LEAD).append((char)(POS_BASE + pos));
506                    return j;
507                }
508            }
509            if(raw.equals("top")) {
510                str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_REGULAR.ordinal()));
511                return j;
512            }
513            if(raw.equals("variable top")) {
514                str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_VARIABLE.ordinal()));
515                return j;
516            }
517        }
518        setParseError("not a valid special reset position");
519        return i;
520    }
521
522    private void parseSetting() throws ParseException {
523        int i = ruleIndex + 1;
524        int j = readWords(i, rawBuilder);
525        if(j <= i || rawBuilder.length() == 0) {
526            setParseError("expected a setting/option at '['");
527        }
528        // startsWith() etc. are available for String but not CharSequence/StringBuilder.
529        String raw = rawBuilder.toString();
530        if(rules.charAt(j) == 0x5d) {  // words end with ]
531            ++j;
532            if(raw.startsWith("reorder") &&
533                    (raw.length() == 7 || raw.charAt(7) == 0x20)) {
534                parseReordering(raw);
535                ruleIndex = j;
536                return;
537            }
538            if(raw.equals("backwards 2")) {
539                settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
540                ruleIndex = j;
541                return;
542            }
543            String v;
544            int valueIndex = raw.lastIndexOf(0x20);
545            if(valueIndex >= 0) {
546                v = raw.substring(valueIndex + 1);
547                raw = raw.substring(0, valueIndex);
548            } else {
549                v = "";
550            }
551            if(raw.equals("strength") && v.length() == 1) {
552                int value = UCOL_DEFAULT;
553                char c = v.charAt(0);
554                if(0x31 <= c && c <= 0x34) {  // 1..4
555                    value = Collator.PRIMARY + (c - 0x31);
556                } else if(c == 0x49) {  // 'I'
557                    value = Collator.IDENTICAL;
558                }
559                if(value != UCOL_DEFAULT) {
560                    settings.setStrength(value);
561                    ruleIndex = j;
562                    return;
563                }
564            } else if(raw.equals("alternate")) {
565                int value = UCOL_DEFAULT;
566                if(v.equals("non-ignorable")) {
567                    value = 0;  // UCOL_NON_IGNORABLE
568                } else if(v.equals("shifted")) {
569                    value = 1;  // UCOL_SHIFTED
570                }
571                if(value != UCOL_DEFAULT) {
572                    settings.setAlternateHandlingShifted(value > 0);
573                    ruleIndex = j;
574                    return;
575                }
576            } else if(raw.equals("maxVariable")) {
577                int value = UCOL_DEFAULT;
578                if(v.equals("space")) {
579                    value = CollationSettings.MAX_VAR_SPACE;
580                } else if(v.equals("punct")) {
581                    value = CollationSettings.MAX_VAR_PUNCT;
582                } else if(v.equals("symbol")) {
583                    value = CollationSettings.MAX_VAR_SYMBOL;
584                } else if(v.equals("currency")) {
585                    value = CollationSettings.MAX_VAR_CURRENCY;
586                }
587                if(value != UCOL_DEFAULT) {
588                    settings.setMaxVariable(value, 0);
589                    settings.variableTop = baseData.getLastPrimaryForGroup(
590                        Collator.ReorderCodes.FIRST + value);
591                    assert(settings.variableTop != 0);
592                    ruleIndex = j;
593                    return;
594                }
595            } else if(raw.equals("caseFirst")) {
596                int value = UCOL_DEFAULT;
597                if(v.equals("off")) {
598                    value = UCOL_OFF;
599                } else if(v.equals("lower")) {
600                    value = CollationSettings.CASE_FIRST;  // UCOL_LOWER_FIRST
601                } else if(v.equals("upper")) {
602                    value = CollationSettings.CASE_FIRST_AND_UPPER_MASK;  // UCOL_UPPER_FIRST
603                }
604                if(value != UCOL_DEFAULT) {
605                    settings.setCaseFirst(value);
606                    ruleIndex = j;
607                    return;
608                }
609            } else if(raw.equals("caseLevel")) {
610                int value = getOnOffValue(v);
611                if(value != UCOL_DEFAULT) {
612                    settings.setFlag(CollationSettings.CASE_LEVEL, value > 0);
613                    ruleIndex = j;
614                    return;
615                }
616            } else if(raw.equals("normalization")) {
617                int value = getOnOffValue(v);
618                if(value != UCOL_DEFAULT) {
619                    settings.setFlag(CollationSettings.CHECK_FCD, value > 0);
620                    ruleIndex = j;
621                    return;
622                }
623            } else if(raw.equals("numericOrdering")) {
624                int value = getOnOffValue(v);
625                if(value != UCOL_DEFAULT) {
626                    settings.setFlag(CollationSettings.NUMERIC, value > 0);
627                    ruleIndex = j;
628                    return;
629                }
630            } else if(raw.equals("hiraganaQ")) {
631                int value = getOnOffValue(v);
632                if(value != UCOL_DEFAULT) {
633                    if(value == UCOL_ON) {
634                        setParseError("[hiraganaQ on] is not supported");
635                    }
636                    ruleIndex = j;
637                    return;
638                }
639            } else if(raw.equals("import")) {
640                // BCP 47 language tag -> ICU locale ID
641                ULocale localeID;
642                try {
643                    localeID = new ULocale.Builder().setLanguageTag(v).build();
644                } catch(Exception e) {
645                    setParseError("expected language tag in [import langTag]", e);
646                    return;
647                }
648                // localeID minus all keywords
649                String baseID = localeID.getBaseName();
650                // @collation=type, or length=0 if not specified
651                String collationType = localeID.getKeywordValue("collation");
652                if(importer == null) {
653                    setParseError("[import langTag] is not supported");
654                } else {
655                    String importedRules;
656                    try {
657                        importedRules =
658                            importer.getRules(baseID,
659                                    collationType != null ? collationType : "standard");
660                    } catch(Exception e) {
661                        setParseError("[import langTag] failed", e);
662                        return;
663                    }
664                    String outerRules = rules;
665                    int outerRuleIndex = ruleIndex;
666                    try {
667                        parse(importedRules);
668                    } catch(Exception e) {
669                        ruleIndex = outerRuleIndex;  // Restore the original index for error reporting.
670                        setParseError("parsing imported rules failed", e);
671                    }
672                    rules = outerRules;
673                    ruleIndex = j;
674                }
675                return;
676            }
677        } else if(rules.charAt(j) == 0x5b) {  // words end with [
678            UnicodeSet set = new UnicodeSet();
679            j = parseUnicodeSet(j, set);
680            if(raw.equals("optimize")) {
681                try {
682                    sink.optimize(set);
683                } catch(Exception e) {
684                    setParseError("[optimize set] failed", e);
685                }
686                ruleIndex = j;
687                return;
688            } else if(raw.equals("suppressContractions")) {
689                try {
690                    sink.suppressContractions(set);
691                } catch(Exception e) {
692                    setParseError("[suppressContractions set] failed", e);
693                }
694                ruleIndex = j;
695                return;
696            }
697        }
698        setParseError("not a valid setting/option");
699    }
700
701    private void parseReordering(CharSequence raw) throws ParseException {
702        int i = 7;  // after "reorder"
703        if(i == raw.length()) {
704            // empty [reorder] with no codes
705            settings.resetReordering();
706            return;
707        }
708        // Parse the codes in [reorder aa bb cc].
709        ArrayList<Integer> reorderCodes = new ArrayList<Integer>();
710        while(i < raw.length()) {
711            ++i;  // skip the word-separating space
712            int limit = i;
713            while(limit < raw.length() && raw.charAt(limit) != ' ') { ++limit; }
714            String word = raw.subSequence(i, limit).toString();
715            int code = getReorderCode(word);
716            if(code < 0) {
717                setParseError("unknown script or reorder code");
718                return;
719            }
720            reorderCodes.add(code);
721            i = limit;
722        }
723        if(reorderCodes.isEmpty()) {
724            settings.resetReordering();
725        } else {
726            int[] codes = new int[reorderCodes.size()];
727            int j = 0;
728            for(Integer code : reorderCodes) { codes[j++] = code; }
729            settings.setReordering(baseData, codes);
730        }
731    }
732
733    private static final String[] gSpecialReorderCodes = {
734        "space", "punct", "symbol", "currency", "digit"
735    };
736
737    /**
738     * Gets a script or reorder code from its string representation.
739     * @return the script/reorder code, or
740     * -1 if not recognized
741     */
742    public static int getReorderCode(String word) {
743        for(int i = 0; i < gSpecialReorderCodes.length; ++i) {
744            if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) {
745                return Collator.ReorderCodes.FIRST + i;
746            }
747        }
748        try {
749            int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word);
750            if(script >= 0) {
751                return script;
752            }
753        } catch (IllegalIcuArgumentException e) {
754            // fall through
755        }
756        if(word.equalsIgnoreCase("others")) {
757            return Collator.ReorderCodes.OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
758        }
759        return -1;
760    }
761
762    private static int getOnOffValue(String s) {
763        if(s.equals("on")) {
764            return UCOL_ON;
765        } else if(s.equals("off")) {
766            return UCOL_OFF;
767        } else {
768            return UCOL_DEFAULT;
769        }
770    }
771
772    private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException {
773        // Collect a UnicodeSet pattern between a balanced pair of [brackets].
774        int level = 0;
775        int j = i;
776        for(;;) {
777            if(j == rules.length()) {
778                setParseError("unbalanced UnicodeSet pattern brackets");
779                return j;
780            }
781            char c = rules.charAt(j++);
782            if(c == 0x5b) {  // '['
783                ++level;
784            } else if(c == 0x5d) {  // ']'
785                if(--level == 0) { break; }
786            }
787        }
788        try {
789            set.applyPattern(rules.substring(i, j));
790        } catch(Exception e) {
791            setParseError("not a valid UnicodeSet pattern: " + e.getMessage());
792        }
793        j = skipWhiteSpace(j);
794        if(j == rules.length() || rules.charAt(j) != 0x5d) {
795            setParseError("missing option-terminating ']' after UnicodeSet pattern");
796            return j;
797        }
798        return ++j;
799    }
800
801    private int readWords(int i, StringBuilder raw) {
802        raw.setLength(0);
803        i = skipWhiteSpace(i);
804        for(;;) {
805            if(i >= rules.length()) { return 0; }
806            char c = rules.charAt(i);
807            if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
808                if(raw.length() == 0) { return i; }
809                int lastIndex = raw.length() - 1;
810                if(raw.charAt(lastIndex) == ' ') {  // remove trailing space
811                    raw.setLength(lastIndex);
812                }
813                return i;
814            }
815            if(PatternProps.isWhiteSpace(c)) {
816                raw.append(' ');
817                i = skipWhiteSpace(i + 1);
818            } else {
819                raw.append(c);
820                ++i;
821            }
822        }
823    }
824
825    private int skipComment(int i) {
826        // skip to past the newline
827        while(i < rules.length()) {
828            char c = rules.charAt(i++);
829            // LF or FF or CR or NEL or LS or PS
830            if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
831                // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
832                // NLF (new line function) = CR or LF or CR+LF or NEL.
833                // No need to collect all of CR+LF because a following LF will be ignored anyway.
834                break;
835            }
836        }
837        return i;
838    }
839
840    private void setParseError(String reason) throws ParseException {
841        throw makeParseException(reason);
842    }
843
844    private void setParseError(String reason, Exception e) throws ParseException {
845        ParseException newExc = makeParseException(reason + ": " + e.getMessage());
846        newExc.initCause(e);
847        throw newExc;
848    }
849
850    private ParseException makeParseException(String reason) {
851        return new ParseException(appendErrorContext(reason), ruleIndex);
852    }
853
854    private static final int U_PARSE_CONTEXT_LEN = 16;
855
856    // C++ setErrorContext()
857    private String appendErrorContext(String reason) {
858        // Note: This relies on the calling code maintaining the ruleIndex
859        // at a position that is useful for debugging.
860        // For example, at the beginning of a reset or relation etc.
861        StringBuilder msg = new StringBuilder(reason);
862        msg.append(" at index ").append(ruleIndex);
863        // We are not counting line numbers.
864
865        msg.append(" near \"");
866        // before ruleIndex
867        int start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
868        if(start < 0) {
869            start = 0;
870        } else if(start > 0 && Character.isLowSurrogate(rules.charAt(start))) {
871            ++start;
872        }
873        msg.append(rules, start, ruleIndex);
874
875        msg.append('!');
876        // starting from ruleIndex
877        int length = rules.length() - ruleIndex;
878        if(length >= U_PARSE_CONTEXT_LEN) {
879            length = U_PARSE_CONTEXT_LEN - 1;
880            if(Character.isHighSurrogate(rules.charAt(ruleIndex + length - 1))) {
881                --length;
882            }
883        }
884        msg.append(rules, ruleIndex, ruleIndex + length);
885        return msg.append('\"').toString();
886    }
887
888    /**
889     * ASCII [:P:] and [:S:]:
890     * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
891     */
892    private static boolean isSyntaxChar(int c) {
893        return 0x21 <= c && c <= 0x7e &&
894                (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
895                (0x5b <= c && c <= 0x60) || (0x7b <= c));
896    }
897
898    private int skipWhiteSpace(int i) {
899        while(i < rules.length() && PatternProps.isWhiteSpace(rules.charAt(i))) {
900            ++i;
901        }
902        return i;
903    }
904
905    private Normalizer2 nfd = Normalizer2.getNFDInstance();
906    private Normalizer2 nfc = Normalizer2.getNFCInstance();
907
908    private String rules;
909    private final CollationData baseData;
910    private CollationSettings settings;
911
912    private Sink sink;
913    private Importer importer;
914
915    private int ruleIndex;
916}
917