1/*
2*******************************************************************************
3* Copyright (C) 2013-2015, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* CollationRuleParser.java, ported from collationruleparser.h/.cpp
7*
8* C++ version created on: 2013apr10
9* created by: Markus W. Scherer
10*/
11
12package com.ibm.icu.impl.coll;
13
14import java.text.ParseException;
15import java.util.ArrayList;
16
17import com.ibm.icu.impl.IllegalIcuArgumentException;
18import com.ibm.icu.impl.PatternProps;
19import com.ibm.icu.lang.UCharacter;
20import com.ibm.icu.lang.UProperty;
21import com.ibm.icu.text.Collator;
22import com.ibm.icu.text.Normalizer2;
23import com.ibm.icu.text.UTF16;
24import com.ibm.icu.text.UnicodeSet;
25import com.ibm.icu.util.ULocale;
26
27public final class CollationRuleParser {
28    /** Special reset positions. */
29    enum Position {
30        FIRST_TERTIARY_IGNORABLE,
31        LAST_TERTIARY_IGNORABLE,
32        FIRST_SECONDARY_IGNORABLE,
33        LAST_SECONDARY_IGNORABLE,
34        FIRST_PRIMARY_IGNORABLE,
35        LAST_PRIMARY_IGNORABLE,
36        FIRST_VARIABLE,
37        LAST_VARIABLE,
38        FIRST_REGULAR,
39        LAST_REGULAR,
40        FIRST_IMPLICIT,
41        LAST_IMPLICIT,
42        FIRST_TRAILING,
43        LAST_TRAILING
44    }
45    static final Position[] POSITION_VALUES = Position.values();
46
47    /**
48     * First character of contractions that encode special reset positions.
49     * U+FFFE cannot be tailored via rule syntax.
50     *
51     * The second contraction character is POS_BASE + Position.
52     */
53    static final char POS_LEAD = 0xfffe;
54    /**
55     * Base for the second character of contractions that encode special reset positions.
56     * Braille characters U+28xx are printable and normalization-inert.
57     * @see POS_LEAD
58     */
59    static final char POS_BASE = 0x2800;
60
61    static abstract class Sink {
62        /**
63         * Adds a reset.
64         * strength=UCOL_IDENTICAL for &str.
65         * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
66         */
67        abstract void addReset(int strength, CharSequence str);
68        /**
69         * Adds a relation with strength and prefix | str / extension.
70         */
71        abstract void addRelation(int strength, CharSequence prefix,
72                CharSequence str, CharSequence extension);
73
74        void suppressContractions(UnicodeSet set) {}
75
76        void optimize(UnicodeSet set) {}
77    }
78
79    interface Importer {
80        String getRules(String localeID, String collationType);
81    }
82
83    /**
84     * Constructor.
85     * The Sink must be set before parsing.
86     * The Importer can be set, otherwise [import locale] syntax is not supported.
87     */
88    CollationRuleParser(CollationData base) {
89        baseData = base;
90    }
91
92    /**
93     * Sets the pointer to a Sink object.
94     * The pointer is aliased: Pointer copy without cloning or taking ownership.
95     */
96    void setSink(Sink sinkAlias) {
97        sink = sinkAlias;
98    }
99
100    /**
101     * Sets the pointer to an Importer object.
102     * The pointer is aliased: Pointer copy without cloning or taking ownership.
103     */
104    void setImporter(Importer importerAlias) {
105        importer = importerAlias;
106    }
107
108    void parse(String ruleString, CollationSettings outSettings) throws ParseException {
109        settings = outSettings;
110        parse(ruleString);
111    }
112
113    private static final int UCOL_DEFAULT = -1;
114    private static final int UCOL_OFF = 0;
115    private static final int UCOL_ON = 1;
116
117    /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
118    private static final int STRENGTH_MASK = 0xf;
119    private static final int STARRED_FLAG = 0x10;
120    private static final int OFFSET_SHIFT = 8;
121
122    private static final String BEFORE = "[before";
123
124    // In C++, we parse into temporary UnicodeString objects named "raw" or "str".
125    // In Java, we reuse this StringBuilder.
126    private final StringBuilder rawBuilder = new StringBuilder();
127
128    private void parse(String ruleString) throws ParseException {
129        rules = ruleString;
130        ruleIndex = 0;
131
132        while(ruleIndex < rules.length()) {
133            char c = rules.charAt(ruleIndex);
134            if(PatternProps.isWhiteSpace(c)) {
135                ++ruleIndex;
136                continue;
137            }
138            switch(c) {
139            case 0x26:  // '&'
140                parseRuleChain();
141                break;
142            case 0x5b:  // '['
143                parseSetting();
144                break;
145            case 0x23:  // '#' starts a comment, until the end of the line
146                ruleIndex = skipComment(ruleIndex + 1);
147                break;
148            case 0x40:  // '@' is equivalent to [backwards 2]
149                settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
150                ++ruleIndex;
151                break;
152            case 0x21:  // '!' used to turn on Thai/Lao character reversal
153                // Accept but ignore. The root collator has contractions
154                // that are equivalent to the character reversal, where appropriate.
155                ++ruleIndex;
156                break;
157            default:
158                setParseError("expected a reset or setting or comment");
159                break;
160            }
161        }
162    }
163
164    private void parseRuleChain() throws ParseException {
165        int resetStrength = parseResetAndPosition();
166        boolean isFirstRelation = true;
167        for(;;) {
168            int result = parseRelationOperator();
169            if(result < 0) {
170                if(ruleIndex < rules.length() && rules.charAt(ruleIndex) == 0x23) {
171                    // '#' starts a comment, until the end of the line
172                    ruleIndex = skipComment(ruleIndex + 1);
173                    continue;
174                }
175                if(isFirstRelation) {
176                    setParseError("reset not followed by a relation");
177                }
178                return;
179            }
180            int strength = result & STRENGTH_MASK;
181            if(resetStrength < Collator.IDENTICAL) {
182                // reset-before rule chain
183                if(isFirstRelation) {
184                    if(strength != resetStrength) {
185                        setParseError("reset-before strength differs from its first relation");
186                        return;
187                    }
188                } else {
189                    if(strength < resetStrength) {
190                        setParseError("reset-before strength followed by a stronger relation");
191                        return;
192                    }
193                }
194            }
195            int i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
196            if((result & STARRED_FLAG) == 0) {
197                parseRelationStrings(strength, i);
198            } else {
199                parseStarredCharacters(strength, i);
200            }
201            isFirstRelation = false;
202        }
203    }
204
205    private int parseResetAndPosition() throws ParseException {
206        int i = skipWhiteSpace(ruleIndex + 1);
207        int j;
208        char c;
209        int resetStrength;
210        if(rules.regionMatches(i, BEFORE, 0, BEFORE.length()) &&
211                (j = i + BEFORE.length()) < rules.length() &&
212                PatternProps.isWhiteSpace(rules.charAt(j)) &&
213                ((j = skipWhiteSpace(j + 1)) + 1) < rules.length() &&
214                0x31 <= (c = rules.charAt(j)) && c <= 0x33 &&
215                rules.charAt(j + 1) == 0x5d) {
216            // &[before n] with n=1 or 2 or 3
217            resetStrength = Collator.PRIMARY + (c - 0x31);
218            i = skipWhiteSpace(j + 2);
219        } else {
220            resetStrength = Collator.IDENTICAL;
221        }
222        if(i >= rules.length()) {
223            setParseError("reset without position");
224            return UCOL_DEFAULT;
225        }
226        if(rules.charAt(i) == 0x5b) {  // '['
227            i = parseSpecialPosition(i, rawBuilder);
228        } else {
229            i = parseTailoringString(i, rawBuilder);
230        }
231        try {
232            sink.addReset(resetStrength, rawBuilder);
233        } catch(Exception e) {
234            setParseError("adding reset failed", e);
235            return UCOL_DEFAULT;
236        }
237        ruleIndex = i;
238        return resetStrength;
239    }
240
241    private int parseRelationOperator() {
242        ruleIndex = skipWhiteSpace(ruleIndex);
243        if(ruleIndex >= rules.length()) { return UCOL_DEFAULT; }
244        int strength;
245        int i = ruleIndex;
246        char c = rules.charAt(i++);
247        switch(c) {
248        case 0x3c:  // '<'
249            if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<
250                ++i;
251                if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<<
252                    ++i;
253                    if(i < rules.length() && rules.charAt(i) == 0x3c) {  // <<<<
254                        ++i;
255                        strength = Collator.QUATERNARY;
256                    } else {
257                        strength = Collator.TERTIARY;
258                    }
259                } else {
260                    strength = Collator.SECONDARY;
261                }
262            } else {
263                strength = Collator.PRIMARY;
264            }
265            if(i < rules.length() && rules.charAt(i) == 0x2a) {  // '*'
266                ++i;
267                strength |= STARRED_FLAG;
268            }
269            break;
270        case 0x3b:  // ';' same as <<
271            strength = Collator.SECONDARY;
272            break;
273        case 0x2c:  // ',' same as <<<
274            strength = Collator.TERTIARY;
275            break;
276        case 0x3d:  // '='
277            strength = Collator.IDENTICAL;
278            if(i < rules.length() && rules.charAt(i) == 0x2a) {  // '*'
279                ++i;
280                strength |= STARRED_FLAG;
281            }
282            break;
283        default:
284            return UCOL_DEFAULT;
285        }
286        return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
287    }
288
289    private void parseRelationStrings(int strength, int i) throws ParseException {
290        // Parse
291        //     prefix | str / extension
292        // where prefix and extension are optional.
293        String prefix = "";
294        CharSequence extension = "";
295        i = parseTailoringString(i, rawBuilder);
296        char next = (i < rules.length()) ? rules.charAt(i) : 0;
297        if(next == 0x7c) {  // '|' separates the context prefix from the string.
298            prefix = rawBuilder.toString();
299            i = parseTailoringString(i + 1, rawBuilder);
300            next = (i < rules.length()) ? rules.charAt(i) : 0;
301        }
302        // str = rawBuilder (do not modify rawBuilder any more in this function)
303        if(next == 0x2f) {  // '/' separates the string from the extension.
304            StringBuilder extBuilder = new StringBuilder();
305            i = parseTailoringString(i + 1, extBuilder);
306            extension = extBuilder;
307        }
308        if(prefix.length() != 0) {
309            int prefix0 = prefix.codePointAt(0);
310            int c = rawBuilder.codePointAt(0);
311            if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
312                setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary");
313                return;
314            }
315        }
316        try {
317            sink.addRelation(strength, prefix, rawBuilder, extension);
318        } catch(Exception e) {
319            setParseError("adding relation failed", e);
320            return;
321        }
322        ruleIndex = i;
323    }
324
325    private void parseStarredCharacters(int strength, int i) throws ParseException {
326        String empty = "";
327        i = parseString(skipWhiteSpace(i), rawBuilder);
328        if(rawBuilder.length() == 0) {
329            setParseError("missing starred-relation string");
330            return;
331        }
332        int prev = -1;
333        int j = 0;
334        for(;;) {
335            while(j < rawBuilder.length()) {
336                int c = rawBuilder.codePointAt(j);
337                if(!nfd.isInert(c)) {
338                    setParseError("starred-relation string is not all NFD-inert");
339                    return;
340                }
341                try {
342                    sink.addRelation(strength, empty, UTF16.valueOf(c), empty);
343                } catch(Exception e) {
344                    setParseError("adding relation failed", e);
345                    return;
346                }
347                j += Character.charCount(c);
348                prev = c;
349            }
350            if(i >= rules.length() || rules.charAt(i) != 0x2d) {  // '-'
351                break;
352            }
353            if(prev < 0) {
354                setParseError("range without start in starred-relation string");
355                return;
356            }
357            i = parseString(i + 1, rawBuilder);
358            if(rawBuilder.length() == 0) {
359                setParseError("range without end in starred-relation string");
360                return;
361            }
362            int c = rawBuilder.codePointAt(0);
363            if(c < prev) {
364                setParseError("range start greater than end in starred-relation string");
365                return;
366            }
367            // range prev-c
368            while(++prev <= c) {
369                if(!nfd.isInert(prev)) {
370                    setParseError("starred-relation string range is not all NFD-inert");
371                    return;
372                }
373                if(isSurrogate(prev)) {
374                    setParseError("starred-relation string range contains a surrogate");
375                    return;
376                }
377                if(0xfffd <= prev && prev <= 0xffff) {
378                    setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF");
379                    return;
380                }
381                try {
382                    sink.addRelation(strength, empty, UTF16.valueOf(prev), empty);
383                } catch(Exception e) {
384                    setParseError("adding relation failed", e);
385                    return;
386                }
387            }
388            prev = -1;
389            j = Character.charCount(c);
390        }
391        ruleIndex = skipWhiteSpace(i);
392    }
393
394    private int parseTailoringString(int i, StringBuilder raw) throws ParseException {
395        i = parseString(skipWhiteSpace(i), raw);
396        if(raw.length() == 0) {
397            setParseError("missing relation string");
398        }
399        return skipWhiteSpace(i);
400    }
401
402    private int parseString(int i, StringBuilder raw) throws ParseException {
403        raw.setLength(0);
404        while(i < rules.length()) {
405            char c = rules.charAt(i++);
406            if(isSyntaxChar(c)) {
407                if(c == 0x27) {  // apostrophe
408                    if(i < rules.length() && rules.charAt(i) == 0x27) {
409                        // Double apostrophe, encodes a single one.
410                        raw.append((char)0x27);
411                        ++i;
412                        continue;
413                    }
414                    // Quote literal text until the next single apostrophe.
415                    for(;;) {
416                        if(i == rules.length()) {
417                            setParseError("quoted literal text missing terminating apostrophe");
418                            return i;
419                        }
420                        c = rules.charAt(i++);
421                        if(c == 0x27) {
422                            if(i < rules.length() && rules.charAt(i) == 0x27) {
423                                // Double apostrophe inside quoted literal text,
424                                // still encodes a single apostrophe.
425                                ++i;
426                            } else {
427                                break;
428                            }
429                        }
430                        raw.append(c);
431                    }
432                } else if(c == 0x5c) {  // backslash
433                    if(i == rules.length()) {
434                        setParseError("backslash escape at the end of the rule string");
435                        return i;
436                    }
437                    int cp = rules.codePointAt(i);
438                    raw.appendCodePoint(cp);
439                    i += Character.charCount(cp);
440                } else {
441                    // Any other syntax character terminates a string.
442                    --i;
443                    break;
444                }
445            } else if(PatternProps.isWhiteSpace(c)) {
446                // Unquoted white space terminates a string.
447                --i;
448                break;
449            } else {
450                raw.append(c);
451            }
452        }
453        for(int j = 0; j < raw.length();) {
454            int c = raw.codePointAt(j);
455            if(isSurrogate(c)) {
456                setParseError("string contains an unpaired surrogate");
457                return i;
458            }
459            if(0xfffd <= c && c <= 0xffff) {
460                setParseError("string contains U+FFFD, U+FFFE or U+FFFF");
461                return i;
462            }
463            j += Character.charCount(c);
464        }
465        return i;
466    }
467
468    // TODO: Widen UTF16.isSurrogate(char16) to take an int.
469    private static final boolean isSurrogate(int c) {
470        return (c & 0xfffff800) == 0xd800;
471    }
472
473    private static final String[] positions = {
474        "first tertiary ignorable",
475        "last tertiary ignorable",
476        "first secondary ignorable",
477        "last secondary ignorable",
478        "first primary ignorable",
479        "last primary ignorable",
480        "first variable",
481        "last variable",
482        "first regular",
483        "last regular",
484        "first implicit",
485        "last implicit",
486        "first trailing",
487        "last trailing"
488    };
489
490    /**
491     * Sets str to a contraction of U+FFFE and (U+2800 + Position).
492     * @return rule index after the special reset position
493     * @throws ParseException
494     */
495    private int parseSpecialPosition(int i, StringBuilder str) throws ParseException {
496        int j = readWords(i + 1, rawBuilder);
497        if(j > i && rules.charAt(j) == 0x5d && rawBuilder.length() != 0) {  // words end with ]
498            ++j;
499            String raw = rawBuilder.toString();
500            str.setLength(0);
501            for(int pos = 0; pos < positions.length; ++pos) {
502                if(raw.equals(positions[pos])) {
503                    str.append(POS_LEAD).append((char)(POS_BASE + pos));
504                    return j;
505                }
506            }
507            if(raw.equals("top")) {
508                str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_REGULAR.ordinal()));
509                return j;
510            }
511            if(raw.equals("variable top")) {
512                str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_VARIABLE.ordinal()));
513                return j;
514            }
515        }
516        setParseError("not a valid special reset position");
517        return i;
518    }
519
520    private void parseSetting() throws ParseException {
521        int i = ruleIndex + 1;
522        int j = readWords(i, rawBuilder);
523        if(j <= i || rawBuilder.length() == 0) {
524            setParseError("expected a setting/option at '['");
525        }
526        // startsWith() etc. are available for String but not CharSequence/StringBuilder.
527        String raw = rawBuilder.toString();
528        if(rules.charAt(j) == 0x5d) {  // words end with ]
529            ++j;
530            if(raw.startsWith("reorder") &&
531                    (raw.length() == 7 || raw.charAt(7) == 0x20)) {
532                parseReordering(raw);
533                ruleIndex = j;
534                return;
535            }
536            if(raw.equals("backwards 2")) {
537                settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true);
538                ruleIndex = j;
539                return;
540            }
541            String v;
542            int valueIndex = raw.lastIndexOf(0x20);
543            if(valueIndex >= 0) {
544                v = raw.substring(valueIndex + 1);
545                raw = raw.substring(0, valueIndex);
546            } else {
547                v = "";
548            }
549            if(raw.equals("strength") && v.length() == 1) {
550                int value = UCOL_DEFAULT;
551                char c = v.charAt(0);
552                if(0x31 <= c && c <= 0x34) {  // 1..4
553                    value = Collator.PRIMARY + (c - 0x31);
554                } else if(c == 0x49) {  // 'I'
555                    value = Collator.IDENTICAL;
556                }
557                if(value != UCOL_DEFAULT) {
558                    settings.setStrength(value);
559                    ruleIndex = j;
560                    return;
561                }
562            } else if(raw.equals("alternate")) {
563                int value = UCOL_DEFAULT;
564                if(v.equals("non-ignorable")) {
565                    value = 0;  // UCOL_NON_IGNORABLE
566                } else if(v.equals("shifted")) {
567                    value = 1;  // UCOL_SHIFTED
568                }
569                if(value != UCOL_DEFAULT) {
570                    settings.setAlternateHandlingShifted(value > 0);
571                    ruleIndex = j;
572                    return;
573                }
574            } else if(raw.equals("maxVariable")) {
575                int value = UCOL_DEFAULT;
576                if(v.equals("space")) {
577                    value = CollationSettings.MAX_VAR_SPACE;
578                } else if(v.equals("punct")) {
579                    value = CollationSettings.MAX_VAR_PUNCT;
580                } else if(v.equals("symbol")) {
581                    value = CollationSettings.MAX_VAR_SYMBOL;
582                } else if(v.equals("currency")) {
583                    value = CollationSettings.MAX_VAR_CURRENCY;
584                }
585                if(value != UCOL_DEFAULT) {
586                    settings.setMaxVariable(value, 0);
587                    settings.variableTop = baseData.getLastPrimaryForGroup(
588                        Collator.ReorderCodes.FIRST + value);
589                    assert(settings.variableTop != 0);
590                    ruleIndex = j;
591                    return;
592                }
593            } else if(raw.equals("caseFirst")) {
594                int value = UCOL_DEFAULT;
595                if(v.equals("off")) {
596                    value = UCOL_OFF;
597                } else if(v.equals("lower")) {
598                    value = CollationSettings.CASE_FIRST;  // UCOL_LOWER_FIRST
599                } else if(v.equals("upper")) {
600                    value = CollationSettings.CASE_FIRST_AND_UPPER_MASK;  // UCOL_UPPER_FIRST
601                }
602                if(value != UCOL_DEFAULT) {
603                    settings.setCaseFirst(value);
604                    ruleIndex = j;
605                    return;
606                }
607            } else if(raw.equals("caseLevel")) {
608                int value = getOnOffValue(v);
609                if(value != UCOL_DEFAULT) {
610                    settings.setFlag(CollationSettings.CASE_LEVEL, value > 0);
611                    ruleIndex = j;
612                    return;
613                }
614            } else if(raw.equals("normalization")) {
615                int value = getOnOffValue(v);
616                if(value != UCOL_DEFAULT) {
617                    settings.setFlag(CollationSettings.CHECK_FCD, value > 0);
618                    ruleIndex = j;
619                    return;
620                }
621            } else if(raw.equals("numericOrdering")) {
622                int value = getOnOffValue(v);
623                if(value != UCOL_DEFAULT) {
624                    settings.setFlag(CollationSettings.NUMERIC, value > 0);
625                    ruleIndex = j;
626                    return;
627                }
628            } else if(raw.equals("hiraganaQ")) {
629                int value = getOnOffValue(v);
630                if(value != UCOL_DEFAULT) {
631                    if(value == UCOL_ON) {
632                        setParseError("[hiraganaQ on] is not supported");
633                    }
634                    ruleIndex = j;
635                    return;
636                }
637            } else if(raw.equals("import")) {
638                // BCP 47 language tag -> ICU locale ID
639                ULocale localeID;
640                try {
641                    localeID = new ULocale.Builder().setLanguageTag(v).build();
642                } catch(Exception e) {
643                    setParseError("expected language tag in [import langTag]", e);
644                    return;
645                }
646                // localeID minus all keywords
647                String baseID = localeID.getBaseName();
648                // @collation=type, or length=0 if not specified
649                String collationType = localeID.getKeywordValue("collation");
650                if(importer == null) {
651                    setParseError("[import langTag] is not supported");
652                } else {
653                    String importedRules;
654                    try {
655                        importedRules =
656                            importer.getRules(baseID,
657                                    collationType != null ? collationType : "standard");
658                    } catch(Exception e) {
659                        setParseError("[import langTag] failed", e);
660                        return;
661                    }
662                    String outerRules = rules;
663                    int outerRuleIndex = ruleIndex;
664                    try {
665                        parse(importedRules);
666                    } catch(Exception e) {
667                        ruleIndex = outerRuleIndex;  // Restore the original index for error reporting.
668                        setParseError("parsing imported rules failed", e);
669                    }
670                    rules = outerRules;
671                    ruleIndex = j;
672                }
673                return;
674            }
675        } else if(rules.charAt(j) == 0x5b) {  // words end with [
676            UnicodeSet set = new UnicodeSet();
677            j = parseUnicodeSet(j, set);
678            if(raw.equals("optimize")) {
679                try {
680                    sink.optimize(set);
681                } catch(Exception e) {
682                    setParseError("[optimize set] failed", e);
683                }
684                ruleIndex = j;
685                return;
686            } else if(raw.equals("suppressContractions")) {
687                try {
688                    sink.suppressContractions(set);
689                } catch(Exception e) {
690                    setParseError("[suppressContractions set] failed", e);
691                }
692                ruleIndex = j;
693                return;
694            }
695        }
696        setParseError("not a valid setting/option");
697    }
698
699    private void parseReordering(CharSequence raw) throws ParseException {
700        int i = 7;  // after "reorder"
701        if(i == raw.length()) {
702            // empty [reorder] with no codes
703            settings.resetReordering();
704            return;
705        }
706        // Parse the codes in [reorder aa bb cc].
707        ArrayList<Integer> reorderCodes = new ArrayList<Integer>();
708        while(i < raw.length()) {
709            ++i;  // skip the word-separating space
710            int limit = i;
711            while(limit < raw.length() && raw.charAt(limit) != ' ') { ++limit; }
712            String word = raw.subSequence(i, limit).toString();
713            int code = getReorderCode(word);
714            if(code < 0) {
715                setParseError("unknown script or reorder code");
716                return;
717            }
718            reorderCodes.add(code);
719            i = limit;
720        }
721        if(reorderCodes.isEmpty()) {
722            settings.resetReordering();
723        } else {
724            int[] codes = new int[reorderCodes.size()];
725            int j = 0;
726            for(Integer code : reorderCodes) { codes[j++] = code; }
727            settings.setReordering(baseData, codes);
728        }
729    }
730
731    private static final String[] gSpecialReorderCodes = {
732        "space", "punct", "symbol", "currency", "digit"
733    };
734
735    /**
736     * Gets a script or reorder code from its string representation.
737     * @return the script/reorder code, or
738     * -1 if not recognized
739     */
740    public static int getReorderCode(String word) {
741        for(int i = 0; i < gSpecialReorderCodes.length; ++i) {
742            if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) {
743                return Collator.ReorderCodes.FIRST + i;
744            }
745        }
746        try {
747            int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word);
748            if(script >= 0) {
749                return script;
750            }
751        } catch (IllegalIcuArgumentException e) {
752            // fall through
753        }
754        if(word.equalsIgnoreCase("others")) {
755            return Collator.ReorderCodes.OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
756        }
757        return -1;
758    }
759
760    private static int getOnOffValue(String s) {
761        if(s.equals("on")) {
762            return UCOL_ON;
763        } else if(s.equals("off")) {
764            return UCOL_OFF;
765        } else {
766            return UCOL_DEFAULT;
767        }
768    }
769
770    private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException {
771        // Collect a UnicodeSet pattern between a balanced pair of [brackets].
772        int level = 0;
773        int j = i;
774        for(;;) {
775            if(j == rules.length()) {
776                setParseError("unbalanced UnicodeSet pattern brackets");
777                return j;
778            }
779            char c = rules.charAt(j++);
780            if(c == 0x5b) {  // '['
781                ++level;
782            } else if(c == 0x5d) {  // ']'
783                if(--level == 0) { break; }
784            }
785        }
786        try {
787            set.applyPattern(rules.substring(i, j));
788        } catch(Exception e) {
789            setParseError("not a valid UnicodeSet pattern: " + e.getMessage());
790        }
791        j = skipWhiteSpace(j);
792        if(j == rules.length() || rules.charAt(j) != 0x5d) {
793            setParseError("missing option-terminating ']' after UnicodeSet pattern");
794            return j;
795        }
796        return ++j;
797    }
798
799    private int readWords(int i, StringBuilder raw) {
800        raw.setLength(0);
801        i = skipWhiteSpace(i);
802        for(;;) {
803            if(i >= rules.length()) { return 0; }
804            char c = rules.charAt(i);
805            if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
806                if(raw.length() == 0) { return i; }
807                int lastIndex = raw.length() - 1;
808                if(raw.charAt(lastIndex) == ' ') {  // remove trailing space
809                    raw.setLength(lastIndex);
810                }
811                return i;
812            }
813            if(PatternProps.isWhiteSpace(c)) {
814                raw.append(' ');
815                i = skipWhiteSpace(i + 1);
816            } else {
817                raw.append(c);
818                ++i;
819            }
820        }
821    }
822
823    private int skipComment(int i) {
824        // skip to past the newline
825        while(i < rules.length()) {
826            char c = rules.charAt(i++);
827            // LF or FF or CR or NEL or LS or PS
828            if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
829                // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
830                // NLF (new line function) = CR or LF or CR+LF or NEL.
831                // No need to collect all of CR+LF because a following LF will be ignored anyway.
832                break;
833            }
834        }
835        return i;
836    }
837
838    private void setParseError(String reason) throws ParseException {
839        throw makeParseException(reason);
840    }
841
842    private void setParseError(String reason, Exception e) throws ParseException {
843        ParseException newExc = makeParseException(reason + ": " + e.getMessage());
844        newExc.initCause(e);
845        throw newExc;
846    }
847
848    private ParseException makeParseException(String reason) {
849        return new ParseException(appendErrorContext(reason), ruleIndex);
850    }
851
852    private static final int U_PARSE_CONTEXT_LEN = 16;
853
854    // C++ setErrorContext()
855    private String appendErrorContext(String reason) {
856        // Note: This relies on the calling code maintaining the ruleIndex
857        // at a position that is useful for debugging.
858        // For example, at the beginning of a reset or relation etc.
859        StringBuilder msg = new StringBuilder(reason);
860        msg.append(" at index ").append(ruleIndex);
861        // We are not counting line numbers.
862
863        msg.append(" near \"");
864        // before ruleIndex
865        int start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
866        if(start < 0) {
867            start = 0;
868        } else if(start > 0 && Character.isLowSurrogate(rules.charAt(start))) {
869            ++start;
870        }
871        msg.append(rules, start, ruleIndex);
872
873        msg.append('!');
874        // starting from ruleIndex
875        int length = rules.length() - ruleIndex;
876        if(length >= U_PARSE_CONTEXT_LEN) {
877            length = U_PARSE_CONTEXT_LEN - 1;
878            if(Character.isHighSurrogate(rules.charAt(ruleIndex + length - 1))) {
879                --length;
880            }
881        }
882        msg.append(rules, ruleIndex, ruleIndex + length);
883        return msg.append('\"').toString();
884    }
885
886    /**
887     * ASCII [:P:] and [:S:]:
888     * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
889     */
890    private static boolean isSyntaxChar(int c) {
891        return 0x21 <= c && c <= 0x7e &&
892                (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
893                (0x5b <= c && c <= 0x60) || (0x7b <= c));
894    }
895
896    private int skipWhiteSpace(int i) {
897        while(i < rules.length() && PatternProps.isWhiteSpace(rules.charAt(i))) {
898            ++i;
899        }
900        return i;
901    }
902
903    private Normalizer2 nfd = Normalizer2.getNFDInstance();
904    private Normalizer2 nfc = Normalizer2.getNFCInstance();
905
906    private String rules;
907    private final CollationData baseData;
908    private CollationSettings settings;
909
910    private Sink sink;
911    private Importer importer;
912
913    private int ruleIndex;
914}
915