1/*
2*******************************************************************************
3* Copyright (C) 2013-2015, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationruleparser.cpp
7*
8* (replaced the former ucol_tok.cpp)
9*
10* created on: 2013apr10
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/normalizer2.h"
19#include "unicode/parseerr.h"
20#include "unicode/uchar.h"
21#include "unicode/ucol.h"
22#include "unicode/uloc.h"
23#include "unicode/unistr.h"
24#include "unicode/utf16.h"
25#include "charstr.h"
26#include "cmemory.h"
27#include "collation.h"
28#include "collationdata.h"
29#include "collationruleparser.h"
30#include "collationsettings.h"
31#include "collationtailoring.h"
32#include "cstring.h"
33#include "patternprops.h"
34#include "uassert.h"
35#include "uvectr32.h"
36
37U_NAMESPACE_BEGIN
38
39namespace {
40
41static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
42const int32_t BEFORE_LENGTH = 7;
43
44}  // namespace
45
46CollationRuleParser::Sink::~Sink() {}
47
48void
49CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
50
51void
52CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
53
54CollationRuleParser::Importer::~Importer() {}
55
56CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
57        : nfd(*Normalizer2::getNFDInstance(errorCode)),
58          nfc(*Normalizer2::getNFCInstance(errorCode)),
59          rules(NULL), baseData(base), settings(NULL),
60          parseError(NULL), errorReason(NULL),
61          sink(NULL), importer(NULL),
62          ruleIndex(0) {
63}
64
65CollationRuleParser::~CollationRuleParser() {
66}
67
68void
69CollationRuleParser::parse(const UnicodeString &ruleString,
70                           CollationSettings &outSettings,
71                           UParseError *outParseError,
72                           UErrorCode &errorCode) {
73    if(U_FAILURE(errorCode)) { return; }
74    settings = &outSettings;
75    parseError = outParseError;
76    if(parseError != NULL) {
77        parseError->line = 0;
78        parseError->offset = -1;
79        parseError->preContext[0] = 0;
80        parseError->postContext[0] = 0;
81    }
82    errorReason = NULL;
83    parse(ruleString, errorCode);
84}
85
86void
87CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
88    if(U_FAILURE(errorCode)) { return; }
89    rules = &ruleString;
90    ruleIndex = 0;
91
92    while(ruleIndex < rules->length()) {
93        UChar c = rules->charAt(ruleIndex);
94        if(PatternProps::isWhiteSpace(c)) {
95            ++ruleIndex;
96            continue;
97        }
98        switch(c) {
99        case 0x26:  // '&'
100            parseRuleChain(errorCode);
101            break;
102        case 0x5b:  // '['
103            parseSetting(errorCode);
104            break;
105        case 0x23:  // '#' starts a comment, until the end of the line
106            ruleIndex = skipComment(ruleIndex + 1);
107            break;
108        case 0x40:  // '@' is equivalent to [backwards 2]
109            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
110                              UCOL_ON, 0, errorCode);
111            ++ruleIndex;
112            break;
113        case 0x21:  // '!' used to turn on Thai/Lao character reversal
114            // Accept but ignore. The root collator has contractions
115            // that are equivalent to the character reversal, where appropriate.
116            ++ruleIndex;
117            break;
118        default:
119            setParseError("expected a reset or setting or comment", errorCode);
120            break;
121        }
122        if(U_FAILURE(errorCode)) { return; }
123    }
124}
125
126void
127CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
128    int32_t resetStrength = parseResetAndPosition(errorCode);
129    UBool isFirstRelation = TRUE;
130    for(;;) {
131        int32_t result = parseRelationOperator(errorCode);
132        if(U_FAILURE(errorCode)) { return; }
133        if(result < 0) {
134            if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
135                // '#' starts a comment, until the end of the line
136                ruleIndex = skipComment(ruleIndex + 1);
137                continue;
138            }
139            if(isFirstRelation) {
140                setParseError("reset not followed by a relation", errorCode);
141            }
142            return;
143        }
144        int32_t strength = result & STRENGTH_MASK;
145        if(resetStrength < UCOL_IDENTICAL) {
146            // reset-before rule chain
147            if(isFirstRelation) {
148                if(strength != resetStrength) {
149                    setParseError("reset-before strength differs from its first relation", errorCode);
150                    return;
151                }
152            } else {
153                if(strength < resetStrength) {
154                    setParseError("reset-before strength followed by a stronger relation", errorCode);
155                    return;
156                }
157            }
158        }
159        int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
160        if((result & STARRED_FLAG) == 0) {
161            parseRelationStrings(strength, i, errorCode);
162        } else {
163            parseStarredCharacters(strength, i, errorCode);
164        }
165        if(U_FAILURE(errorCode)) { return; }
166        isFirstRelation = FALSE;
167    }
168}
169
170int32_t
171CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
172    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
173    int32_t i = skipWhiteSpace(ruleIndex + 1);
174    int32_t j;
175    UChar c;
176    int32_t resetStrength;
177    if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
178            (j = i + BEFORE_LENGTH) < rules->length() &&
179            PatternProps::isWhiteSpace(rules->charAt(j)) &&
180            ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
181            0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
182            rules->charAt(j + 1) == 0x5d) {
183        // &[before n] with n=1 or 2 or 3
184        resetStrength = UCOL_PRIMARY + (c - 0x31);
185        i = skipWhiteSpace(j + 2);
186    } else {
187        resetStrength = UCOL_IDENTICAL;
188    }
189    if(i >= rules->length()) {
190        setParseError("reset without position", errorCode);
191        return UCOL_DEFAULT;
192    }
193    UnicodeString str;
194    if(rules->charAt(i) == 0x5b) {  // '['
195        i = parseSpecialPosition(i, str, errorCode);
196    } else {
197        i = parseTailoringString(i, str, errorCode);
198    }
199    sink->addReset(resetStrength, str, errorReason, errorCode);
200    if(U_FAILURE(errorCode)) { setErrorContext(); }
201    ruleIndex = i;
202    return resetStrength;
203}
204
205int32_t
206CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
207    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
208    ruleIndex = skipWhiteSpace(ruleIndex);
209    if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
210    int32_t strength;
211    int32_t i = ruleIndex;
212    UChar c = rules->charAt(i++);
213    switch(c) {
214    case 0x3c:  // '<'
215        if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
216            ++i;
217            if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
218                ++i;
219                if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
220                    ++i;
221                    strength = UCOL_QUATERNARY;
222                } else {
223                    strength = UCOL_TERTIARY;
224                }
225            } else {
226                strength = UCOL_SECONDARY;
227            }
228        } else {
229            strength = UCOL_PRIMARY;
230        }
231        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
232            ++i;
233            strength |= STARRED_FLAG;
234        }
235        break;
236    case 0x3b:  // ';' same as <<
237        strength = UCOL_SECONDARY;
238        break;
239    case 0x2c:  // ',' same as <<<
240        strength = UCOL_TERTIARY;
241        break;
242    case 0x3d:  // '='
243        strength = UCOL_IDENTICAL;
244        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
245            ++i;
246            strength |= STARRED_FLAG;
247        }
248        break;
249    default:
250        return UCOL_DEFAULT;
251    }
252    return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
253}
254
255void
256CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
257    // Parse
258    //     prefix | str / extension
259    // where prefix and extension are optional.
260    UnicodeString prefix, str, extension;
261    i = parseTailoringString(i, str, errorCode);
262    if(U_FAILURE(errorCode)) { return; }
263    UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
264    if(next == 0x7c) {  // '|' separates the context prefix from the string.
265        prefix = str;
266        i = parseTailoringString(i + 1, str, errorCode);
267        if(U_FAILURE(errorCode)) { return; }
268        next = (i < rules->length()) ? rules->charAt(i) : 0;
269    }
270    if(next == 0x2f) {  // '/' separates the string from the extension.
271        i = parseTailoringString(i + 1, extension, errorCode);
272    }
273    if(!prefix.isEmpty()) {
274        UChar32 prefix0 = prefix.char32At(0);
275        UChar32 c = str.char32At(0);
276        if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
277            setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
278                          errorCode);
279            return;
280        }
281    }
282    sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
283    if(U_FAILURE(errorCode)) { setErrorContext(); }
284    ruleIndex = i;
285}
286
287void
288CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
289    UnicodeString empty, raw;
290    i = parseString(skipWhiteSpace(i), raw, errorCode);
291    if(U_FAILURE(errorCode)) { return; }
292    if(raw.isEmpty()) {
293        setParseError("missing starred-relation string", errorCode);
294        return;
295    }
296    UChar32 prev = -1;
297    int32_t j = 0;
298    for(;;) {
299        while(j < raw.length()) {
300            UChar32 c = raw.char32At(j);
301            if(!nfd.isInert(c)) {
302                setParseError("starred-relation string is not all NFD-inert", errorCode);
303                return;
304            }
305            sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
306            if(U_FAILURE(errorCode)) {
307                setErrorContext();
308                return;
309            }
310            j += U16_LENGTH(c);
311            prev = c;
312        }
313        if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
314            break;
315        }
316        if(prev < 0) {
317            setParseError("range without start in starred-relation string", errorCode);
318            return;
319        }
320        i = parseString(i + 1, raw, errorCode);
321        if(U_FAILURE(errorCode)) { return; }
322        if(raw.isEmpty()) {
323            setParseError("range without end in starred-relation string", errorCode);
324            return;
325        }
326        UChar32 c = raw.char32At(0);
327        if(c < prev) {
328            setParseError("range start greater than end in starred-relation string", errorCode);
329            return;
330        }
331        // range prev-c
332        UnicodeString s;
333        while(++prev <= c) {
334            if(!nfd.isInert(prev)) {
335                setParseError("starred-relation string range is not all NFD-inert", errorCode);
336                return;
337            }
338            if(U_IS_SURROGATE(prev)) {
339                setParseError("starred-relation string range contains a surrogate", errorCode);
340                return;
341            }
342            if(0xfffd <= prev && prev <= 0xffff) {
343                setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
344                return;
345            }
346            s.setTo(prev);
347            sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
348            if(U_FAILURE(errorCode)) {
349                setErrorContext();
350                return;
351            }
352        }
353        prev = -1;
354        j = U16_LENGTH(c);
355    }
356    ruleIndex = skipWhiteSpace(i);
357}
358
359int32_t
360CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
361    i = parseString(skipWhiteSpace(i), raw, errorCode);
362    if(U_SUCCESS(errorCode) && raw.isEmpty()) {
363        setParseError("missing relation string", errorCode);
364    }
365    return skipWhiteSpace(i);
366}
367
368int32_t
369CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
370    if(U_FAILURE(errorCode)) { return i; }
371    raw.remove();
372    while(i < rules->length()) {
373        UChar32 c = rules->charAt(i++);
374        if(isSyntaxChar(c)) {
375            if(c == 0x27) {  // apostrophe
376                if(i < rules->length() && rules->charAt(i) == 0x27) {
377                    // Double apostrophe, encodes a single one.
378                    raw.append((UChar)0x27);
379                    ++i;
380                    continue;
381                }
382                // Quote literal text until the next single apostrophe.
383                for(;;) {
384                    if(i == rules->length()) {
385                        setParseError("quoted literal text missing terminating apostrophe", errorCode);
386                        return i;
387                    }
388                    c = rules->charAt(i++);
389                    if(c == 0x27) {
390                        if(i < rules->length() && rules->charAt(i) == 0x27) {
391                            // Double apostrophe inside quoted literal text,
392                            // still encodes a single apostrophe.
393                            ++i;
394                        } else {
395                            break;
396                        }
397                    }
398                    raw.append((UChar)c);
399                }
400            } else if(c == 0x5c) {  // backslash
401                if(i == rules->length()) {
402                    setParseError("backslash escape at the end of the rule string", errorCode);
403                    return i;
404                }
405                c = rules->char32At(i);
406                raw.append(c);
407                i += U16_LENGTH(c);
408            } else {
409                // Any other syntax character terminates a string.
410                --i;
411                break;
412            }
413        } else if(PatternProps::isWhiteSpace(c)) {
414            // Unquoted white space terminates a string.
415            --i;
416            break;
417        } else {
418            raw.append((UChar)c);
419        }
420    }
421    for(int32_t j = 0; j < raw.length();) {
422        UChar32 c = raw.char32At(j);
423        if(U_IS_SURROGATE(c)) {
424            setParseError("string contains an unpaired surrogate", errorCode);
425            return i;
426        }
427        if(0xfffd <= c && c <= 0xffff) {
428            setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
429            return i;
430        }
431        j += U16_LENGTH(c);
432    }
433    return i;
434}
435
436namespace {
437
438static const char *const positions[] = {
439    "first tertiary ignorable",
440    "last tertiary ignorable",
441    "first secondary ignorable",
442    "last secondary ignorable",
443    "first primary ignorable",
444    "last primary ignorable",
445    "first variable",
446    "last variable",
447    "first regular",
448    "last regular",
449    "first implicit",
450    "last implicit",
451    "first trailing",
452    "last trailing"
453};
454
455}  // namespace
456
457int32_t
458CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
459    if(U_FAILURE(errorCode)) { return 0; }
460    UnicodeString raw;
461    int32_t j = readWords(i + 1, raw);
462    if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
463        ++j;
464        for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
465            if(raw == UnicodeString(positions[pos], -1, US_INV)) {
466                str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
467                return j;
468            }
469        }
470        if(raw == UNICODE_STRING_SIMPLE("top")) {
471            str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
472            return j;
473        }
474        if(raw == UNICODE_STRING_SIMPLE("variable top")) {
475            str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
476            return j;
477        }
478    }
479    setParseError("not a valid special reset position", errorCode);
480    return i;
481}
482
483void
484CollationRuleParser::parseSetting(UErrorCode &errorCode) {
485    if(U_FAILURE(errorCode)) { return; }
486    UnicodeString raw;
487    int32_t i = ruleIndex + 1;
488    int32_t j = readWords(i, raw);
489    if(j <= i || raw.isEmpty()) {
490        setParseError("expected a setting/option at '['", errorCode);
491    }
492    if(rules->charAt(j) == 0x5d) {  // words end with ]
493        ++j;
494        if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
495                (raw.length() == 7 || raw.charAt(7) == 0x20)) {
496            parseReordering(raw, errorCode);
497            ruleIndex = j;
498            return;
499        }
500        if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
501            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
502                              UCOL_ON, 0, errorCode);
503            ruleIndex = j;
504            return;
505        }
506        UnicodeString v;
507        int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
508        if(valueIndex >= 0) {
509            v.setTo(raw, valueIndex + 1);
510            raw.truncate(valueIndex);
511        }
512        if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
513            int32_t value = UCOL_DEFAULT;
514            UChar c = v.charAt(0);
515            if(0x31 <= c && c <= 0x34) {  // 1..4
516                value = UCOL_PRIMARY + (c - 0x31);
517            } else if(c == 0x49) {  // 'I'
518                value = UCOL_IDENTICAL;
519            }
520            if(value != UCOL_DEFAULT) {
521                settings->setStrength(value, 0, errorCode);
522                ruleIndex = j;
523                return;
524            }
525        } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
526            UColAttributeValue value = UCOL_DEFAULT;
527            if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
528                value = UCOL_NON_IGNORABLE;
529            } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
530                value = UCOL_SHIFTED;
531            }
532            if(value != UCOL_DEFAULT) {
533                settings->setAlternateHandling(value, 0, errorCode);
534                ruleIndex = j;
535                return;
536            }
537        } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
538            int32_t value = UCOL_DEFAULT;
539            if(v == UNICODE_STRING_SIMPLE("space")) {
540                value = CollationSettings::MAX_VAR_SPACE;
541            } else if(v == UNICODE_STRING_SIMPLE("punct")) {
542                value = CollationSettings::MAX_VAR_PUNCT;
543            } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
544                value = CollationSettings::MAX_VAR_SYMBOL;
545            } else if(v == UNICODE_STRING_SIMPLE("currency")) {
546                value = CollationSettings::MAX_VAR_CURRENCY;
547            }
548            if(value != UCOL_DEFAULT) {
549                settings->setMaxVariable(value, 0, errorCode);
550                settings->variableTop = baseData->getLastPrimaryForGroup(
551                    UCOL_REORDER_CODE_FIRST + value);
552                U_ASSERT(settings->variableTop != 0);
553                ruleIndex = j;
554                return;
555            }
556        } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
557            UColAttributeValue value = UCOL_DEFAULT;
558            if(v == UNICODE_STRING_SIMPLE("off")) {
559                value = UCOL_OFF;
560            } else if(v == UNICODE_STRING_SIMPLE("lower")) {
561                value = UCOL_LOWER_FIRST;
562            } else if(v == UNICODE_STRING_SIMPLE("upper")) {
563                value = UCOL_UPPER_FIRST;
564            }
565            if(value != UCOL_DEFAULT) {
566                settings->setCaseFirst(value, 0, errorCode);
567                ruleIndex = j;
568                return;
569            }
570        } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
571            UColAttributeValue value = getOnOffValue(v);
572            if(value != UCOL_DEFAULT) {
573                settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
574                ruleIndex = j;
575                return;
576            }
577        } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
578            UColAttributeValue value = getOnOffValue(v);
579            if(value != UCOL_DEFAULT) {
580                settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
581                ruleIndex = j;
582                return;
583            }
584        } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
585            UColAttributeValue value = getOnOffValue(v);
586            if(value != UCOL_DEFAULT) {
587                settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
588                ruleIndex = j;
589                return;
590            }
591        } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
592            UColAttributeValue value = getOnOffValue(v);
593            if(value != UCOL_DEFAULT) {
594                if(value == UCOL_ON) {
595                    setParseError("[hiraganaQ on] is not supported", errorCode);
596                }
597                ruleIndex = j;
598                return;
599            }
600        } else if(raw == UNICODE_STRING_SIMPLE("import")) {
601            CharString lang;
602            lang.appendInvariantChars(v, errorCode);
603            if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
604            // BCP 47 language tag -> ICU locale ID
605            char localeID[ULOC_FULLNAME_CAPACITY];
606            int32_t parsedLength;
607            int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
608                                                 &parsedLength, &errorCode);
609            if(U_FAILURE(errorCode) ||
610                    parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
611                errorCode = U_ZERO_ERROR;
612                setParseError("expected language tag in [import langTag]", errorCode);
613                return;
614            }
615            // localeID minus all keywords
616            char baseID[ULOC_FULLNAME_CAPACITY];
617            length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
618            if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
619                errorCode = U_ZERO_ERROR;
620                setParseError("expected language tag in [import langTag]", errorCode);
621                return;
622            }
623            if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
624                uprv_strcpy(baseID, "root");
625            }
626            // @collation=type, or length=0 if not specified
627            char collationType[ULOC_KEYWORDS_CAPACITY];
628            length = uloc_getKeywordValue(localeID, "collation",
629                                          collationType, ULOC_KEYWORDS_CAPACITY,
630                                          &errorCode);
631            if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
632                errorCode = U_ZERO_ERROR;
633                setParseError("expected language tag in [import langTag]", errorCode);
634                return;
635            }
636            if(importer == NULL) {
637                setParseError("[import langTag] is not supported", errorCode);
638            } else {
639                UnicodeString importedRules;
640                importer->getRules(baseID, length > 0 ? collationType : "standard",
641                                   importedRules, errorReason, errorCode);
642                if(U_FAILURE(errorCode)) {
643                    if(errorReason == NULL) {
644                        errorReason = "[import langTag] failed";
645                    }
646                    setErrorContext();
647                    return;
648                }
649                const UnicodeString *outerRules = rules;
650                int32_t outerRuleIndex = ruleIndex;
651                parse(importedRules, errorCode);
652                if(U_FAILURE(errorCode)) {
653                    if(parseError != NULL) {
654                        parseError->offset = outerRuleIndex;
655                    }
656                }
657                rules = outerRules;
658                ruleIndex = j;
659            }
660            return;
661        }
662    } else if(rules->charAt(j) == 0x5b) {  // words end with [
663        UnicodeSet set;
664        j = parseUnicodeSet(j, set, errorCode);
665        if(U_FAILURE(errorCode)) { return; }
666        if(raw == UNICODE_STRING_SIMPLE("optimize")) {
667            sink->optimize(set, errorReason, errorCode);
668            if(U_FAILURE(errorCode)) { setErrorContext(); }
669            ruleIndex = j;
670            return;
671        } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
672            sink->suppressContractions(set, errorReason, errorCode);
673            if(U_FAILURE(errorCode)) { setErrorContext(); }
674            ruleIndex = j;
675            return;
676        }
677    }
678    setParseError("not a valid setting/option", errorCode);
679}
680
681void
682CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
683    if(U_FAILURE(errorCode)) { return; }
684    int32_t i = 7;  // after "reorder"
685    if(i == raw.length()) {
686        // empty [reorder] with no codes
687        settings->resetReordering();
688        return;
689    }
690    // Parse the codes in [reorder aa bb cc].
691    UVector32 reorderCodes(errorCode);
692    if(U_FAILURE(errorCode)) { return; }
693    CharString word;
694    while(i < raw.length()) {
695        ++i;  // skip the word-separating space
696        int32_t limit = raw.indexOf((UChar)0x20, i);
697        if(limit < 0) { limit = raw.length(); }
698        word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
699        if(U_FAILURE(errorCode)) { return; }
700        int32_t code = getReorderCode(word.data());
701        if(code < 0) {
702            setParseError("unknown script or reorder code", errorCode);
703            return;
704        }
705        reorderCodes.addElement(code, errorCode);
706        if(U_FAILURE(errorCode)) { return; }
707        i = limit;
708    }
709    settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
710}
711
712static const char *const gSpecialReorderCodes[] = {
713    "space", "punct", "symbol", "currency", "digit"
714};
715
716int32_t
717CollationRuleParser::getReorderCode(const char *word) {
718    for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
719        if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
720            return UCOL_REORDER_CODE_FIRST + i;
721        }
722    }
723    int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
724    if(script >= 0) {
725        return script;
726    }
727    if(uprv_stricmp(word, "others") == 0) {
728        return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
729    }
730    return -1;
731}
732
733UColAttributeValue
734CollationRuleParser::getOnOffValue(const UnicodeString &s) {
735    if(s == UNICODE_STRING_SIMPLE("on")) {
736        return UCOL_ON;
737    } else if(s == UNICODE_STRING_SIMPLE("off")) {
738        return UCOL_OFF;
739    } else {
740        return UCOL_DEFAULT;
741    }
742}
743
744int32_t
745CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
746    // Collect a UnicodeSet pattern between a balanced pair of [brackets].
747    int32_t level = 0;
748    int32_t j = i;
749    for(;;) {
750        if(j == rules->length()) {
751            setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
752            return j;
753        }
754        UChar c = rules->charAt(j++);
755        if(c == 0x5b) {  // '['
756            ++level;
757        } else if(c == 0x5d) {  // ']'
758            if(--level == 0) { break; }
759        }
760    }
761    set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
762    if(U_FAILURE(errorCode)) {
763        errorCode = U_ZERO_ERROR;
764        setParseError("not a valid UnicodeSet pattern", errorCode);
765        return j;
766    }
767    j = skipWhiteSpace(j);
768    if(j == rules->length() || rules->charAt(j) != 0x5d) {
769        setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
770        return j;
771    }
772    return ++j;
773}
774
775int32_t
776CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
777    static const UChar sp = 0x20;
778    raw.remove();
779    i = skipWhiteSpace(i);
780    for(;;) {
781        if(i >= rules->length()) { return 0; }
782        UChar c = rules->charAt(i);
783        if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
784            if(raw.isEmpty()) { return i; }
785            if(raw.endsWith(&sp, 1)) {  // remove trailing space
786                raw.truncate(raw.length() - 1);
787            }
788            return i;
789        }
790        if(PatternProps::isWhiteSpace(c)) {
791            raw.append(0x20);
792            i = skipWhiteSpace(i + 1);
793        } else {
794            raw.append(c);
795            ++i;
796        }
797    }
798}
799
800int32_t
801CollationRuleParser::skipComment(int32_t i) const {
802    // skip to past the newline
803    while(i < rules->length()) {
804        UChar c = rules->charAt(i++);
805        // LF or FF or CR or NEL or LS or PS
806        if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
807            // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
808            // NLF (new line function) = CR or LF or CR+LF or NEL.
809            // No need to collect all of CR+LF because a following LF will be ignored anyway.
810            break;
811        }
812    }
813    return i;
814}
815
816void
817CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
818    if(U_FAILURE(errorCode)) { return; }
819    // Error code consistent with the old parser (from ca. 2001),
820    // rather than U_PARSE_ERROR;
821    errorCode = U_INVALID_FORMAT_ERROR;
822    errorReason = reason;
823    if(parseError != NULL) { setErrorContext(); }
824}
825
826void
827CollationRuleParser::setErrorContext() {
828    if(parseError == NULL) { return; }
829
830    // Note: This relies on the calling code maintaining the ruleIndex
831    // at a position that is useful for debugging.
832    // For example, at the beginning of a reset or relation etc.
833    parseError->offset = ruleIndex;
834    parseError->line = 0;  // We are not counting line numbers.
835
836    // before ruleIndex
837    int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
838    if(start < 0) {
839        start = 0;
840    } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
841        ++start;
842    }
843    int32_t length = ruleIndex - start;
844    rules->extract(start, length, parseError->preContext);
845    parseError->preContext[length] = 0;
846
847    // starting from ruleIndex
848    length = rules->length() - ruleIndex;
849    if(length >= U_PARSE_CONTEXT_LEN) {
850        length = U_PARSE_CONTEXT_LEN - 1;
851        if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
852            --length;
853        }
854    }
855    rules->extract(ruleIndex, length, parseError->postContext);
856    parseError->postContext[length] = 0;
857}
858
859UBool
860CollationRuleParser::isSyntaxChar(UChar32 c) {
861    return 0x21 <= c && c <= 0x7e &&
862            (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
863            (0x5b <= c && c <= 0x60) || (0x7b <= c));
864}
865
866int32_t
867CollationRuleParser::skipWhiteSpace(int32_t i) const {
868    while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
869        ++i;
870    }
871    return i;
872}
873
874U_NAMESPACE_END
875
876#endif  // !UCONFIG_NO_COLLATION
877