1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2013-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* collationruleparser.cpp
9*
10* (replaced the former ucol_tok.cpp)
11*
12* created on: 2013apr10
13* created by: Markus W. Scherer
14*/
15
16#include "unicode/utypes.h"
17
18#if !UCONFIG_NO_COLLATION
19
20#include "unicode/normalizer2.h"
21#include "unicode/parseerr.h"
22#include "unicode/uchar.h"
23#include "unicode/ucol.h"
24#include "unicode/uloc.h"
25#include "unicode/unistr.h"
26#include "unicode/utf16.h"
27#include "charstr.h"
28#include "cmemory.h"
29#include "collation.h"
30#include "collationdata.h"
31#include "collationruleparser.h"
32#include "collationsettings.h"
33#include "collationtailoring.h"
34#include "cstring.h"
35#include "patternprops.h"
36#include "uassert.h"
37#include "uvectr32.h"
38
39U_NAMESPACE_BEGIN
40
41namespace {
42
43static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
44const int32_t BEFORE_LENGTH = 7;
45
46}  // namespace
47
48CollationRuleParser::Sink::~Sink() {}
49
50void
51CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52
53void
54CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55
56CollationRuleParser::Importer::~Importer() {}
57
58CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59        : nfd(*Normalizer2::getNFDInstance(errorCode)),
60          nfc(*Normalizer2::getNFCInstance(errorCode)),
61          rules(NULL), baseData(base), settings(NULL),
62          parseError(NULL), errorReason(NULL),
63          sink(NULL), importer(NULL),
64          ruleIndex(0) {
65}
66
67CollationRuleParser::~CollationRuleParser() {
68}
69
70void
71CollationRuleParser::parse(const UnicodeString &ruleString,
72                           CollationSettings &outSettings,
73                           UParseError *outParseError,
74                           UErrorCode &errorCode) {
75    if(U_FAILURE(errorCode)) { return; }
76    settings = &outSettings;
77    parseError = outParseError;
78    if(parseError != NULL) {
79        parseError->line = 0;
80        parseError->offset = -1;
81        parseError->preContext[0] = 0;
82        parseError->postContext[0] = 0;
83    }
84    errorReason = NULL;
85    parse(ruleString, errorCode);
86}
87
88void
89CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90    if(U_FAILURE(errorCode)) { return; }
91    rules = &ruleString;
92    ruleIndex = 0;
93
94    while(ruleIndex < rules->length()) {
95        UChar c = rules->charAt(ruleIndex);
96        if(PatternProps::isWhiteSpace(c)) {
97            ++ruleIndex;
98            continue;
99        }
100        switch(c) {
101        case 0x26:  // '&'
102            parseRuleChain(errorCode);
103            break;
104        case 0x5b:  // '['
105            parseSetting(errorCode);
106            break;
107        case 0x23:  // '#' starts a comment, until the end of the line
108            ruleIndex = skipComment(ruleIndex + 1);
109            break;
110        case 0x40:  // '@' is equivalent to [backwards 2]
111            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112                              UCOL_ON, 0, errorCode);
113            ++ruleIndex;
114            break;
115        case 0x21:  // '!' used to turn on Thai/Lao character reversal
116            // Accept but ignore. The root collator has contractions
117            // that are equivalent to the character reversal, where appropriate.
118            ++ruleIndex;
119            break;
120        default:
121            setParseError("expected a reset or setting or comment", errorCode);
122            break;
123        }
124        if(U_FAILURE(errorCode)) { return; }
125    }
126}
127
128void
129CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130    int32_t resetStrength = parseResetAndPosition(errorCode);
131    UBool isFirstRelation = TRUE;
132    for(;;) {
133        int32_t result = parseRelationOperator(errorCode);
134        if(U_FAILURE(errorCode)) { return; }
135        if(result < 0) {
136            if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137                // '#' starts a comment, until the end of the line
138                ruleIndex = skipComment(ruleIndex + 1);
139                continue;
140            }
141            if(isFirstRelation) {
142                setParseError("reset not followed by a relation", errorCode);
143            }
144            return;
145        }
146        int32_t strength = result & STRENGTH_MASK;
147        if(resetStrength < UCOL_IDENTICAL) {
148            // reset-before rule chain
149            if(isFirstRelation) {
150                if(strength != resetStrength) {
151                    setParseError("reset-before strength differs from its first relation", errorCode);
152                    return;
153                }
154            } else {
155                if(strength < resetStrength) {
156                    setParseError("reset-before strength followed by a stronger relation", errorCode);
157                    return;
158                }
159            }
160        }
161        int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
162        if((result & STARRED_FLAG) == 0) {
163            parseRelationStrings(strength, i, errorCode);
164        } else {
165            parseStarredCharacters(strength, i, errorCode);
166        }
167        if(U_FAILURE(errorCode)) { return; }
168        isFirstRelation = FALSE;
169    }
170}
171
172int32_t
173CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175    int32_t i = skipWhiteSpace(ruleIndex + 1);
176    int32_t j;
177    UChar c;
178    int32_t resetStrength;
179    if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180            (j = i + BEFORE_LENGTH) < rules->length() &&
181            PatternProps::isWhiteSpace(rules->charAt(j)) &&
182            ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183            0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184            rules->charAt(j + 1) == 0x5d) {
185        // &[before n] with n=1 or 2 or 3
186        resetStrength = UCOL_PRIMARY + (c - 0x31);
187        i = skipWhiteSpace(j + 2);
188    } else {
189        resetStrength = UCOL_IDENTICAL;
190    }
191    if(i >= rules->length()) {
192        setParseError("reset without position", errorCode);
193        return UCOL_DEFAULT;
194    }
195    UnicodeString str;
196    if(rules->charAt(i) == 0x5b) {  // '['
197        i = parseSpecialPosition(i, str, errorCode);
198    } else {
199        i = parseTailoringString(i, str, errorCode);
200    }
201    sink->addReset(resetStrength, str, errorReason, errorCode);
202    if(U_FAILURE(errorCode)) { setErrorContext(); }
203    ruleIndex = i;
204    return resetStrength;
205}
206
207int32_t
208CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209    if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210    ruleIndex = skipWhiteSpace(ruleIndex);
211    if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212    int32_t strength;
213    int32_t i = ruleIndex;
214    UChar c = rules->charAt(i++);
215    switch(c) {
216    case 0x3c:  // '<'
217        if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
218            ++i;
219            if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
220                ++i;
221                if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
222                    ++i;
223                    strength = UCOL_QUATERNARY;
224                } else {
225                    strength = UCOL_TERTIARY;
226                }
227            } else {
228                strength = UCOL_SECONDARY;
229            }
230        } else {
231            strength = UCOL_PRIMARY;
232        }
233        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
234            ++i;
235            strength |= STARRED_FLAG;
236        }
237        break;
238    case 0x3b:  // ';' same as <<
239        strength = UCOL_SECONDARY;
240        break;
241    case 0x2c:  // ',' same as <<<
242        strength = UCOL_TERTIARY;
243        break;
244    case 0x3d:  // '='
245        strength = UCOL_IDENTICAL;
246        if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
247            ++i;
248            strength |= STARRED_FLAG;
249        }
250        break;
251    default:
252        return UCOL_DEFAULT;
253    }
254    return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255}
256
257void
258CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259    // Parse
260    //     prefix | str / extension
261    // where prefix and extension are optional.
262    UnicodeString prefix, str, extension;
263    i = parseTailoringString(i, str, errorCode);
264    if(U_FAILURE(errorCode)) { return; }
265    UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266    if(next == 0x7c) {  // '|' separates the context prefix from the string.
267        prefix = str;
268        i = parseTailoringString(i + 1, str, errorCode);
269        if(U_FAILURE(errorCode)) { return; }
270        next = (i < rules->length()) ? rules->charAt(i) : 0;
271    }
272    if(next == 0x2f) {  // '/' separates the string from the extension.
273        i = parseTailoringString(i + 1, extension, errorCode);
274    }
275    if(!prefix.isEmpty()) {
276        UChar32 prefix0 = prefix.char32At(0);
277        UChar32 c = str.char32At(0);
278        if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279            setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280                          errorCode);
281            return;
282        }
283    }
284    sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285    if(U_FAILURE(errorCode)) { setErrorContext(); }
286    ruleIndex = i;
287}
288
289void
290CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291    UnicodeString empty, raw;
292    i = parseString(skipWhiteSpace(i), raw, errorCode);
293    if(U_FAILURE(errorCode)) { return; }
294    if(raw.isEmpty()) {
295        setParseError("missing starred-relation string", errorCode);
296        return;
297    }
298    UChar32 prev = -1;
299    int32_t j = 0;
300    for(;;) {
301        while(j < raw.length()) {
302            UChar32 c = raw.char32At(j);
303            if(!nfd.isInert(c)) {
304                setParseError("starred-relation string is not all NFD-inert", errorCode);
305                return;
306            }
307            sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308            if(U_FAILURE(errorCode)) {
309                setErrorContext();
310                return;
311            }
312            j += U16_LENGTH(c);
313            prev = c;
314        }
315        if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
316            break;
317        }
318        if(prev < 0) {
319            setParseError("range without start in starred-relation string", errorCode);
320            return;
321        }
322        i = parseString(i + 1, raw, errorCode);
323        if(U_FAILURE(errorCode)) { return; }
324        if(raw.isEmpty()) {
325            setParseError("range without end in starred-relation string", errorCode);
326            return;
327        }
328        UChar32 c = raw.char32At(0);
329        if(c < prev) {
330            setParseError("range start greater than end in starred-relation string", errorCode);
331            return;
332        }
333        // range prev-c
334        UnicodeString s;
335        while(++prev <= c) {
336            if(!nfd.isInert(prev)) {
337                setParseError("starred-relation string range is not all NFD-inert", errorCode);
338                return;
339            }
340            if(U_IS_SURROGATE(prev)) {
341                setParseError("starred-relation string range contains a surrogate", errorCode);
342                return;
343            }
344            if(0xfffd <= prev && prev <= 0xffff) {
345                setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346                return;
347            }
348            s.setTo(prev);
349            sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350            if(U_FAILURE(errorCode)) {
351                setErrorContext();
352                return;
353            }
354        }
355        prev = -1;
356        j = U16_LENGTH(c);
357    }
358    ruleIndex = skipWhiteSpace(i);
359}
360
361int32_t
362CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363    i = parseString(skipWhiteSpace(i), raw, errorCode);
364    if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365        setParseError("missing relation string", errorCode);
366    }
367    return skipWhiteSpace(i);
368}
369
370int32_t
371CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372    if(U_FAILURE(errorCode)) { return i; }
373    raw.remove();
374    while(i < rules->length()) {
375        UChar32 c = rules->charAt(i++);
376        if(isSyntaxChar(c)) {
377            if(c == 0x27) {  // apostrophe
378                if(i < rules->length() && rules->charAt(i) == 0x27) {
379                    // Double apostrophe, encodes a single one.
380                    raw.append((UChar)0x27);
381                    ++i;
382                    continue;
383                }
384                // Quote literal text until the next single apostrophe.
385                for(;;) {
386                    if(i == rules->length()) {
387                        setParseError("quoted literal text missing terminating apostrophe", errorCode);
388                        return i;
389                    }
390                    c = rules->charAt(i++);
391                    if(c == 0x27) {
392                        if(i < rules->length() && rules->charAt(i) == 0x27) {
393                            // Double apostrophe inside quoted literal text,
394                            // still encodes a single apostrophe.
395                            ++i;
396                        } else {
397                            break;
398                        }
399                    }
400                    raw.append((UChar)c);
401                }
402            } else if(c == 0x5c) {  // backslash
403                if(i == rules->length()) {
404                    setParseError("backslash escape at the end of the rule string", errorCode);
405                    return i;
406                }
407                c = rules->char32At(i);
408                raw.append(c);
409                i += U16_LENGTH(c);
410            } else {
411                // Any other syntax character terminates a string.
412                --i;
413                break;
414            }
415        } else if(PatternProps::isWhiteSpace(c)) {
416            // Unquoted white space terminates a string.
417            --i;
418            break;
419        } else {
420            raw.append((UChar)c);
421        }
422    }
423    for(int32_t j = 0; j < raw.length();) {
424        UChar32 c = raw.char32At(j);
425        if(U_IS_SURROGATE(c)) {
426            setParseError("string contains an unpaired surrogate", errorCode);
427            return i;
428        }
429        if(0xfffd <= c && c <= 0xffff) {
430            setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431            return i;
432        }
433        j += U16_LENGTH(c);
434    }
435    return i;
436}
437
438namespace {
439
440static const char *const positions[] = {
441    "first tertiary ignorable",
442    "last tertiary ignorable",
443    "first secondary ignorable",
444    "last secondary ignorable",
445    "first primary ignorable",
446    "last primary ignorable",
447    "first variable",
448    "last variable",
449    "first regular",
450    "last regular",
451    "first implicit",
452    "last implicit",
453    "first trailing",
454    "last trailing"
455};
456
457}  // namespace
458
459int32_t
460CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461    if(U_FAILURE(errorCode)) { return 0; }
462    UnicodeString raw;
463    int32_t j = readWords(i + 1, raw);
464    if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
465        ++j;
466        for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
467            if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468                str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469                return j;
470            }
471        }
472        if(raw == UNICODE_STRING_SIMPLE("top")) {
473            str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474            return j;
475        }
476        if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477            str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478            return j;
479        }
480    }
481    setParseError("not a valid special reset position", errorCode);
482    return i;
483}
484
485void
486CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487    if(U_FAILURE(errorCode)) { return; }
488    UnicodeString raw;
489    int32_t i = ruleIndex + 1;
490    int32_t j = readWords(i, raw);
491    if(j <= i || raw.isEmpty()) {
492        setParseError("expected a setting/option at '['", errorCode);
493    }
494    if(rules->charAt(j) == 0x5d) {  // words end with ]
495        ++j;
496        if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497                (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498            parseReordering(raw, errorCode);
499            ruleIndex = j;
500            return;
501        }
502        if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503            settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504                              UCOL_ON, 0, errorCode);
505            ruleIndex = j;
506            return;
507        }
508        UnicodeString v;
509        int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510        if(valueIndex >= 0) {
511            v.setTo(raw, valueIndex + 1);
512            raw.truncate(valueIndex);
513        }
514        if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515            int32_t value = UCOL_DEFAULT;
516            UChar c = v.charAt(0);
517            if(0x31 <= c && c <= 0x34) {  // 1..4
518                value = UCOL_PRIMARY + (c - 0x31);
519            } else if(c == 0x49) {  // 'I'
520                value = UCOL_IDENTICAL;
521            }
522            if(value != UCOL_DEFAULT) {
523                settings->setStrength(value, 0, errorCode);
524                ruleIndex = j;
525                return;
526            }
527        } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528            UColAttributeValue value = UCOL_DEFAULT;
529            if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530                value = UCOL_NON_IGNORABLE;
531            } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532                value = UCOL_SHIFTED;
533            }
534            if(value != UCOL_DEFAULT) {
535                settings->setAlternateHandling(value, 0, errorCode);
536                ruleIndex = j;
537                return;
538            }
539        } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540            int32_t value = UCOL_DEFAULT;
541            if(v == UNICODE_STRING_SIMPLE("space")) {
542                value = CollationSettings::MAX_VAR_SPACE;
543            } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544                value = CollationSettings::MAX_VAR_PUNCT;
545            } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546                value = CollationSettings::MAX_VAR_SYMBOL;
547            } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548                value = CollationSettings::MAX_VAR_CURRENCY;
549            }
550            if(value != UCOL_DEFAULT) {
551                settings->setMaxVariable(value, 0, errorCode);
552                settings->variableTop = baseData->getLastPrimaryForGroup(
553                    UCOL_REORDER_CODE_FIRST + value);
554                U_ASSERT(settings->variableTop != 0);
555                ruleIndex = j;
556                return;
557            }
558        } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559            UColAttributeValue value = UCOL_DEFAULT;
560            if(v == UNICODE_STRING_SIMPLE("off")) {
561                value = UCOL_OFF;
562            } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563                value = UCOL_LOWER_FIRST;
564            } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565                value = UCOL_UPPER_FIRST;
566            }
567            if(value != UCOL_DEFAULT) {
568                settings->setCaseFirst(value, 0, errorCode);
569                ruleIndex = j;
570                return;
571            }
572        } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573            UColAttributeValue value = getOnOffValue(v);
574            if(value != UCOL_DEFAULT) {
575                settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576                ruleIndex = j;
577                return;
578            }
579        } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580            UColAttributeValue value = getOnOffValue(v);
581            if(value != UCOL_DEFAULT) {
582                settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583                ruleIndex = j;
584                return;
585            }
586        } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587            UColAttributeValue value = getOnOffValue(v);
588            if(value != UCOL_DEFAULT) {
589                settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590                ruleIndex = j;
591                return;
592            }
593        } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594            UColAttributeValue value = getOnOffValue(v);
595            if(value != UCOL_DEFAULT) {
596                if(value == UCOL_ON) {
597                    setParseError("[hiraganaQ on] is not supported", errorCode);
598                }
599                ruleIndex = j;
600                return;
601            }
602        } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603            CharString lang;
604            lang.appendInvariantChars(v, errorCode);
605            if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606            // BCP 47 language tag -> ICU locale ID
607            char localeID[ULOC_FULLNAME_CAPACITY];
608            int32_t parsedLength;
609            int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610                                                 &parsedLength, &errorCode);
611            if(U_FAILURE(errorCode) ||
612                    parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613                errorCode = U_ZERO_ERROR;
614                setParseError("expected language tag in [import langTag]", errorCode);
615                return;
616            }
617            // localeID minus all keywords
618            char baseID[ULOC_FULLNAME_CAPACITY];
619            length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620            if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621                errorCode = U_ZERO_ERROR;
622                setParseError("expected language tag in [import langTag]", errorCode);
623                return;
624            }
625            if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
626                uprv_strcpy(baseID, "root");
627            }
628            // @collation=type, or length=0 if not specified
629            char collationType[ULOC_KEYWORDS_CAPACITY];
630            length = uloc_getKeywordValue(localeID, "collation",
631                                          collationType, ULOC_KEYWORDS_CAPACITY,
632                                          &errorCode);
633            if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
634                errorCode = U_ZERO_ERROR;
635                setParseError("expected language tag in [import langTag]", errorCode);
636                return;
637            }
638            if(importer == NULL) {
639                setParseError("[import langTag] is not supported", errorCode);
640            } else {
641                UnicodeString importedRules;
642                importer->getRules(baseID, length > 0 ? collationType : "standard",
643                                   importedRules, errorReason, errorCode);
644                if(U_FAILURE(errorCode)) {
645                    if(errorReason == NULL) {
646                        errorReason = "[import langTag] failed";
647                    }
648                    setErrorContext();
649                    return;
650                }
651                const UnicodeString *outerRules = rules;
652                int32_t outerRuleIndex = ruleIndex;
653                parse(importedRules, errorCode);
654                if(U_FAILURE(errorCode)) {
655                    if(parseError != NULL) {
656                        parseError->offset = outerRuleIndex;
657                    }
658                }
659                rules = outerRules;
660                ruleIndex = j;
661            }
662            return;
663        }
664    } else if(rules->charAt(j) == 0x5b) {  // words end with [
665        UnicodeSet set;
666        j = parseUnicodeSet(j, set, errorCode);
667        if(U_FAILURE(errorCode)) { return; }
668        if(raw == UNICODE_STRING_SIMPLE("optimize")) {
669            sink->optimize(set, errorReason, errorCode);
670            if(U_FAILURE(errorCode)) { setErrorContext(); }
671            ruleIndex = j;
672            return;
673        } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
674            sink->suppressContractions(set, errorReason, errorCode);
675            if(U_FAILURE(errorCode)) { setErrorContext(); }
676            ruleIndex = j;
677            return;
678        }
679    }
680    setParseError("not a valid setting/option", errorCode);
681}
682
683void
684CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
685    if(U_FAILURE(errorCode)) { return; }
686    int32_t i = 7;  // after "reorder"
687    if(i == raw.length()) {
688        // empty [reorder] with no codes
689        settings->resetReordering();
690        return;
691    }
692    // Parse the codes in [reorder aa bb cc].
693    UVector32 reorderCodes(errorCode);
694    if(U_FAILURE(errorCode)) { return; }
695    CharString word;
696    while(i < raw.length()) {
697        ++i;  // skip the word-separating space
698        int32_t limit = raw.indexOf((UChar)0x20, i);
699        if(limit < 0) { limit = raw.length(); }
700        word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
701        if(U_FAILURE(errorCode)) { return; }
702        int32_t code = getReorderCode(word.data());
703        if(code < 0) {
704            setParseError("unknown script or reorder code", errorCode);
705            return;
706        }
707        reorderCodes.addElement(code, errorCode);
708        if(U_FAILURE(errorCode)) { return; }
709        i = limit;
710    }
711    settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
712}
713
714static const char *const gSpecialReorderCodes[] = {
715    "space", "punct", "symbol", "currency", "digit"
716};
717
718int32_t
719CollationRuleParser::getReorderCode(const char *word) {
720    for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
721        if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
722            return UCOL_REORDER_CODE_FIRST + i;
723        }
724    }
725    int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
726    if(script >= 0) {
727        return script;
728    }
729    if(uprv_stricmp(word, "others") == 0) {
730        return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
731    }
732    return -1;
733}
734
735UColAttributeValue
736CollationRuleParser::getOnOffValue(const UnicodeString &s) {
737    if(s == UNICODE_STRING_SIMPLE("on")) {
738        return UCOL_ON;
739    } else if(s == UNICODE_STRING_SIMPLE("off")) {
740        return UCOL_OFF;
741    } else {
742        return UCOL_DEFAULT;
743    }
744}
745
746int32_t
747CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
748    // Collect a UnicodeSet pattern between a balanced pair of [brackets].
749    int32_t level = 0;
750    int32_t j = i;
751    for(;;) {
752        if(j == rules->length()) {
753            setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
754            return j;
755        }
756        UChar c = rules->charAt(j++);
757        if(c == 0x5b) {  // '['
758            ++level;
759        } else if(c == 0x5d) {  // ']'
760            if(--level == 0) { break; }
761        }
762    }
763    set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
764    if(U_FAILURE(errorCode)) {
765        errorCode = U_ZERO_ERROR;
766        setParseError("not a valid UnicodeSet pattern", errorCode);
767        return j;
768    }
769    j = skipWhiteSpace(j);
770    if(j == rules->length() || rules->charAt(j) != 0x5d) {
771        setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
772        return j;
773    }
774    return ++j;
775}
776
777int32_t
778CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
779    static const UChar sp = 0x20;
780    raw.remove();
781    i = skipWhiteSpace(i);
782    for(;;) {
783        if(i >= rules->length()) { return 0; }
784        UChar c = rules->charAt(i);
785        if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
786            if(raw.isEmpty()) { return i; }
787            if(raw.endsWith(&sp, 1)) {  // remove trailing space
788                raw.truncate(raw.length() - 1);
789            }
790            return i;
791        }
792        if(PatternProps::isWhiteSpace(c)) {
793            raw.append(sp);
794            i = skipWhiteSpace(i + 1);
795        } else {
796            raw.append(c);
797            ++i;
798        }
799    }
800}
801
802int32_t
803CollationRuleParser::skipComment(int32_t i) const {
804    // skip to past the newline
805    while(i < rules->length()) {
806        UChar c = rules->charAt(i++);
807        // LF or FF or CR or NEL or LS or PS
808        if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
809            // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
810            // NLF (new line function) = CR or LF or CR+LF or NEL.
811            // No need to collect all of CR+LF because a following LF will be ignored anyway.
812            break;
813        }
814    }
815    return i;
816}
817
818void
819CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
820    if(U_FAILURE(errorCode)) { return; }
821    // Error code consistent with the old parser (from ca. 2001),
822    // rather than U_PARSE_ERROR;
823    errorCode = U_INVALID_FORMAT_ERROR;
824    errorReason = reason;
825    if(parseError != NULL) { setErrorContext(); }
826}
827
828void
829CollationRuleParser::setErrorContext() {
830    if(parseError == NULL) { return; }
831
832    // Note: This relies on the calling code maintaining the ruleIndex
833    // at a position that is useful for debugging.
834    // For example, at the beginning of a reset or relation etc.
835    parseError->offset = ruleIndex;
836    parseError->line = 0;  // We are not counting line numbers.
837
838    // before ruleIndex
839    int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
840    if(start < 0) {
841        start = 0;
842    } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
843        ++start;
844    }
845    int32_t length = ruleIndex - start;
846    rules->extract(start, length, parseError->preContext);
847    parseError->preContext[length] = 0;
848
849    // starting from ruleIndex
850    length = rules->length() - ruleIndex;
851    if(length >= U_PARSE_CONTEXT_LEN) {
852        length = U_PARSE_CONTEXT_LEN - 1;
853        if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
854            --length;
855        }
856    }
857    rules->extract(ruleIndex, length, parseError->postContext);
858    parseError->postContext[length] = 0;
859}
860
861UBool
862CollationRuleParser::isSyntaxChar(UChar32 c) {
863    return 0x21 <= c && c <= 0x7e &&
864            (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
865            (0x5b <= c && c <= 0x60) || (0x7b <= c));
866}
867
868int32_t
869CollationRuleParser::skipWhiteSpace(int32_t i) const {
870    while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
871        ++i;
872    }
873    return i;
874}
875
876U_NAMESPACE_END
877
878#endif  // !UCONFIG_NO_COLLATION
879