1/*
2*******************************************************************************
3* Copyright (C) 2013-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationruleparser.h
7*
8* created on: 2013apr10
9* created by: Markus W. Scherer
10*/
11
12#ifndef __COLLATIONRULEPARSER_H__
13#define __COLLATIONRULEPARSER_H__
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_COLLATION
18
19#include "unicode/ucol.h"
20#include "unicode/uniset.h"
21#include "unicode/unistr.h"
22
23struct UParseError;
24
25U_NAMESPACE_BEGIN
26
27struct CollationData;
28struct CollationTailoring;
29
30class Locale;
31class Normalizer2;
32
33struct CollationSettings;
34
35class U_I18N_API CollationRuleParser : public UMemory {
36public:
37    /** Special reset positions. */
38    enum Position {
39        FIRST_TERTIARY_IGNORABLE,
40        LAST_TERTIARY_IGNORABLE,
41        FIRST_SECONDARY_IGNORABLE,
42        LAST_SECONDARY_IGNORABLE,
43        FIRST_PRIMARY_IGNORABLE,
44        LAST_PRIMARY_IGNORABLE,
45        FIRST_VARIABLE,
46        LAST_VARIABLE,
47        FIRST_REGULAR,
48        LAST_REGULAR,
49        FIRST_IMPLICIT,
50        LAST_IMPLICIT,
51        FIRST_TRAILING,
52        LAST_TRAILING
53    };
54
55    /**
56     * First character of contractions that encode special reset positions.
57     * U+FFFE cannot be tailored via rule syntax.
58     *
59     * The second contraction character is POS_BASE + Position.
60     */
61    static const UChar POS_LEAD = 0xfffe;
62    /**
63     * Base for the second character of contractions that encode special reset positions.
64     * Braille characters U+28xx are printable and normalization-inert.
65     * @see POS_LEAD
66     */
67    static const UChar POS_BASE = 0x2800;
68
69    class U_I18N_API Sink : public UObject {
70    public:
71        virtual ~Sink();
72        /**
73         * Adds a reset.
74         * strength=UCOL_IDENTICAL for &str.
75         * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
76         */
77        virtual void addReset(int32_t strength, const UnicodeString &str,
78                              const char *&errorReason, UErrorCode &errorCode) = 0;
79        /**
80         * Adds a relation with strength and prefix | str / extension.
81         */
82        virtual void addRelation(int32_t strength, const UnicodeString &prefix,
83                                 const UnicodeString &str, const UnicodeString &extension,
84                                 const char *&errorReason, UErrorCode &errorCode) = 0;
85
86        virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
87                                          UErrorCode &errorCode);
88
89        virtual void optimize(const UnicodeSet &set, const char *&errorReason,
90                              UErrorCode &errorCode);
91    };
92
93    class U_I18N_API Importer : public UObject {
94    public:
95        virtual ~Importer();
96        virtual const UnicodeString *getRules(
97                const char *localeID, const char *collationType,
98                const char *&errorReason, UErrorCode &errorCode) = 0;
99    };
100
101    /**
102     * Constructor.
103     * The Sink must be set before parsing.
104     * The Importer can be set, otherwise [import locale] syntax is not supported.
105     */
106    CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
107    ~CollationRuleParser();
108
109    /**
110     * Sets the pointer to a Sink object.
111     * The pointer is aliased: Pointer copy without cloning or taking ownership.
112     */
113    void setSink(Sink *sinkAlias) {
114        sink = sinkAlias;
115    }
116
117    /**
118     * Sets the pointer to an Importer object.
119     * The pointer is aliased: Pointer copy without cloning or taking ownership.
120     */
121    void setImporter(Importer *importerAlias) {
122        importer = importerAlias;
123    }
124
125    void parse(const UnicodeString &ruleString,
126               CollationSettings &outSettings,
127               UParseError *outParseError,
128               UErrorCode &errorCode);
129
130    const char *getErrorReason() const { return errorReason; }
131
132    /**
133     * Gets a script or reorder code from its string representation.
134     * @return the script/reorder code, or
135     * -1==UCOL_REORDER_CODE_DEFAULT, or
136     * -2 if not recognized
137     */
138    static int32_t getReorderCode(const char *word);
139
140private:
141    /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
142    static const int32_t STRENGTH_MASK = 0xf;
143    static const int32_t STARRED_FLAG = 0x10;
144    static const int32_t OFFSET_SHIFT = 8;
145
146    void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
147    void parseRuleChain(UErrorCode &errorCode);
148    int32_t parseResetAndPosition(UErrorCode &errorCode);
149    int32_t parseRelationOperator(UErrorCode &errorCode);
150    void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
151    void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
152    int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
153    int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
154
155    /**
156     * Sets str to a contraction of U+FFFE and (U+2800 + Position).
157     * @return rule index after the special reset position
158     */
159    int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
160    void parseSetting(UErrorCode &errorCode);
161    void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
162    static UColAttributeValue getOnOffValue(const UnicodeString &s);
163
164    int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
165    int32_t readWords(int32_t i, UnicodeString &raw) const;
166    int32_t skipComment(int32_t i) const;
167
168    void setParseError(const char *reason, UErrorCode &errorCode);
169    void setErrorContext();
170
171    /**
172     * ASCII [:P:] and [:S:]:
173     * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
174     */
175    static UBool isSyntaxChar(UChar32 c);
176    int32_t skipWhiteSpace(int32_t i) const;
177
178    const Normalizer2 &nfd, &nfc;
179
180    const UnicodeString *rules;
181    const CollationData *const baseData;
182    CollationSettings *settings;
183    UParseError *parseError;
184    const char *errorReason;
185
186    Sink *sink;
187    Importer *importer;
188
189    int32_t ruleIndex;
190};
191
192U_NAMESPACE_END
193
194#endif  // !UCONFIG_NO_COLLATION
195#endif  // __COLLATIONRULEPARSER_H__
196