1/*
2**********************************************************************
3* Copyright (C) 1999-2011, International Business Machines Corporation
4* and others. All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   11/17/99    aliu        Creation.
8**********************************************************************
9*/
10#ifndef RBT_PARS_H
11#define RBT_PARS_H
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16#ifdef __cplusplus
17
18#include "unicode/uobject.h"
19#include "unicode/parseerr.h"
20#include "unicode/unorm.h"
21#include "rbt.h"
22#include "hash.h"
23#include "uvector.h"
24
25U_NAMESPACE_BEGIN
26
27class TransliterationRuleData;
28class UnicodeFunctor;
29class ParseData;
30class RuleHalf;
31class ParsePosition;
32class StringMatcher;
33
34class TransliteratorParser : public UMemory {
35
36 public:
37
38    /**
39     * A Vector of TransliterationRuleData objects, one for each discrete group
40     * of rules in the rule set
41     */
42    UVector dataVector;
43
44    /**
45     * PUBLIC data member.
46     * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
47     */
48    UVector idBlockVector;
49
50    /**
51     * PUBLIC data member containing the parsed compound filter, if any.
52     */
53    UnicodeSet* compoundFilter;
54
55 private:
56
57    /**
58     * The current data object for which we are parsing rules
59     */
60    TransliterationRuleData* curData;
61
62    UTransDirection direction;
63
64    /**
65     * Parse error information.
66     */
67    UParseError parseError;
68
69    /**
70     * Temporary symbol table used during parsing.
71     */
72    ParseData* parseData;
73
74    /**
75     * Temporary vector of matcher variables.  When parsing is complete, this
76     * is copied into the array data.variables.  As with data.variables,
77     * element 0 corresponds to character data.variablesBase.
78     */
79    UVector variablesVector;
80
81    /**
82     * Temporary table of variable names.  When parsing is complete, this is
83     * copied into data.variableNames.
84     */
85    Hashtable variableNames;
86
87    /**
88     * String of standins for segments.  Used during the parsing of a single
89     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
90     * to StringMatcher object segmentObjects.elementAt(0), etc.
91     */
92    UnicodeString segmentStandins;
93
94    /**
95     * Vector of StringMatcher objects for segments.  Used during the
96     * parsing of a single rule.
97     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
98     * to StringMatcher object segmentObjects.elementAt(0), etc.
99     */
100    UVector segmentObjects;
101
102    /**
103     * The next available stand-in for variables.  This starts at some point in
104     * the private use area (discovered dynamically) and increments up toward
105     * <code>variableLimit</code>.  At any point during parsing, available
106     * variables are <code>variableNext..variableLimit-1</code>.
107     */
108    UChar variableNext;
109
110    /**
111     * The last available stand-in for variables.  This is discovered
112     * dynamically.  At any point during parsing, available variables are
113     * <code>variableNext..variableLimit-1</code>.
114     */
115    UChar variableLimit;
116
117    /**
118     * When we encounter an undefined variable, we do not immediately signal
119     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
120     * Instead, we save the name of the undefined variable, and substitute
121     * in the placeholder char variableLimit - 1, and decrement
122     * variableLimit.
123     */
124    UnicodeString undefinedVariableName;
125
126    /**
127     * The stand-in character for the 'dot' set, represented by '.' in
128     * patterns.  This is allocated the first time it is needed, and
129     * reused thereafter.
130     */
131    UChar dotStandIn;
132
133public:
134
135    /**
136     * Constructor.
137     */
138    TransliteratorParser(UErrorCode &statusReturn);
139
140    /**
141     * Destructor.
142     */
143    ~TransliteratorParser();
144
145    /**
146     * Parse the given string as a sequence of rules, separated by newline
147     * characters ('\n'), and cause this object to implement those rules.  Any
148     * previous rules are discarded.  Typically this method is called exactly
149     * once after construction.
150     *
151     * Parse the given rules, in the given direction.  After this call
152     * returns, query the public data members for results.  The caller
153     * owns the 'data' and 'compoundFilter' data members after this
154     * call returns.
155     * @param rules      rules, separated by ';'
156     * @param direction  either FORWARD or REVERSE.
157     * @param pe         Struct to recieve information on position
158     *                   of error if an error is encountered
159     * @param ec         Output param set to success/failure code.
160     */
161    void parse(const UnicodeString& rules,
162               UTransDirection direction,
163               UParseError& pe,
164               UErrorCode& ec);
165
166    /**
167     * Return the compound filter parsed by parse().  Caller owns result.
168     * @return the compound filter parsed by parse().
169     */
170    UnicodeSet* orphanCompoundFilter();
171
172private:
173
174    /**
175     * Return a representation of this transliterator as source rules.
176     * @param rules      Output param to receive the rules.
177     * @param direction  either FORWARD or REVERSE.
178     */
179    void parseRules(const UnicodeString& rules,
180                    UTransDirection direction,
181                    UErrorCode& status);
182
183    /**
184     * MAIN PARSER.  Parse the next rule in the given rule string, starting
185     * at pos.  Return the index after the last character parsed.  Do not
186     * parse characters at or after limit.
187     *
188     * Important:  The character at pos must be a non-whitespace character
189     * that is not the comment character.
190     *
191     * This method handles quoting, escaping, and whitespace removal.  It
192     * parses the end-of-rule character.  It recognizes context and cursor
193     * indicators.  Once it does a lexical breakdown of the rule at pos, it
194     * creates a rule object and adds it to our rule list.
195     * @param rules      Output param to receive the rules.
196     * @param pos        the starting position.
197     * @param limit      pointer past the last character of the rule.
198     * @return           the index after the last character parsed.
199     */
200    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
201
202    /**
203     * Set the variable range to [start, end] (inclusive).
204     * @param start    the start value of the range.
205     * @param end      the end value of the range.
206     */
207    void setVariableRange(int32_t start, int32_t end, UErrorCode& status);
208
209    /**
210     * Assert that the given character is NOT within the variable range.
211     * If it is, return FALSE.  This is neccesary to ensure that the
212     * variable range does not overlap characters used in a rule.
213     * @param ch     the given character.
214     * @return       True, if the given character is NOT within the variable range.
215     */
216    UBool checkVariableRange(UChar32 ch) const;
217
218    /**
219     * Set the maximum backup to 'backup', in response to a pragma
220     * statement.
221     * @param backup    the new value to be set.
222     */
223    void pragmaMaximumBackup(int32_t backup);
224
225    /**
226     * Begin normalizing all rules using the given mode, in response
227     * to a pragma statement.
228     * @param mode    the given mode.
229     */
230    void pragmaNormalizeRules(UNormalizationMode mode);
231
232    /**
233     * Return true if the given rule looks like a pragma.
234     * @param pos offset to the first non-whitespace character
235     * of the rule.
236     * @param limit pointer past the last character of the rule.
237     * @return true if the given rule looks like a pragma.
238     */
239    static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);
240
241    /**
242     * Parse a pragma.  This method assumes resemblesPragma() has
243     * already returned true.
244     * @param pos offset to the first non-whitespace character
245     * of the rule.
246     * @param limit pointer past the last character of the rule.
247     * @return the position index after the final ';' of the pragma,
248     * or -1 on failure.
249     */
250    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
251
252    /**
253     * Called by main parser upon syntax error.  Search the rule string
254     * for the probable end of the rule.  Of course, if the error is that
255     * the end of rule marker is missing, then the rule end will not be found.
256     * In any case the rule start will be correctly reported.
257     * @param parseErrorCode error code.
258     * @param msg error description.
259     * @param start position of first character of current rule.
260     * @return start position of first character of current rule.
261     */
262    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
263                        UErrorCode& status);
264
265    /**
266     * Parse a UnicodeSet out, store it, and return the stand-in character
267     * used to represent it.
268     *
269     * @param rule    the rule for UnicodeSet.
270     * @param pos     the position in pattern at which to start parsing.
271     * @return        the stand-in character used to represent it.
272     */
273    UChar parseSet(const UnicodeString& rule,
274                   ParsePosition& pos,
275                   UErrorCode& status);
276
277    /**
278     * Generate and return a stand-in for a new UnicodeFunctor.  Store
279     * the matcher (adopt it).
280     * @param adopted the UnicodeFunctor to be adopted.
281     * @return        a stand-in for a new UnicodeFunctor.
282     */
283    UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);
284
285    /**
286     * Return the standin for segment seg (1-based).
287     * @param seg    the given segment.
288     * @return       the standIn character for the given segment.
289     */
290    UChar getSegmentStandin(int32_t seg, UErrorCode& status);
291
292    /**
293     * Set the object for segment seg (1-based).
294     * @param seg      the given segment.
295     * @param adopted  the StringMatcher to be adopted.
296     */
297    void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);
298
299    /**
300     * Return the stand-in for the dot set.  It is allocated the first
301     * time and reused thereafter.
302     * @return    the stand-in for the dot set.
303     */
304    UChar getDotStandIn(UErrorCode& status);
305
306    /**
307     * Append the value of the given variable name to the given
308     * UnicodeString.
309     * @param name    the variable name to be appended.
310     * @param buf     the given UnicodeString to append to.
311     */
312    void appendVariableDef(const UnicodeString& name,
313                           UnicodeString& buf,
314                           UErrorCode& status);
315
316    /**
317     * Glue method to get around access restrictions in C++.
318     */
319    /*static Transliterator* createBasicInstance(const UnicodeString& id,
320                                               const UnicodeString* canonID);*/
321
322    friend class RuleHalf;
323
324    // Disallowed methods; no impl.
325    /**
326     * Copy constructor
327     */
328    TransliteratorParser(const TransliteratorParser&);
329
330    /**
331     * Assignment operator
332     */
333    TransliteratorParser& operator=(const TransliteratorParser&);
334};
335
336U_NAMESPACE_END
337
338#endif /* #ifdef __cplusplus */
339
340/**
341 * Strip/convert the following from the transliterator rules:
342 * comments
343 * newlines
344 * white space at the beginning and end of a line
345 * unescape \u notation
346 *
347 * The target must be equal in size as the source.
348 * @internal
349 */
350U_CAPI int32_t
351utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);
352
353#endif /* #if !UCONFIG_NO_TRANSLITERATION */
354
355#endif
356