1/*
2 **************************************************************************
3 *   Copyright (c) 2002-2010, International Business Machines Corporation *
4 *   and others.  All Rights Reserved.                                    *
5 **************************************************************************
6 *   Date        Name        Description                                  *
7 *   01/28/2002  aliu        Creation.                                    *
8 **************************************************************************
9 */
10#ifndef TRIDPARS_H
11#define TRIDPARS_H
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
17#include "unicode/uobject.h"
18#include "unicode/unistr.h"
19
20U_NAMESPACE_BEGIN
21
22class Transliterator;
23class UnicodeSet;
24class UVector;
25
26/**
27 * Parsing component for transliterator IDs.  This class contains only
28 * static members; it cannot be instantiated.  Methods in this class
29 * parse various ID formats, including the following:
30 *
31 * A basic ID, which contains source, target, and variant, but no
32 * filter and no explicit inverse.  Examples include
33 * "Latin-Greek/UNGEGN" and "Null".
34 *
35 * A single ID, which is a basic ID plus optional filter and optional
36 * explicit inverse.  Examples include "[a-zA-Z] Latin-Greek" and
37 * "Lower (Upper)".
38 *
39 * A compound ID, which is a sequence of one or more single IDs,
40 * separated by semicolons, with optional forward and reverse global
41 * filters.  The global filters are UnicodeSet patterns prepended or
42 * appended to the IDs, separated by semicolons.  An appended filter
43 * must be enclosed in parentheses and applies in the reverse
44 * direction.
45 *
46 * @author Alan Liu
47 */
48class TransliteratorIDParser /* not : public UObject because all methods are static */ {
49
50 public:
51
52    /**
53     * A structure containing the parsed data of a filtered ID, that
54     * is, a basic ID optionally with a filter.
55     *
56     * 'source' and 'target' will always be non-null.  The 'variant'
57     * will be non-null only if a non-empty variant was parsed.
58     *
59     * 'sawSource' is true if there was an explicit source in the
60     * parsed id.  If there was no explicit source, then an implied
61     * source of ANY is returned and 'sawSource' is set to false.
62     *
63     * 'filter' is the parsed filter pattern, or null if there was no
64     * filter.
65     */
66    class Specs : public UMemory {
67    public:
68        UnicodeString source; // not null
69        UnicodeString target; // not null
70        UnicodeString variant; // may be null
71        UnicodeString filter; // may be null
72        UBool sawSource;
73        Specs(const UnicodeString& s, const UnicodeString& t,
74              const UnicodeString& v, UBool sawS,
75              const UnicodeString& f);
76
77    private:
78
79        Specs(const Specs &other); // forbid copying of this class
80        Specs &operator=(const Specs &other); // forbid copying of this class
81    };
82
83    /**
84     * A structure containing the canonicalized data of a filtered ID,
85     * that is, a basic ID optionally with a filter.
86     *
87     * 'canonID' is always non-null.  It may be the empty string "".
88     * It is the id that should be assigned to the created
89     * transliterator.  It _cannot_ be instantiated directly.
90     *
91     * 'basicID' is always non-null and non-empty.  It is always of
92     * the form S-T or S-T/V.  It is designed to be fed to low-level
93     * instantiation code that only understands these two formats.
94     *
95     * 'filter' may be null, if there is none, or non-null and
96     * non-empty.
97     */
98    class SingleID : public UMemory {
99    public:
100        UnicodeString canonID;
101        UnicodeString basicID;
102        UnicodeString filter;
103        SingleID(const UnicodeString& c, const UnicodeString& b,
104                 const UnicodeString& f);
105        SingleID(const UnicodeString& c, const UnicodeString& b);
106        Transliterator* createInstance();
107
108    private:
109
110        SingleID(const SingleID &other); // forbid copying of this class
111        SingleID &operator=(const SingleID &other); // forbid copying of this class
112    };
113
114    /**
115     * Parse a filter ID, that is, an ID of the general form
116     * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
117     * @param id the id to be parsed
118     * @param pos INPUT-OUTPUT parameter.  On input, the position of
119     * the first character to parse.  On output, the position after
120     * the last character parsed.
121     * @return a SingleID object or null if the parse fails
122     */
123    static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
124
125    /**
126     * Parse a single ID, that is, an ID of the general form
127     * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
128     * optional, the filters optional, and the variants optional.
129     * @param id the id to be parsed
130     * @param pos INPUT-OUTPUT parameter.  On input, the position of
131     * the first character to parse.  On output, the position after
132     * the last character parsed.
133     * @param dir the direction.  If the direction is REVERSE then the
134     * SingleID is constructed for the reverse direction.
135     * @return a SingleID object or null
136     */
137    static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
138                                  int32_t dir, UErrorCode& status);
139
140    /**
141     * Parse a global filter of the form "[f]" or "([f])", depending
142     * on 'withParens'.
143     * @param id the pattern the parse
144     * @param pos INPUT-OUTPUT parameter.  On input, the position of
145     * the first character to parse.  On output, the position after
146     * the last character parsed.
147     * @param dir the direction.
148     * @param withParens INPUT-OUTPUT parameter.  On entry, if
149     * withParens[0] is 0, then parens are disallowed.  If it is 1,
150     * then parens are required.  If it is -1, then parens are
151     * optional, and the return result will be set to 0 or 1.
152     * @param canonID OUTPUT parameter.  The pattern for the filter
153     * added to the canonID, either at the end, if dir is FORWARD, or
154     * at the start, if dir is REVERSE.  The pattern will be enclosed
155     * in parentheses if appropriate, and will be suffixed with an
156     * ID_DELIM character.  May be null.
157     * @return a UnicodeSet object or null.  A non-null results
158     * indicates a successful parse, regardless of whether the filter
159     * applies to the given direction.  The caller should discard it
160     * if withParens != (dir == REVERSE).
161     */
162    static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
163                                         int32_t dir,
164                                         int32_t& withParens,
165                                         UnicodeString* canonID);
166
167    /**
168     * Parse a compound ID, consisting of an optional forward global
169     * filter, a separator, one or more single IDs delimited by
170     * separators, an an optional reverse global filter.  The
171     * separator is a semicolon.  The global filters are UnicodeSet
172     * patterns.  The reverse global filter must be enclosed in
173     * parentheses.
174     * @param id the pattern the parse
175     * @param dir the direction.
176     * @param canonID OUTPUT parameter that receives the canonical ID,
177     * consisting of canonical IDs for all elements, as returned by
178     * parseSingleID(), separated by semicolons.  Previous contents
179     * are discarded.
180     * @param list OUTPUT parameter that receives a list of SingleID
181     * objects representing the parsed IDs.  Previous contents are
182     * discarded.
183     * @param globalFilter OUTPUT parameter that receives a pointer to
184     * a newly created global filter for this ID in this direction, or
185     * null if there is none.
186     * @return true if the parse succeeds, that is, if the entire
187     * id is consumed without syntax error.
188     */
189    static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
190                                 UnicodeString& canonID,
191                                 UVector& list,
192                                 UnicodeSet*& globalFilter);
193
194    /**
195     * Convert the elements of the 'list' vector, which are SingleID
196     * objects, into actual Transliterator objects.  In the course of
197     * this, some (or all) entries may be removed.  If all entries
198     * are removed, the Null transliterator will be added.
199     *
200     * Delete entries with empty basicIDs; these are generated by
201     * elements like "(A)" in the forward direction, or "A()" in
202     * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
203     * SingleID entries to actual transliterators.
204     *
205     * @param list vector of SingleID objects.  On exit, vector
206     * of one or more Transliterators.
207     * @param ec Output param to receive a success or an error code.
208     * @return new value of insertIndex.  The index will shift if
209     * there are empty items, like "(Lower)", with indices less than
210     * insertIndex.
211     */
212    static void instantiateList(UVector& list,
213                                UErrorCode& ec);
214
215    /**
216     * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
217     * S-T/V, or S/V-T.  If the source is missing, return a source of
218     * ANY.
219     * @param id the id string, in any of several forms
220     * @param source          the given source.
221     * @param target          the given target.
222     * @param variant         the given variant
223     * @param isSourcePresent If TRUE then the source is present.
224     *                        If the source is not present, ANY will be
225     *                        given as the source, and isSourcePresent will be null
226     * @return an array of 4 strings: source, target, variant, and
227     * isSourcePresent.  If the source is not present, ANY will be
228     * given as the source, and isSourcePresent will be null.  Otherwise
229     * isSourcePresent will be non-null.  The target may be empty if the
230     * id is not well-formed.  The variant may be empty.
231     */
232    static void IDtoSTV(const UnicodeString& id,
233                        UnicodeString& source,
234                        UnicodeString& target,
235                        UnicodeString& variant,
236                        UBool& isSourcePresent);
237
238    /**
239     * Given source, target, and variant strings, concatenate them into a
240     * full ID.  If the source is empty, then "Any" will be used for the
241     * source, so the ID will always be of the form s-t/v or s-t.
242     */
243    static void STVtoID(const UnicodeString& source,
244                        const UnicodeString& target,
245                        const UnicodeString& variant,
246                        UnicodeString& id);
247
248    /**
249     * Register two targets as being inverses of one another.  For
250     * example, calling registerSpecialInverse("NFC", "NFD", true) causes
251     * Transliterator to form the following inverse relationships:
252     *
253     * <pre>NFC => NFD
254     * Any-NFC => Any-NFD
255     * NFD => NFC
256     * Any-NFD => Any-NFC</pre>
257     *
258     * (Without the special inverse registration, the inverse of NFC
259     * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
260     * that the presence or absence of "Any-" is preserved.
261     *
262     * <p>The relationship is symmetrical; registering (a, b) is
263     * equivalent to registering (b, a).
264     *
265     * <p>The relevant IDs must still be registered separately as
266     * factories or classes.
267     *
268     * <p>Only the targets are specified.  Special inverses always
269     * have the form Any-Target1 <=> Any-Target2.  The target should
270     * have canonical casing (the casing desired to be produced when
271     * an inverse is formed) and should contain no whitespace or other
272     * extraneous characters.
273     *
274     * @param target the target against which to register the inverse
275     * @param inverseTarget the inverse of target, that is
276     * Any-target.getInverse() => Any-inverseTarget
277     * @param bidirectional if true, register the reverse relation
278     * as well, that is, Any-inverseTarget.getInverse() => Any-target
279     */
280    static void registerSpecialInverse(const UnicodeString& target,
281                                       const UnicodeString& inverseTarget,
282                                       UBool bidirectional,
283                                       UErrorCode &status);
284
285    /**
286     * Free static memory.
287     */
288    static void cleanup();
289
290 private:
291    //----------------------------------------------------------------
292    // Private implementation
293    //----------------------------------------------------------------
294
295    // forbid instantiation
296    TransliteratorIDParser();
297
298    /**
299     * Parse an ID into component pieces.  Take IDs of the form T,
300     * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
301     * source of ANY.
302     * @param id the id string, in any of several forms
303     * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
304     * offset of the first character to parse in id.  On output,
305     * pos[0] is the offset after the last parsed character.  If the
306     * parse failed, pos[0] will be unchanged.
307     * @param allowFilter if true, a UnicodeSet pattern is allowed
308     * at any location between specs or delimiters, and is returned
309     * as the fifth string in the array.
310     * @return a Specs object, or null if the parse failed.  If
311     * neither source nor target was seen in the parsed id, then the
312     * parse fails.  If allowFilter is true, then the parsed filter
313     * pattern is returned in the Specs object, otherwise the returned
314     * filter reference is null.  If the parse fails for any reason
315     * null is returned.
316     */
317    static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
318                                UBool allowFilter);
319
320    /**
321     * Givens a Specs object, convert it to a SingleID object.  The
322     * Spec object is a more unprocessed parse result.  The SingleID
323     * object contains information about canonical and basic IDs.
324     * @param specs the given Specs object.
325     * @param dir   either FORWARD or REVERSE.
326     * @return a SingleID; never returns null.  Returned object always
327     * has 'filter' field of null.
328     */
329    static SingleID* specsToID(const Specs* specs, int32_t dir);
330
331    /**
332     * Given a Specs object, return a SingleID representing the
333     * special inverse of that ID.  If there is no special inverse
334     * then return null.
335     * @param specs the given Specs.
336     * @return a SingleID or null.  Returned object always has
337     * 'filter' field of null.
338     */
339    static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
340
341    /**
342     * Glue method to get around access problems in C++.
343     * @param id the id string for the transliterator, in any of several forms
344     * @param canonID the given canonical ID
345     */
346    static Transliterator* createBasicInstance(const UnicodeString& id,
347                                               const UnicodeString* canonID);
348
349    /**
350     * Initialize static memory.
351     */
352    static void init(UErrorCode &status);
353
354    friend class SingleID;
355};
356
357U_NAMESPACE_END
358
359#endif /* #if !UCONFIG_NO_TRANSLITERATION */
360
361#endif
362