1/*
2* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
3**********************************************************************
4*   Date        Name        Description
5*   11/17/99    aliu        Creation.
6**********************************************************************
7*/
8#ifndef RBT_RULE_H
9#define RBT_RULE_H
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "unicode/uobject.h"
16#include "unicode/unistr.h"
17#include "unicode/utrans.h"
18#include "unicode/unimatch.h"
19
20U_NAMESPACE_BEGIN
21
22class Replaceable;
23class TransliterationRuleData;
24class StringMatcher;
25class UnicodeFunctor;
26
27/**
28 * A transliteration rule used by
29 * <code>RuleBasedTransliterator</code>.
30 * <code>TransliterationRule</code> is an immutable object.
31 *
32 * <p>A rule consists of an input pattern and an output string.  When
33 * the input pattern is matched, the output string is emitted.  The
34 * input pattern consists of zero or more characters which are matched
35 * exactly (the key) and optional context.  Context must match if it
36 * is specified.  Context may be specified before the key, after the
37 * key, or both.  The key, preceding context, and following context
38 * may contain variables.  Variables represent a set of Unicode
39 * characters, such as the letters <i>a</i> through <i>z</i>.
40 * Variables are detected by looking up each character in a supplied
41 * variable list to see if it has been so defined.
42 *
43 * <p>A rule may contain segments in its input string and segment
44 * references in its output string.  A segment is a substring of the
45 * input pattern, indicated by an offset and limit.  The segment may
46 * be in the preceding or following context.  It may not span a
47 * context boundary.  A segment reference is a special character in
48 * the output string that causes a segment of the input string (not
49 * the input pattern) to be copied to the output string.  The range of
50 * special characters that represent segment references is defined by
51 * RuleBasedTransliterator.Data.
52 *
53 * @author Alan Liu
54 */
55class TransliterationRule : public UMemory {
56
57private:
58
59    // TODO Eliminate the pattern and keyLength data members.  They
60    // are used only by masks() and getIndexValue() which are called
61    // only during build time, not during run-time.  Perhaps these
62    // methods and pattern/keyLength can be isolated into a separate
63    // object.
64
65    /**
66     * The match that must occur before the key, or null if there is no
67     * preceding context.
68     */
69    StringMatcher *anteContext;
70
71    /**
72     * The matcher object for the key.  If null, then the key is empty.
73     */
74    StringMatcher *key;
75
76    /**
77     * The match that must occur after the key, or null if there is no
78     * following context.
79     */
80    StringMatcher *postContext;
81
82    /**
83     * The object that performs the replacement if the key,
84     * anteContext, and postContext are matched.  Never null.
85     */
86    UnicodeFunctor* output;
87
88    /**
89     * The string that must be matched, consisting of the anteContext, key,
90     * and postContext, concatenated together, in that order.  Some components
91     * may be empty (zero length).
92     * @see anteContextLength
93     * @see keyLength
94     */
95    UnicodeString pattern;
96
97    /**
98     * An array of matcher objects corresponding to the input pattern
99     * segments.  If there are no segments this is null.  N.B. This is
100     * a UnicodeMatcher for generality, but in practice it is always a
101     * StringMatcher.  In the future we may generalize this, but for
102     * now we sometimes cast down to StringMatcher.
103     *
104     * The array is owned, but the pointers within it are not.
105     */
106    UnicodeFunctor** segments;
107
108    /**
109     * The number of elements in segments[] or zero if segments is NULL.
110     */
111    int32_t segmentsCount;
112
113    /**
114     * The length of the string that must match before the key.  If
115     * zero, then there is no matching requirement before the key.
116     * Substring [0,anteContextLength) of pattern is the anteContext.
117     */
118    int32_t anteContextLength;
119
120    /**
121     * The length of the key.  Substring [anteContextLength,
122     * anteContextLength + keyLength) is the key.
123
124     */
125    int32_t keyLength;
126
127    /**
128     * Miscellaneous attributes.
129     */
130    int8_t flags;
131
132    /**
133     * Flag attributes.
134     */
135    enum {
136        ANCHOR_START = 1,
137        ANCHOR_END   = 2
138    };
139
140    /**
141     * An alias pointer to the data for this rule.  The data provides
142     * lookup services for matchers and segments.
143     */
144    const TransliterationRuleData* data;
145
146public:
147
148    /**
149     * Construct a new rule with the given input, output text, and other
150     * attributes.  A cursor position may be specified for the output text.
151     * @param input          input string, including key and optional ante and
152     *                       post context.
153     * @param anteContextPos offset into input to end of ante context, or -1 if
154     *                       none.  Must be <= input.length() if not -1.
155     * @param postContextPos offset into input to start of post context, or -1
156     *                       if none.  Must be <= input.length() if not -1, and must be >=
157     *                       anteContextPos.
158     * @param outputStr      output string.
159     * @param cursorPosition offset into output at which cursor is located, or -1 if
160     *                       none.  If less than zero, then the cursor is placed after the
161     *                       <code>output</code>; that is, -1 is equivalent to
162     *                       <code>output.length()</code>.  If greater than
163     *                       <code>output.length()</code> then an exception is thrown.
164     * @param cursorOffset   an offset to be added to cursorPos to position the
165     *                       cursor either in the ante context, if < 0, or in the post context, if >
166     *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
167     *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset
168     *                       of -3.
169     * @param segs           array of UnicodeMatcher corresponding to input pattern
170     *                       segments, or null if there are none.  The array itself is adopted,
171     *                       but the pointers within it are not.
172     * @param segsCount      number of elements in segs[].
173     * @param anchorStart    TRUE if the the rule is anchored on the left to
174     *                       the context start.
175     * @param anchorEnd      TRUE if the rule is anchored on the right to the
176     *                       context limit.
177     * @param data           the rule data.
178     * @param status         Output parameter filled in with success or failure status.
179     */
180    TransliterationRule(const UnicodeString& input,
181                        int32_t anteContextPos, int32_t postContextPos,
182                        const UnicodeString& outputStr,
183                        int32_t cursorPosition, int32_t cursorOffset,
184                        UnicodeFunctor** segs,
185                        int32_t segsCount,
186                        UBool anchorStart, UBool anchorEnd,
187                        const TransliterationRuleData* data,
188                        UErrorCode& status);
189
190    /**
191     * Copy constructor.
192     * @param other    the object to be copied.
193     */
194    TransliterationRule(TransliterationRule& other);
195
196    /**
197     * Destructor.
198     */
199    virtual ~TransliterationRule();
200
201    /**
202     * Change the data object that this rule belongs to.  Used
203     * internally by the TransliterationRuleData copy constructor.
204     * @param data    the new data value to be set.
205     */
206    void setData(const TransliterationRuleData* data);
207
208    /**
209     * Return the preceding context length.  This method is needed to
210     * support the <code>Transliterator</code> method
211     * <code>getMaximumContextLength()</code>.  Internally, this is
212     * implemented as the anteContextLength, optionally plus one if
213     * there is a start anchor.  The one character anchor gap is
214     * needed to make repeated incremental transliteration with
215     * anchors work.
216     * @return    the preceding context length.
217     */
218    virtual int32_t getContextLength(void) const;
219
220    /**
221     * Internal method.  Returns 8-bit index value for this rule.
222     * This is the low byte of the first character of the key,
223     * unless the first character of the key is a set.  If it's a
224     * set, or otherwise can match multiple keys, the index value is -1.
225     * @return    8-bit index value for this rule.
226     */
227    int16_t getIndexValue() const;
228
229    /**
230     * Internal method.  Returns true if this rule matches the given
231     * index value.  The index value is an 8-bit integer, 0..255,
232     * representing the low byte of the first character of the key.
233     * It matches this rule if it matches the first character of the
234     * key, or if the first character of the key is a set, and the set
235     * contains any character with a low byte equal to the index
236     * value.  If the rule contains only ante context, as in foo)>bar,
237     * then it will match any key.
238     * @param v    the given index value.
239     * @return     true if this rule matches the given index value.
240     */
241    UBool matchesIndexValue(uint8_t v) const;
242
243    /**
244     * Return true if this rule masks another rule.  If r1 masks r2 then
245     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
246     * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
247     * "[c]a>x" masks "[dc]a>y".
248     * @param r2  the given rule to be compared with.
249     * @return    true if this rule masks 'r2'
250     */
251    virtual UBool masks(const TransliterationRule& r2) const;
252
253    /**
254     * Attempt a match and replacement at the given position.  Return
255     * the degree of match between this rule and the given text.  The
256     * degree of match may be mismatch, a partial match, or a full
257     * match.  A mismatch means at least one character of the text
258     * does not match the context or key.  A partial match means some
259     * context and key characters match, but the text is not long
260     * enough to match all of them.  A full match means all context
261     * and key characters match.
262     *
263     * If a full match is obtained, perform a replacement, update pos,
264     * and return U_MATCH.  Otherwise both text and pos are unchanged.
265     *
266     * @param text the text
267     * @param pos the position indices
268     * @param incremental if TRUE, test for partial matches that may
269     * be completed by additional text inserted at pos.limit.
270     * @return one of <code>U_MISMATCH</code>,
271     * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
272     * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
273     */
274    UMatchDegree matchAndReplace(Replaceable& text,
275                                 UTransPosition& pos,
276                                 UBool incremental) const;
277
278    /**
279     * Create a rule string that represents this rule object.  Append
280     * it to the given string.
281     */
282    virtual UnicodeString& toRule(UnicodeString& pat,
283                                  UBool escapeUnprintable) const;
284
285    /**
286     * Union the set of all characters that may be modified by this rule
287     * into the given set.
288     */
289    void addSourceSetTo(UnicodeSet& toUnionTo) const;
290
291    /**
292     * Union the set of all characters that may be emitted by this rule
293     * into the given set.
294     */
295    void addTargetSetTo(UnicodeSet& toUnionTo) const;
296
297 private:
298
299    friend class StringMatcher;
300
301    TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
302};
303
304U_NAMESPACE_END
305
306#endif /* #if !UCONFIG_NO_TRANSLITERATION */
307
308#endif
309