1/*
2 * Copyright (C) 2001-2011, International Business Machines Corporation
3 * and others. All Rights Reserved.
4 **********************************************************************
5 *   Date        Name        Description
6 *   07/23/01    aliu        Creation.
7 **********************************************************************
8 */
9#ifndef STRMATCH_H
10#define STRMATCH_H
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_TRANSLITERATION
15
16#include "unicode/unistr.h"
17#include "unicode/unifunct.h"
18#include "unicode/unimatch.h"
19#include "unicode/unirepl.h"
20
21U_NAMESPACE_BEGIN
22
23class TransliterationRuleData;
24
25/**
26 * An object that matches a fixed input string, implementing the
27 * UnicodeMatcher API.  This object also implements the
28 * UnicodeReplacer API, allowing it to emit the matched text as
29 * output.  Since the match text may contain flexible match elements,
30 * such as UnicodeSets, the emitted text is not the match pattern, but
31 * instead a substring of the actual matched text.  Following
32 * convention, the output text is the leftmost match seen up to this
33 * point.
34 *
35 * A StringMatcher may represent a segment, in which case it has a
36 * positive segment number.  This affects how the matcher converts
37 * itself to a pattern but does not otherwise affect its function.
38 *
39 * A StringMatcher that is not a segment should not be used as a
40 * UnicodeReplacer.
41 */
42class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
43
44 public:
45
46    /**
47     * Construct a matcher that matches the given pattern string.
48     * @param string the pattern to be matched, possibly containing
49     * stand-ins that represent nested UnicodeMatcher objects.
50     * @param start inclusive start index of text to be replaced
51     * @param limit exclusive end index of text to be replaced;
52     * must be greater than or equal to start
53     * @param segmentNum the segment number from 1..n, or 0 if this is
54     * not a segment.
55     * @param data context object mapping stand-ins to
56     * UnicodeMatcher objects.
57     */
58    StringMatcher(const UnicodeString& string,
59                  int32_t start,
60                  int32_t limit,
61                  int32_t segmentNum,
62                  const TransliterationRuleData& data);
63
64    /**
65     * Copy constructor
66     * @param o  the object to be copied.
67     */
68    StringMatcher(const StringMatcher& o);
69
70    /**
71     * Destructor
72     */
73    virtual ~StringMatcher();
74
75    /**
76     * Implement UnicodeFunctor
77     * @return a copy of the object.
78     */
79    virtual UnicodeFunctor* clone() const;
80
81    /**
82     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
83     * and return the pointer.
84     * @return the UnicodeMatcher point.
85     */
86    virtual UnicodeMatcher* toMatcher() const;
87
88    /**
89     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
90     * and return the pointer.
91     * @return the UnicodeReplacer pointer.
92     */
93    virtual UnicodeReplacer* toReplacer() const;
94
95    /**
96     * Implement UnicodeMatcher
97     * @param text the text to be matched
98     * @param offset on input, the index into text at which to begin
99     * matching.  On output, the limit of the matched text.  The
100     * number of matched characters is the output value of offset
101     * minus the input value.  Offset should always point to the
102     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
103     * both on entry and upon return.
104     * @param limit the limit index of text to be matched.  Greater
105     * than offset for a forward direction match, less than offset for
106     * a backward direction match.  The last character to be
107     * considered for matching will be text.charAt(limit-1) in the
108     * forward direction or text.charAt(limit+1) in the backward
109     * direction.
110     * @param incremental  if TRUE, then assume further characters may
111     * be inserted at limit and check for partial matching.  Otherwise
112     * assume the text as given is complete.
113     * @return a match degree value indicating a full match, a partial
114     * match, or a mismatch.  If incremental is FALSE then
115     * U_PARTIAL_MATCH should never be returned.
116     */
117    virtual UMatchDegree matches(const Replaceable& text,
118                                 int32_t& offset,
119                                 int32_t limit,
120                                 UBool incremental);
121
122    /**
123     * Implement UnicodeMatcher
124     * @param result            Output param to receive the pattern.
125     * @param escapeUnprintable if True then escape the unprintable characters.
126     * @return                  A reference to 'result'.
127     */
128    virtual UnicodeString& toPattern(UnicodeString& result,
129                                     UBool escapeUnprintable = FALSE) const;
130
131    /**
132     * Implement UnicodeMatcher
133     * Returns TRUE if this matcher will match a character c, where c
134     * & 0xFF == v, at offset, in the forward direction (with limit >
135     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
136     * indexing.
137     * @param v    the given value
138     * @return     TRUE if this matcher will match a character c,
139     *             where c & 0xFF == v
140     */
141    virtual UBool matchesIndexValue(uint8_t v) const;
142
143    /**
144     * Implement UnicodeMatcher
145     */
146    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
147
148    /**
149     * Implement UnicodeFunctor
150     */
151    virtual void setData(const TransliterationRuleData*);
152
153    /**
154     * Replace characters in 'text' from 'start' to 'limit' with the
155     * output text of this object.  Update the 'cursor' parameter to
156     * give the cursor position and return the length of the
157     * replacement text.
158     *
159     * @param text the text to be matched
160     * @param start inclusive start index of text to be replaced
161     * @param limit exclusive end index of text to be replaced;
162     * must be greater than or equal to start
163     * @param cursor output parameter for the cursor position.
164     * Not all replacer objects will update this, but in a complete
165     * tree of replacer objects, representing the entire output side
166     * of a transliteration rule, at least one must update it.
167     * @return the number of 16-bit code units in the text replacing
168     * the characters at offsets start..(limit-1) in text
169     */
170    virtual int32_t replace(Replaceable& text,
171                            int32_t start,
172                            int32_t limit,
173                            int32_t& cursor);
174
175    /**
176     * Returns a string representation of this replacer.  If the
177     * result of calling this function is passed to the appropriate
178     * parser, typically TransliteratorParser, it will produce another
179     * replacer that is equal to this one.
180     * @param result the string to receive the pattern.  Previous
181     * contents will be deleted.
182     * @param escapeUnprintable if TRUE then convert unprintable
183     * character to their hex escape representations, \\uxxxx or
184     * \\Uxxxxxxxx.  Unprintable characters are defined by
185     * Utility.isUnprintable().
186     * @return a reference to 'result'.
187     */
188    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
189                                             UBool escapeUnprintable) const;
190
191    /**
192     * Remove any match data.  This must be called before performing a
193     * set of matches with this segment.
194     */
195    void resetMatch();
196
197    /**
198     * ICU "poor man's RTTI", returns a UClassID for the actual class.
199     */
200    virtual UClassID getDynamicClassID() const;
201
202    /**
203     * ICU "poor man's RTTI", returns a UClassID for this class.
204     */
205    static UClassID U_EXPORT2 getStaticClassID();
206
207    /**
208     * Union the set of all characters that may output by this object
209     * into the given set.
210     * @param toUnionTo the set into which to union the output characters
211     */
212    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
213
214 private:
215
216    /**
217     * The text to be matched.
218     */
219    UnicodeString pattern;
220
221    /**
222     * Context object that maps stand-ins to matcher and replacer
223     * objects.
224     */
225    const TransliterationRuleData* data;
226
227    /**
228     * The segment number, 1-based, or 0 if not a segment.
229     */
230    int32_t segmentNumber;
231
232    /**
233     * Start offset, in the match text, of the <em>rightmost</em>
234     * match.
235     */
236    int32_t matchStart;
237
238    /**
239     * Limit offset, in the match text, of the <em>rightmost</em>
240     * match.
241     */
242    int32_t matchLimit;
243
244};
245
246U_NAMESPACE_END
247
248#endif /* #if !UCONFIG_NO_TRANSLITERATION */
249
250#endif
251