1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Copyright (C) 2001-2011, International Business Machines Corporation
3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and others. All Rights Reserved.
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   Date        Name        Description
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *   07/23/01    aliu        Creation.
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru **********************************************************************
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef STRMATCH_H
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define STRMATCH_H
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_TRANSLITERATION
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unistr.h"
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unifunct.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unimatch.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unirepl.h"
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass TransliterationRuleData;
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/**
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * An object that matches a fixed input string, implementing the
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeMatcher API.  This object also implements the
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeReplacer API, allowing it to emit the matched text as
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * output.  Since the match text may contain flexible match elements,
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * such as UnicodeSets, the emitted text is not the match pattern, but
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * instead a substring of the actual matched text.  Following
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * convention, the output text is the leftmost match seen up to this
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * point.
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * A StringMatcher may represent a segment, in which case it has a
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * positive segment number.  This affects how the matcher converts
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * itself to a pattern but does not otherwise affect its function.
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * A StringMatcher that is not a segment should not be used as a
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * UnicodeReplacer.
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru public:
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Construct a matcher that matches the given pattern string.
48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param string the pattern to be matched, possibly containing
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * stand-ins that represent nested UnicodeMatcher objects.
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param start inclusive start index of text to be replaced
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param limit exclusive end index of text to be replaced;
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * must be greater than or equal to start
53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param segmentNum the segment number from 1..n, or 0 if this is
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * not a segment.
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param data context object mapping stand-ins to
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * UnicodeMatcher objects.
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    StringMatcher(const UnicodeString& string,
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                  int32_t start,
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                  int32_t limit,
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                  int32_t segmentNum,
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                  const TransliterationRuleData& data);
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Copy constructor
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param o  the object to be copied.
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    StringMatcher(const StringMatcher& o);
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Destructor
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual ~StringMatcher();
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Implement UnicodeFunctor
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @return a copy of the object.
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual UnicodeFunctor* clone() const;
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * and return the pointer.
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @return the UnicodeMatcher point.
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual UnicodeMatcher* toMatcher() const;
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * and return the pointer.
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @return the UnicodeReplacer pointer.
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual UnicodeReplacer* toReplacer() const;
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Implement UnicodeMatcher
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param text the text to be matched
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param offset on input, the index into text at which to begin
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * matching.  On output, the limit of the matched text.  The
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * number of matched characters is the output value of offset
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * minus the input value.  Offset should always point to the
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * both on entry and upon return.
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param limit the limit index of text to be matched.  Greater
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * than offset for a forward direction match, less than offset for
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * a backward direction match.  The last character to be
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * considered for matching will be text.charAt(limit-1) in the
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * forward direction or text.charAt(limit+1) in the backward
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * direction.
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param incremental  if TRUE, then assume further characters may
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * be inserted at limit and check for partial matching.  Otherwise
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * assume the text as given is complete.
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @return a match degree value indicating a full match, a partial
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * match, or a mismatch.  If incremental is FALSE then
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * U_PARTIAL_MATCH should never be returned.
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual UMatchDegree matches(const Replaceable& text,
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 int32_t& offset,
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 int32_t limit,
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                 UBool incremental);
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Implement UnicodeMatcher
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * @param result            Output param to receive the pattern.
125     * @param escapeUnprintable if True then escape the unprintable characters.
126     * @return                  A reference to 'result'.
127     */
128    virtual UnicodeString& toPattern(UnicodeString& result,
129                                     UBool escapeUnprintable = FALSE) const;
130
131    /**
132     * Implement UnicodeMatcher
133     * Returns TRUE if this matcher will match a character c, where c
134     * & 0xFF == v, at offset, in the forward direction (with limit >
135     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
136     * indexing.
137     * @param v    the given value
138     * @return     TRUE if this matcher will match a character c,
139     *             where c & 0xFF == v
140     */
141    virtual UBool matchesIndexValue(uint8_t v) const;
142
143    /**
144     * Implement UnicodeMatcher
145     */
146    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
147
148    /**
149     * Implement UnicodeFunctor
150     */
151    virtual void setData(const TransliterationRuleData*);
152
153    /**
154     * Replace characters in 'text' from 'start' to 'limit' with the
155     * output text of this object.  Update the 'cursor' parameter to
156     * give the cursor position and return the length of the
157     * replacement text.
158     *
159     * @param text the text to be matched
160     * @param start inclusive start index of text to be replaced
161     * @param limit exclusive end index of text to be replaced;
162     * must be greater than or equal to start
163     * @param cursor output parameter for the cursor position.
164     * Not all replacer objects will update this, but in a complete
165     * tree of replacer objects, representing the entire output side
166     * of a transliteration rule, at least one must update it.
167     * @return the number of 16-bit code units in the text replacing
168     * the characters at offsets start..(limit-1) in text
169     */
170    virtual int32_t replace(Replaceable& text,
171                            int32_t start,
172                            int32_t limit,
173                            int32_t& cursor);
174
175    /**
176     * Returns a string representation of this replacer.  If the
177     * result of calling this function is passed to the appropriate
178     * parser, typically TransliteratorParser, it will produce another
179     * replacer that is equal to this one.
180     * @param result the string to receive the pattern.  Previous
181     * contents will be deleted.
182     * @param escapeUnprintable if TRUE then convert unprintable
183     * character to their hex escape representations, \\uxxxx or
184     * \\Uxxxxxxxx.  Unprintable characters are defined by
185     * Utility.isUnprintable().
186     * @return a reference to 'result'.
187     */
188    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
189                                             UBool escapeUnprintable) const;
190
191    /**
192     * Remove any match data.  This must be called before performing a
193     * set of matches with this segment.
194     */
195    void resetMatch();
196
197    /**
198     * ICU "poor man's RTTI", returns a UClassID for the actual class.
199     */
200    virtual UClassID getDynamicClassID() const;
201
202    /**
203     * ICU "poor man's RTTI", returns a UClassID for this class.
204     */
205    static UClassID U_EXPORT2 getStaticClassID();
206
207    /**
208     * Union the set of all characters that may output by this object
209     * into the given set.
210     * @param toUnionTo the set into which to union the output characters
211     */
212    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
213
214 private:
215
216    /**
217     * The text to be matched.
218     */
219    UnicodeString pattern;
220
221    /**
222     * Context object that maps stand-ins to matcher and replacer
223     * objects.
224     */
225    const TransliterationRuleData* data;
226
227    /**
228     * The segment number, 1-based, or 0 if not a segment.
229     */
230    int32_t segmentNumber;
231
232    /**
233     * Start offset, in the match text, of the <em>rightmost</em>
234     * match.
235     */
236    int32_t matchStart;
237
238    /**
239     * Limit offset, in the match text, of the <em>rightmost</em>
240     * match.
241     */
242    int32_t matchLimit;
243
244};
245
246U_NAMESPACE_END
247
248#endif /* #if !UCONFIG_NO_TRANSLITERATION */
249
250#endif
251