1/*
2**********************************************************************
3*   Copyright (c) 2001-2004, International Business Machines Corporation
4*   and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   07/23/01    aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "strmatch.h"
16#include "rbt_data.h"
17#include "util.h"
18#include "unicode/uniset.h"
19
20U_NAMESPACE_BEGIN
21
22static const UChar EMPTY[] = { 0 }; // empty string: ""
23
24UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
25
26StringMatcher::StringMatcher(const UnicodeString& theString,
27                             int32_t start,
28                             int32_t limit,
29                             int32_t segmentNum,
30                             const TransliterationRuleData& theData) :
31    data(&theData),
32    segmentNumber(segmentNum),
33    matchStart(-1),
34    matchLimit(-1)
35{
36    theString.extractBetween(start, limit, pattern);
37}
38
39StringMatcher::StringMatcher(const StringMatcher& o) :
40    UnicodeFunctor(o),
41    UnicodeMatcher(o),
42    UnicodeReplacer(o),
43    pattern(o.pattern),
44    data(o.data),
45    segmentNumber(o.segmentNumber),
46    matchStart(o.matchStart),
47    matchLimit(o.matchLimit)
48{
49}
50
51/**
52 * Destructor
53 */
54StringMatcher::~StringMatcher() {
55}
56
57/**
58 * Implement UnicodeFunctor
59 */
60UnicodeFunctor* StringMatcher::clone() const {
61    return new StringMatcher(*this);
62}
63
64/**
65 * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
66 * and return the pointer.
67 */
68UnicodeMatcher* StringMatcher::toMatcher() const {
69    return (UnicodeMatcher*) this;
70}
71
72/**
73 * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
74 * and return the pointer.
75 */
76UnicodeReplacer* StringMatcher::toReplacer() const {
77    return (UnicodeReplacer*) this;
78}
79
80/**
81 * Implement UnicodeMatcher
82 */
83UMatchDegree StringMatcher::matches(const Replaceable& text,
84                                    int32_t& offset,
85                                    int32_t limit,
86                                    UBool incremental) {
87    int32_t i;
88    int32_t cursor = offset;
89    if (limit < cursor) {
90        // Match in the reverse direction
91        for (i=pattern.length()-1; i>=0; --i) {
92            UChar keyChar = pattern.charAt(i);
93            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
94            if (subm == 0) {
95                if (cursor > limit &&
96                    keyChar == text.charAt(cursor)) {
97                    --cursor;
98                } else {
99                    return U_MISMATCH;
100                }
101            } else {
102                UMatchDegree m =
103                    subm->matches(text, cursor, limit, incremental);
104                if (m != U_MATCH) {
105                    return m;
106                }
107            }
108        }
109        // Record the match position, but adjust for a normal
110        // forward start, limit, and only if a prior match does not
111        // exist -- we want the rightmost match.
112        if (matchStart < 0) {
113            matchStart = cursor+1;
114            matchLimit = offset+1;
115        }
116    } else {
117        for (i=0; i<pattern.length(); ++i) {
118            if (incremental && cursor == limit) {
119                // We've reached the context limit without a mismatch and
120                // without completing our match.
121                return U_PARTIAL_MATCH;
122            }
123            UChar keyChar = pattern.charAt(i);
124            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
125            if (subm == 0) {
126                // Don't need the cursor < limit check if
127                // incremental is TRUE (because it's done above); do need
128                // it otherwise.
129                if (cursor < limit &&
130                    keyChar == text.charAt(cursor)) {
131                    ++cursor;
132                } else {
133                    return U_MISMATCH;
134                }
135            } else {
136                UMatchDegree m =
137                    subm->matches(text, cursor, limit, incremental);
138                if (m != U_MATCH) {
139                    return m;
140                }
141            }
142        }
143        // Record the match position
144        matchStart = offset;
145        matchLimit = cursor;
146    }
147
148    offset = cursor;
149    return U_MATCH;
150}
151
152/**
153 * Implement UnicodeMatcher
154 */
155UnicodeString& StringMatcher::toPattern(UnicodeString& result,
156                                        UBool escapeUnprintable) const
157{
158    result.truncate(0);
159    UnicodeString str, quoteBuf;
160    if (segmentNumber > 0) {
161        result.append((UChar)40); /*(*/
162    }
163    for (int32_t i=0; i<pattern.length(); ++i) {
164        UChar keyChar = pattern.charAt(i);
165        const UnicodeMatcher* m = data->lookupMatcher(keyChar);
166        if (m == 0) {
167            ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
168        } else {
169            ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
170                         TRUE, escapeUnprintable, quoteBuf);
171        }
172    }
173    if (segmentNumber > 0) {
174        result.append((UChar)41); /*)*/
175    }
176    // Flush quoteBuf out to result
177    ICU_Utility::appendToRule(result, -1,
178                              TRUE, escapeUnprintable, quoteBuf);
179    return result;
180}
181
182/**
183 * Implement UnicodeMatcher
184 */
185UBool StringMatcher::matchesIndexValue(uint8_t v) const {
186    if (pattern.length() == 0) {
187        return TRUE;
188    }
189    UChar32 c = pattern.char32At(0);
190    const UnicodeMatcher *m = data->lookupMatcher(c);
191    return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
192}
193
194/**
195 * Implement UnicodeMatcher
196 */
197void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
198    UChar32 ch;
199    for (int32_t i=0; i<pattern.length(); i+=UTF_CHAR_LENGTH(ch)) {
200        ch = pattern.char32At(i);
201        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
202        if (matcher == NULL) {
203            toUnionTo.add(ch);
204        } else {
205            matcher->addMatchSetTo(toUnionTo);
206        }
207    }
208}
209
210/**
211 * UnicodeReplacer API
212 */
213int32_t StringMatcher::replace(Replaceable& text,
214                               int32_t start,
215                               int32_t limit,
216                               int32_t& /*cursor*/) {
217
218    int32_t outLen = 0;
219
220    // Copy segment with out-of-band data
221    int32_t dest = limit;
222    // If there was no match, that means that a quantifier
223    // matched zero-length.  E.g., x (a)* y matched "xy".
224    if (matchStart >= 0) {
225        if (matchStart != matchLimit) {
226            text.copy(matchStart, matchLimit, dest);
227            outLen = matchLimit - matchStart;
228        }
229    }
230
231    text.handleReplaceBetween(start, limit, EMPTY); // delete original text
232
233    return outLen;
234}
235
236/**
237 * UnicodeReplacer API
238 */
239UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
240                                                UBool /*escapeUnprintable*/) const {
241    // assert(segmentNumber > 0);
242    rule.truncate(0);
243    rule.append((UChar)0x0024 /*$*/);
244    ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
245    return rule;
246}
247
248/**
249 * Remove any match info.  This must be called before performing a
250 * set of matches with this segment.
251 */
252 void StringMatcher::resetMatch() {
253    matchStart = matchLimit = -1;
254}
255
256/**
257 * Union the set of all characters that may output by this object
258 * into the given set.
259 * @param toUnionTo the set into which to union the output characters
260 */
261void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
262    // The output of this replacer varies; it is the source text between
263    // matchStart and matchLimit.  Since this varies depending on the
264    // input text, we can't compute it here.  We can either do nothing
265    // or we can add ALL characters to the set.  It's probably more useful
266    // to do nothing.
267}
268
269/**
270 * Implement UnicodeFunctor
271 */
272void StringMatcher::setData(const TransliterationRuleData* d) {
273    data = d;
274    int32_t i = 0;
275    while (i<pattern.length()) {
276        UChar32 c = pattern.char32At(i);
277        UnicodeFunctor* f = data->lookup(c);
278        if (f != NULL) {
279            f->setData(data);
280        }
281        i += UTF_CHAR_LENGTH(c);
282    }
283}
284
285U_NAMESPACE_END
286
287#endif /* #if !UCONFIG_NO_TRANSLITERATION */
288
289//eof
290