1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2001-2004, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.text;
10import com.ibm.icu.impl.Utility;
11
12/**
13 * An object that matches a fixed input string, implementing the
14 * UnicodeMatcher API.  This object also implements the
15 * UnicodeReplacer API, allowing it to emit the matched text as
16 * output.  Since the match text may contain flexible match elements,
17 * such as UnicodeSets, the emitted text is not the match pattern, but
18 * instead a substring of the actual matched text.  Following
19 * convention, the output text is the leftmost match seen up to this
20 * point.
21 *
22 * A StringMatcher may represent a segment, in which case it has a
23 * positive segment number.  This affects how the matcher converts
24 * itself to a pattern but does not otherwise affect its function.
25 *
26 * A StringMatcher that is not a segment should not be used as a
27 * UnicodeReplacer.
28 */
29class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
30
31    /**
32     * The text to be matched.
33     */
34    private String pattern;
35
36    /**
37     * Start offset, in the match text, of the <em>rightmost</em>
38     * match.
39     */
40    private int matchStart;
41
42    /**
43     * Limit offset, in the match text, of the <em>rightmost</em>
44     * match.
45     */
46    private int matchLimit;
47
48    /**
49     * The segment number, 1-based, or 0 if not a segment.
50     */
51    private int segmentNumber;
52
53    /**
54     * Context object that maps stand-ins to matcher and replacer
55     * objects.
56     */
57    private final RuleBasedTransliterator.Data data;
58
59    /**
60     * Construct a matcher that matches the given pattern string.
61     * @param theString the pattern to be matched, possibly containing
62     * stand-ins that represent nested UnicodeMatcher objects.
63     * @param segmentNum the segment number from 1..n, or 0 if this is
64     * not a segment.
65     * @param theData context object mapping stand-ins to
66     * UnicodeMatcher objects.
67     */
68    public StringMatcher(String theString,
69                         int segmentNum,
70                         RuleBasedTransliterator.Data theData) {
71        data = theData;
72        pattern = theString;
73        matchStart = matchLimit = -1;
74        segmentNumber = segmentNum;
75    }
76
77    /**
78     * Construct a matcher that matches a substring of the given
79     * pattern string.
80     * @param theString the pattern to be matched, possibly containing
81     * stand-ins that represent nested UnicodeMatcher objects.
82     * @param start first character of theString to be matched
83     * @param limit index after the last character of theString to be
84     * matched.
85     * @param segmentNum the segment number from 1..n, or 0 if this is
86     * not a segment.
87     * @param theData context object mapping stand-ins to
88     * UnicodeMatcher objects.
89     */
90    public StringMatcher(String theString,
91                         int start,
92                         int limit,
93                         int segmentNum,
94                         RuleBasedTransliterator.Data theData) {
95        this(theString.substring(start, limit), segmentNum, theData);
96    }
97
98    /**
99     * Implement UnicodeMatcher
100     */
101    @Override
102    public int matches(Replaceable text,
103                       int[] offset,
104                       int limit,
105                       boolean incremental) {
106        // Note (1): We process text in 16-bit code units, rather than
107        // 32-bit code points.  This works because stand-ins are
108        // always in the BMP and because we are doing a literal match
109        // operation, which can be done 16-bits at a time.
110        int i;
111        int[] cursor = new int[] { offset[0] };
112        if (limit < cursor[0]) {
113            // Match in the reverse direction
114            for (i=pattern.length()-1; i>=0; --i) {
115                char keyChar = pattern.charAt(i); // OK; see note (1) above
116                UnicodeMatcher subm = data.lookupMatcher(keyChar);
117                if (subm == null) {
118                    if (cursor[0] > limit &&
119                        keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
120                        --cursor[0];
121                    } else {
122                        return U_MISMATCH;
123                    }
124                } else {
125                    int m =
126                        subm.matches(text, cursor, limit, incremental);
127                    if (m != U_MATCH) {
128                        return m;
129                    }
130                }
131            }
132            // Record the match position, but adjust for a normal
133            // forward start, limit, and only if a prior match does not
134            // exist -- we want the rightmost match.
135            if (matchStart < 0) {
136                matchStart = cursor[0]+1;
137                matchLimit = offset[0]+1;
138            }
139        } else {
140            for (i=0; i<pattern.length(); ++i) {
141                if (incremental && cursor[0] == limit) {
142                    // We've reached the context limit without a mismatch and
143                    // without completing our match.
144                    return U_PARTIAL_MATCH;
145                }
146                char keyChar = pattern.charAt(i); // OK; see note (1) above
147                UnicodeMatcher subm = data.lookupMatcher(keyChar);
148                if (subm == null) {
149                    // Don't need the cursor < limit check if
150                    // incremental is true (because it's done above); do need
151                    // it otherwise.
152                    if (cursor[0] < limit &&
153                        keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
154                        ++cursor[0];
155                    } else {
156                        return U_MISMATCH;
157                    }
158                } else {
159                    int m =
160                        subm.matches(text, cursor, limit, incremental);
161                    if (m != U_MATCH) {
162                        return m;
163                    }
164                }
165            }
166            // Record the match position
167            matchStart = offset[0];
168            matchLimit = cursor[0];
169        }
170
171        offset[0] = cursor[0];
172        return U_MATCH;
173    }
174
175    /**
176     * Implement UnicodeMatcher
177     */
178    @Override
179    public String toPattern(boolean escapeUnprintable) {
180        StringBuffer result = new StringBuffer();
181        StringBuffer quoteBuf = new StringBuffer();
182        if (segmentNumber > 0) { // i.e., if this is a segment
183            result.append('(');
184        }
185        for (int i=0; i<pattern.length(); ++i) {
186            char keyChar = pattern.charAt(i); // OK; see note (1) above
187            UnicodeMatcher m = data.lookupMatcher(keyChar);
188            if (m == null) {
189                Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
190            } else {
191                Utility.appendToRule(result, m.toPattern(escapeUnprintable),
192                                     true, escapeUnprintable, quoteBuf);
193            }
194        }
195        if (segmentNumber > 0) { // i.e., if this is a segment
196            result.append(')');
197        }
198        // Flush quoteBuf out to result
199        Utility.appendToRule(result, -1,
200                             true, escapeUnprintable, quoteBuf);
201        return result.toString();
202    }
203
204    /**
205     * Implement UnicodeMatcher
206     */
207    @Override
208    public boolean matchesIndexValue(int v) {
209        if (pattern.length() == 0) {
210            return true;
211        }
212        int c = UTF16.charAt(pattern, 0);
213        UnicodeMatcher m = data.lookupMatcher(c);
214        return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
215    }
216
217    /**
218     * Implementation of UnicodeMatcher API.  Union the set of all
219     * characters that may be matched by this object into the given
220     * set.
221     * @param toUnionTo the set into which to union the source characters
222     */
223    @Override
224    public void addMatchSetTo(UnicodeSet toUnionTo) {
225        int ch;
226        for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {
227            ch = UTF16.charAt(pattern, i);
228            UnicodeMatcher matcher = data.lookupMatcher(ch);
229            if (matcher == null) {
230                toUnionTo.add(ch);
231            } else {
232                matcher.addMatchSetTo(toUnionTo);
233            }
234        }
235    }
236
237    /**
238     * UnicodeReplacer API
239     */
240    @Override
241    public int replace(Replaceable text,
242                       int start,
243                       int limit,
244                       int[] cursor) {
245
246        int outLen = 0;
247
248        // Copy segment with out-of-band data
249        int dest = limit;
250        // If there was no match, that means that a quantifier
251        // matched zero-length.  E.g., x (a)* y matched "xy".
252        if (matchStart >= 0) {
253            if (matchStart != matchLimit) {
254                text.copy(matchStart, matchLimit, dest);
255                outLen = matchLimit - matchStart;
256            }
257        }
258
259        text.replace(start, limit, ""); // delete original text
260
261        return outLen;
262    }
263
264    /**
265     * UnicodeReplacer API
266     */
267    @Override
268    public String toReplacerPattern(boolean escapeUnprintable) {
269        // assert(segmentNumber > 0);
270        StringBuffer rule = new StringBuffer("$");
271        Utility.appendNumber(rule, segmentNumber, 10, 1);
272        return rule.toString();
273    }
274
275    /**
276     * Remove any match data.  This must be called before performing a
277     * set of matches with this segment.
278     */
279    public void resetMatch() {
280        matchStart = matchLimit = -1;
281    }
282
283    /**
284     * Union the set of all characters that may output by this object
285     * into the given set.
286     * @param toUnionTo the set into which to union the output characters
287     */
288    @Override
289    public void addReplacementSetTo(UnicodeSet toUnionTo) {
290        // The output of this replacer varies; it is the source text between
291        // matchStart and matchLimit.  Since this varies depending on the
292        // input text, we can't compute it here.  We can either do nothing
293        // or we can add ALL characters to the set.  It's probably more useful
294        // to do nothing.
295    }
296}
297
298//eof
299