1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 2014-2016, International Business Machines Corporation and
7 * others. All Rights Reserved.
8 *******************************************************************************
9 */
10package android.icu.text;
11
12import android.icu.text.UnicodeSet.SpanCondition;
13import android.icu.util.OutputInt;
14
15/**
16 * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
17 * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
18 * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with
19 * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on.
20 * For the inverse, the following mapping is used:
21 * <ul>
22 * <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li>
23 * <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li>
24 * <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}</li>
25 * </ul>
26 * These are actually not complete inverses. However, the alternating works because there are no gaps.
27 * For example, with [a{ab}{bc}], you get the following behavior when scanning forward:
28 *
29 * <table border="1">
30 * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr>
31 * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr>
32 * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr>
33 * </table>
34 * <p>So here is what happens when you alternate:
35 *
36 * <table border="1">
37 * <tr><th>start</th><td>|xxxabcyyy</td></tr>
38 * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr>
39 * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr>
40 * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr>
41 * </table>
42 * <p>The entire string is traversed.
43 */
44public class UnicodeSetSpanner {
45
46    private final UnicodeSet unicodeSet;
47
48    /**
49     * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class
50     * can be used with a non-frozen version to avoid the cost of freezing.
51     *
52     * @param source
53     *            the original UnicodeSet
54     */
55    public UnicodeSetSpanner(UnicodeSet source) {
56        unicodeSet = source;
57    }
58
59    /**
60     * Returns the UnicodeSet used for processing. It is frozen iff the original was.
61     *
62     * @return the construction set.
63     */
64    public UnicodeSet getUnicodeSet() {
65        return unicodeSet;
66    }
67
68
69    /**
70     * {@inheritDoc}
71     */
72    @Override
73    public boolean equals(Object other) {
74        return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet);
75    }
76
77    /**
78     * {@inheritDoc}
79     */
80    @Override
81    public int hashCode() {
82        return unicodeSet.hashCode();
83    }
84
85    /**
86     * Options for replaceFrom and countIn to control how to treat each matched span.
87     * It is similar to whether one is replacing [abc] by x, or [abc]* by x.
88     */
89    public enum CountMethod {
90        /**
91         * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate
92         * set elements.
93         */
94        WHOLE_SPAN,
95        /**
96         * Use the smallest number of elements in the spanned range for counting and modification,
97         * based on the {@link UnicodeSet.SpanCondition}.
98         * If the set has no strings, this will be the same as the number of spanned code points.
99         * <p>For example, in the string "abab" with SpanCondition.SIMPLE:
100         * <ul>
101         * <li>spanning with [ab] will count four MIN_ELEMENTS.</li>
102         * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li>
103         * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li>
104         * </ul>
105         */
106        MIN_ELEMENTS,
107        // Note: could in the future have an additional option MAX_ELEMENTS
108    }
109
110    /**
111     * Returns the number of matching characters found in a character sequence,
112     * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE.
113     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
114     * @param sequence
115     *            the sequence to count characters in
116     * @return the count. Zero if there are none.
117     */
118    public int countIn(CharSequence sequence) {
119        return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
120    }
121
122    /**
123     * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE.
124     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
125     * @param sequence
126     *            the sequence to count characters in
127     * @param countMethod
128     *            whether to treat an entire span as a match, or individual elements as matches
129     * @return the count. Zero if there are none.
130     */
131    public int countIn(CharSequence sequence, CountMethod countMethod) {
132        return countIn(sequence, countMethod, SpanCondition.SIMPLE);
133    }
134
135    /**
136     * Returns the number of matching characters found in a character sequence.
137     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
138     * @param sequence
139     *            the sequence to count characters in
140     * @param countMethod
141     *            whether to treat an entire span as a match, or individual elements as matches
142     * @param spanCondition
143     *            the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span;
144     *            NOT_CONTAINED is the reverse.
145     *            <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases.
146     * @return the count. Zero if there are none.
147     */
148    public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) {
149        int count = 0;
150        int start = 0;
151        SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
152                : SpanCondition.NOT_CONTAINED;
153        final int length = sequence.length();
154        OutputInt spanCount = null;
155        while (start != length) {
156            int endOfSpan = unicodeSet.span(sequence, start, skipSpan);
157            if (endOfSpan == length) {
158                break;
159            }
160            if (countMethod == CountMethod.WHOLE_SPAN) {
161                start = unicodeSet.span(sequence, endOfSpan, spanCondition);
162                count += 1;
163            } else {
164                if (spanCount == null) {
165                    spanCount = new OutputInt();
166                }
167                start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount);
168                count += spanCount.value;
169            }
170        }
171        return count;
172    }
173
174    /**
175     * Delete all the matching spans in sequence, using SpanCondition.SIMPLE
176     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
177     * @param sequence
178     *            charsequence to replace matching spans in.
179     * @return modified string.
180     */
181    public String deleteFrom(CharSequence sequence) {
182        return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE);
183    }
184
185    /**
186     * Delete all matching spans in sequence, according to the spanCondition.
187     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
188     * @param sequence
189     *            charsequence to replace matching spans in.
190     * @param spanCondition
191     *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED)
192     * @return modified string.
193     */
194    public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) {
195        return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition);
196    }
197
198    /**
199     * Replace all matching spans in sequence by the replacement,
200     * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE.
201     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
202     * @param sequence
203     *            charsequence to replace matching spans in.
204     * @param replacement
205     *            replacement sequence. To delete, use ""
206     * @return modified string.
207     */
208    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
209        return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
210    }
211
212    /**
213     * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE.
214     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
215     *
216     * @param sequence
217     *            charsequence to replace matching spans in.
218     * @param replacement
219     *            replacement sequence. To delete, use ""
220     * @param countMethod
221     *            whether to treat an entire span as a match, or individual elements as matches
222     * @return modified string.
223     */
224    public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) {
225        return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE);
226    }
227
228    /**
229     * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition.
230     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
231     * @param sequence
232     *            charsequence to replace matching spans in.
233     * @param replacement
234     *            replacement sequence. To delete, use ""
235     * @param countMethod
236     *            whether to treat an entire span as a match, or individual elements as matches
237     * @param spanCondition
238     *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching
239     *            (NOT_CONTAINED)
240     * @return modified string.
241     */
242    public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod,
243            SpanCondition spanCondition) {
244        SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
245                : SpanCondition.NOT_CONTAINED;
246        final boolean remove = replacement.length() == 0;
247        StringBuilder result = new StringBuilder();
248        // TODO, we can optimize this to
249        // avoid this allocation unless needed
250
251        final int length = sequence.length();
252        OutputInt spanCount = null;
253        for (int endCopy = 0; endCopy != length;) {
254            int endModify;
255            if (countMethod == CountMethod.WHOLE_SPAN) {
256                endModify = unicodeSet.span(sequence, endCopy, spanCondition);
257            } else {
258                if (spanCount == null) {
259                    spanCount = new OutputInt();
260                }
261                endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount);
262            }
263            if (remove || endModify == 0) {
264                // do nothing
265            } else if (countMethod == CountMethod.WHOLE_SPAN) {
266                result.append(replacement);
267            } else {
268                for (int i = spanCount.value; i > 0; --i) {
269                    result.append(replacement);
270                }
271            }
272            if (endModify == length) {
273                break;
274            }
275            endCopy = unicodeSet.span(sequence, endModify, copySpan);
276            result.append(sequence.subSequence(endModify, endCopy));
277        }
278        return result.toString();
279    }
280
281    /**
282     * Options for the trim() method
283     */
284    public enum TrimOption {
285        /**
286         * Trim leading spans.
287         */
288        LEADING,
289        /**
290         * Trim leading and trailing spans.
291         */
292        BOTH,
293        /**
294         * Trim trailing spans.
295         */
296        TRAILING;
297    }
298
299    /**
300     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and
301     * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example:
302     *
303     * <pre>
304     * {@code
305     *
306     *   new UnicodeSet("[ab]").trim("abacatbab")}
307     * </pre>
308     *
309     * ... returns {@code "cat"}.
310     * @param sequence
311     *            the sequence to trim
312     * @return a subsequence
313     */
314    public CharSequence trim(CharSequence sequence) {
315        return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE);
316    }
317
318    /**
319     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or
320     * end of the string, using the trimOption and SpanCondition.SIMPLE. For example:
321     *
322     * <pre>
323     * {@code
324     *
325     *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)}
326     * </pre>
327     *
328     * ... returns {@code "catbab"}.
329     *
330     * @param sequence
331     *            the sequence to trim
332     * @param trimOption
333     *            LEADING, TRAILING, or BOTH
334     * @return a subsequence
335     */
336    public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
337        return trim(sequence, trimOption, SpanCondition.SIMPLE);
338    }
339
340    /**
341     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or
342     * end of the string, depending on the trimOption and spanCondition. For example:
343     *
344     * <pre>
345     * {@code
346     *
347     *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)}
348     * </pre>
349     *
350     * ... returns {@code "catbab"}.
351     *
352     * @param sequence
353     *            the sequence to trim
354     * @param trimOption
355     *            LEADING, TRAILING, or BOTH
356     * @param spanCondition
357     *            SIMPLE, CONTAINED or NOT_CONTAINED
358     * @return a subsequence
359     */
360    public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) {
361        int endLeadContained, startTrailContained;
362        final int length = sequence.length();
363        if (trimOption != TrimOption.TRAILING) {
364            endLeadContained = unicodeSet.span(sequence, spanCondition);
365            if (endLeadContained == length) {
366                return "";
367            }
368        } else {
369            endLeadContained = 0;
370        }
371        if (trimOption != TrimOption.LEADING) {
372            startTrailContained = unicodeSet.spanBack(sequence, spanCondition);
373        } else {
374            startTrailContained = length;
375        }
376        return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence(
377                endLeadContained, startTrailContained);
378    }
379
380}
381