1/*
2 *******************************************************************************
3 * Copyright (C) 2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7package com.ibm.icu.text;
8
9import com.ibm.icu.text.UnicodeSet.SpanCondition;
10import com.ibm.icu.util.OutputInt;
11
12/**
13 * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches.
14 * An instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
15 * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link SpanCondition} with
16 * its inverse. That is, the code spans, then spans for the inverse, then spans, and so on.
17 * For the inverse, the following mapping is used:</p>
18 * <ul>
19 * <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li>
20 * <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}</li>
21 * <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}</li>
22 * </ul>
23 * These are actually not complete inverses. However, the alternating works because there are no gaps.
24 * For example, with [a{ab}{bc}], you get the following behavior when scanning forward:
25 * <p>
26 * <table border="1">
27 * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr>
28 * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr>
29 * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr>
30 * </table>
31 * <p>So here is what happens when you alternate:
32 * <p>
33 * <table border="1">
34 * <tr><th>start</th><td>|xxxabcyyy</td></tr>
35 * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr>
36 * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr>
37 * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr>
38 * </table>
39 * </p>The entire string is traversed.
40 *
41 * @draft ICU 54
42 * @provisional This is a draft API and might change in a future release of ICU.
43 */
44public class UnicodeSetSpanner {
45
46    private final UnicodeSet unicodeSet;
47
48    /**
49     * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen. However, this class
50     * can be used with a non-frozen version to avoid the cost of freezing.
51     *
52     * @param source
53     *            the original UnicodeSet
54     *
55     * @draft ICU 54
56     * @provisional This is a draft API and might change in a future release of ICU.
57     */
58    public UnicodeSetSpanner(UnicodeSet source) {
59        unicodeSet = source;
60    }
61
62    /**
63     * Returns the UnicodeSet used for processing. It is frozen iff the original was.
64     *
65     * @return the construction set.
66     *
67     * @draft ICU 54
68     * @provisional This is a draft API and might change in a future release of ICU.
69     */
70    public UnicodeSet getUnicodeSet() {
71        return unicodeSet;
72    }
73
74
75    /**
76     * {@inheritDoc}
77     *
78     * @draft ICU 54
79     * @provisional This is a draft API and might change in a future release of ICU.
80     */
81    @Override
82    public boolean equals(Object other) {
83        return other instanceof UnicodeSetSpanner && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet);
84    }
85
86    /**
87     * {@inheritDoc}
88     *
89     * @draft ICU 54
90     * @provisional This is a draft API and might change in a future release of ICU.
91     */
92    @Override
93    public int hashCode() {
94        return unicodeSet.hashCode();
95    }
96
97    /**
98     * Options for replaceFrom and countIn to control how to treat each matched span.
99     * It is similar to whether one is replacing [abc] by x, or [abc]* by x.
100     *
101     * @draft ICU 54
102     * @provisional This is a draft API and might change in a future release of ICU.
103     */
104    public enum CountMethod {
105        /**
106         * Collapse spans. That is, modify/count the entire matching span as a single item, instead of separate
107         * set elements.
108         *
109         * @draft ICU 54
110         * @provisional This is a draft API and might change in a future release of ICU.
111         */
112        WHOLE_SPAN,
113        /**
114         * Use the smallest number of elements in the spanned range for counting and modification,
115         * based on the {@link UnicodeSet.SpanCondition}.
116         * If the set has no strings, this will be the same as the number of spanned code points.
117         * <p>For example, in the string "abab" with SpanCondition.SIMPLE:
118         * <ul>
119         * <li>spanning with [ab] will count four MIN_ELEMENTS.</li>
120         * <li>spanning with [{ab}] will count two MIN_ELEMENTS.</li>
121         * <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.</li>
122         * </ul>
123         *
124         * @draft ICU 54
125         * @provisional This is a draft API and might change in a future release of ICU.
126         */
127        MIN_ELEMENTS,
128        // Note: could in the future have an additional option MAX_ELEMENTS
129    }
130
131    /**
132     * Returns the number of matching characters found in a character sequence,
133     * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE.
134     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
135     * @param sequence
136     *            the sequence to count characters in
137     * @return the count. Zero if there are none.
138     *
139     * @draft ICU 54
140     * @provisional This is a draft API and might change in a future release of ICU.
141     */
142    public int countIn(CharSequence sequence) {
143        return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
144    }
145
146    /**
147     * Returns the number of matching characters found in a character sequence, using SpanCondition.SIMPLE.
148     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
149     * @param sequence
150     *            the sequence to count characters in
151     * @param countMethod
152     *            whether to treat an entire span as a match, or individual elements as matches
153     * @return the count. Zero if there are none.
154     *
155     * @draft ICU 54
156     * @provisional This is a draft API and might change in a future release of ICU.
157     */
158    public int countIn(CharSequence sequence, CountMethod countMethod) {
159        return countIn(sequence, countMethod, SpanCondition.SIMPLE);
160    }
161
162    /**
163     * Returns the number of matching characters found in a character sequence.
164     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
165     * @param sequence
166     *            the sequence to count characters in
167     * @param countMethod
168     *            whether to treat an entire span as a match, or individual elements as matches
169     * @param spanCondition
170     *            the spanCondition to use. SIMPLE or CONTAINED means only count the elements in the span;
171     *            NOT_CONTAINED is the reverse.
172     *            <br><b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in edge cases.
173     * @return the count. Zero if there are none.
174     *
175     * @draft ICU 54
176     * @provisional This is a draft API and might change in a future release of ICU.
177     */
178    public int countIn(CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) {
179        int count = 0;
180        int start = 0;
181        SpanCondition skipSpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
182                : SpanCondition.NOT_CONTAINED;
183        final int length = sequence.length();
184        OutputInt spanCount = null;
185        while (start != length) {
186            int endOfSpan = unicodeSet.span(sequence, start, skipSpan);
187            if (endOfSpan == length) {
188                break;
189            }
190            if (countMethod == CountMethod.WHOLE_SPAN) {
191                start = unicodeSet.span(sequence, endOfSpan, spanCondition);
192                count += 1;
193            } else {
194                if (spanCount == null) {
195                    spanCount = new OutputInt();
196                }
197                start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount);
198                count += spanCount.value;
199            }
200        }
201        return count;
202    }
203
204    /**
205     * Delete all the matching spans in sequence, using SpanCondition.SIMPLE
206     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
207     * @param sequence
208     *            charsequence to replace matching spans in.
209     * @return modified string.
210     *
211     * @draft ICU 54
212     * @provisional This is a draft API and might change in a future release of ICU.
213     */
214    public String deleteFrom(CharSequence sequence) {
215        return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE);
216    }
217
218    /**
219     * Delete all matching spans in sequence, according to the spanCondition.
220     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
221     * @param sequence
222     *            charsequence to replace matching spans in.
223     * @param spanCondition
224     *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching (NOT_CONTAINED)
225     * @return modified string.
226     *
227     * @draft ICU 54
228     * @provisional This is a draft API and might change in a future release of ICU.
229     */
230    public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) {
231        return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition);
232    }
233
234    /**
235     * Replace all matching spans in sequence by the replacement,
236     * counting by CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE.
237     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
238     * @param sequence
239     *            charsequence to replace matching spans in.
240     * @param replacement
241     *            replacement sequence. To delete, use ""
242     * @return modified string.
243     *
244     * @draft ICU 54
245     * @provisional This is a draft API and might change in a future release of ICU.
246     */
247    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
248        return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
249    }
250
251    /**
252     * Replace all matching spans in sequence by replacement, according to the CountMethod, using SpanCondition.SIMPLE.
253     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
254     *
255     * @param sequence
256     *            charsequence to replace matching spans in.
257     * @param replacement
258     *            replacement sequence. To delete, use ""
259     * @param countMethod
260     *            whether to treat an entire span as a match, or individual elements as matches
261     * @return modified string.
262     *
263     * @draft ICU 54
264     * @provisional This is a draft API and might change in a future release of ICU.
265     */
266    public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod) {
267        return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE);
268    }
269
270    /**
271     * Replace all matching spans in sequence by replacement, according to the countMethod and spanCondition.
272     * The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
273     * @param sequence
274     *            charsequence to replace matching spans in.
275     * @param replacement
276     *            replacement sequence. To delete, use ""
277     * @param countMethod
278     *            whether to treat an entire span as a match, or individual elements as matches
279     * @param spanCondition
280     *            specify whether to modify the matching spans (CONTAINED or SIMPLE) or the non-matching
281     *            (NOT_CONTAINED)
282     * @return modified string.
283     *
284     * @draft ICU 54
285     * @provisional This is a draft API and might change in a future release of ICU.
286     */
287    public String replaceFrom(CharSequence sequence, CharSequence replacement, CountMethod countMethod,
288            SpanCondition spanCondition) {
289        SpanCondition copySpan = spanCondition == SpanCondition.NOT_CONTAINED ? SpanCondition.SIMPLE
290                : SpanCondition.NOT_CONTAINED;
291        final boolean remove = replacement.length() == 0;
292        StringBuilder result = new StringBuilder();
293        // TODO, we can optimize this to
294        // avoid this allocation unless needed
295
296        final int length = sequence.length();
297        OutputInt spanCount = null;
298        for (int endCopy = 0; endCopy != length;) {
299            int endModify;
300            if (countMethod == CountMethod.WHOLE_SPAN) {
301                endModify = unicodeSet.span(sequence, endCopy, spanCondition);
302            } else {
303                if (spanCount == null) {
304                    spanCount = new OutputInt();
305                }
306                endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount);
307            }
308            if (remove || endModify == 0) {
309                // do nothing
310            } else if (countMethod == CountMethod.WHOLE_SPAN) {
311                result.append(replacement);
312            } else {
313                for (int i = spanCount.value; i > 0; --i) {
314                    result.append(replacement);
315                }
316            }
317            if (endModify == length) {
318                break;
319            }
320            endCopy = unicodeSet.span(sequence, endModify, copySpan);
321            result.append(sequence.subSequence(endModify, endCopy));
322        }
323        return result.toString();
324    }
325
326    /**
327     * Options for the trim() method
328     *
329     * @draft ICU 54
330     * @provisional This is a draft API and might change in a future release of ICU.
331     */
332    public enum TrimOption {
333        /**
334         * Trim leading spans.
335         *
336         * @draft ICU 54
337         * @provisional This is a draft API and might change in a future release of ICU.
338         */
339        LEADING,
340        /**
341         * Trim leading and trailing spans.
342         *
343         * @draft ICU 54
344         * @provisional This is a draft API and might change in a future release of ICU.
345         */
346        BOTH,
347        /**
348         * Trim trailing spans.
349         *
350         * @draft ICU 54
351         * @provisional This is a draft API and might change in a future release of ICU.
352         */
353        TRAILING;
354    }
355
356    /**
357     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start and
358     * end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For example:
359     *
360     * <pre>
361     * {@code
362     *
363     *   new UnicodeSet("[ab]").trim("abacatbab")}
364     * </pre>
365     *
366     * ... returns {@code "cat"}.
367     * @param sequence
368     *            the sequence to trim
369     * @return a subsequence
370     *
371     * @draft ICU 54
372     * @provisional This is a draft API and might change in a future release of ICU.
373     */
374    public CharSequence trim(CharSequence sequence) {
375        return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE);
376    }
377
378    /**
379     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or
380     * end of the string, using the trimOption and SpanCondition.SIMPLE. For example:
381     *
382     * <pre>
383     * {@code
384     *
385     *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)}
386     * </pre>
387     *
388     * ... returns {@code "catbab"}.
389     *
390     * @param sequence
391     *            the sequence to trim
392     * @param trimOption
393     *            LEADING, TRAILING, or BOTH
394     * @return a subsequence
395     *
396     * @draft ICU 54
397     * @provisional This is a draft API and might change in a future release of ICU.
398     */
399    public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
400        return trim(sequence, trimOption, SpanCondition.SIMPLE);
401    }
402
403    /**
404     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements at the start or
405     * end of the string, depending on the trimOption and spanCondition. For example:
406     *
407     * <pre>
408     * {@code
409     *
410     *   new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)}
411     * </pre>
412     *
413     * ... returns {@code "catbab"}.
414     *
415     * @param sequence
416     *            the sequence to trim
417     * @param trimOption
418     *            LEADING, TRAILING, or BOTH
419     * @param spanCondition
420     *            SIMPLE, CONTAINED or NOT_CONTAINED
421     * @return a subsequence
422     *
423     * @draft ICU 54
424     * @provisional This is a draft API and might change in a future release of ICU.
425     */
426    public CharSequence trim(CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) {
427        int endLeadContained, startTrailContained;
428        final int length = sequence.length();
429        if (trimOption != TrimOption.TRAILING) {
430            endLeadContained = unicodeSet.span(sequence, spanCondition);
431            if (endLeadContained == length) {
432                return "";
433            }
434        } else {
435            endLeadContained = 0;
436        }
437        if (trimOption != TrimOption.LEADING) {
438            startTrailContained = unicodeSet.spanBack(sequence, spanCondition);
439        } else {
440            startTrailContained = length;
441        }
442        return endLeadContained == 0 && startTrailContained == length ? sequence : sequence.subSequence(
443                endLeadContained, startTrailContained);
444    }
445
446}
447