1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.text.method;
18
19import android.annotation.NonNull;
20import android.icu.lang.UCharacter;
21import android.icu.lang.UProperty;
22import android.icu.text.BreakIterator;
23import android.text.CharSequenceCharacterIterator;
24import android.text.Selection;
25
26import java.util.Locale;
27
28/**
29 * Walks through cursor positions at word boundaries. Internally uses
30 * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
31 * for performance reasons.
32 *
33 * Also provides methods to determine word boundaries.
34 * {@hide}
35 */
36public class WordIterator implements Selection.PositionIterator {
37    // Size of the window for the word iterator, should be greater than the longest word's length
38    private static final int WINDOW_WIDTH = 50;
39
40    private int mStart, mEnd;
41    private CharSequence mCharSeq;
42    private final BreakIterator mIterator;
43
44    /**
45     * Constructs a WordIterator using the default locale.
46     */
47    public WordIterator() {
48        this(Locale.getDefault());
49    }
50
51    /**
52     * Constructs a new WordIterator for the specified locale.
53     * @param locale The locale to be used for analyzing the text.
54     */
55    public WordIterator(Locale locale) {
56        mIterator = BreakIterator.getWordInstance(locale);
57    }
58
59    public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) {
60        if (0 <= start && end <= charSequence.length()) {
61            mCharSeq = charSequence;
62            mStart = Math.max(0, start - WINDOW_WIDTH);
63            mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
64            mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd));
65        } else {
66            throw new IndexOutOfBoundsException("input indexes are outside the CharSequence");
67        }
68    }
69
70    /** {@inheritDoc} */
71    public int preceding(int offset) {
72        checkOffsetIsValid(offset);
73        while (true) {
74            offset = mIterator.preceding(offset);
75            if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) {
76                return offset;
77            }
78        }
79    }
80
81    /** {@inheritDoc} */
82    public int following(int offset) {
83        checkOffsetIsValid(offset);
84        while (true) {
85            offset = mIterator.following(offset);
86            if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) {
87                return offset;
88            }
89        }
90    }
91
92    /** {@inheritDoc} */
93    public boolean isBoundary(int offset) {
94        checkOffsetIsValid(offset);
95        return mIterator.isBoundary(offset);
96    }
97
98    /**
99     * Returns the position of next boundary after the given offset. Returns
100     * {@code DONE} if there is no boundary after the given offset.
101     *
102     * @param offset the given start position to search from.
103     * @return the position of the last boundary preceding the given offset.
104     */
105    public int nextBoundary(int offset) {
106        checkOffsetIsValid(offset);
107        return mIterator.following(offset);
108    }
109
110    /**
111     * Returns the position of boundary preceding the given offset or
112     * {@code DONE} if the given offset specifies the starting position.
113     *
114     * @param offset the given start position to search from.
115     * @return the position of the last boundary preceding the given offset.
116     */
117    public int prevBoundary(int offset) {
118        checkOffsetIsValid(offset);
119        return mIterator.preceding(offset);
120    }
121
122    /** If <code>offset</code> is within a word, returns the index of the first character of that
123     * word, otherwise returns BreakIterator.DONE.
124     *
125     * The offsets that are considered to be part of a word are the indexes of its characters,
126     * <i>as well as</i> the index of its last character plus one.
127     * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
128     *
129     * Valid range for offset is [0..textLength] (note the inclusive upper bound).
130     * The returned value is within [0..offset] or BreakIterator.DONE.
131     *
132     * @throws IllegalArgumentException is offset is not valid.
133     */
134    public int getBeginning(int offset) {
135        // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
136        // so this method can be removed.
137        return getBeginning(offset, false);
138    }
139
140    /**
141     * If <code>offset</code> is within a word, returns the index of the last character of that
142     * word plus one, otherwise returns BreakIterator.DONE.
143     *
144     * The offsets that are considered to be part of a word are the indexes of its characters,
145     * <i>as well as</i> the index of its last character plus one.
146     * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
147     *
148     * Valid range for offset is [0..textLength] (note the inclusive upper bound).
149     * The returned value is within [offset..textLength] or BreakIterator.DONE.
150     *
151     * @throws IllegalArgumentException is offset is not valid.
152     */
153    public int getEnd(int offset) {
154        // TODO: Check if usage of this can be updated to getEnd(offset, true), if
155        // so this method can be removed.
156        return getEnd(offset, false);
157    }
158
159    /**
160     * If the <code>offset</code> is within a word or on a word boundary that can only be
161     * considered the start of a word (e.g. _word where "_" is any character that would not
162     * be considered part of the word) then this returns the index of the first character of
163     * that word.
164     *
165     * If the offset is on a word boundary that can be considered the start and end of a
166     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
167     * between AA and BB, this would return the start of the previous word, AA.
168     *
169     * Returns BreakIterator.DONE if there is no previous boundary.
170     *
171     * @throws IllegalArgumentException is offset is not valid.
172     */
173    public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
174        return getBeginning(offset, true);
175    }
176
177    /**
178     * If the <code>offset</code> is within a word or on a word boundary that can only be
179     * considered the end of a word (e.g. word_ where "_" is any character that would not
180     * be considered part of the word) then this returns the index of the last character
181     * plus one of that word.
182     *
183     * If the offset is on a word boundary that can be considered the start and end of a
184     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
185     * between AA and BB, this would return the end of the next word, BB.
186     *
187     * Returns BreakIterator.DONE if there is no next boundary.
188     *
189     * @throws IllegalArgumentException is offset is not valid.
190     */
191    public int getNextWordEndOnTwoWordBoundary(int offset) {
192        return getEnd(offset, true);
193    }
194
195    /**
196     * If the <code>offset</code> is within a word or on a word boundary that can only be
197     * considered the start of a word (e.g. _word where "_" is any character that would not
198     * be considered part of the word) then this returns the index of the first character of
199     * that word.
200     *
201     * If the offset is on a word boundary that can be considered the start and end of a
202     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
203     * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
204     * return the start of the previous word, AA. Otherwise it would return the current offset,
205     * the start of BB.
206     *
207     * Returns BreakIterator.DONE if there is no previous boundary.
208     *
209     * @throws IllegalArgumentException is offset is not valid.
210     */
211    private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
212        checkOffsetIsValid(offset);
213
214        if (isOnLetterOrDigit(offset)) {
215            if (mIterator.isBoundary(offset)
216                    && (!isAfterLetterOrDigit(offset)
217                            || !getPrevWordBeginningOnTwoWordsBoundary)) {
218                return offset;
219            } else {
220                return mIterator.preceding(offset);
221            }
222        } else {
223            if (isAfterLetterOrDigit(offset)) {
224                return mIterator.preceding(offset);
225            }
226        }
227        return BreakIterator.DONE;
228    }
229
230    /**
231     * If the <code>offset</code> is within a word or on a word boundary that can only be
232     * considered the end of a word (e.g. word_ where "_" is any character that would not be
233     * considered part of the word) then this returns the index of the last character plus one
234     * of that word.
235     *
236     * If the offset is on a word boundary that can be considered the start and end of a
237     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
238     * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
239     * the end of the next word, BB. Otherwise it would return the current offset, the end
240     * of AA.
241     *
242     * Returns BreakIterator.DONE if there is no next boundary.
243     *
244     * @throws IllegalArgumentException is offset is not valid.
245     */
246    private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
247        checkOffsetIsValid(offset);
248
249        if (isAfterLetterOrDigit(offset)) {
250            if (mIterator.isBoundary(offset)
251                    && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) {
252                return offset;
253            } else {
254                return mIterator.following(offset);
255            }
256        } else {
257            if (isOnLetterOrDigit(offset)) {
258                return mIterator.following(offset);
259            }
260        }
261        return BreakIterator.DONE;
262    }
263
264    /**
265     * If <code>offset</code> is within a group of punctuation as defined
266     * by {@link #isPunctuation(int)}, returns the index of the first character
267     * of that group, otherwise returns BreakIterator.DONE.
268     *
269     * @param offset the offset to search from.
270     */
271    public int getPunctuationBeginning(int offset) {
272        checkOffsetIsValid(offset);
273        while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
274            offset = prevBoundary(offset);
275        }
276        // No need to shift offset, prevBoundary handles that.
277        return offset;
278    }
279
280    /**
281     * If <code>offset</code> is within a group of punctuation as defined
282     * by {@link #isPunctuation(int)}, returns the index of the last character
283     * of that group plus one, otherwise returns BreakIterator.DONE.
284     *
285     * @param offset the offset to search from.
286     */
287    public int getPunctuationEnd(int offset) {
288        checkOffsetIsValid(offset);
289        while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
290            offset = nextBoundary(offset);
291        }
292        // No need to shift offset, nextBoundary handles that.
293        return offset;
294    }
295
296    /**
297     * Indicates if the provided offset is after a punctuation character
298     * as defined by {@link #isPunctuation(int)}.
299     *
300     * @param offset the offset to check from.
301     * @return Whether the offset is after a punctuation character.
302     */
303    public boolean isAfterPunctuation(int offset) {
304        if (mStart < offset && offset <= mEnd) {
305            final int codePoint = Character.codePointBefore(mCharSeq, offset);
306            return isPunctuation(codePoint);
307        }
308        return false;
309    }
310
311    /**
312     * Indicates if the provided offset is at a punctuation character
313     * as defined by {@link #isPunctuation(int)}.
314     *
315     * @param offset the offset to check from.
316     * @return Whether the offset is at a punctuation character.
317     */
318    public boolean isOnPunctuation(int offset) {
319        if (mStart <= offset && offset < mEnd) {
320            final int codePoint = Character.codePointAt(mCharSeq, offset);
321            return isPunctuation(codePoint);
322        }
323        return false;
324    }
325
326    /**
327     * Indicates if the codepoint is a mid-word-only punctuation.
328     *
329     * At the moment, this is locale-independent, and includes all the characters in
330     * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see
331     * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the
332     * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are
333     * in the middle of a word, but they become word breaks if they happen at the end of a word
334     * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise).
335     *
336     * @param locale the locale to consider the codepoint in. Presently ignored.
337     * @param codePoint the codepoint to check.
338     * @return True if the codepoint is a mid-word punctuation.
339     */
340    public static boolean isMidWordPunctuation(Locale locale, int codePoint) {
341        final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK);
342        return (wb == UCharacter.WordBreak.MIDLETTER
343                || wb == UCharacter.WordBreak.MIDNUMLET
344                || wb == UCharacter.WordBreak.SINGLE_QUOTE);
345    }
346
347    private boolean isPunctuationStartBoundary(int offset) {
348        return isOnPunctuation(offset) && !isAfterPunctuation(offset);
349    }
350
351    private boolean isPunctuationEndBoundary(int offset) {
352        return !isOnPunctuation(offset) && isAfterPunctuation(offset);
353    }
354
355    private static boolean isPunctuation(int cp) {
356        final int type = Character.getType(cp);
357        return (type == Character.CONNECTOR_PUNCTUATION
358                || type == Character.DASH_PUNCTUATION
359                || type == Character.END_PUNCTUATION
360                || type == Character.FINAL_QUOTE_PUNCTUATION
361                || type == Character.INITIAL_QUOTE_PUNCTUATION
362                || type == Character.OTHER_PUNCTUATION
363                || type == Character.START_PUNCTUATION);
364    }
365
366    private boolean isAfterLetterOrDigit(int offset) {
367        if (mStart < offset && offset <= mEnd) {
368            final int codePoint = Character.codePointBefore(mCharSeq, offset);
369            if (Character.isLetterOrDigit(codePoint)) return true;
370        }
371        return false;
372    }
373
374    private boolean isOnLetterOrDigit(int offset) {
375        if (mStart <= offset && offset < mEnd) {
376            final int codePoint = Character.codePointAt(mCharSeq, offset);
377            if (Character.isLetterOrDigit(codePoint)) return true;
378        }
379        return false;
380    }
381
382    private void checkOffsetIsValid(int offset) {
383        if (!(mStart <= offset && offset <= mEnd)) {
384            throw new IllegalArgumentException("Invalid offset: " + (offset) +
385                    ". Valid range is [" + mStart + ", " + mEnd + "]");
386        }
387    }
388}
389