1
2/*
3 * Copyright (C) 2011 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package android.text.method;
19
20import android.text.Selection;
21import android.text.SpannableStringBuilder;
22
23import android.icu.text.BreakIterator;
24import java.util.Locale;
25
26/**
27 * Walks through cursor positions at word boundaries. Internally uses
28 * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
29 * for performance reasons.
30 *
31 * Also provides methods to determine word boundaries.
32 * {@hide}
33 */
34public class WordIterator implements Selection.PositionIterator {
35    // Size of the window for the word iterator, should be greater than the longest word's length
36    private static final int WINDOW_WIDTH = 50;
37
38    private String mString;
39    private int mOffsetShift;
40
41    private BreakIterator mIterator;
42
43    /**
44     * Constructs a WordIterator using the default locale.
45     */
46    public WordIterator() {
47        this(Locale.getDefault());
48    }
49
50    /**
51     * Constructs a new WordIterator for the specified locale.
52     * @param locale The locale to be used when analysing the text.
53     */
54    public WordIterator(Locale locale) {
55        mIterator = BreakIterator.getWordInstance(locale);
56    }
57
58    public void setCharSequence(CharSequence charSequence, int start, int end) {
59        mOffsetShift = Math.max(0, start - WINDOW_WIDTH);
60        final int windowEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
61
62        if (charSequence instanceof SpannableStringBuilder) {
63            mString = ((SpannableStringBuilder) charSequence).substring(mOffsetShift, windowEnd);
64        } else {
65            mString = charSequence.subSequence(mOffsetShift, windowEnd).toString();
66        }
67        mIterator.setText(mString);
68    }
69
70    /** {@inheritDoc} */
71    public int preceding(int offset) {
72        int shiftedOffset = offset - mOffsetShift;
73        do {
74            shiftedOffset = mIterator.preceding(shiftedOffset);
75            if (shiftedOffset == BreakIterator.DONE) {
76                return BreakIterator.DONE;
77            }
78            if (isOnLetterOrDigit(shiftedOffset)) {
79                return shiftedOffset + mOffsetShift;
80            }
81        } while (true);
82    }
83
84    /** {@inheritDoc} */
85    public int following(int offset) {
86        int shiftedOffset = offset - mOffsetShift;
87        do {
88            shiftedOffset = mIterator.following(shiftedOffset);
89            if (shiftedOffset == BreakIterator.DONE) {
90                return BreakIterator.DONE;
91            }
92            if (isAfterLetterOrDigit(shiftedOffset)) {
93                return shiftedOffset + mOffsetShift;
94            }
95        } while (true);
96    }
97
98    /** {@inheritDoc} */
99    public boolean isBoundary(int offset) {
100        int shiftedOffset = offset - mOffsetShift;
101        checkOffsetIsValid(shiftedOffset);
102        return mIterator.isBoundary(shiftedOffset);
103    }
104
105    /**
106     * Returns the position of next boundary after the given offset. Returns
107     * {@code DONE} if there is no boundary after the given offset.
108     *
109     * @param offset the given start position to search from.
110     * @return the position of the last boundary preceding the given offset.
111     */
112    public int nextBoundary(int offset) {
113        int shiftedOffset = offset - mOffsetShift;
114        shiftedOffset = mIterator.following(shiftedOffset);
115        if (shiftedOffset == BreakIterator.DONE) {
116            return BreakIterator.DONE;
117        }
118        return shiftedOffset + mOffsetShift;
119    }
120
121    /**
122     * Returns the position of boundary preceding the given offset or
123     * {@code DONE} if the given offset specifies the starting position.
124     *
125     * @param offset the given start position to search from.
126     * @return the position of the last boundary preceding the given offset.
127     */
128    public int prevBoundary(int offset) {
129        int shiftedOffset = offset - mOffsetShift;
130        shiftedOffset = mIterator.preceding(shiftedOffset);
131        if (shiftedOffset == BreakIterator.DONE) {
132            return BreakIterator.DONE;
133        }
134        return shiftedOffset + mOffsetShift;
135    }
136
137    /** If <code>offset</code> is within a word, returns the index of the first character of that
138     * word, otherwise returns BreakIterator.DONE.
139     *
140     * The offsets that are considered to be part of a word are the indexes of its characters,
141     * <i>as well as</i> the index of its last character plus one.
142     * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
143     *
144     * Valid range for offset is [0..textLength] (note the inclusive upper bound).
145     * The returned value is within [0..offset] or BreakIterator.DONE.
146     *
147     * @throws IllegalArgumentException is offset is not valid.
148     */
149    public int getBeginning(int offset) {
150        // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
151        // so this method can be removed.
152        return getBeginning(offset, false);
153    }
154
155    /**
156     * If <code>offset</code> is within a word, returns the index of the last character of that
157     * word plus one, otherwise returns BreakIterator.DONE.
158     *
159     * The offsets that are considered to be part of a word are the indexes of its characters,
160     * <i>as well as</i> the index of its last character plus one.
161     * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
162     *
163     * Valid range for offset is [0..textLength] (note the inclusive upper bound).
164     * The returned value is within [offset..textLength] or BreakIterator.DONE.
165     *
166     * @throws IllegalArgumentException is offset is not valid.
167     */
168    public int getEnd(int offset) {
169        // TODO: Check if usage of this can be updated to getEnd(offset, true), if
170        // so this method can be removed.
171        return getEnd(offset, false);
172    }
173
174    /**
175     * If the <code>offset</code> is within a word or on a word boundary that can only be
176     * considered the start of a word (e.g. _word where "_" is any character that would not
177     * be considered part of the word) then this returns the index of the first character of
178     * that word.
179     *
180     * If the offset is on a word boundary that can be considered the start and end of a
181     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
182     * between AA and BB, this would return the start of the previous word, AA.
183     *
184     * Returns BreakIterator.DONE if there is no previous boundary.
185     *
186     * @throws IllegalArgumentException is offset is not valid.
187     */
188    public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
189        return getBeginning(offset, true);
190    }
191
192    /**
193     * If the <code>offset</code> is within a word or on a word boundary that can only be
194     * considered the end of a word (e.g. word_ where "_" is any character that would not
195     * be considered part of the word) then this returns the index of the last character
196     * plus one of that word.
197     *
198     * If the offset is on a word boundary that can be considered the start and end of a
199     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
200     * between AA and BB, this would return the end of the next word, BB.
201     *
202     * Returns BreakIterator.DONE if there is no next boundary.
203     *
204     * @throws IllegalArgumentException is offset is not valid.
205     */
206    public int getNextWordEndOnTwoWordBoundary(int offset) {
207        return getEnd(offset, true);
208    }
209
210    /**
211     * If the <code>offset</code> is within a word or on a word boundary that can only be
212     * considered the start of a word (e.g. _word where "_" is any character that would not
213     * be considered part of the word) then this returns the index of the first character of
214     * that word.
215     *
216     * If the offset is on a word boundary that can be considered the start and end of a
217     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
218     * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
219     * return the start of the previous word, AA. Otherwise it would return the current offset,
220     * the start of BB.
221     *
222     * Returns BreakIterator.DONE if there is no previous boundary.
223     *
224     * @throws IllegalArgumentException is offset is not valid.
225     */
226    private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
227        final int shiftedOffset = offset - mOffsetShift;
228        checkOffsetIsValid(shiftedOffset);
229
230        if (isOnLetterOrDigit(shiftedOffset)) {
231            if (mIterator.isBoundary(shiftedOffset)
232                    && (!isAfterLetterOrDigit(shiftedOffset)
233                            || !getPrevWordBeginningOnTwoWordsBoundary)) {
234                return shiftedOffset + mOffsetShift;
235            } else {
236                return mIterator.preceding(shiftedOffset) + mOffsetShift;
237            }
238        } else {
239            if (isAfterLetterOrDigit(shiftedOffset)) {
240                return mIterator.preceding(shiftedOffset) + mOffsetShift;
241            }
242        }
243        return BreakIterator.DONE;
244    }
245
246    /**
247     * If the <code>offset</code> is within a word or on a word boundary that can only be
248     * considered the end of a word (e.g. word_ where "_" is any character that would not be
249     * considered part of the word) then this returns the index of the last character plus one
250     * of that word.
251     *
252     * If the offset is on a word boundary that can be considered the start and end of a
253     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
254     * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
255     * the end of the next word, BB. Otherwise it would return the current offset, the end
256     * of AA.
257     *
258     * Returns BreakIterator.DONE if there is no next boundary.
259     *
260     * @throws IllegalArgumentException is offset is not valid.
261     */
262    private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
263        final int shiftedOffset = offset - mOffsetShift;
264        checkOffsetIsValid(shiftedOffset);
265
266        if (isAfterLetterOrDigit(shiftedOffset)) {
267            if (mIterator.isBoundary(shiftedOffset)
268                    && (!isOnLetterOrDigit(shiftedOffset) || !getNextWordEndOnTwoWordBoundary)) {
269                return shiftedOffset + mOffsetShift;
270            } else {
271                return mIterator.following(shiftedOffset) + mOffsetShift;
272            }
273        } else {
274            if (isOnLetterOrDigit(shiftedOffset)) {
275                return mIterator.following(shiftedOffset) + mOffsetShift;
276            }
277        }
278        return BreakIterator.DONE;
279    }
280
281    /**
282     * If <code>offset</code> is within a group of punctuation as defined
283     * by {@link #isPunctuation(int)}, returns the index of the first character
284     * of that group, otherwise returns BreakIterator.DONE.
285     *
286     * @param offset the offset to search from.
287     */
288    public int getPunctuationBeginning(int offset) {
289        while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
290            offset = prevBoundary(offset);
291        }
292        // No need to shift offset, prevBoundary handles that.
293        return offset;
294    }
295
296    /**
297     * If <code>offset</code> is within a group of punctuation as defined
298     * by {@link #isPunctuation(int)}, returns the index of the last character
299     * of that group plus one, otherwise returns BreakIterator.DONE.
300     *
301     * @param offset the offset to search from.
302     */
303    public int getPunctuationEnd(int offset) {
304        while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
305            offset = nextBoundary(offset);
306        }
307        // No need to shift offset, nextBoundary handles that.
308        return offset;
309    }
310
311    /**
312     * Indicates if the provided offset is after a punctuation character
313     * as defined by {@link #isPunctuation(int)}.
314     *
315     * @param offset the offset to check from.
316     * @return Whether the offset is after a punctuation character.
317     */
318    public boolean isAfterPunctuation(int offset) {
319        final int shiftedOffset = offset - mOffsetShift;
320        if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) {
321            final int codePoint = mString.codePointBefore(shiftedOffset);
322            return isPunctuation(codePoint);
323        }
324        return false;
325    }
326
327    /**
328     * Indicates if the provided offset is at a punctuation character
329     * as defined by {@link #isPunctuation(int)}.
330     *
331     * @param offset the offset to check from.
332     * @return Whether the offset is at a punctuation character.
333     */
334    public boolean isOnPunctuation(int offset) {
335        final int shiftedOffset = offset - mOffsetShift;
336        if (shiftedOffset >= 0 && shiftedOffset < mString.length()) {
337            final int codePoint = mString.codePointAt(shiftedOffset);
338            return isPunctuation(codePoint);
339        }
340        return false;
341    }
342
343    private boolean isPunctuationStartBoundary(int offset) {
344        return isOnPunctuation(offset) && !isAfterPunctuation(offset);
345    }
346
347    private boolean isPunctuationEndBoundary(int offset) {
348        return !isOnPunctuation(offset) && isAfterPunctuation(offset);
349    }
350
351    private boolean isPunctuation(int cp) {
352        int type = Character.getType(cp);
353        return (type == Character.CONNECTOR_PUNCTUATION ||
354                type == Character.DASH_PUNCTUATION ||
355                type == Character.END_PUNCTUATION ||
356                type == Character.FINAL_QUOTE_PUNCTUATION ||
357                type == Character.INITIAL_QUOTE_PUNCTUATION ||
358                type == Character.OTHER_PUNCTUATION ||
359                type == Character.START_PUNCTUATION);
360    }
361
362    private boolean isAfterLetterOrDigit(int shiftedOffset) {
363        if (shiftedOffset >= 1 && shiftedOffset <= mString.length()) {
364            final int codePoint = mString.codePointBefore(shiftedOffset);
365            if (Character.isLetterOrDigit(codePoint)) return true;
366        }
367        return false;
368    }
369
370    private boolean isOnLetterOrDigit(int shiftedOffset) {
371        if (shiftedOffset >= 0 && shiftedOffset < mString.length()) {
372            final int codePoint = mString.codePointAt(shiftedOffset);
373            if (Character.isLetterOrDigit(codePoint)) return true;
374        }
375        return false;
376    }
377
378    private void checkOffsetIsValid(int shiftedOffset) {
379        if (shiftedOffset < 0 || shiftedOffset > mString.length()) {
380            throw new IllegalArgumentException("Invalid offset: " + (shiftedOffset + mOffsetShift) +
381                    ". Valid range is [" + mOffsetShift + ", " + (mString.length() + mOffsetShift) +
382                    "]");
383        }
384    }
385}
386