/* * Copyright (C) 2011 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package android.text.method; import android.annotation.NonNull; import android.icu.lang.UCharacter; import android.icu.lang.UProperty; import android.icu.text.BreakIterator; import android.text.CharSequenceCharacterIterator; import android.text.Selection; import java.util.Locale; /** * Walks through cursor positions at word boundaries. Internally uses * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence} * for performance reasons. * * Also provides methods to determine word boundaries. * {@hide} */ public class WordIterator implements Selection.PositionIterator { // Size of the window for the word iterator, should be greater than the longest word's length private static final int WINDOW_WIDTH = 50; private int mStart, mEnd; private CharSequence mCharSeq; private final BreakIterator mIterator; /** * Constructs a WordIterator using the default locale. */ public WordIterator() { this(Locale.getDefault()); } /** * Constructs a new WordIterator for the specified locale. * @param locale The locale to be used for analyzing the text. */ public WordIterator(Locale locale) { mIterator = BreakIterator.getWordInstance(locale); } public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) { if (0 <= start && end <= charSequence.length()) { mCharSeq = charSequence; mStart = Math.max(0, start - WINDOW_WIDTH); mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd)); } else { throw new IndexOutOfBoundsException("input indexes are outside the CharSequence"); } } /** {@inheritDoc} */ public int preceding(int offset) { checkOffsetIsValid(offset); while (true) { offset = mIterator.preceding(offset); if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) { return offset; } } } /** {@inheritDoc} */ public int following(int offset) { checkOffsetIsValid(offset); while (true) { offset = mIterator.following(offset); if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) { return offset; } } } /** {@inheritDoc} */ public boolean isBoundary(int offset) { checkOffsetIsValid(offset); return mIterator.isBoundary(offset); } /** * Returns the position of next boundary after the given offset. Returns * {@code DONE} if there is no boundary after the given offset. * * @param offset the given start position to search from. * @return the position of the last boundary preceding the given offset. */ public int nextBoundary(int offset) { checkOffsetIsValid(offset); return mIterator.following(offset); } /** * Returns the position of boundary preceding the given offset or * {@code DONE} if the given offset specifies the starting position. * * @param offset the given start position to search from. * @return the position of the last boundary preceding the given offset. */ public int prevBoundary(int offset) { checkOffsetIsValid(offset); return mIterator.preceding(offset); } /** If offset is within a word, returns the index of the first character of that * word, otherwise returns BreakIterator.DONE. * * The offsets that are considered to be part of a word are the indexes of its characters, * as well as the index of its last character plus one. * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. * * Valid range for offset is [0..textLength] (note the inclusive upper bound). * The returned value is within [0..offset] or BreakIterator.DONE. * * @throws IllegalArgumentException is offset is not valid. */ public int getBeginning(int offset) { // TODO: Check if usage of this can be updated to getBeginning(offset, true) if // so this method can be removed. return getBeginning(offset, false); } /** * If offset is within a word, returns the index of the last character of that * word plus one, otherwise returns BreakIterator.DONE. * * The offsets that are considered to be part of a word are the indexes of its characters, * as well as the index of its last character plus one. * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. * * Valid range for offset is [0..textLength] (note the inclusive upper bound). * The returned value is within [offset..textLength] or BreakIterator.DONE. * * @throws IllegalArgumentException is offset is not valid. */ public int getEnd(int offset) { // TODO: Check if usage of this can be updated to getEnd(offset, true), if // so this method can be removed. return getEnd(offset, false); } /** * If the offset is within a word or on a word boundary that can only be * considered the start of a word (e.g. _word where "_" is any character that would not * be considered part of the word) then this returns the index of the first character of * that word. * * If the offset is on a word boundary that can be considered the start and end of a * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary * between AA and BB, this would return the start of the previous word, AA. * * Returns BreakIterator.DONE if there is no previous boundary. * * @throws IllegalArgumentException is offset is not valid. */ public int getPrevWordBeginningOnTwoWordsBoundary(int offset) { return getBeginning(offset, true); } /** * If the offset is within a word or on a word boundary that can only be * considered the end of a word (e.g. word_ where "_" is any character that would not * be considered part of the word) then this returns the index of the last character * plus one of that word. * * If the offset is on a word boundary that can be considered the start and end of a * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary * between AA and BB, this would return the end of the next word, BB. * * Returns BreakIterator.DONE if there is no next boundary. * * @throws IllegalArgumentException is offset is not valid. */ public int getNextWordEndOnTwoWordBoundary(int offset) { return getEnd(offset, true); } /** * If the offset is within a word or on a word boundary that can only be * considered the start of a word (e.g. _word where "_" is any character that would not * be considered part of the word) then this returns the index of the first character of * that word. * * If the offset is on a word boundary that can be considered the start and end of a * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would * return the start of the previous word, AA. Otherwise it would return the current offset, * the start of BB. * * Returns BreakIterator.DONE if there is no previous boundary. * * @throws IllegalArgumentException is offset is not valid. */ private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) { checkOffsetIsValid(offset); if (isOnLetterOrDigit(offset)) { if (mIterator.isBoundary(offset) && (!isAfterLetterOrDigit(offset) || !getPrevWordBeginningOnTwoWordsBoundary)) { return offset; } else { return mIterator.preceding(offset); } } else { if (isAfterLetterOrDigit(offset)) { return mIterator.preceding(offset); } } return BreakIterator.DONE; } /** * If the offset is within a word or on a word boundary that can only be * considered the end of a word (e.g. word_ where "_" is any character that would not be * considered part of the word) then this returns the index of the last character plus one * of that word. * * If the offset is on a word boundary that can be considered the start and end of a * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return * the end of the next word, BB. Otherwise it would return the current offset, the end * of AA. * * Returns BreakIterator.DONE if there is no next boundary. * * @throws IllegalArgumentException is offset is not valid. */ private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) { checkOffsetIsValid(offset); if (isAfterLetterOrDigit(offset)) { if (mIterator.isBoundary(offset) && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) { return offset; } else { return mIterator.following(offset); } } else { if (isOnLetterOrDigit(offset)) { return mIterator.following(offset); } } return BreakIterator.DONE; } /** * If offset is within a group of punctuation as defined * by {@link #isPunctuation(int)}, returns the index of the first character * of that group, otherwise returns BreakIterator.DONE. * * @param offset the offset to search from. */ public int getPunctuationBeginning(int offset) { checkOffsetIsValid(offset); while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) { offset = prevBoundary(offset); } // No need to shift offset, prevBoundary handles that. return offset; } /** * If offset is within a group of punctuation as defined * by {@link #isPunctuation(int)}, returns the index of the last character * of that group plus one, otherwise returns BreakIterator.DONE. * * @param offset the offset to search from. */ public int getPunctuationEnd(int offset) { checkOffsetIsValid(offset); while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) { offset = nextBoundary(offset); } // No need to shift offset, nextBoundary handles that. return offset; } /** * Indicates if the provided offset is after a punctuation character * as defined by {@link #isPunctuation(int)}. * * @param offset the offset to check from. * @return Whether the offset is after a punctuation character. */ public boolean isAfterPunctuation(int offset) { if (mStart < offset && offset <= mEnd) { final int codePoint = Character.codePointBefore(mCharSeq, offset); return isPunctuation(codePoint); } return false; } /** * Indicates if the provided offset is at a punctuation character * as defined by {@link #isPunctuation(int)}. * * @param offset the offset to check from. * @return Whether the offset is at a punctuation character. */ public boolean isOnPunctuation(int offset) { if (mStart <= offset && offset < mEnd) { final int codePoint = Character.codePointAt(mCharSeq, offset); return isPunctuation(codePoint); } return false; } /** * Indicates if the codepoint is a mid-word-only punctuation. * * At the moment, this is locale-independent, and includes all the characters in * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are * in the middle of a word, but they become word breaks if they happen at the end of a word * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise). * * @param locale the locale to consider the codepoint in. Presently ignored. * @param codePoint the codepoint to check. * @return True if the codepoint is a mid-word punctuation. */ public static boolean isMidWordPunctuation(Locale locale, int codePoint) { final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK); return (wb == UCharacter.WordBreak.MIDLETTER || wb == UCharacter.WordBreak.MIDNUMLET || wb == UCharacter.WordBreak.SINGLE_QUOTE); } private boolean isPunctuationStartBoundary(int offset) { return isOnPunctuation(offset) && !isAfterPunctuation(offset); } private boolean isPunctuationEndBoundary(int offset) { return !isOnPunctuation(offset) && isAfterPunctuation(offset); } private static boolean isPunctuation(int cp) { final int type = Character.getType(cp); return (type == Character.CONNECTOR_PUNCTUATION || type == Character.DASH_PUNCTUATION || type == Character.END_PUNCTUATION || type == Character.FINAL_QUOTE_PUNCTUATION || type == Character.INITIAL_QUOTE_PUNCTUATION || type == Character.OTHER_PUNCTUATION || type == Character.START_PUNCTUATION); } private boolean isAfterLetterOrDigit(int offset) { if (mStart < offset && offset <= mEnd) { final int codePoint = Character.codePointBefore(mCharSeq, offset); if (Character.isLetterOrDigit(codePoint)) return true; } return false; } private boolean isOnLetterOrDigit(int offset) { if (mStart <= offset && offset < mEnd) { final int codePoint = Character.codePointAt(mCharSeq, offset); if (Character.isLetterOrDigit(codePoint)) return true; } return false; } private void checkOffsetIsValid(int offset) { if (!(mStart <= offset && offset <= mEnd)) { throw new IllegalArgumentException("Invalid offset: " + (offset) + ". Valid range is [" + mStart + ", " + mEnd + "]"); } } }