1e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne/* 2e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Copyright (C) 2011 The Android Open Source Project 3e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 4e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Licensed under the Apache License, Version 2.0 (the "License"); 5e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * you may not use this file except in compliance with the License. 6e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * You may obtain a copy of the License at 7e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 8e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * http://www.apache.org/licenses/LICENSE-2.0 9e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 10e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Unless required by applicable law or agreed to in writing, software 11e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * distributed under the License is distributed on an "AS IS" BASIS, 12e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * See the License for the specific language governing permissions and 14e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * limitations under the License. 15e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne */ 16e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 17e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunnepackage android.text.method; 18e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 19b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournaderimport android.annotation.NonNull; 20f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournaderimport android.icu.lang.UCharacter; 21f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournaderimport android.icu.lang.UProperty; 22b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournaderimport android.icu.text.BreakIterator; 23b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournaderimport android.text.CharSequenceCharacterIterator; 244037d51b132a85dcfe37a95f9d2d91ad23d162fdAurimas Liutikasimport android.text.Selection; 25e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 26e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunneimport java.util.Locale; 27e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 28e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne/** 29e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Walks through cursor positions at word boundaries. Internally uses 30e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence} 31e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * for performance reasons. 32e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 33e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Also provides methods to determine word boundaries. 34e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * {@hide} 35e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne */ 36e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunnepublic class WordIterator implements Selection.PositionIterator { 37287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne // Size of the window for the word iterator, should be greater than the longest word's length 38287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne private static final int WINDOW_WIDTH = 50; 39287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne 40b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader private int mStart, mEnd; 41b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader private CharSequence mCharSeq; 42b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader private final BreakIterator mIterator; 43e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 44e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne /** 45e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Constructs a WordIterator using the default locale. 46e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne */ 47e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne public WordIterator() { 48e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne this(Locale.getDefault()); 49e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 50e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 51e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne /** 52e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Constructs a new WordIterator for the specified locale. 53b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader * @param locale The locale to be used for analyzing the text. 54e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne */ 55e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne public WordIterator(Locale locale) { 56e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne mIterator = BreakIterator.getWordInstance(locale); 57e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 58e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 59b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) { 60b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (0 <= start && end <= charSequence.length()) { 61b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader mCharSeq = charSequence; 62b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader mStart = Math.max(0, start - WINDOW_WIDTH); 63b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); 64b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd)); 65653d3a27878d5358b4a91518a756f6b9b3407b07Gilles Debunne } else { 66b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader throw new IndexOutOfBoundsException("input indexes are outside the CharSequence"); 67653d3a27878d5358b4a91518a756f6b9b3407b07Gilles Debunne } 68e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 69e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 70e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne /** {@inheritDoc} */ 71e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne public int preceding(int offset) { 72b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 73b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader while (true) { 74b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader offset = mIterator.preceding(offset); 75b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) { 76b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return offset; 77e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 78b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader } 79e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 80e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 81e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne /** {@inheritDoc} */ 82e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne public int following(int offset) { 83b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 84b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader while (true) { 85b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader offset = mIterator.following(offset); 86b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) { 87b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return offset; 88e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 89b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader } 90e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 91e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 926c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor /** {@inheritDoc} */ 936c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor public boolean isBoundary(int offset) { 94b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 95b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return mIterator.isBoundary(offset); 966c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor } 976c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor 986c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor /** 996c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * Returns the position of next boundary after the given offset. Returns 1006c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * {@code DONE} if there is no boundary after the given offset. 1016c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * 1026c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * @param offset the given start position to search from. 1036c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * @return the position of the last boundary preceding the given offset. 1046c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor */ 1056c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor public int nextBoundary(int offset) { 106b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 107b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return mIterator.following(offset); 1086c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor } 1096c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor 1106c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor /** 1116c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * Returns the position of boundary preceding the given offset or 1126c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * {@code DONE} if the given offset specifies the starting position. 1136c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * 1146c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * @param offset the given start position to search from. 1156c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor * @return the position of the last boundary preceding the given offset. 1166c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor */ 1176c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor public int prevBoundary(int offset) { 118b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 119b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return mIterator.preceding(offset); 1206c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor } 1216c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor 122e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne /** If <code>offset</code> is within a word, returns the index of the first character of that 123e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * word, otherwise returns BreakIterator.DONE. 124e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 125e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * The offsets that are considered to be part of a word are the indexes of its characters, 126e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * <i>as well as</i> the index of its last character plus one. 127e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 128e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 129e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Valid range for offset is [0..textLength] (note the inclusive upper bound). 130e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * The returned value is within [0..offset] or BreakIterator.DONE. 131e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 132e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * @throws IllegalArgumentException is offset is not valid. 133e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne */ 134e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne public int getBeginning(int offset) { 135e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor // TODO: Check if usage of this can be updated to getBeginning(offset, true) if 136e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor // so this method can be removed. 137e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor return getBeginning(offset, false); 138e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor } 139e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor 140e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor /** 141e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If <code>offset</code> is within a word, returns the index of the last character of that 142e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * word plus one, otherwise returns BreakIterator.DONE. 143e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 144e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * The offsets that are considered to be part of a word are the indexes of its characters, 145e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * <i>as well as</i> the index of its last character plus one. 146e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned. 147e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 148e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * Valid range for offset is [0..textLength] (note the inclusive upper bound). 149e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * The returned value is within [offset..textLength] or BreakIterator.DONE. 150e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 151e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * @throws IllegalArgumentException is offset is not valid. 152e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor */ 153e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor public int getEnd(int offset) { 154e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor // TODO: Check if usage of this can be updated to getEnd(offset, true), if 155e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor // so this method can be removed. 156e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor return getEnd(offset, false); 157e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor } 158e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor 159e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor /** 160e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If the <code>offset</code> is within a word or on a word boundary that can only be 161e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * considered the start of a word (e.g. _word where "_" is any character that would not 162e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * be considered part of the word) then this returns the index of the first character of 163e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * that word. 164e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 165e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If the offset is on a word boundary that can be considered the start and end of a 166e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 167e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * between AA and BB, this would return the start of the previous word, AA. 168e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 169e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * Returns BreakIterator.DONE if there is no previous boundary. 170e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 171e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * @throws IllegalArgumentException is offset is not valid. 172e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor */ 173e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor public int getPrevWordBeginningOnTwoWordsBoundary(int offset) { 174e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor return getBeginning(offset, true); 175e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor } 176e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor 177e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor /** 178e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If the <code>offset</code> is within a word or on a word boundary that can only be 179e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * considered the end of a word (e.g. word_ where "_" is any character that would not 180e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * be considered part of the word) then this returns the index of the last character 181e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * plus one of that word. 182e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 183e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If the offset is on a word boundary that can be considered the start and end of a 184e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 185e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * between AA and BB, this would return the end of the next word, BB. 186e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 187e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * Returns BreakIterator.DONE if there is no next boundary. 188e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 189e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * @throws IllegalArgumentException is offset is not valid. 190e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor */ 191e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor public int getNextWordEndOnTwoWordBoundary(int offset) { 192e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor return getEnd(offset, true); 193e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor } 194e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor 195e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor /** 196e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If the <code>offset</code> is within a word or on a word boundary that can only be 197e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * considered the start of a word (e.g. _word where "_" is any character that would not 198e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * be considered part of the word) then this returns the index of the first character of 199e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * that word. 200e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 201e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If the offset is on a word boundary that can be considered the start and end of a 202e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 203e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would 204e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * return the start of the previous word, AA. Otherwise it would return the current offset, 205e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * the start of BB. 206e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 207e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * Returns BreakIterator.DONE if there is no previous boundary. 208e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * 209e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * @throws IllegalArgumentException is offset is not valid. 210e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor */ 211e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) { 212b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 213e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 214b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (isOnLetterOrDigit(offset)) { 215b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (mIterator.isBoundary(offset) 216b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader && (!isAfterLetterOrDigit(offset) 217e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor || !getPrevWordBeginningOnTwoWordsBoundary)) { 218b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return offset; 219e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } else { 220b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return mIterator.preceding(offset); 221e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 222e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } else { 223b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (isAfterLetterOrDigit(offset)) { 224b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return mIterator.preceding(offset); 225e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 226e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 227e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne return BreakIterator.DONE; 228e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 229e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 230e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor /** 231e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If the <code>offset</code> is within a word or on a word boundary that can only be 232e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * considered the end of a word (e.g. word_ where "_" is any character that would not be 233e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * considered part of the word) then this returns the index of the last character plus one 234e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * of that word. 235e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 236e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * If the offset is on a word boundary that can be considered the start and end of a 237e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary 238e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return 239e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * the end of the next word, BB. Otherwise it would return the current offset, the end 240e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * of AA. 241e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 242e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor * Returns BreakIterator.DONE if there is no next boundary. 243e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * 244e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * @throws IllegalArgumentException is offset is not valid. 245e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne */ 246e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) { 247b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 248e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 249b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (isAfterLetterOrDigit(offset)) { 250b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (mIterator.isBoundary(offset) 251b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) { 252b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return offset; 253e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } else { 254b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return mIterator.following(offset); 255e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 256e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } else { 257b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (isOnLetterOrDigit(offset)) { 258b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader return mIterator.following(offset); 259e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 260e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 261e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne return BreakIterator.DONE; 262e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 263e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 26458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor /** 26558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * If <code>offset</code> is within a group of punctuation as defined 26658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * by {@link #isPunctuation(int)}, returns the index of the first character 26758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * of that group, otherwise returns BreakIterator.DONE. 26858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * 26958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * @param offset the offset to search from. 27058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor */ 27158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor public int getPunctuationBeginning(int offset) { 272b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 27358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) { 27458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor offset = prevBoundary(offset); 27558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 27658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor // No need to shift offset, prevBoundary handles that. 27758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor return offset; 27858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 27958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor 28058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor /** 28158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * If <code>offset</code> is within a group of punctuation as defined 28258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * by {@link #isPunctuation(int)}, returns the index of the last character 28358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * of that group plus one, otherwise returns BreakIterator.DONE. 28458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * 28558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * @param offset the offset to search from. 28658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor */ 28758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor public int getPunctuationEnd(int offset) { 288b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader checkOffsetIsValid(offset); 28958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) { 29058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor offset = nextBoundary(offset); 29158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 29258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor // No need to shift offset, nextBoundary handles that. 29358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor return offset; 29458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 29558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor 29658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor /** 29758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * Indicates if the provided offset is after a punctuation character 29858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * as defined by {@link #isPunctuation(int)}. 29958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * 30058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * @param offset the offset to check from. 30158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * @return Whether the offset is after a punctuation character. 30258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor */ 30358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor public boolean isAfterPunctuation(int offset) { 304b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (mStart < offset && offset <= mEnd) { 305b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader final int codePoint = Character.codePointBefore(mCharSeq, offset); 30658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor return isPunctuation(codePoint); 30758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 30858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor return false; 30958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 31058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor 31158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor /** 31258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * Indicates if the provided offset is at a punctuation character 31358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * as defined by {@link #isPunctuation(int)}. 31458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * 31558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * @param offset the offset to check from. 31658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor * @return Whether the offset is at a punctuation character. 31758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor */ 31858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor public boolean isOnPunctuation(int offset) { 319b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (mStart <= offset && offset < mEnd) { 320b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader final int codePoint = Character.codePointAt(mCharSeq, offset); 32158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor return isPunctuation(codePoint); 32258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 32358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor return false; 32458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 32558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor 326f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader /** 327f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * Indicates if the codepoint is a mid-word-only punctuation. 328f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * 329f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * At the moment, this is locale-independent, and includes all the characters in 330f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see 331f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the 332f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are 333f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * in the middle of a word, but they become word breaks if they happen at the end of a word 334f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise). 335f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * 336f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * @param locale the locale to consider the codepoint in. Presently ignored. 337f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * @param codePoint the codepoint to check. 338f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader * @return True if the codepoint is a mid-word punctuation. 339f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader */ 340f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader public static boolean isMidWordPunctuation(Locale locale, int codePoint) { 341f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK); 342f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader return (wb == UCharacter.WordBreak.MIDLETTER 343f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader || wb == UCharacter.WordBreak.MIDNUMLET 344f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader || wb == UCharacter.WordBreak.SINGLE_QUOTE); 345f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader } 346f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader 34758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor private boolean isPunctuationStartBoundary(int offset) { 34858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor return isOnPunctuation(offset) && !isAfterPunctuation(offset); 34958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 35058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor 35158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor private boolean isPunctuationEndBoundary(int offset) { 35258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor return !isOnPunctuation(offset) && isAfterPunctuation(offset); 35358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 35458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor 355b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader private static boolean isPunctuation(int cp) { 356b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader final int type = Character.getType(cp); 357f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader return (type == Character.CONNECTOR_PUNCTUATION 358f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader || type == Character.DASH_PUNCTUATION 359f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader || type == Character.END_PUNCTUATION 360f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader || type == Character.FINAL_QUOTE_PUNCTUATION 361f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader || type == Character.INITIAL_QUOTE_PUNCTUATION 362f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader || type == Character.OTHER_PUNCTUATION 363f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader || type == Character.START_PUNCTUATION); 36458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor } 36558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor 366b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader private boolean isAfterLetterOrDigit(int offset) { 367b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (mStart < offset && offset <= mEnd) { 368b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader final int codePoint = Character.codePointBefore(mCharSeq, offset); 369287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne if (Character.isLetterOrDigit(codePoint)) return true; 370e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 371e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne return false; 372e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 373e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 374b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader private boolean isOnLetterOrDigit(int offset) { 375b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (mStart <= offset && offset < mEnd) { 376b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader final int codePoint = Character.codePointAt(mCharSeq, offset); 377287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne if (Character.isLetterOrDigit(codePoint)) return true; 378e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 379e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne return false; 380e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 381e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne 382b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader private void checkOffsetIsValid(int offset) { 383b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader if (!(mStart <= offset && offset <= mEnd)) { 384b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader throw new IllegalArgumentException("Invalid offset: " + (offset) + 385b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader ". Valid range is [" + mStart + ", " + mEnd + "]"); 386e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 387e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne } 388e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne} 389