1e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne/*
2e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Copyright (C) 2011 The Android Open Source Project
3e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne *
4e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Licensed under the Apache License, Version 2.0 (the "License");
5e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * you may not use this file except in compliance with the License.
6e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * You may obtain a copy of the License at
7e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne *
8e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne *      http://www.apache.org/licenses/LICENSE-2.0
9e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne *
10e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Unless required by applicable law or agreed to in writing, software
11e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * distributed under the License is distributed on an "AS IS" BASIS,
12e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * See the License for the specific language governing permissions and
14e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * limitations under the License.
15e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne */
16e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
17e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunnepackage android.text.method;
18e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
19b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournaderimport android.annotation.NonNull;
20f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournaderimport android.icu.lang.UCharacter;
21f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournaderimport android.icu.lang.UProperty;
22b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournaderimport android.icu.text.BreakIterator;
23b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournaderimport android.text.CharSequenceCharacterIterator;
244037d51b132a85dcfe37a95f9d2d91ad23d162fdAurimas Liutikasimport android.text.Selection;
25e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
26e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunneimport java.util.Locale;
27e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
28e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne/**
29e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Walks through cursor positions at word boundaries. Internally uses
30e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * {@link BreakIterator#getWordInstance()}, and caches {@link CharSequence}
31e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * for performance reasons.
32e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne *
33e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * Also provides methods to determine word boundaries.
34e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne * {@hide}
35e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne */
36e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunnepublic class WordIterator implements Selection.PositionIterator {
37287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne    // Size of the window for the word iterator, should be greater than the longest word's length
38287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne    private static final int WINDOW_WIDTH = 50;
39287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne
40b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader    private int mStart, mEnd;
41b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader    private CharSequence mCharSeq;
42b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader    private final BreakIterator mIterator;
43e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
44e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    /**
45e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * Constructs a WordIterator using the default locale.
46e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     */
47e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    public WordIterator() {
48e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        this(Locale.getDefault());
49e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
50e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
51e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    /**
52e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * Constructs a new WordIterator for the specified locale.
53b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader     * @param locale The locale to be used for analyzing the text.
54e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     */
55e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    public WordIterator(Locale locale) {
56e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        mIterator = BreakIterator.getWordInstance(locale);
57e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
58e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
59b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader    public void setCharSequence(@NonNull CharSequence charSequence, int start, int end) {
60b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        if (0 <= start && end <= charSequence.length()) {
61b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            mCharSeq = charSequence;
62b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            mStart = Math.max(0, start - WINDOW_WIDTH);
63b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            mEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH);
64b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            mIterator.setText(new CharSequenceCharacterIterator(charSequence, mStart, mEnd));
65653d3a27878d5358b4a91518a756f6b9b3407b07Gilles Debunne        } else {
66b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            throw new IndexOutOfBoundsException("input indexes are outside the CharSequence");
67653d3a27878d5358b4a91518a756f6b9b3407b07Gilles Debunne        }
68e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
69e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
70e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    /** {@inheritDoc} */
71e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    public int preceding(int offset) {
72b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
73b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        while (true) {
74b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            offset = mIterator.preceding(offset);
75b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            if (offset == BreakIterator.DONE || isOnLetterOrDigit(offset)) {
76b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                return offset;
77e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne            }
78b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        }
79e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
80e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
81e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    /** {@inheritDoc} */
82e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    public int following(int offset) {
83b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
84b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        while (true) {
85b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            offset = mIterator.following(offset);
86b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            if (offset == BreakIterator.DONE || isAfterLetterOrDigit(offset)) {
87b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                return offset;
88e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne            }
89b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        }
90e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
91e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
926c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    /** {@inheritDoc} */
936c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    public boolean isBoundary(int offset) {
94b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
95b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        return mIterator.isBoundary(offset);
966c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    }
976c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor
986c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    /**
996c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     * Returns the position of next boundary after the given offset. Returns
1006c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     * {@code DONE} if there is no boundary after the given offset.
1016c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     *
1026c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     * @param offset the given start position to search from.
1036c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     * @return the position of the last boundary preceding the given offset.
1046c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     */
1056c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    public int nextBoundary(int offset) {
106b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
107b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        return mIterator.following(offset);
1086c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    }
1096c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor
1106c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    /**
1116c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     * Returns the position of boundary preceding the given offset or
1126c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     * {@code DONE} if the given offset specifies the starting position.
1136c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     *
1146c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     * @param offset the given start position to search from.
1156c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     * @return the position of the last boundary preceding the given offset.
1166c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor     */
1176c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    public int prevBoundary(int offset) {
118b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
119b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        return mIterator.preceding(offset);
1206c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor    }
1216c7b4ad690fe5c22c01ad79a232e567e835f676dMady Mellor
122e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    /** If <code>offset</code> is within a word, returns the index of the first character of that
123e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * word, otherwise returns BreakIterator.DONE.
124e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     *
125e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * The offsets that are considered to be part of a word are the indexes of its characters,
126e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * <i>as well as</i> the index of its last character plus one.
127e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
128e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     *
129e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * Valid range for offset is [0..textLength] (note the inclusive upper bound).
130e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * The returned value is within [0..offset] or BreakIterator.DONE.
131e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     *
132e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * @throws IllegalArgumentException is offset is not valid.
133e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     */
134e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    public int getBeginning(int offset) {
135e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor        // TODO: Check if usage of this can be updated to getBeginning(offset, true) if
136e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor        // so this method can be removed.
137e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor        return getBeginning(offset, false);
138e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    }
139e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor
140e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    /**
141e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If <code>offset</code> is within a word, returns the index of the last character of that
142e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * word plus one, otherwise returns BreakIterator.DONE.
143e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
144e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * The offsets that are considered to be part of a word are the indexes of its characters,
145e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * <i>as well as</i> the index of its last character plus one.
146e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If offset is the index of a low surrogate character, BreakIterator.DONE will be returned.
147e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
148e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * Valid range for offset is [0..textLength] (note the inclusive upper bound).
149e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * The returned value is within [offset..textLength] or BreakIterator.DONE.
150e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
151e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * @throws IllegalArgumentException is offset is not valid.
152e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     */
153e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    public int getEnd(int offset) {
154e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor        // TODO: Check if usage of this can be updated to getEnd(offset, true), if
155e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor        // so this method can be removed.
156e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor        return getEnd(offset, false);
157e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    }
158e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor
159e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    /**
160e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If the <code>offset</code> is within a word or on a word boundary that can only be
161e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * considered the start of a word (e.g. _word where "_" is any character that would not
162e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * be considered part of the word) then this returns the index of the first character of
163e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * that word.
164e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
165e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If the offset is on a word boundary that can be considered the start and end of a
166e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
167e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * between AA and BB, this would return the start of the previous word, AA.
168e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
169e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * Returns BreakIterator.DONE if there is no previous boundary.
170e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
171e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * @throws IllegalArgumentException is offset is not valid.
172e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     */
173e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    public int getPrevWordBeginningOnTwoWordsBoundary(int offset) {
174e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor        return getBeginning(offset, true);
175e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    }
176e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor
177e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    /**
178e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If the <code>offset</code> is within a word or on a word boundary that can only be
179e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * considered the end of a word (e.g. word_ where "_" is any character that would not
180e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * be considered part of the word) then this returns the index of the last character
181e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * plus one of that word.
182e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
183e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If the offset is on a word boundary that can be considered the start and end of a
184e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
185e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * between AA and BB, this would return the end of the next word, BB.
186e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
187e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * Returns BreakIterator.DONE if there is no next boundary.
188e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
189e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * @throws IllegalArgumentException is offset is not valid.
190e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     */
191e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    public int getNextWordEndOnTwoWordBoundary(int offset) {
192e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor        return getEnd(offset, true);
193e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    }
194e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor
195e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    /**
196e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If the <code>offset</code> is within a word or on a word boundary that can only be
197e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * considered the start of a word (e.g. _word where "_" is any character that would not
198e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * be considered part of the word) then this returns the index of the first character of
199e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * that word.
200e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
201e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If the offset is on a word boundary that can be considered the start and end of a
202e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
203e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * between AA and BB, and getPrevWordBeginningOnTwoWordsBoundary is true then this would
204e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * return the start of the previous word, AA. Otherwise it would return the current offset,
205e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * the start of BB.
206e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
207e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * Returns BreakIterator.DONE if there is no previous boundary.
208e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     *
209e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * @throws IllegalArgumentException is offset is not valid.
210e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     */
211e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    private int getBeginning(int offset, boolean getPrevWordBeginningOnTwoWordsBoundary) {
212b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
213e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
214b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        if (isOnLetterOrDigit(offset)) {
215b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            if (mIterator.isBoundary(offset)
216b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                    && (!isAfterLetterOrDigit(offset)
217e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor                            || !getPrevWordBeginningOnTwoWordsBoundary)) {
218b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                return offset;
219e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne            } else {
220b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                return mIterator.preceding(offset);
221e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne            }
222e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        } else {
223b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            if (isAfterLetterOrDigit(offset)) {
224b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                return mIterator.preceding(offset);
225e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne            }
226e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        }
227e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        return BreakIterator.DONE;
228e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
229e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
230e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    /**
231e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If the <code>offset</code> is within a word or on a word boundary that can only be
232e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * considered the end of a word (e.g. word_ where "_" is any character that would not be
233e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * considered part of the word) then this returns the index of the last character plus one
234e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * of that word.
235e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     *
236e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * If the offset is on a word boundary that can be considered the start and end of a
237e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * word, e.g. AABB (where AA and BB are both words) and the offset is the boundary
238e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * between AA and BB, and getNextWordEndOnTwoWordBoundary is true then this would return
239e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * the end of the next word, BB. Otherwise it would return the current offset, the end
240e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * of AA.
241e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     *
242e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor     * Returns BreakIterator.DONE if there is no next boundary.
243e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     *
244e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     * @throws IllegalArgumentException is offset is not valid.
245e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne     */
246e264ac392a886788ebfd1069e1d366e2b1edef72Mady Mellor    private int getEnd(int offset, boolean getNextWordEndOnTwoWordBoundary) {
247b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
248e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
249b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        if (isAfterLetterOrDigit(offset)) {
250b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            if (mIterator.isBoundary(offset)
251b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                    && (!isOnLetterOrDigit(offset) || !getNextWordEndOnTwoWordBoundary)) {
252b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                return offset;
253e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne            } else {
254b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                return mIterator.following(offset);
255e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne            }
256e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        } else {
257b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            if (isOnLetterOrDigit(offset)) {
258b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                return mIterator.following(offset);
259e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne            }
260e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        }
261e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        return BreakIterator.DONE;
262e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
263e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
26458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    /**
26558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * If <code>offset</code> is within a group of punctuation as defined
26658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * by {@link #isPunctuation(int)}, returns the index of the first character
26758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * of that group, otherwise returns BreakIterator.DONE.
26858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     *
26958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * @param offset the offset to search from.
27058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     */
27158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    public int getPunctuationBeginning(int offset) {
272b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
27358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        while (offset != BreakIterator.DONE && !isPunctuationStartBoundary(offset)) {
27458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor            offset = prevBoundary(offset);
27558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        }
27658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        // No need to shift offset, prevBoundary handles that.
27758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        return offset;
27858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    }
27958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor
28058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    /**
28158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * If <code>offset</code> is within a group of punctuation as defined
28258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * by {@link #isPunctuation(int)}, returns the index of the last character
28358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * of that group plus one, otherwise returns BreakIterator.DONE.
28458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     *
28558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * @param offset the offset to search from.
28658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     */
28758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    public int getPunctuationEnd(int offset) {
288b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        checkOffsetIsValid(offset);
28958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        while (offset != BreakIterator.DONE && !isPunctuationEndBoundary(offset)) {
29058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor            offset = nextBoundary(offset);
29158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        }
29258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        // No need to shift offset, nextBoundary handles that.
29358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        return offset;
29458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    }
29558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor
29658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    /**
29758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * Indicates if the provided offset is after a punctuation character
29858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * as defined by {@link #isPunctuation(int)}.
29958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     *
30058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * @param offset the offset to check from.
30158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * @return Whether the offset is after a punctuation character.
30258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     */
30358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    public boolean isAfterPunctuation(int offset) {
304b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        if (mStart < offset && offset <= mEnd) {
305b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            final int codePoint = Character.codePointBefore(mCharSeq, offset);
30658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor            return isPunctuation(codePoint);
30758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        }
30858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        return false;
30958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    }
31058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor
31158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    /**
31258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * Indicates if the provided offset is at a punctuation character
31358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * as defined by {@link #isPunctuation(int)}.
31458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     *
31558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * @param offset the offset to check from.
31658c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     * @return Whether the offset is at a punctuation character.
31758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor     */
31858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    public boolean isOnPunctuation(int offset) {
319b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        if (mStart <= offset && offset < mEnd) {
320b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            final int codePoint = Character.codePointAt(mCharSeq, offset);
32158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor            return isPunctuation(codePoint);
32258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        }
32358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        return false;
32458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    }
32558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor
326f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader    /**
327f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * Indicates if the codepoint is a mid-word-only punctuation.
328f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     *
329f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * At the moment, this is locale-independent, and includes all the characters in
330f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * the MidLetter, MidNumLet, and Single_Quote class of Unicode word breaking algorithm (see
331f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * UAX #29 "Unicode Text Segmentation" at http://unicode.org/reports/tr29/). These are all the
332f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * characters that according to the rules WB6 and WB7 of UAX #29 prevent word breaks if they are
333f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * in the middle of a word, but they become word breaks if they happen at the end of a word
334f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * (accroding to rule WB999 that breaks word in any place that is not prohibited otherwise).
335f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     *
336f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * @param locale the locale to consider the codepoint in. Presently ignored.
337f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * @param codePoint the codepoint to check.
338f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     * @return True if the codepoint is a mid-word punctuation.
339f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader     */
340f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader    public static boolean isMidWordPunctuation(Locale locale, int codePoint) {
341f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader        final int wb = UCharacter.getIntPropertyValue(codePoint, UProperty.WORD_BREAK);
342f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader        return (wb == UCharacter.WordBreak.MIDLETTER
343f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader                || wb == UCharacter.WordBreak.MIDNUMLET
344f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader                || wb == UCharacter.WordBreak.SINGLE_QUOTE);
345f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader    }
346f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader
34758c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    private boolean isPunctuationStartBoundary(int offset) {
34858c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        return isOnPunctuation(offset) && !isAfterPunctuation(offset);
34958c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    }
35058c9087137989da8411ffd212072f630d3fac4f3Mady Mellor
35158c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    private boolean isPunctuationEndBoundary(int offset) {
35258c9087137989da8411ffd212072f630d3fac4f3Mady Mellor        return !isOnPunctuation(offset) && isAfterPunctuation(offset);
35358c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    }
35458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor
355b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader    private static boolean isPunctuation(int cp) {
356b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        final int type = Character.getType(cp);
357f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader        return (type == Character.CONNECTOR_PUNCTUATION
358f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader                || type == Character.DASH_PUNCTUATION
359f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader                || type == Character.END_PUNCTUATION
360f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader                || type == Character.FINAL_QUOTE_PUNCTUATION
361f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader                || type == Character.INITIAL_QUOTE_PUNCTUATION
362f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader                || type == Character.OTHER_PUNCTUATION
363f6952b83310af2fefddaa6d1f038f7709fa4a803Roozbeh Pournader                || type == Character.START_PUNCTUATION);
36458c9087137989da8411ffd212072f630d3fac4f3Mady Mellor    }
36558c9087137989da8411ffd212072f630d3fac4f3Mady Mellor
366b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader    private boolean isAfterLetterOrDigit(int offset) {
367b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        if (mStart < offset && offset <= mEnd) {
368b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            final int codePoint = Character.codePointBefore(mCharSeq, offset);
369287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne            if (Character.isLetterOrDigit(codePoint)) return true;
370e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        }
371e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        return false;
372e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
373e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
374b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader    private boolean isOnLetterOrDigit(int offset) {
375b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        if (mStart <= offset && offset < mEnd) {
376b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            final int codePoint = Character.codePointAt(mCharSeq, offset);
377287d6c6e12a38864d019fa7b9184206bc8a31ea1Gilles Debunne            if (Character.isLetterOrDigit(codePoint)) return true;
378e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        }
379e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        return false;
380e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
381e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne
382b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader    private void checkOffsetIsValid(int offset) {
383b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader        if (!(mStart <= offset && offset <= mEnd)) {
384b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader            throw new IllegalArgumentException("Invalid offset: " + (offset) +
385b08a07165fdc9ee6df40bbce94676755461ff3e0Roozbeh Pournader                    ". Valid range is [" + mStart + ", " + mEnd + "]");
386e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne        }
387e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne    }
388e193fd14346e6e808c6c266d2bb13c0c0cc6890eGilles Debunne}
389