WordBreaker.cpp revision 74b56175e5d41c1c1dc992208842b5576973d452
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18#include <cutils/log.h>
19
20#include <minikin/WordBreaker.h>
21#include "MinikinInternal.h"
22
23#include <unicode/uchar.h>
24#include <unicode/utf16.h>
25
26namespace android {
27
28const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
29const uint32_t CHAR_ZWJ = 0x200D;
30
31void WordBreaker::setLocale(const icu::Locale& locale) {
32    UErrorCode status = U_ZERO_ERROR;
33    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
34    // TODO: handle failure status
35    if (mText != nullptr) {
36        mBreakIterator->setText(&mUText, status);
37    }
38    mIteratorWasReset = true;
39}
40
41void WordBreaker::setText(const uint16_t* data, size_t size) {
42    mText = data;
43    mTextSize = size;
44    mIteratorWasReset = false;
45    mLast = 0;
46    mCurrent = 0;
47    mScanOffset = 0;
48    mInEmailOrUrl = false;
49    UErrorCode status = U_ZERO_ERROR;
50    utext_openUChars(&mUText, data, size, &status);
51    mBreakIterator->setText(&mUText, status);
52    mBreakIterator->first();
53}
54
55ssize_t WordBreaker::current() const {
56    return mCurrent;
57}
58
59enum ScanState {
60    START,
61    SAW_AT,
62    SAW_COLON,
63    SAW_COLON_SLASH,
64    SAW_COLON_SLASH_SLASH,
65};
66
67/**
68 * Determine whether a line break at position i within the buffer buf is valid. This
69 * represents customization beyond the ICU behavior, because plain ICU provides some
70 * line break opportunities that we don't want.
71 **/
72static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
73    uint32_t codePoint;
74    size_t prev_offset = i;
75    U16_PREV(buf, 0, prev_offset, codePoint);
76    if (codePoint == CHAR_SOFT_HYPHEN) {
77        return false;
78    }
79    // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
80    // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
81    // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
82    // where no line break could be imagined, since the Myanmar virama is a pure stacker.
83    if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
84        return false;
85    }
86
87    uint32_t next_codepoint;
88    size_t next_offset = i;
89    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
90
91    // Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt
92    // (AL | HL) × (PR | PO)
93    int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK);
94    if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) {
95        lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK);
96        if (lineBreak == U_LB_PREFIX_NUMERIC || lineBreak == U_LB_POSTFIX_NUMERIC) {
97            return false;
98        }
99    }
100
101    // Emoji ZWJ sequences.
102    if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
103        return false;
104    }
105
106    // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
107    // EB x EM
108    if (isEmojiModifier(next_codepoint)) {
109        if (codePoint == 0xFE0F && prev_offset > 0) {
110            // skip over emoji variation selector
111            U16_PREV(buf, 0, prev_offset, codePoint);
112        }
113        if (isEmojiBase(codePoint)) {
114            return false;
115        }
116    }
117    return true;
118}
119
120// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
121static bool breakAfter(uint16_t c) {
122    return c == ':' || c == '=' || c == '&';
123}
124
125// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
126static bool breakBefore(uint16_t c) {
127    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
128            || c == '%' || c == '=' || c == '&';
129}
130
131ssize_t WordBreaker::next() {
132    mLast = mCurrent;
133
134    // scan forward from current ICU position for email address or URL
135    if (mLast >= mScanOffset) {
136        ScanState state = START;
137        size_t i;
138        for (i = mLast; i < mTextSize; i++) {
139            uint16_t c = mText[i];
140            // scan only ASCII characters, stop at space
141            if (!(' ' < c && c <= 0x007E)) {
142                break;
143            }
144            if (state == START && c == '@') {
145                state = SAW_AT;
146            } else if (state == START && c == ':') {
147                state = SAW_COLON;
148            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
149                if (c == '/') {
150                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
151                } else {
152                    state = START;
153                }
154            }
155        }
156        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
157            if (!mBreakIterator->isBoundary(i)) {
158                i = mBreakIterator->following(i);
159            }
160            mInEmailOrUrl = true;
161            mIteratorWasReset = true;
162        } else {
163            mInEmailOrUrl = false;
164        }
165        mScanOffset = i;
166    }
167
168    if (mInEmailOrUrl) {
169        // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
170        uint16_t lastChar = mText[mLast];
171        ssize_t i;
172        for (i = mLast + 1; i < mScanOffset; i++) {
173            if (breakAfter(lastChar)) {
174                break;
175            }
176            // break after double slash
177            if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
178                break;
179            }
180            uint16_t thisChar = mText[i];
181            // never break after hyphen
182            if (lastChar != '-') {
183                if (breakBefore(thisChar)) {
184                    break;
185                }
186                // break before single slash
187                if (thisChar == '/' && lastChar != '/' &&
188                            !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
189                    break;
190                }
191            }
192            lastChar = thisChar;
193        }
194        mCurrent = i;
195        return mCurrent;
196    }
197
198    int32_t result;
199    do {
200        if (mIteratorWasReset) {
201            result = mBreakIterator->following(mCurrent);
202            mIteratorWasReset = false;
203        } else {
204            result = mBreakIterator->next();
205        }
206    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
207            && !isBreakValid(mText, mTextSize, result));
208    mCurrent = (ssize_t)result;
209    return mCurrent;
210}
211
212ssize_t WordBreaker::wordStart() const {
213    if (mInEmailOrUrl) {
214        return mLast;
215    }
216    ssize_t result = mLast;
217    while (result < mCurrent) {
218        UChar32 c;
219        ssize_t ix = result;
220        U16_NEXT(mText, ix, mCurrent, c);
221        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
222        // strip leading punctuation, defined as OP and QU line breaking classes,
223        // see UAX #14
224        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
225            break;
226        }
227        result = ix;
228    }
229    return result;
230}
231
232ssize_t WordBreaker::wordEnd() const {
233    if (mInEmailOrUrl) {
234        return mLast;
235    }
236    ssize_t result = mCurrent;
237    while (result > mLast) {
238        UChar32 c;
239        ssize_t ix = result;
240        U16_PREV(mText, mLast, ix, c);
241        int32_t gc_mask = U_GET_GC_MASK(c);
242        // strip trailing space and punctuation
243        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
244            break;
245        }
246        result = ix;
247    }
248    return result;
249}
250
251int WordBreaker::breakBadness() const {
252    return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
253}
254
255void WordBreaker::finish() {
256    mText = nullptr;
257    // Note: calling utext_close multiply is safe
258    utext_close(&mUText);
259}
260
261}  // namespace android
262