WordBreaker.cpp revision d8917c69a9f7b7ca52f7ac850922dab4322113f5
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18#include <cutils/log.h>
19
20#include <minikin/WordBreaker.h>
21#include "MinikinInternal.h"
22
23#include <unicode/uchar.h>
24#include <unicode/utf16.h>
25
26namespace android {
27
28const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
29const uint32_t CHAR_ZWJ = 0x200D;
30
31void WordBreaker::setLocale(const icu::Locale& locale) {
32    UErrorCode status = U_ZERO_ERROR;
33    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
34    // TODO: handle failure status
35    if (mText != nullptr) {
36        mBreakIterator->setText(&mUText, status);
37    }
38    mIteratorWasReset = true;
39}
40
41void WordBreaker::setText(const uint16_t* data, size_t size) {
42    mText = data;
43    mTextSize = size;
44    mIteratorWasReset = false;
45    mLast = 0;
46    mCurrent = 0;
47    mScanOffset = 0;
48    mInEmailOrUrl = false;
49    UErrorCode status = U_ZERO_ERROR;
50    utext_openUChars(&mUText, data, size, &status);
51    mBreakIterator->setText(&mUText, status);
52    mBreakIterator->first();
53}
54
55ssize_t WordBreaker::current() const {
56    return mCurrent;
57}
58
59enum ScanState {
60    START,
61    SAW_AT,
62    SAW_COLON,
63    SAW_COLON_SLASH,
64    SAW_COLON_SLASH_SLASH,
65};
66
67/**
68 * Determine whether a line break at position i within the buffer buf is valid. This
69 * represents customization beyond the ICU behavior, because plain ICU provides some
70 * line break opportunities that we don't want.
71 **/
72static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
73    uint32_t codePoint;
74    size_t prev_offset = i;
75    U16_PREV(buf, 0, prev_offset, codePoint);
76    if (codePoint == CHAR_SOFT_HYPHEN) {
77        return false;
78    }
79    uint32_t next_codepoint;
80    size_t next_offset = i;
81    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
82
83    // Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt
84    //(AL | HL) × (PR | PO)
85    int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK);
86    if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) {
87        lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK);
88        if (lineBreak == U_LB_PREFIX_NUMERIC || lineBreak == U_LB_POSTFIX_NUMERIC) {
89            return false;
90        }
91    }
92
93    // Known emoji ZWJ sequences
94    if (codePoint == CHAR_ZWJ) {
95        // Possible emoji ZWJ sequence
96        if (next_codepoint == 0x2764 ||       // HEAVY BLACK HEART
97                next_codepoint == 0x1F466 ||  // BOY
98                next_codepoint == 0x1F467 ||  // GIRL
99                next_codepoint == 0x1F468 ||  // MAN
100                next_codepoint == 0x1F469 ||  // WOMAN
101                next_codepoint == 0x1F48B ||  // KISS MARK
102                next_codepoint == 0x1F5E8) {  // LEFT SPEECH BUBBLE
103            return false;
104        }
105    }
106
107    // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
108    // EB x EM
109    if (isEmojiModifier(next_codepoint)) {
110        if (codePoint == 0xFE0F && prev_offset > 0) {
111            // skip over emoji variation selector
112            U16_PREV(buf, 0, prev_offset, codePoint);
113        }
114        if (isEmojiBase(codePoint)) {
115            return false;
116        }
117    }
118    return true;
119}
120
121// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
122static bool breakAfter(uint16_t c) {
123    return c == ':' || c == '=' || c == '&';
124}
125
126// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
127static bool breakBefore(uint16_t c) {
128    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
129            || c == '%' || c == '=' || c == '&';
130}
131
132ssize_t WordBreaker::next() {
133    mLast = mCurrent;
134
135    // scan forward from current ICU position for email address or URL
136    if (mLast >= mScanOffset) {
137        ScanState state = START;
138        size_t i;
139        for (i = mLast; i < mTextSize; i++) {
140            uint16_t c = mText[i];
141            // scan only ASCII characters, stop at space
142            if (!(' ' < c && c <= 0x007E)) {
143                break;
144            }
145            if (state == START && c == '@') {
146                state = SAW_AT;
147            } else if (state == START && c == ':') {
148                state = SAW_COLON;
149            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
150                if (c == '/') {
151                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
152                } else {
153                    state = START;
154                }
155            }
156        }
157        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
158            if (!mBreakIterator->isBoundary(i)) {
159                i = mBreakIterator->following(i);
160            }
161            mInEmailOrUrl = true;
162            mIteratorWasReset = true;
163        } else {
164            mInEmailOrUrl = false;
165        }
166        mScanOffset = i;
167    }
168
169    if (mInEmailOrUrl) {
170        // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
171        uint16_t lastChar = mText[mLast];
172        ssize_t i;
173        for (i = mLast + 1; i < mScanOffset; i++) {
174            if (breakAfter(lastChar)) {
175                break;
176            }
177            // break after double slash
178            if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
179                break;
180            }
181            uint16_t thisChar = mText[i];
182            // never break after hyphen
183            if (lastChar != '-') {
184                if (breakBefore(thisChar)) {
185                    break;
186                }
187                // break before single slash
188                if (thisChar == '/' && lastChar != '/' &&
189                            !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
190                    break;
191                }
192            }
193            lastChar = thisChar;
194        }
195        mCurrent = i;
196        return mCurrent;
197    }
198
199    int32_t result;
200    do {
201        if (mIteratorWasReset) {
202            result = mBreakIterator->following(mCurrent);
203            mIteratorWasReset = false;
204        } else {
205            result = mBreakIterator->next();
206        }
207    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
208            && !isBreakValid(mText, mTextSize, result));
209    mCurrent = (ssize_t)result;
210    return mCurrent;
211}
212
213ssize_t WordBreaker::wordStart() const {
214    if (mInEmailOrUrl) {
215        return mLast;
216    }
217    ssize_t result = mLast;
218    while (result < mCurrent) {
219        UChar32 c;
220        ssize_t ix = result;
221        U16_NEXT(mText, ix, mCurrent, c);
222        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
223        // strip leading punctuation, defined as OP and QU line breaking classes,
224        // see UAX #14
225        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
226            break;
227        }
228        result = ix;
229    }
230    return result;
231}
232
233ssize_t WordBreaker::wordEnd() const {
234    if (mInEmailOrUrl) {
235        return mLast;
236    }
237    ssize_t result = mCurrent;
238    while (result > mLast) {
239        UChar32 c;
240        ssize_t ix = result;
241        U16_PREV(mText, mLast, ix, c);
242        int32_t gc_mask = U_GET_GC_MASK(c);
243        // strip trailing space and punctuation
244        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
245            break;
246        }
247        result = ix;
248    }
249    return result;
250}
251
252int WordBreaker::breakBadness() const {
253    return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
254}
255
256void WordBreaker::finish() {
257    mText = nullptr;
258    // Note: calling utext_close multiply is safe
259    utext_close(&mUText);
260}
261
262}  // namespace android
263