1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18
19#include <android/log.h>
20
21#include <minikin/Emoji.h>
22#include <minikin/Hyphenator.h>
23#include <minikin/WordBreaker.h>
24#include "MinikinInternal.h"
25
26#include <unicode/uchar.h>
27#include <unicode/utf16.h>
28
29namespace minikin {
30
31const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
32const uint32_t CHAR_ZWJ = 0x200D;
33
34void WordBreaker::setLocale(const icu::Locale& locale) {
35    UErrorCode status = U_ZERO_ERROR;
36    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
37    // TODO: handle failure status
38    if (mText != nullptr) {
39        mBreakIterator->setText(&mUText, status);
40    }
41    mIteratorWasReset = true;
42}
43
44void WordBreaker::setText(const uint16_t* data, size_t size) {
45    mText = data;
46    mTextSize = size;
47    mIteratorWasReset = false;
48    mLast = 0;
49    mCurrent = 0;
50    mScanOffset = 0;
51    mInEmailOrUrl = false;
52    UErrorCode status = U_ZERO_ERROR;
53    utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size, &status);
54    mBreakIterator->setText(&mUText, status);
55    mBreakIterator->first();
56}
57
58ssize_t WordBreaker::current() const {
59    return mCurrent;
60}
61
62/**
63 * Determine whether a line break at position i within the buffer buf is valid. This
64 * represents customization beyond the ICU behavior, because plain ICU provides some
65 * line break opportunities that we don't want.
66 **/
67static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
68    uint32_t codePoint;
69    size_t prev_offset = i;
70    U16_PREV(buf, 0, prev_offset, codePoint);
71    // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
72    if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
73        return false;
74    }
75    // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
76    // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
77    // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
78    // where no line break could be imagined, since the Myanmar virama is a pure stacker.
79    if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
80        return false;
81    }
82
83    uint32_t next_codepoint;
84    size_t next_offset = i;
85    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
86
87    // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
88    // emoji data than ICU does.
89    if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
90        return false;
91    }
92
93    // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
94    if (isEmojiModifier(next_codepoint)) {
95        if (codePoint == 0xFE0F && prev_offset > 0) {
96            // skip over emoji variation selector
97            U16_PREV(buf, 0, prev_offset, codePoint);
98        }
99        if (isEmojiBase(codePoint)) {
100            return false;
101        }
102    }
103    return true;
104}
105
106// Customized iteratorNext that takes care of both resets and our modifications
107// to ICU's behavior.
108int32_t WordBreaker::iteratorNext() {
109    int32_t result;
110    do {
111        if (mIteratorWasReset) {
112            result = mBreakIterator->following(mCurrent);
113            mIteratorWasReset = false;
114        } else {
115            result = mBreakIterator->next();
116        }
117    } while (!(result == icu::BreakIterator::DONE || (size_t)result == mTextSize
118            || isBreakValid(mText, mTextSize, result)));
119    return result;
120}
121
122// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
123static bool breakAfter(uint16_t c) {
124    return c == ':' || c == '=' || c == '&';
125}
126
127// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
128static bool breakBefore(uint16_t c) {
129    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
130            || c == '%' || c == '=' || c == '&';
131}
132
133enum ScanState {
134    START,
135    SAW_AT,
136    SAW_COLON,
137    SAW_COLON_SLASH,
138    SAW_COLON_SLASH_SLASH,
139};
140
141void WordBreaker::detectEmailOrUrl() {
142    // scan forward from current ICU position for email address or URL
143    if (mLast >= mScanOffset) {
144        ScanState state = START;
145        size_t i;
146        for (i = mLast; i < mTextSize; i++) {
147            uint16_t c = mText[i];
148            // scan only ASCII characters, stop at space
149            if (!(' ' < c && c <= 0x007E)) {
150                break;
151            }
152            if (state == START && c == '@') {
153                state = SAW_AT;
154            } else if (state == START && c == ':') {
155                state = SAW_COLON;
156            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
157                if (c == '/') {
158                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
159                } else {
160                    state = START;
161                }
162            }
163        }
164        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
165            if (!mBreakIterator->isBoundary(i)) {
166                // If there are combining marks or such at the end of the URL or the email address,
167                // consider them a part of the URL or the email, and skip to the next actual
168                // boundary.
169                i = mBreakIterator->following(i);
170            }
171            mInEmailOrUrl = true;
172            mIteratorWasReset = true;
173        } else {
174            mInEmailOrUrl = false;
175        }
176        mScanOffset = i;
177    }
178}
179
180ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
181    // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
182    uint16_t lastChar = mText[mLast];
183    ssize_t i;
184    for (i = mLast + 1; i < mScanOffset; i++) {
185        if (breakAfter(lastChar)) {
186            break;
187        }
188        // break after double slash
189        if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
190            break;
191        }
192        const uint16_t thisChar = mText[i];
193        // never break after hyphen
194        if (lastChar != '-') {
195            if (breakBefore(thisChar)) {
196                break;
197            }
198            // break before single slash
199            if (thisChar == '/' && lastChar != '/' &&
200                        !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
201                break;
202            }
203        }
204        lastChar = thisChar;
205    }
206    return i;
207}
208
209ssize_t WordBreaker::next() {
210    mLast = mCurrent;
211
212    detectEmailOrUrl();
213    if (mInEmailOrUrl) {
214        mCurrent = findNextBreakInEmailOrUrl();
215    } else {  // Business as usual
216        mCurrent = (ssize_t) iteratorNext();
217    }
218    return mCurrent;
219}
220
221ssize_t WordBreaker::wordStart() const {
222    if (mInEmailOrUrl) {
223        return mLast;
224    }
225    ssize_t result = mLast;
226    while (result < mCurrent) {
227        UChar32 c;
228        ssize_t ix = result;
229        U16_NEXT(mText, ix, mCurrent, c);
230        const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
231        // strip leading punctuation, defined as OP and QU line breaking classes,
232        // see UAX #14
233        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
234            break;
235        }
236        result = ix;
237    }
238    return result;
239}
240
241ssize_t WordBreaker::wordEnd() const {
242    if (mInEmailOrUrl) {
243        return mLast;
244    }
245    ssize_t result = mCurrent;
246    while (result > mLast) {
247        UChar32 c;
248        ssize_t ix = result;
249        U16_PREV(mText, mLast, ix, c);
250        const int32_t gc_mask = U_GET_GC_MASK(c);
251        // strip trailing space and punctuation
252        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
253            break;
254        }
255        result = ix;
256    }
257    return result;
258}
259
260int WordBreaker::breakBadness() const {
261    return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
262}
263
264void WordBreaker::finish() {
265    mText = nullptr;
266    // Note: calling utext_close multiply is safe
267    utext_close(&mUText);
268}
269
270}  // namespace minikin
271