WordBreaker.cpp revision 6d15657e4a3826d4d47d5358f1dde211484527e9
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18#include <cutils/log.h>
19
20#include "minikin/WordBreaker.h"
21
22#include <unicode/uchar.h>
23#include <unicode/utf16.h>
24
25namespace android {
26
27const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
28
29void WordBreaker::setLocale(const icu::Locale& locale) {
30    UErrorCode status = U_ZERO_ERROR;
31    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
32    // TODO: handle failure status
33    if (mText != nullptr) {
34        mBreakIterator->setText(&mUText, status);
35    }
36    mIteratorWasReset = true;
37}
38
39void WordBreaker::setText(const uint16_t* data, size_t size) {
40    mText = data;
41    mTextSize = size;
42    mIteratorWasReset = false;
43    mLast = 0;
44    mCurrent = 0;
45    mScanOffset = 0;
46    mInEmailOrUrl = false;
47    UErrorCode status = U_ZERO_ERROR;
48    utext_openUChars(&mUText, data, size, &status);
49    mBreakIterator->setText(&mUText, status);
50    mBreakIterator->first();
51}
52
53ssize_t WordBreaker::current() const {
54    return mCurrent;
55}
56
57enum ScanState {
58    START,
59    SAW_AT,
60    SAW_COLON,
61    SAW_COLON_SLASH,
62    SAW_COLON_SLASH_SLASH,
63};
64
65// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
66static bool breakAfter(uint16_t c) {
67    return c == ':' || c == '=' || c == '&';
68}
69
70// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
71static bool breakBefore(uint16_t c) {
72    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
73            || c == '%' || c == '=' || c == '&';
74}
75
76ssize_t WordBreaker::next() {
77    mLast = mCurrent;
78
79    // scan forward from current ICU position for email address or URL
80    if (mLast >= mScanOffset) {
81        ScanState state = START;
82        size_t i;
83        for (i = mLast; i < mTextSize; i++) {
84            uint16_t c = mText[i];
85            // scan only ASCII characters, stop at space
86            if (!(' ' < c && c <= 0x007E)) {
87                break;
88            }
89            if (state == START && c == '@') {
90                state = SAW_AT;
91            } else if (state == START && c == ':') {
92                state = SAW_COLON;
93            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
94                if (c == '/') {
95                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
96                } else {
97                    state = START;
98                }
99            }
100        }
101        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
102            if (!mBreakIterator->isBoundary(i)) {
103                i = mBreakIterator->following(i);
104            }
105            mInEmailOrUrl = true;
106            mIteratorWasReset = true;
107        } else {
108            mInEmailOrUrl = false;
109        }
110        mScanOffset = i;
111    }
112
113    if (mInEmailOrUrl) {
114        // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
115        uint16_t lastChar = mText[mLast];
116        ssize_t i;
117        for (i = mLast + 1; i < mScanOffset; i++) {
118            if (breakAfter(lastChar)) {
119                break;
120            }
121            // break after double slash
122            if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
123                break;
124            }
125            uint16_t thisChar = mText[i];
126            // never break after hyphen
127            if (lastChar != '-') {
128                if (breakBefore(thisChar)) {
129                    break;
130                }
131                // break before single slash
132                if (thisChar == '/' && lastChar != '/' &&
133                            !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
134                    break;
135                }
136            }
137            lastChar = thisChar;
138        }
139        mCurrent = i;
140        return mCurrent;
141    }
142
143    int32_t result;
144    do {
145        if (mIteratorWasReset) {
146            result = mBreakIterator->following(mCurrent);
147            mIteratorWasReset = false;
148        } else {
149            result = mBreakIterator->next();
150        }
151    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
152             && mText[result - 1] == CHAR_SOFT_HYPHEN);
153    mCurrent = (ssize_t)result;
154    return mCurrent;
155}
156
157ssize_t WordBreaker::wordStart() const {
158    if (mInEmailOrUrl) {
159        return mLast;
160    }
161    ssize_t result = mLast;
162    while (result < mCurrent) {
163        UChar32 c;
164        ssize_t ix = result;
165        U16_NEXT(mText, ix, mCurrent, c);
166        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
167        // strip leading punctuation, defined as OP and QU line breaking classes,
168        // see UAX #14
169        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
170            break;
171        }
172        result = ix;
173    }
174    return result;
175}
176
177ssize_t WordBreaker::wordEnd() const {
178    if (mInEmailOrUrl) {
179        return mLast;
180    }
181    ssize_t result = mCurrent;
182    while (result > mLast) {
183        UChar32 c;
184        ssize_t ix = result;
185        U16_PREV(mText, mLast, ix, c);
186        int32_t gc_mask = U_GET_GC_MASK(c);
187        // strip trailing space and punctuation
188        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
189            break;
190        }
191        result = ix;
192    }
193    return result;
194}
195
196void WordBreaker::finish() {
197    mText = nullptr;
198    // Note: calling utext_close multiply is safe
199    utext_close(&mUText);
200}
201
202}  // namespace android
203