WordBreaker.cpp revision 9c4cc648abcae144f3b99d612e58ef01d5e52cce
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18#include <cutils/log.h>
19
20#include "minikin/WordBreaker.h"
21
22#include <unicode/uchar.h>
23#include <unicode/utf16.h>
24
25namespace android {
26
27const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
28
29void WordBreaker::setLocale(const icu::Locale& locale) {
30    UErrorCode status = U_ZERO_ERROR;
31    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
32    // TODO: handle failure status
33    if (mText != nullptr) {
34        mBreakIterator->setText(&mUText, status);
35    }
36    mIteratorWasReset = true;
37}
38
39void WordBreaker::setText(const uint16_t* data, size_t size) {
40    mText = data;
41    mTextSize = size;
42    mIteratorWasReset = false;
43    mLast = 0;
44    mCurrent = 0;
45    mScanOffset = 0;
46    mSuppressHyphen = false;
47    UErrorCode status = U_ZERO_ERROR;
48    utext_openUChars(&mUText, data, size, &status);
49    mBreakIterator->setText(&mUText, status);
50    mBreakIterator->first();
51}
52
53ssize_t WordBreaker::current() const {
54    return mCurrent;
55}
56
57enum ScanState {
58    START,
59    SAW_AT,
60    SAW_COLON,
61    SAW_COLON_SLASH,
62    SAW_COLON_SLASH_SLASH,
63};
64
65ssize_t WordBreaker::next() {
66    mLast = mCurrent;
67
68    // scan forward from current ICU position for email address or URL
69    if (mLast >= mScanOffset) {
70        ScanState state = START;
71        size_t i;
72        for (i = mLast; i < mTextSize; i++) {
73            uint16_t c = mText[i];
74            // scan only ASCII characters, stop at space
75            if (!(' ' < c && c <= 0x007E)) {
76                break;
77            }
78            if (state == START && c == '@') {
79                state = SAW_AT;
80            } else if (state == START && c == ':') {
81                state = SAW_COLON;
82            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
83                if (c == '/') {
84                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
85                } else {
86                    state = START;
87                }
88            }
89        }
90        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
91            // no line breaks in entire email address or url
92            // TODO: refine this according to Chicago Manual of Style rules
93            while (i < mTextSize && mText[i] == ' ') {
94                i++;
95            }
96            mCurrent = i;
97            mSuppressHyphen = true;
98            // Setting mIteratorWasReset will cause next break to be computed following
99            // mCurrent, rather than following the current ICU iterator location.
100            mIteratorWasReset = true;
101            if (mBreakIterator->isBoundary(mCurrent)) {
102                return mCurrent;
103            }
104        } else {
105            mScanOffset = i;
106            mSuppressHyphen = false;
107        }
108    }
109
110    int32_t result;
111    do {
112        if (mIteratorWasReset) {
113            result = mBreakIterator->following(mCurrent);
114            mIteratorWasReset = false;
115        } else {
116            result = mBreakIterator->next();
117        }
118    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
119             && mText[result - 1] == CHAR_SOFT_HYPHEN);
120    mCurrent = (ssize_t)result;
121    return mCurrent;
122}
123
124ssize_t WordBreaker::wordStart() const {
125    if (mSuppressHyphen) {
126        return mLast;
127    }
128    ssize_t result = mLast;
129    while (result < mCurrent) {
130        UChar32 c;
131        ssize_t ix = result;
132        U16_NEXT(mText, ix, mCurrent, c);
133        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
134        // strip leading punctuation, defined as OP and QU line breaking classes,
135        // see UAX #14
136        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
137            break;
138        }
139        result = ix;
140    }
141    return result;
142}
143
144ssize_t WordBreaker::wordEnd() const {
145    if (mSuppressHyphen) {
146        return mLast;
147    }
148    ssize_t result = mCurrent;
149    while (result > mLast) {
150        UChar32 c;
151        ssize_t ix = result;
152        U16_PREV(mText, mLast, ix, c);
153        int32_t gc_mask = U_GET_GC_MASK(c);
154        // strip trailing space and punctuation
155        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
156            break;
157        }
158        result = ix;
159    }
160    return result;
161}
162
163void WordBreaker::finish() {
164    mText = nullptr;
165    // Note: calling utext_close multiply is safe
166    utext_close(&mUText);
167}
168
169}  // namespace android
170