WordBreaker.cpp revision 57b6dae9894b9362ef04517ff477fd491f9d433b
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18#include <cutils/log.h>
19
20#include "minikin/WordBreaker.h"
21
22#include <unicode/uchar.h>
23#include <unicode/utf16.h>
24
25namespace android {
26
27const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
28
29void WordBreaker::setLocale(const icu::Locale& locale) {
30    UErrorCode status = U_ZERO_ERROR;
31    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
32    // TODO: handle failure status
33    if (mText != nullptr) {
34        mBreakIterator->setText(&mUText, status);
35    }
36    mIteratorWasReset = true;
37}
38
39void WordBreaker::setText(const uint16_t* data, size_t size) {
40    mText = data;
41    mTextSize = size;
42    mIteratorWasReset = false;
43    mLast = 0;
44    mCurrent = 0;
45    UErrorCode status = U_ZERO_ERROR;
46    utext_openUChars(&mUText, data, size, &status);
47    mBreakIterator->setText(&mUText, status);
48    mBreakIterator->first();
49}
50
51ssize_t WordBreaker::current() const {
52    return mCurrent;
53}
54
55ssize_t WordBreaker::next() {
56    int32_t result;
57    mLast = mCurrent;
58    do {
59        if (mIteratorWasReset) {
60            result = mBreakIterator->following(mCurrent);
61            mIteratorWasReset = false;
62        } else {
63            result = mBreakIterator->next();
64        }
65    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
66             && mText[result - 1] == CHAR_SOFT_HYPHEN);
67    mCurrent = (ssize_t)result;
68    return mCurrent;
69}
70
71ssize_t WordBreaker::wordStart() const {
72    ssize_t result = mLast;
73    while (result < mCurrent) {
74        UChar32 c;
75        ssize_t ix = result;
76        U16_NEXT(mText, ix, mCurrent, c);
77        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
78        // strip leading punctuation, defined as OP and QU line breaking classes,
79        // see UAX #14
80        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
81            break;
82        }
83        result = ix;
84    }
85    return result;
86}
87
88ssize_t WordBreaker::wordEnd() const {
89    ssize_t result = mCurrent;
90    while (result > mLast) {
91        UChar32 c;
92        ssize_t ix = result;
93        U16_PREV(mText, mLast, ix, c);
94        int32_t gc_mask = U_GET_GC_MASK(c);
95        // strip trailing space and punctuation
96        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
97            break;
98        }
99        result = ix;
100    }
101    return result;
102}
103
104void WordBreaker::finish() {
105    mText = nullptr;
106    // Note: calling utext_close multiply is safe
107    utext_close(&mUText);
108}
109
110}  // namespace android
111