WordBreaker.cpp revision 9c4cc648abcae144f3b99d612e58ef01d5e52cce
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/*
257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project
357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License.
657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at
757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software
1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and
1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License.
1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */
1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin"
1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h>
1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "minikin/WordBreaker.h"
2157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uchar.h>
2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/utf16.h>
2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Leviennamespace android {
2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienconst uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setLocale(const icu::Locale& locale) {
3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    UErrorCode status = U_ZERO_ERROR;
3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    // TODO: handle failure status
3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    if (mText != nullptr) {
3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        mBreakIterator->setText(&mUText, status);
3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mIteratorWasReset = true;
3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setText(const uint16_t* data, size_t size) {
4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mText = data;
4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mTextSize = size;
4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mIteratorWasReset = false;
4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mLast = 0;
4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mCurrent = 0;
459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    mScanOffset = 0;
469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    mSuppressHyphen = false;
4757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    UErrorCode status = U_ZERO_ERROR;
4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    utext_openUChars(&mUText, data, size, &status);
4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator->setText(&mUText, status);
5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator->first();
5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::current() const {
5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return mCurrent;
5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
579c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levienenum ScanState {
589c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    START,
599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_AT,
609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON,
619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON_SLASH,
629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON_SLASH_SLASH,
639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien};
649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
6557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::next() {
6657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mLast = mCurrent;
679c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
689c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    // scan forward from current ICU position for email address or URL
699c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    if (mLast >= mScanOffset) {
709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        ScanState state = START;
719c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        size_t i;
729c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        for (i = mLast; i < mTextSize; i++) {
739c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            uint16_t c = mText[i];
749c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            // scan only ASCII characters, stop at space
759c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (!(' ' < c && c <= 0x007E)) {
769c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                break;
779c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
789c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (state == START && c == '@') {
799c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                state = SAW_AT;
809c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            } else if (state == START && c == ':') {
819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                state = SAW_COLON;
829c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
839c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                if (c == '/') {
849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                } else {
869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                    state = START;
879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                }
889c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
899c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        }
909c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
919c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            // no line breaks in entire email address or url
929c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            // TODO: refine this according to Chicago Manual of Style rules
939c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            while (i < mTextSize && mText[i] == ' ') {
949c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                i++;
959c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
969c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            mCurrent = i;
979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            mSuppressHyphen = true;
989c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            // Setting mIteratorWasReset will cause next break to be computed following
999c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            // mCurrent, rather than following the current ICU iterator location.
1009c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            mIteratorWasReset = true;
1019c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (mBreakIterator->isBoundary(mCurrent)) {
1029c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                return mCurrent;
1039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1049c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        } else {
1059c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            mScanOffset = i;
1069c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            mSuppressHyphen = false;
1079c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        }
1089c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
1099c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1109c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    int32_t result;
11157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    do {
11257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if (mIteratorWasReset) {
11357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            result = mBreakIterator->following(mCurrent);
11457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            mIteratorWasReset = false;
11557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        } else {
11657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            result = mBreakIterator->next();
11757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
11857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
11957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien             && mText[result - 1] == CHAR_SOFT_HYPHEN);
12057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mCurrent = (ssize_t)result;
12157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return mCurrent;
12257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
12357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
12457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordStart() const {
1259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    if (mSuppressHyphen) {
1269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        return mLast;
1279c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
12857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    ssize_t result = mLast;
12957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    while (result < mCurrent) {
13057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        UChar32 c;
13157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        ssize_t ix = result;
13257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        U16_NEXT(mText, ix, mCurrent, c);
13357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
13457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // strip leading punctuation, defined as OP and QU line breaking classes,
13557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // see UAX #14
13657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
13757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            break;
13857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
13957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        result = ix;
14057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
14157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return result;
14257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
14357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
14457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordEnd() const {
1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    if (mSuppressHyphen) {
1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        return mLast;
1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
14857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    ssize_t result = mCurrent;
14957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    while (result > mLast) {
15057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        UChar32 c;
15157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        ssize_t ix = result;
15257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        U16_PREV(mText, mLast, ix, c);
15357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        int32_t gc_mask = U_GET_GC_MASK(c);
15457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // strip trailing space and punctuation
15557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
15657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            break;
15757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
15857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        result = ix;
15957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
16057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return result;
16157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
16257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
16357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::finish() {
16457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mText = nullptr;
16557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    // Note: calling utext_close multiply is safe
16657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    utext_close(&mUText);
16757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
16857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
16957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}  // namespace android
170