WordBreaker.cpp revision 56840e8006ca2b822adb401fc8a65f3c075cde10
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/*
257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project
357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License.
657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at
757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software
1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and
1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License.
1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */
1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin"
1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h>
1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include <minikin/WordBreaker.h>
2156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include "MinikinInternal.h"
2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uchar.h>
2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/utf16.h>
2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Leviennamespace android {
2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienconst uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
2956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienconst uint32_t CHAR_ZWJ = 0x200D;
3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setLocale(const icu::Locale& locale) {
3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    UErrorCode status = U_ZERO_ERROR;
3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    // TODO: handle failure status
3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    if (mText != nullptr) {
3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        mBreakIterator->setText(&mUText, status);
3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mIteratorWasReset = true;
3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setText(const uint16_t* data, size_t size) {
4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mText = data;
4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mTextSize = size;
4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mIteratorWasReset = false;
4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mLast = 0;
4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mCurrent = 0;
479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    mScanOffset = 0;
486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    mInEmailOrUrl = false;
4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    UErrorCode status = U_ZERO_ERROR;
5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    utext_openUChars(&mUText, data, size, &status);
5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator->setText(&mUText, status);
5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator->first();
5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::current() const {
5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return mCurrent;
5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levienenum ScanState {
609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    START,
619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_AT,
629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON,
639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON_SLASH,
649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON_SLASH_SLASH,
659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien};
669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
67d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien/**
68d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * Determine whether a line break at position i within the buffer buf is valid. This
69d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * represents customization beyond the ICU behavior, because plain ICU provides some
70d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * line break opportunities that we don't want.
71d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien **/
7256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienstatic bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
7356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    uint32_t codePoint;
7456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    size_t prev_offset = i;
7556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    U16_PREV(buf, 0, prev_offset, codePoint);
7656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    if (codePoint == CHAR_SOFT_HYPHEN) {
77d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        return false;
78d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    }
7956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    uint32_t next_codepoint;
8056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    size_t next_offset = i;
8156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
8256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    if (codePoint == CHAR_ZWJ) {
83d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        // Possible emoji ZWJ sequence
84d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        if (next_codepoint == 0x2764 ||       // HEAVY BLACK HEART
85d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien                next_codepoint == 0x1F466 ||  // BOY
86d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien                next_codepoint == 0x1F467 ||  // GIRL
87d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien                next_codepoint == 0x1F468 ||  // MAN
88d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien                next_codepoint == 0x1F469 ||  // WOMAN
89d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien                next_codepoint == 0x1F48B ||  // KISS MARK
90d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien                next_codepoint == 0x1F5E8) {  // LEFT SPEECH BUBBLE
91d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien            return false;
92d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        }
93d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    }
9456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
9556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    // EB x EM
9656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    if (isEmojiModifier(next_codepoint)) {
9756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        if (codePoint == 0xFE0F && prev_offset > 0) {
9856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            // skip over emoji variation selector
9956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            U16_PREV(buf, 0, prev_offset, codePoint);
10056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        }
10156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        if (isEmojiBase(codePoint)) {
10256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            return false;
10356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        }
10456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    }
105d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    return true;
106d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien}
107d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien
1086d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
1096d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakAfter(uint16_t c) {
1106d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    return c == ':' || c == '=' || c == '&';
1116d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
1126d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
1136d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
1146d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakBefore(uint16_t c) {
1156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
1166d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            || c == '%' || c == '=' || c == '&';
1176d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
1186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
11957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::next() {
12057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mLast = mCurrent;
1219c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    // scan forward from current ICU position for email address or URL
1239c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    if (mLast >= mScanOffset) {
1249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        ScanState state = START;
1259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        size_t i;
1269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        for (i = mLast; i < mTextSize; i++) {
1279c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            uint16_t c = mText[i];
1289c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            // scan only ASCII characters, stop at space
1299c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (!(' ' < c && c <= 0x007E)) {
1309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                break;
1319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (state == START && c == '@') {
1339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                state = SAW_AT;
1349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            } else if (state == START && c == ':') {
1359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                state = SAW_COLON;
1369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
1379c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                if (c == '/') {
1389c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
1399c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                } else {
1409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                    state = START;
1419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                }
1429c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        }
1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
1456d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (!mBreakIterator->isBoundary(i)) {
1466d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                i = mBreakIterator->following(i);
1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            mInEmailOrUrl = true;
1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            mIteratorWasReset = true;
1509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        } else {
1516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            mInEmailOrUrl = false;
1526d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        }
1536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        mScanOffset = i;
1546d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    }
1556d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
1566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    if (mInEmailOrUrl) {
1576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
1586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        uint16_t lastChar = mText[mLast];
1596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        ssize_t i;
1606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        for (i = mLast + 1; i < mScanOffset; i++) {
1616d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (breakAfter(lastChar)) {
1626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                break;
1636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            }
1646d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            // break after double slash
1656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
1666d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                break;
1676d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            }
1686d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            uint16_t thisChar = mText[i];
1696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            // never break after hyphen
1706d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (lastChar != '-') {
1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                if (breakBefore(thisChar)) {
1726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                    break;
1736d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                }
1746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                // break before single slash
1756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                if (thisChar == '/' && lastChar != '/' &&
1766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                            !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
1776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                    break;
1786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                }
1796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            }
1806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            lastChar = thisChar;
1819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        }
1826d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        mCurrent = i;
1836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        return mCurrent;
1849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
1859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    int32_t result;
18757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    do {
18857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if (mIteratorWasReset) {
18957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            result = mBreakIterator->following(mCurrent);
19057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            mIteratorWasReset = false;
19157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        } else {
19257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            result = mBreakIterator->next();
19357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
19457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
19556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            && !isBreakValid(mText, mTextSize, result));
19657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mCurrent = (ssize_t)result;
19757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return mCurrent;
19857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
19957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
20057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordStart() const {
2016d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    if (mInEmailOrUrl) {
2029c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        return mLast;
2039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
20457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    ssize_t result = mLast;
20557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    while (result < mCurrent) {
20657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        UChar32 c;
20757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        ssize_t ix = result;
20857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        U16_NEXT(mText, ix, mCurrent, c);
20957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
21057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // strip leading punctuation, defined as OP and QU line breaking classes,
21157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // see UAX #14
21257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
21357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            break;
21457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
21557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        result = ix;
21657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
21757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return result;
21857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
21957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
22057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordEnd() const {
2216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    if (mInEmailOrUrl) {
2229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        return mLast;
2239c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
22457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    ssize_t result = mCurrent;
22557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    while (result > mLast) {
22657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        UChar32 c;
22757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        ssize_t ix = result;
22857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        U16_PREV(mText, mLast, ix, c);
22957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        int32_t gc_mask = U_GET_GC_MASK(c);
23057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // strip trailing space and punctuation
23157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
23257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            break;
23357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
23457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        result = ix;
23557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
23657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return result;
23757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
23857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
239c88ef135fcc2661ec7addc171ebc60787df38affRaph Levienint WordBreaker::breakBadness() const {
240c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
241c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien}
242c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien
24357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::finish() {
24457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mText = nullptr;
24557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    // Note: calling utext_close multiply is safe
24657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    utext_close(&mUText);
24757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
24857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
24957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}  // namespace android
250