157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/*
257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project
357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License.
657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at
757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software
1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and
1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License.
1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */
1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin"
1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h>
1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include <minikin/WordBreaker.h>
2156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include "MinikinInternal.h"
2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uchar.h>
2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/utf16.h>
2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Leviennamespace android {
2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienconst uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
2956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienconst uint32_t CHAR_ZWJ = 0x200D;
3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setLocale(const icu::Locale& locale) {
3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    UErrorCode status = U_ZERO_ERROR;
3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    // TODO: handle failure status
3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    if (mText != nullptr) {
3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        mBreakIterator->setText(&mUText, status);
3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mIteratorWasReset = true;
3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setText(const uint16_t* data, size_t size) {
4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mText = data;
4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mTextSize = size;
4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mIteratorWasReset = false;
4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mLast = 0;
4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mCurrent = 0;
479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    mScanOffset = 0;
486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    mInEmailOrUrl = false;
4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    UErrorCode status = U_ZERO_ERROR;
5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    utext_openUChars(&mUText, data, size, &status);
5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator->setText(&mUText, status);
5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator->first();
5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::current() const {
5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return mCurrent;
5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levienenum ScanState {
609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    START,
619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_AT,
629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON,
639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON_SLASH,
649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    SAW_COLON_SLASH_SLASH,
659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien};
669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
67d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien/**
68d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * Determine whether a line break at position i within the buffer buf is valid. This
69d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * represents customization beyond the ICU behavior, because plain ICU provides some
70d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * line break opportunities that we don't want.
71d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien **/
7256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienstatic bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
7356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    uint32_t codePoint;
7456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    size_t prev_offset = i;
7556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    U16_PREV(buf, 0, prev_offset, codePoint);
7656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    if (codePoint == CHAR_SOFT_HYPHEN) {
77d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        return false;
78d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    }
7974b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
8074b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
8174b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
8274b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // where no line break could be imagined, since the Myanmar virama is a pure stacker.
8374b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
8474b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader        return false;
8574b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    }
8674b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader
8756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    uint32_t next_codepoint;
8856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    size_t next_offset = i;
8956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
90d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader
91d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    // Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt
9274b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // (AL | HL) × (PR | PO)
93d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK);
94d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) {
95d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader        lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK);
96d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader        if (lineBreak == U_LB_PREFIX_NUMERIC || lineBreak == U_LB_POSTFIX_NUMERIC) {
97d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader            return false;
98d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader        }
99d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    }
100d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader
10177f488345316fba46c271fc04bea470819ae1712Seigo Nonaka    // Emoji ZWJ sequences.
10277f488345316fba46c271fc04bea470819ae1712Seigo Nonaka    if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
10377f488345316fba46c271fc04bea470819ae1712Seigo Nonaka        return false;
104d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    }
105d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader
10656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
10756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    // EB x EM
10856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    if (isEmojiModifier(next_codepoint)) {
10956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        if (codePoint == 0xFE0F && prev_offset > 0) {
11056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            // skip over emoji variation selector
11156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            U16_PREV(buf, 0, prev_offset, codePoint);
11256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        }
11356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        if (isEmojiBase(codePoint)) {
11456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            return false;
11556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        }
11656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    }
117d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    return true;
118d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien}
119d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien
1206d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
1216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakAfter(uint16_t c) {
1226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    return c == ':' || c == '=' || c == '&';
1236d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
1246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
1256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
1266d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakBefore(uint16_t c) {
1276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
1286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            || c == '%' || c == '=' || c == '&';
1296d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
1306d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
13157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::next() {
13257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mLast = mCurrent;
1339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    // scan forward from current ICU position for email address or URL
1359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    if (mLast >= mScanOffset) {
1369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        ScanState state = START;
1379c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        size_t i;
1389c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        for (i = mLast; i < mTextSize; i++) {
1399c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            uint16_t c = mText[i];
1409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            // scan only ASCII characters, stop at space
1419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (!(' ' < c && c <= 0x007E)) {
1429c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                break;
1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (state == START && c == '@') {
1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                state = SAW_AT;
1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            } else if (state == START && c == ':') {
1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                state = SAW_COLON;
1489c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                if (c == '/') {
1509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
1519c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                } else {
1529c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                    state = START;
1539c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                }
1549c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1559c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        }
1569c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
1576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (!mBreakIterator->isBoundary(i)) {
1586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                i = mBreakIterator->following(i);
1599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            mInEmailOrUrl = true;
1619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            mIteratorWasReset = true;
1629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        } else {
1636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            mInEmailOrUrl = false;
1646d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        }
1656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        mScanOffset = i;
1666d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    }
1676d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
1686d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    if (mInEmailOrUrl) {
1696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
1706d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        uint16_t lastChar = mText[mLast];
1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        ssize_t i;
1726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        for (i = mLast + 1; i < mScanOffset; i++) {
1736d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (breakAfter(lastChar)) {
1746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                break;
1756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            }
1766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            // break after double slash
1776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
1786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                break;
1796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            }
1806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            uint16_t thisChar = mText[i];
1816d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            // never break after hyphen
1826d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (lastChar != '-') {
1836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                if (breakBefore(thisChar)) {
1846d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                    break;
1856d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                }
1866d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                // break before single slash
1876d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                if (thisChar == '/' && lastChar != '/' &&
1886d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                            !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
1896d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                    break;
1906d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                }
1916d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            }
1926d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            lastChar = thisChar;
1939c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        }
1946d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        mCurrent = i;
1956d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        return mCurrent;
1969c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
1979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1989c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    int32_t result;
19957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    do {
20057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if (mIteratorWasReset) {
20157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            result = mBreakIterator->following(mCurrent);
20257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            mIteratorWasReset = false;
20357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        } else {
20457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            result = mBreakIterator->next();
20557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
20657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
20756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            && !isBreakValid(mText, mTextSize, result));
20857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mCurrent = (ssize_t)result;
20957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return mCurrent;
21057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
21157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
21257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordStart() const {
2136d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    if (mInEmailOrUrl) {
2149c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        return mLast;
2159c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
21657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    ssize_t result = mLast;
21757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    while (result < mCurrent) {
21857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        UChar32 c;
21957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        ssize_t ix = result;
22057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        U16_NEXT(mText, ix, mCurrent, c);
22157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
22257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // strip leading punctuation, defined as OP and QU line breaking classes,
22357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // see UAX #14
22457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
22557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            break;
22657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
22757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        result = ix;
22857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
22957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return result;
23057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
23157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
23257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordEnd() const {
2336d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    if (mInEmailOrUrl) {
2349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        return mLast;
2359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
23657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    ssize_t result = mCurrent;
23757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    while (result > mLast) {
23857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        UChar32 c;
23957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        ssize_t ix = result;
24057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        U16_PREV(mText, mLast, ix, c);
24157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        int32_t gc_mask = U_GET_GC_MASK(c);
24257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // strip trailing space and punctuation
24357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
24457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            break;
24557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
24657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        result = ix;
24757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
24857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return result;
24957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
25057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
251c88ef135fcc2661ec7addc171ebc60787df38affRaph Levienint WordBreaker::breakBadness() const {
252c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
253c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien}
254c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien
25557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::finish() {
25657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mText = nullptr;
25757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    // Note: calling utext_close multiply is safe
25857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    utext_close(&mUText);
25957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
26057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
26157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}  // namespace android
262