WordBreaker.cpp revision 56840e8006ca2b822adb401fc8a65f3c075cde10
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/* 257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project 357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License. 657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at 757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software 1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and 1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License. 1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */ 1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin" 1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h> 1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include <minikin/WordBreaker.h> 2156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include "MinikinInternal.h" 2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uchar.h> 2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/utf16.h> 2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Leviennamespace android { 2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienconst uint32_t CHAR_SOFT_HYPHEN = 0x00AD; 2956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienconst uint32_t CHAR_ZWJ = 0x200D; 3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setLocale(const icu::Locale& locale) { 3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UErrorCode status = U_ZERO_ERROR; 3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); 3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // TODO: handle failure status 3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (mText != nullptr) { 3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->setText(&mUText, status); 3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = true; 3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setText(const uint16_t* data, size_t size) { 4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mText = data; 4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mTextSize = size; 4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = false; 4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mLast = 0; 4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mCurrent = 0; 479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mScanOffset = 0; 486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = false; 4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UErrorCode status = U_ZERO_ERROR; 5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien utext_openUChars(&mUText, data, size, &status); 5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->setText(&mUText, status); 5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->first(); 5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::current() const { 5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return mCurrent; 5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levienenum ScanState { 609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien START, 619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_AT, 629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON, 639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON_SLASH, 649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON_SLASH_SLASH, 659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}; 669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 67d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien/** 68d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * Determine whether a line break at position i within the buffer buf is valid. This 69d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * represents customization beyond the ICU behavior, because plain ICU provides some 70d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * line break opportunities that we don't want. 71d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien **/ 7256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienstatic bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) { 7356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien uint32_t codePoint; 7456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien size_t prev_offset = i; 7556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_PREV(buf, 0, prev_offset, codePoint); 7656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (codePoint == CHAR_SOFT_HYPHEN) { 77d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien return false; 78d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien } 7956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien uint32_t next_codepoint; 8056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien size_t next_offset = i; 8156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_NEXT(buf, next_offset, bufEnd, next_codepoint); 8256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (codePoint == CHAR_ZWJ) { 83d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien // Possible emoji ZWJ sequence 84d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien if (next_codepoint == 0x2764 || // HEAVY BLACK HEART 85d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien next_codepoint == 0x1F466 || // BOY 86d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien next_codepoint == 0x1F467 || // GIRL 87d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien next_codepoint == 0x1F468 || // MAN 88d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien next_codepoint == 0x1F469 || // WOMAN 89d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien next_codepoint == 0x1F48B || // KISS MARK 90d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE 91d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien return false; 92d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien } 93d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien } 9456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf 9556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // EB x EM 9656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (isEmojiModifier(next_codepoint)) { 9756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (codePoint == 0xFE0F && prev_offset > 0) { 9856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // skip over emoji variation selector 9956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_PREV(buf, 0, prev_offset, codePoint); 10056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 10156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (isEmojiBase(codePoint)) { 10256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien return false; 10356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 10456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 105d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien return true; 106d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien} 107d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien 1086d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses 1096d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakAfter(uint16_t c) { 1106d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien return c == ':' || c == '=' || c == '&'; 1116d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 1126d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 1136d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses 1146d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakBefore(uint16_t c) { 1156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' 1166d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien || c == '%' || c == '=' || c == '&'; 1176d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 1186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 11957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::next() { 12057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mLast = mCurrent; 1219c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // scan forward from current ICU position for email address or URL 1239c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (mLast >= mScanOffset) { 1249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien ScanState state = START; 1259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien size_t i; 1269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien for (i = mLast; i < mTextSize; i++) { 1279c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t c = mText[i]; 1289c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // scan only ASCII characters, stop at space 1299c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (!(' ' < c && c <= 0x007E)) { 1309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien break; 1319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (state == START && c == '@') { 1339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = SAW_AT; 1349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else if (state == START && c == ':') { 1359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = SAW_COLON; 1369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 1379c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (c == '/') { 1389c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = static_cast<ScanState>((int)state + 1); // next state adds a slash 1399c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else { 1409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = START; 1419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1429c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 1456d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (!mBreakIterator->isBoundary(i)) { 1466d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien i = mBreakIterator->following(i); 1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = true; 1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mIteratorWasReset = true; 1509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else { 1516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = false; 1526d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mScanOffset = i; 1546d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1556d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 1566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (mInEmailOrUrl) { 1576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.) 1586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t lastChar = mText[mLast]; 1596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien ssize_t i; 1606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien for (i = mLast + 1; i < mScanOffset; i++) { 1616d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (breakAfter(lastChar)) { 1626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1646d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien // break after double slash 1656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') { 1666d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1676d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1686d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t thisChar = mText[i]; 1696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien // never break after hyphen 1706d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (lastChar != '-') { 1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (breakBefore(thisChar)) { 1726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1736d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien // break before single slash 1756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (thisChar == '/' && lastChar != '/' && 1766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien !(i + 1 < mScanOffset && mText[i + 1] == '/')) { 1776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien lastChar = thisChar; 1819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1826d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mCurrent = i; 1836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien return mCurrent; 1849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien int32_t result; 18757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien do { 18857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (mIteratorWasReset) { 18957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = mBreakIterator->following(mCurrent); 19057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = false; 19157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } else { 19257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = mBreakIterator->next(); 19357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 19457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize 19556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien && !isBreakValid(mText, mTextSize, result)); 19657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mCurrent = (ssize_t)result; 19757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return mCurrent; 19857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 19957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 20057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordStart() const { 2016d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (mInEmailOrUrl) { 2029c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mLast; 2039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 20457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t result = mLast; 20557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien while (result < mCurrent) { 20657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UChar32 c; 20757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t ix = result; 20857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien U16_NEXT(mText, ix, mCurrent, c); 20957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 21057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // strip leading punctuation, defined as OP and QU line breaking classes, 21157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // see UAX #14 21257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 21357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien break; 21457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 21557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = ix; 21657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 21757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return result; 21857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 21957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 22057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordEnd() const { 2216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (mInEmailOrUrl) { 2229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mLast; 2239c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 22457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t result = mCurrent; 22557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien while (result > mLast) { 22657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UChar32 c; 22757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t ix = result; 22857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien U16_PREV(mText, mLast, ix, c); 22957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien int32_t gc_mask = U_GET_GC_MASK(c); 23057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // strip trailing space and punctuation 23157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { 23257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien break; 23357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 23457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = ix; 23557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 23657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return result; 23757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 23857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 239c88ef135fcc2661ec7addc171ebc60787df38affRaph Levienint WordBreaker::breakBadness() const { 240c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0; 241c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien} 242c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien 24357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::finish() { 24457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mText = nullptr; 24557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // Note: calling utext_close multiply is safe 24657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien utext_close(&mUText); 24757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 24857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 24957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} // namespace android 250