157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/* 257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project 357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License. 657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at 757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software 1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and 1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License. 1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */ 1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin" 1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h> 1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include <minikin/WordBreaker.h> 2156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include "MinikinInternal.h" 2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uchar.h> 2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/utf16.h> 2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Leviennamespace android { 2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienconst uint32_t CHAR_SOFT_HYPHEN = 0x00AD; 2956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienconst uint32_t CHAR_ZWJ = 0x200D; 3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setLocale(const icu::Locale& locale) { 3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UErrorCode status = U_ZERO_ERROR; 3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); 3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // TODO: handle failure status 3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (mText != nullptr) { 3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->setText(&mUText, status); 3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = true; 3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setText(const uint16_t* data, size_t size) { 4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mText = data; 4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mTextSize = size; 4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = false; 4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mLast = 0; 4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mCurrent = 0; 479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mScanOffset = 0; 486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = false; 4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UErrorCode status = U_ZERO_ERROR; 5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien utext_openUChars(&mUText, data, size, &status); 5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->setText(&mUText, status); 5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->first(); 5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::current() const { 5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return mCurrent; 5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levienenum ScanState { 609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien START, 619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_AT, 629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON, 639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON_SLASH, 649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien SAW_COLON_SLASH_SLASH, 659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}; 669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 67d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien/** 68d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * Determine whether a line break at position i within the buffer buf is valid. This 69d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * represents customization beyond the ICU behavior, because plain ICU provides some 70d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * line break opportunities that we don't want. 71d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien **/ 7256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienstatic bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) { 7356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien uint32_t codePoint; 7456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien size_t prev_offset = i; 7556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_PREV(buf, 0, prev_offset, codePoint); 7656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (codePoint == CHAR_SOFT_HYPHEN) { 77d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien return false; 78d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien } 7974b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go 8074b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid 8174b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // too much looking around in the strings, we simply avoid breaking after any Myanmar virama, 8274b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // where no line break could be imagined, since the Myanmar virama is a pure stacker. 8374b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA 8474b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader return false; 8574b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader } 8674b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader 8756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien uint32_t next_codepoint; 8856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien size_t next_offset = i; 8956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_NEXT(buf, next_offset, bufEnd, next_codepoint); 90d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader 91d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader // Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt 9274b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // (AL | HL) × (PR | PO) 93d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK); 94d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) { 95d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK); 96d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader if (lineBreak == U_LB_PREFIX_NUMERIC || lineBreak == U_LB_POSTFIX_NUMERIC) { 97d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader return false; 98d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader } 99d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader } 100d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader 10177f488345316fba46c271fc04bea470819ae1712Seigo Nonaka // Emoji ZWJ sequences. 10277f488345316fba46c271fc04bea470819ae1712Seigo Nonaka if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) { 10377f488345316fba46c271fc04bea470819ae1712Seigo Nonaka return false; 104d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien } 105d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader 10656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf 10756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // EB x EM 10856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (isEmojiModifier(next_codepoint)) { 10956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (codePoint == 0xFE0F && prev_offset > 0) { 11056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // skip over emoji variation selector 11156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_PREV(buf, 0, prev_offset, codePoint); 11256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 11356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (isEmojiBase(codePoint)) { 11456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien return false; 11556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 11656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 117d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien return true; 118d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien} 119d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien 1206d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses 1216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakAfter(uint16_t c) { 1226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien return c == ':' || c == '=' || c == '&'; 1236d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 1246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 1256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses 1266d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakBefore(uint16_t c) { 1276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' 1286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien || c == '%' || c == '=' || c == '&'; 1296d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 1306d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 13157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::next() { 13257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mLast = mCurrent; 1339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // scan forward from current ICU position for email address or URL 1359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (mLast >= mScanOffset) { 1369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien ScanState state = START; 1379c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien size_t i; 1389c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien for (i = mLast; i < mTextSize; i++) { 1399c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t c = mText[i]; 1409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // scan only ASCII characters, stop at space 1419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (!(' ' < c && c <= 0x007E)) { 1429c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien break; 1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (state == START && c == '@') { 1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = SAW_AT; 1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else if (state == START && c == ':') { 1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = SAW_COLON; 1489c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (c == '/') { 1509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = static_cast<ScanState>((int)state + 1); // next state adds a slash 1519c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else { 1529c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = START; 1539c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1549c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1559c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1569c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 1576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (!mBreakIterator->isBoundary(i)) { 1586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien i = mBreakIterator->following(i); 1599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = true; 1619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mIteratorWasReset = true; 1629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else { 1636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = false; 1646d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mScanOffset = i; 1666d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1676d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 1686d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (mInEmailOrUrl) { 1696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.) 1706d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t lastChar = mText[mLast]; 1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien ssize_t i; 1726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien for (i = mLast + 1; i < mScanOffset; i++) { 1736d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (breakAfter(lastChar)) { 1746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien // break after double slash 1776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') { 1786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t thisChar = mText[i]; 1816d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien // never break after hyphen 1826d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (lastChar != '-') { 1836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (breakBefore(thisChar)) { 1846d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1856d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1866d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien // break before single slash 1876d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (thisChar == '/' && lastChar != '/' && 1886d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien !(i + 1 < mScanOffset && mText[i + 1] == '/')) { 1896d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1906d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1916d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1926d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien lastChar = thisChar; 1939c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1946d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mCurrent = i; 1956d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien return mCurrent; 1969c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1989c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien int32_t result; 19957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien do { 20057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (mIteratorWasReset) { 20157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = mBreakIterator->following(mCurrent); 20257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = false; 20357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } else { 20457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = mBreakIterator->next(); 20557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 20657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize 20756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien && !isBreakValid(mText, mTextSize, result)); 20857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mCurrent = (ssize_t)result; 20957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return mCurrent; 21057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 21157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 21257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordStart() const { 2136d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (mInEmailOrUrl) { 2149c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mLast; 2159c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 21657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t result = mLast; 21757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien while (result < mCurrent) { 21857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UChar32 c; 21957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t ix = result; 22057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien U16_NEXT(mText, ix, mCurrent, c); 22157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 22257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // strip leading punctuation, defined as OP and QU line breaking classes, 22357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // see UAX #14 22457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 22557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien break; 22657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 22757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = ix; 22857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 22957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return result; 23057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 23157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 23257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordEnd() const { 2336d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (mInEmailOrUrl) { 2349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mLast; 2359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 23657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t result = mCurrent; 23757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien while (result > mLast) { 23857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UChar32 c; 23957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t ix = result; 24057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien U16_PREV(mText, mLast, ix, c); 24157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien int32_t gc_mask = U_GET_GC_MASK(c); 24257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // strip trailing space and punctuation 24357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { 24457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien break; 24557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 24657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = ix; 24757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 24857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return result; 24957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 25057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 251c88ef135fcc2661ec7addc171ebc60787df38affRaph Levienint WordBreaker::breakBadness() const { 252c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0; 253c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien} 254c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien 25557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::finish() { 25657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mText = nullptr; 25757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // Note: calling utext_close multiply is safe 25857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien utext_close(&mUText); 25957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 26057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 26157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} // namespace android 262