157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/* 257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project 357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License. 657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at 757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software 1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and 1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License. 1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */ 1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin" 18555d84c6f98eafcbe677cdcb8e9605760acd8ce5Mark Salyzyn 19555d84c6f98eafcbe677cdcb8e9605760acd8ce5Mark Salyzyn#include <android/log.h> 2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 21bab3b98ceb29fa3fc5d8832284312859d7f32cc7Roozbeh Pournader#include <minikin/Emoji.h> 22c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader#include <minikin/Hyphenator.h> 23bab3b98ceb29fa3fc5d8832284312859d7f32cc7Roozbeh Pournader#include <minikin/WordBreaker.h> 2456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include "MinikinInternal.h" 2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uchar.h> 2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/utf16.h> 2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2914e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonakanamespace minikin { 3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienconst uint32_t CHAR_SOFT_HYPHEN = 0x00AD; 3256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienconst uint32_t CHAR_ZWJ = 0x200D; 3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setLocale(const icu::Locale& locale) { 3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UErrorCode status = U_ZERO_ERROR; 3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); 3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // TODO: handle failure status 3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (mText != nullptr) { 3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->setText(&mUText, status); 4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = true; 4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setText(const uint16_t* data, size_t size) { 4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mText = data; 4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mTextSize = size; 4757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mIteratorWasReset = false; 4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mLast = 0; 4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mCurrent = 0; 509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mScanOffset = 0; 516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = false; 5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UErrorCode status = U_ZERO_ERROR; 5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien utext_openUChars(&mUText, data, size, &status); 5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->setText(&mUText, status); 5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mBreakIterator->first(); 5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::current() const { 5957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return mCurrent; 6057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 6157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 62d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien/** 63d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * Determine whether a line break at position i within the buffer buf is valid. This 64d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * represents customization beyond the ICU behavior, because plain ICU provides some 65d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * line break opportunities that we don't want. 66d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien **/ 6756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienstatic bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) { 6856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien uint32_t codePoint; 6956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien size_t prev_offset = i; 7056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_PREV(buf, 0, prev_offset, codePoint); 71c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Do not break on hard or soft hyphens. These are handled by automatic hyphenation. 72c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) { 73d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien return false; 74d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien } 7574b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go 7674b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid 7774b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // too much looking around in the strings, we simply avoid breaking after any Myanmar virama, 7874b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader // where no line break could be imagined, since the Myanmar virama is a pure stacker. 7974b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA 8074b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader return false; 8174b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader } 8274b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader 8356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien uint32_t next_codepoint; 8456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien size_t next_offset = i; 8556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_NEXT(buf, next_offset, bufEnd, next_codepoint); 86d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader 87c97689439cb98ddf46fa279d8088b8c4a5f7b2f4Roozbeh Pournader // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher 88c97689439cb98ddf46fa279d8088b8c4a5f7b2f4Roozbeh Pournader // emoji data than ICU does. 8977f488345316fba46c271fc04bea470819ae1712Seigo Nonaka if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) { 9077f488345316fba46c271fc04bea470819ae1712Seigo Nonaka return false; 91d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien } 92d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader 93c97689439cb98ddf46fa279d8088b8c4a5f7b2f4Roozbeh Pournader // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does. 9456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (isEmojiModifier(next_codepoint)) { 9556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (codePoint == 0xFE0F && prev_offset > 0) { 9656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // skip over emoji variation selector 9756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien U16_PREV(buf, 0, prev_offset, codePoint); 9856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 9956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien if (isEmojiBase(codePoint)) { 10056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien return false; 10156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 10256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien } 103d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien return true; 104d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien} 105d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien 1068bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader// Customized iteratorNext that takes care of both resets and our modifications 1078bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader// to ICU's behavior. 1088bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournaderint32_t WordBreaker::iteratorNext() { 1098bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader int32_t result; 1108bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader do { 1118bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader if (mIteratorWasReset) { 1128bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader result = mBreakIterator->following(mCurrent); 1138bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader mIteratorWasReset = false; 1148bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader } else { 1158bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader result = mBreakIterator->next(); 1168bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader } 1178bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader } while (!(result == icu::BreakIterator::DONE || (size_t)result == mTextSize 1188bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader || isBreakValid(mText, mTextSize, result))); 1198bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader return result; 1208bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader} 1218bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader 1226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses 1236d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakAfter(uint16_t c) { 1246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien return c == ':' || c == '=' || c == '&'; 1256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 1266d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 1276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses 1286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakBefore(uint16_t c) { 1296d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' 1306d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien || c == '%' || c == '=' || c == '&'; 1316d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 1326d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 1338bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournaderenum ScanState { 1348bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader START, 1358bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader SAW_AT, 1368bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader SAW_COLON, 1378bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader SAW_COLON_SLASH, 1388bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader SAW_COLON_SLASH_SLASH, 1398bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader}; 1409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1418bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournadervoid WordBreaker::detectEmailOrUrl() { 1429c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // scan forward from current ICU position for email address or URL 1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (mLast >= mScanOffset) { 1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien ScanState state = START; 1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien size_t i; 1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien for (i = mLast; i < mTextSize; i++) { 1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t c = mText[i]; 1489c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien // scan only ASCII characters, stop at space 1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (!(' ' < c && c <= 0x007E)) { 1509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien break; 1519c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1529c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (state == START && c == '@') { 1539c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = SAW_AT; 1549c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else if (state == START && c == ':') { 1559c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = SAW_COLON; 1569c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 1579c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (c == '/') { 1589c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = static_cast<ScanState>((int)state + 1); // next state adds a slash 1599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else { 1609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien state = START; 1619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 1656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (!mBreakIterator->isBoundary(i)) { 1668bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader // If there are combining marks or such at the end of the URL or the email address, 1678bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader // consider them a part of the URL or the email, and skip to the next actual 1688bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader // boundary. 1696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien i = mBreakIterator->following(i); 1709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = true; 1729c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien mIteratorWasReset = true; 1739c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } else { 1746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mInEmailOrUrl = false; 1756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien mScanOffset = i; 1776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1788bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader} 1796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 1808bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournaderssize_t WordBreaker::findNextBreakInEmailOrUrl() { 1818bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.) 1828bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader uint16_t lastChar = mText[mLast]; 1838bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader ssize_t i; 1848bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader for (i = mLast + 1; i < mScanOffset; i++) { 1858bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader if (breakAfter(lastChar)) { 1868bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader break; 1878bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader } 1888bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader // break after double slash 1898bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') { 1908bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader break; 1918bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader } 1928bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader const uint16_t thisChar = mText[i]; 1938bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader // never break after hyphen 1948bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader if (lastChar != '-') { 1958bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader if (breakBefore(thisChar)) { 1966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 1976d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 1988bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader // break before single slash 1998bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader if (thisChar == '/' && lastChar != '/' && 2008bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader !(i + 1 < mScanOffset && mText[i + 1] == '/')) { 2016d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien break; 2026d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien } 2039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 2048bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader lastChar = thisChar; 2059c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 2068bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader return i; 2078bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader} 2089c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 2098bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournaderssize_t WordBreaker::next() { 2108bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader mLast = mCurrent; 2118bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader 2128bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader detectEmailOrUrl(); 2138bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader if (mInEmailOrUrl) { 2148bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader mCurrent = findNextBreakInEmailOrUrl(); 2158bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader } else { // Business as usual 2168bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader mCurrent = (ssize_t) iteratorNext(); 2178bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader } 21857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return mCurrent; 21957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 22057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 22157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordStart() const { 2226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (mInEmailOrUrl) { 2239c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mLast; 2249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 22557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t result = mLast; 22657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien while (result < mCurrent) { 22757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UChar32 c; 22857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t ix = result; 22957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien U16_NEXT(mText, ix, mCurrent, c); 2308bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 23157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // strip leading punctuation, defined as OP and QU line breaking classes, 23257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // see UAX #14 23357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 23457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien break; 23557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 23657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = ix; 23757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 23857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return result; 23957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 24057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 24157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordEnd() const { 2426d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien if (mInEmailOrUrl) { 2439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien return mLast; 2449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien } 24557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t result = mCurrent; 24657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien while (result > mLast) { 24757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien UChar32 c; 24857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien ssize_t ix = result; 24957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien U16_PREV(mText, mLast, ix, c); 2508bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader const int32_t gc_mask = U_GET_GC_MASK(c); 25157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // strip trailing space and punctuation 25257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { 25357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien break; 25457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 25557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien result = ix; 25657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien } 25757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien return result; 25857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 25957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 260c88ef135fcc2661ec7addc171ebc60787df38affRaph Levienint WordBreaker::breakBadness() const { 261c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0; 262c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien} 263c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien 26457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::finish() { 26557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien mText = nullptr; 26657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien // Note: calling utext_close multiply is safe 26757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien utext_close(&mUText); 26857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 26957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 27014e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka} // namespace minikin 271