157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/*
257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project
357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License.
657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at
757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software
1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and
1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License.
1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */
1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin"
18555d84c6f98eafcbe677cdcb8e9605760acd8ce5Mark Salyzyn
19555d84c6f98eafcbe677cdcb8e9605760acd8ce5Mark Salyzyn#include <android/log.h>
2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
21bab3b98ceb29fa3fc5d8832284312859d7f32cc7Roozbeh Pournader#include <minikin/Emoji.h>
22c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader#include <minikin/Hyphenator.h>
23bab3b98ceb29fa3fc5d8832284312859d7f32cc7Roozbeh Pournader#include <minikin/WordBreaker.h>
2456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#include "MinikinInternal.h"
2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uchar.h>
2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/utf16.h>
2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2914e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonakanamespace minikin {
3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienconst uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
3256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienconst uint32_t CHAR_ZWJ = 0x200D;
3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setLocale(const icu::Locale& locale) {
3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    UErrorCode status = U_ZERO_ERROR;
3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    // TODO: handle failure status
3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    if (mText != nullptr) {
3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        mBreakIterator->setText(&mUText, status);
4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mIteratorWasReset = true;
4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::setText(const uint16_t* data, size_t size) {
4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mText = data;
4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mTextSize = size;
4757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mIteratorWasReset = false;
4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mLast = 0;
4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mCurrent = 0;
509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    mScanOffset = 0;
516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    mInEmailOrUrl = false;
5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    UErrorCode status = U_ZERO_ERROR;
5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    utext_openUChars(&mUText, data, size, &status);
5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator->setText(&mUText, status);
5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mBreakIterator->first();
5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::current() const {
5957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return mCurrent;
6057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
6157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
62d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien/**
63d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * Determine whether a line break at position i within the buffer buf is valid. This
64d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * represents customization beyond the ICU behavior, because plain ICU provides some
65d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien * line break opportunities that we don't want.
66d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien **/
6756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levienstatic bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
6856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    uint32_t codePoint;
6956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    size_t prev_offset = i;
7056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    U16_PREV(buf, 0, prev_offset, codePoint);
71c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
72c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
73d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        return false;
74d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    }
7574b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
7674b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
7774b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
7874b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    // where no line break could be imagined, since the Myanmar virama is a pure stacker.
7974b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
8074b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader        return false;
8174b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    }
8274b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader
8356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    uint32_t next_codepoint;
8456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    size_t next_offset = i;
8556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
86d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader
87c97689439cb98ddf46fa279d8088b8c4a5f7b2f4Roozbeh Pournader    // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
88c97689439cb98ddf46fa279d8088b8c4a5f7b2f4Roozbeh Pournader    // emoji data than ICU does.
8977f488345316fba46c271fc04bea470819ae1712Seigo Nonaka    if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
9077f488345316fba46c271fc04bea470819ae1712Seigo Nonaka        return false;
91d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    }
92d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader
93c97689439cb98ddf46fa279d8088b8c4a5f7b2f4Roozbeh Pournader    // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
9456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    if (isEmojiModifier(next_codepoint)) {
9556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        if (codePoint == 0xFE0F && prev_offset > 0) {
9656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            // skip over emoji variation selector
9756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            U16_PREV(buf, 0, prev_offset, codePoint);
9856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        }
9956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        if (isEmojiBase(codePoint)) {
10056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien            return false;
10156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        }
10256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    }
103d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    return true;
104d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien}
105d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien
1068bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader// Customized iteratorNext that takes care of both resets and our modifications
1078bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader// to ICU's behavior.
1088bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournaderint32_t WordBreaker::iteratorNext() {
1098bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    int32_t result;
1108bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    do {
1118bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        if (mIteratorWasReset) {
1128bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            result = mBreakIterator->following(mCurrent);
1138bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            mIteratorWasReset = false;
1148bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        } else {
1158bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            result = mBreakIterator->next();
1168bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        }
1178bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    } while (!(result == icu::BreakIterator::DONE || (size_t)result == mTextSize
1188bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            || isBreakValid(mText, mTextSize, result)));
1198bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    return result;
1208bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader}
1218bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader
1226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
1236d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakAfter(uint16_t c) {
1246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    return c == ':' || c == '=' || c == '&';
1256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
1266d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
1276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
1286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levienstatic bool breakBefore(uint16_t c) {
1296d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
1306d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            || c == '%' || c == '=' || c == '&';
1316d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
1326d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
1338bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournaderenum ScanState {
1348bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    START,
1358bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    SAW_AT,
1368bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    SAW_COLON,
1378bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    SAW_COLON_SLASH,
1388bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    SAW_COLON_SLASH_SLASH,
1398bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader};
1409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1418bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournadervoid WordBreaker::detectEmailOrUrl() {
1429c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    // scan forward from current ICU position for email address or URL
1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    if (mLast >= mScanOffset) {
1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        ScanState state = START;
1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        size_t i;
1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        for (i = mLast; i < mTextSize; i++) {
1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            uint16_t c = mText[i];
1489c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            // scan only ASCII characters, stop at space
1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (!(' ' < c && c <= 0x007E)) {
1509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                break;
1519c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1529c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            if (state == START && c == '@') {
1539c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                state = SAW_AT;
1549c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            } else if (state == START && c == ':') {
1559c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                state = SAW_COLON;
1569c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
1579c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                if (c == '/') {
1589c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
1599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                } else {
1609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                    state = START;
1619c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien                }
1629c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        }
1649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
1656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            if (!mBreakIterator->isBoundary(i)) {
1668bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader                // If there are combining marks or such at the end of the URL or the email address,
1678bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader                // consider them a part of the URL or the email, and skip to the next actual
1688bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader                // boundary.
1696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                i = mBreakIterator->following(i);
1709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            }
1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            mInEmailOrUrl = true;
1729c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien            mIteratorWasReset = true;
1739c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        } else {
1746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            mInEmailOrUrl = false;
1756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        }
1766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        mScanOffset = i;
1776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    }
1788bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader}
1796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
1808bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournaderssize_t WordBreaker::findNextBreakInEmailOrUrl() {
1818bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
1828bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    uint16_t lastChar = mText[mLast];
1838bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    ssize_t i;
1848bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    for (i = mLast + 1; i < mScanOffset; i++) {
1858bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        if (breakAfter(lastChar)) {
1868bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            break;
1878bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        }
1888bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        // break after double slash
1898bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
1908bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            break;
1918bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        }
1928bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        const uint16_t thisChar = mText[i];
1938bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        // never break after hyphen
1948bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        if (lastChar != '-') {
1958bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            if (breakBefore(thisChar)) {
1966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                break;
1976d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            }
1988bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            // break before single slash
1998bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader            if (thisChar == '/' && lastChar != '/' &&
2008bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader                        !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
2016d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien                break;
2026d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien            }
2039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        }
2048bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        lastChar = thisChar;
2059c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
2068bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    return i;
2078bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader}
2089c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
2098bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournaderssize_t WordBreaker::next() {
2108bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    mLast = mCurrent;
2118bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader
2128bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    detectEmailOrUrl();
2138bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    if (mInEmailOrUrl) {
2148bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        mCurrent = findNextBreakInEmailOrUrl();
2158bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    } else {  // Business as usual
2168bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        mCurrent = (ssize_t) iteratorNext();
2178bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader    }
21857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return mCurrent;
21957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
22057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
22157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordStart() const {
2226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    if (mInEmailOrUrl) {
2239c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        return mLast;
2249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
22557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    ssize_t result = mLast;
22657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    while (result < mCurrent) {
22757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        UChar32 c;
22857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        ssize_t ix = result;
22957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        U16_NEXT(mText, ix, mCurrent, c);
2308bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
23157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // strip leading punctuation, defined as OP and QU line breaking classes,
23257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // see UAX #14
23357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
23457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            break;
23557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
23657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        result = ix;
23757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
23857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return result;
23957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
24057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
24157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienssize_t WordBreaker::wordEnd() const {
2426d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    if (mInEmailOrUrl) {
2439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        return mLast;
2449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    }
24557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    ssize_t result = mCurrent;
24657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    while (result > mLast) {
24757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        UChar32 c;
24857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        ssize_t ix = result;
24957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        U16_PREV(mText, mLast, ix, c);
2508bdd9b948fc8a55ade32c2d84ff1a6b5be5659e1Roozbeh Pournader        const int32_t gc_mask = U_GET_GC_MASK(c);
25157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        // strip trailing space and punctuation
25257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
25357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien            break;
25457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        }
25557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        result = ix;
25657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    }
25757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    return result;
25857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
25957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
260c88ef135fcc2661ec7addc171ebc60787df38affRaph Levienint WordBreaker::breakBadness() const {
261c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
262c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien}
263c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien
26457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienvoid WordBreaker::finish() {
26557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    mText = nullptr;
26657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    // Note: calling utext_close multiply is safe
26757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    utext_close(&mUText);
26857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
26957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
27014e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka}  // namespace minikin
271