WordBreaker.cpp revision d3f45892c721fb1738bf02fe19a5143a320ca4bf
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define LOG_TAG "Minikin" 18#include <cutils/log.h> 19 20#include "minikin/WordBreaker.h" 21 22#include <unicode/uchar.h> 23#include <unicode/utf16.h> 24 25namespace android { 26 27const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; 28const uint16_t CHAR_ZWJ = 0x200D; 29 30void WordBreaker::setLocale(const icu::Locale& locale) { 31 UErrorCode status = U_ZERO_ERROR; 32 mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); 33 // TODO: handle failure status 34 if (mText != nullptr) { 35 mBreakIterator->setText(&mUText, status); 36 } 37 mIteratorWasReset = true; 38} 39 40void WordBreaker::setText(const uint16_t* data, size_t size) { 41 mText = data; 42 mTextSize = size; 43 mIteratorWasReset = false; 44 mLast = 0; 45 mCurrent = 0; 46 mScanOffset = 0; 47 mInEmailOrUrl = false; 48 UErrorCode status = U_ZERO_ERROR; 49 utext_openUChars(&mUText, data, size, &status); 50 mBreakIterator->setText(&mUText, status); 51 mBreakIterator->first(); 52} 53 54ssize_t WordBreaker::current() const { 55 return mCurrent; 56} 57 58enum ScanState { 59 START, 60 SAW_AT, 61 SAW_COLON, 62 SAW_COLON_SLASH, 63 SAW_COLON_SLASH_SLASH, 64}; 65 66/** 67 * Determine whether a line break at position i within the buffer buf is valid. This 68 * represents customization beyond the ICU behavior, because plain ICU provides some 69 * line break opportunities that we don't want. 70 **/ 71static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) { 72 if (codeUnit == CHAR_SOFT_HYPHEN) { 73 return false; 74 } 75 if (codeUnit == CHAR_ZWJ) { 76 // Possible emoji ZWJ sequence 77 uint32_t next_codepoint; 78 U16_NEXT(buf, i, bufEnd, next_codepoint); 79 if (next_codepoint == 0x2764 || // HEAVY BLACK HEART 80 next_codepoint == 0x1F466 || // BOY 81 next_codepoint == 0x1F467 || // GIRL 82 next_codepoint == 0x1F468 || // MAN 83 next_codepoint == 0x1F469 || // WOMAN 84 next_codepoint == 0x1F48B || // KISS MARK 85 next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE 86 return false; 87 } 88 } 89 return true; 90} 91 92// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses 93static bool breakAfter(uint16_t c) { 94 return c == ':' || c == '=' || c == '&'; 95} 96 97// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses 98static bool breakBefore(uint16_t c) { 99 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' 100 || c == '%' || c == '=' || c == '&'; 101} 102 103ssize_t WordBreaker::next() { 104 mLast = mCurrent; 105 106 // scan forward from current ICU position for email address or URL 107 if (mLast >= mScanOffset) { 108 ScanState state = START; 109 size_t i; 110 for (i = mLast; i < mTextSize; i++) { 111 uint16_t c = mText[i]; 112 // scan only ASCII characters, stop at space 113 if (!(' ' < c && c <= 0x007E)) { 114 break; 115 } 116 if (state == START && c == '@') { 117 state = SAW_AT; 118 } else if (state == START && c == ':') { 119 state = SAW_COLON; 120 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 121 if (c == '/') { 122 state = static_cast<ScanState>((int)state + 1); // next state adds a slash 123 } else { 124 state = START; 125 } 126 } 127 } 128 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 129 if (!mBreakIterator->isBoundary(i)) { 130 i = mBreakIterator->following(i); 131 } 132 mInEmailOrUrl = true; 133 mIteratorWasReset = true; 134 } else { 135 mInEmailOrUrl = false; 136 } 137 mScanOffset = i; 138 } 139 140 if (mInEmailOrUrl) { 141 // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.) 142 uint16_t lastChar = mText[mLast]; 143 ssize_t i; 144 for (i = mLast + 1; i < mScanOffset; i++) { 145 if (breakAfter(lastChar)) { 146 break; 147 } 148 // break after double slash 149 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') { 150 break; 151 } 152 uint16_t thisChar = mText[i]; 153 // never break after hyphen 154 if (lastChar != '-') { 155 if (breakBefore(thisChar)) { 156 break; 157 } 158 // break before single slash 159 if (thisChar == '/' && lastChar != '/' && 160 !(i + 1 < mScanOffset && mText[i + 1] == '/')) { 161 break; 162 } 163 } 164 lastChar = thisChar; 165 } 166 mCurrent = i; 167 return mCurrent; 168 } 169 170 int32_t result; 171 do { 172 if (mIteratorWasReset) { 173 result = mBreakIterator->following(mCurrent); 174 mIteratorWasReset = false; 175 } else { 176 result = mBreakIterator->next(); 177 } 178 } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize 179 && !isBreakValid(mText[result - 1], mText, mTextSize, result)); 180 mCurrent = (ssize_t)result; 181 return mCurrent; 182} 183 184ssize_t WordBreaker::wordStart() const { 185 if (mInEmailOrUrl) { 186 return mLast; 187 } 188 ssize_t result = mLast; 189 while (result < mCurrent) { 190 UChar32 c; 191 ssize_t ix = result; 192 U16_NEXT(mText, ix, mCurrent, c); 193 int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 194 // strip leading punctuation, defined as OP and QU line breaking classes, 195 // see UAX #14 196 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 197 break; 198 } 199 result = ix; 200 } 201 return result; 202} 203 204ssize_t WordBreaker::wordEnd() const { 205 if (mInEmailOrUrl) { 206 return mLast; 207 } 208 ssize_t result = mCurrent; 209 while (result > mLast) { 210 UChar32 c; 211 ssize_t ix = result; 212 U16_PREV(mText, mLast, ix, c); 213 int32_t gc_mask = U_GET_GC_MASK(c); 214 // strip trailing space and punctuation 215 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { 216 break; 217 } 218 result = ix; 219 } 220 return result; 221} 222 223int WordBreaker::breakBadness() const { 224 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0; 225} 226 227void WordBreaker::finish() { 228 mText = nullptr; 229 // Note: calling utext_close multiply is safe 230 utext_close(&mUText); 231} 232 233} // namespace android 234