WordBreaker.cpp revision d8917c69a9f7b7ca52f7ac850922dab4322113f5
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define LOG_TAG "Minikin" 18#include <cutils/log.h> 19 20#include <minikin/WordBreaker.h> 21#include "MinikinInternal.h" 22 23#include <unicode/uchar.h> 24#include <unicode/utf16.h> 25 26namespace android { 27 28const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; 29const uint32_t CHAR_ZWJ = 0x200D; 30 31void WordBreaker::setLocale(const icu::Locale& locale) { 32 UErrorCode status = U_ZERO_ERROR; 33 mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); 34 // TODO: handle failure status 35 if (mText != nullptr) { 36 mBreakIterator->setText(&mUText, status); 37 } 38 mIteratorWasReset = true; 39} 40 41void WordBreaker::setText(const uint16_t* data, size_t size) { 42 mText = data; 43 mTextSize = size; 44 mIteratorWasReset = false; 45 mLast = 0; 46 mCurrent = 0; 47 mScanOffset = 0; 48 mInEmailOrUrl = false; 49 UErrorCode status = U_ZERO_ERROR; 50 utext_openUChars(&mUText, data, size, &status); 51 mBreakIterator->setText(&mUText, status); 52 mBreakIterator->first(); 53} 54 55ssize_t WordBreaker::current() const { 56 return mCurrent; 57} 58 59enum ScanState { 60 START, 61 SAW_AT, 62 SAW_COLON, 63 SAW_COLON_SLASH, 64 SAW_COLON_SLASH_SLASH, 65}; 66 67/** 68 * Determine whether a line break at position i within the buffer buf is valid. This 69 * represents customization beyond the ICU behavior, because plain ICU provides some 70 * line break opportunities that we don't want. 71 **/ 72static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) { 73 uint32_t codePoint; 74 size_t prev_offset = i; 75 U16_PREV(buf, 0, prev_offset, codePoint); 76 if (codePoint == CHAR_SOFT_HYPHEN) { 77 return false; 78 } 79 uint32_t next_codepoint; 80 size_t next_offset = i; 81 U16_NEXT(buf, next_offset, bufEnd, next_codepoint); 82 83 // Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt 84 //(AL | HL) × (PR | PO) 85 int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK); 86 if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) { 87 lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK); 88 if (lineBreak == U_LB_PREFIX_NUMERIC || lineBreak == U_LB_POSTFIX_NUMERIC) { 89 return false; 90 } 91 } 92 93 // Known emoji ZWJ sequences 94 if (codePoint == CHAR_ZWJ) { 95 // Possible emoji ZWJ sequence 96 if (next_codepoint == 0x2764 || // HEAVY BLACK HEART 97 next_codepoint == 0x1F466 || // BOY 98 next_codepoint == 0x1F467 || // GIRL 99 next_codepoint == 0x1F468 || // MAN 100 next_codepoint == 0x1F469 || // WOMAN 101 next_codepoint == 0x1F48B || // KISS MARK 102 next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE 103 return false; 104 } 105 } 106 107 // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf 108 // EB x EM 109 if (isEmojiModifier(next_codepoint)) { 110 if (codePoint == 0xFE0F && prev_offset > 0) { 111 // skip over emoji variation selector 112 U16_PREV(buf, 0, prev_offset, codePoint); 113 } 114 if (isEmojiBase(codePoint)) { 115 return false; 116 } 117 } 118 return true; 119} 120 121// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses 122static bool breakAfter(uint16_t c) { 123 return c == ':' || c == '=' || c == '&'; 124} 125 126// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses 127static bool breakBefore(uint16_t c) { 128 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' 129 || c == '%' || c == '=' || c == '&'; 130} 131 132ssize_t WordBreaker::next() { 133 mLast = mCurrent; 134 135 // scan forward from current ICU position for email address or URL 136 if (mLast >= mScanOffset) { 137 ScanState state = START; 138 size_t i; 139 for (i = mLast; i < mTextSize; i++) { 140 uint16_t c = mText[i]; 141 // scan only ASCII characters, stop at space 142 if (!(' ' < c && c <= 0x007E)) { 143 break; 144 } 145 if (state == START && c == '@') { 146 state = SAW_AT; 147 } else if (state == START && c == ':') { 148 state = SAW_COLON; 149 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 150 if (c == '/') { 151 state = static_cast<ScanState>((int)state + 1); // next state adds a slash 152 } else { 153 state = START; 154 } 155 } 156 } 157 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 158 if (!mBreakIterator->isBoundary(i)) { 159 i = mBreakIterator->following(i); 160 } 161 mInEmailOrUrl = true; 162 mIteratorWasReset = true; 163 } else { 164 mInEmailOrUrl = false; 165 } 166 mScanOffset = i; 167 } 168 169 if (mInEmailOrUrl) { 170 // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.) 171 uint16_t lastChar = mText[mLast]; 172 ssize_t i; 173 for (i = mLast + 1; i < mScanOffset; i++) { 174 if (breakAfter(lastChar)) { 175 break; 176 } 177 // break after double slash 178 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') { 179 break; 180 } 181 uint16_t thisChar = mText[i]; 182 // never break after hyphen 183 if (lastChar != '-') { 184 if (breakBefore(thisChar)) { 185 break; 186 } 187 // break before single slash 188 if (thisChar == '/' && lastChar != '/' && 189 !(i + 1 < mScanOffset && mText[i + 1] == '/')) { 190 break; 191 } 192 } 193 lastChar = thisChar; 194 } 195 mCurrent = i; 196 return mCurrent; 197 } 198 199 int32_t result; 200 do { 201 if (mIteratorWasReset) { 202 result = mBreakIterator->following(mCurrent); 203 mIteratorWasReset = false; 204 } else { 205 result = mBreakIterator->next(); 206 } 207 } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize 208 && !isBreakValid(mText, mTextSize, result)); 209 mCurrent = (ssize_t)result; 210 return mCurrent; 211} 212 213ssize_t WordBreaker::wordStart() const { 214 if (mInEmailOrUrl) { 215 return mLast; 216 } 217 ssize_t result = mLast; 218 while (result < mCurrent) { 219 UChar32 c; 220 ssize_t ix = result; 221 U16_NEXT(mText, ix, mCurrent, c); 222 int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 223 // strip leading punctuation, defined as OP and QU line breaking classes, 224 // see UAX #14 225 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 226 break; 227 } 228 result = ix; 229 } 230 return result; 231} 232 233ssize_t WordBreaker::wordEnd() const { 234 if (mInEmailOrUrl) { 235 return mLast; 236 } 237 ssize_t result = mCurrent; 238 while (result > mLast) { 239 UChar32 c; 240 ssize_t ix = result; 241 U16_PREV(mText, mLast, ix, c); 242 int32_t gc_mask = U_GET_GC_MASK(c); 243 // strip trailing space and punctuation 244 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { 245 break; 246 } 247 result = ix; 248 } 249 return result; 250} 251 252int WordBreaker::breakBadness() const { 253 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0; 254} 255 256void WordBreaker::finish() { 257 mText = nullptr; 258 // Note: calling utext_close multiply is safe 259 utext_close(&mUText); 260} 261 262} // namespace android 263