1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define LOG_TAG "Minikin" 18#include <cutils/log.h> 19 20#include <minikin/WordBreaker.h> 21#include "MinikinInternal.h" 22 23#include <unicode/uchar.h> 24#include <unicode/utf16.h> 25 26namespace android { 27 28const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; 29const uint32_t CHAR_ZWJ = 0x200D; 30 31void WordBreaker::setLocale(const icu::Locale& locale) { 32 UErrorCode status = U_ZERO_ERROR; 33 mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status)); 34 // TODO: handle failure status 35 if (mText != nullptr) { 36 mBreakIterator->setText(&mUText, status); 37 } 38 mIteratorWasReset = true; 39} 40 41void WordBreaker::setText(const uint16_t* data, size_t size) { 42 mText = data; 43 mTextSize = size; 44 mIteratorWasReset = false; 45 mLast = 0; 46 mCurrent = 0; 47 mScanOffset = 0; 48 mInEmailOrUrl = false; 49 UErrorCode status = U_ZERO_ERROR; 50 utext_openUChars(&mUText, data, size, &status); 51 mBreakIterator->setText(&mUText, status); 52 mBreakIterator->first(); 53} 54 55ssize_t WordBreaker::current() const { 56 return mCurrent; 57} 58 59enum ScanState { 60 START, 61 SAW_AT, 62 SAW_COLON, 63 SAW_COLON_SLASH, 64 SAW_COLON_SLASH_SLASH, 65}; 66 67/** 68 * Determine whether a line break at position i within the buffer buf is valid. This 69 * represents customization beyond the ICU behavior, because plain ICU provides some 70 * line break opportunities that we don't want. 71 **/ 72static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) { 73 uint32_t codePoint; 74 size_t prev_offset = i; 75 U16_PREV(buf, 0, prev_offset, codePoint); 76 if (codePoint == CHAR_SOFT_HYPHEN) { 77 return false; 78 } 79 // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go 80 // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid 81 // too much looking around in the strings, we simply avoid breaking after any Myanmar virama, 82 // where no line break could be imagined, since the Myanmar virama is a pure stacker. 83 if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA 84 return false; 85 } 86 87 uint32_t next_codepoint; 88 size_t next_offset = i; 89 U16_NEXT(buf, next_offset, bufEnd, next_codepoint); 90 91 // Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt 92 // (AL | HL) × (PR | PO) 93 int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK); 94 if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) { 95 lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK); 96 if (lineBreak == U_LB_PREFIX_NUMERIC || lineBreak == U_LB_POSTFIX_NUMERIC) { 97 return false; 98 } 99 } 100 101 // Emoji ZWJ sequences. 102 if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) { 103 return false; 104 } 105 106 // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf 107 // EB x EM 108 if (isEmojiModifier(next_codepoint)) { 109 if (codePoint == 0xFE0F && prev_offset > 0) { 110 // skip over emoji variation selector 111 U16_PREV(buf, 0, prev_offset, codePoint); 112 } 113 if (isEmojiBase(codePoint)) { 114 return false; 115 } 116 } 117 return true; 118} 119 120// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses 121static bool breakAfter(uint16_t c) { 122 return c == ':' || c == '=' || c == '&'; 123} 124 125// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses 126static bool breakBefore(uint16_t c) { 127 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' 128 || c == '%' || c == '=' || c == '&'; 129} 130 131ssize_t WordBreaker::next() { 132 mLast = mCurrent; 133 134 // scan forward from current ICU position for email address or URL 135 if (mLast >= mScanOffset) { 136 ScanState state = START; 137 size_t i; 138 for (i = mLast; i < mTextSize; i++) { 139 uint16_t c = mText[i]; 140 // scan only ASCII characters, stop at space 141 if (!(' ' < c && c <= 0x007E)) { 142 break; 143 } 144 if (state == START && c == '@') { 145 state = SAW_AT; 146 } else if (state == START && c == ':') { 147 state = SAW_COLON; 148 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 149 if (c == '/') { 150 state = static_cast<ScanState>((int)state + 1); // next state adds a slash 151 } else { 152 state = START; 153 } 154 } 155 } 156 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 157 if (!mBreakIterator->isBoundary(i)) { 158 i = mBreakIterator->following(i); 159 } 160 mInEmailOrUrl = true; 161 mIteratorWasReset = true; 162 } else { 163 mInEmailOrUrl = false; 164 } 165 mScanOffset = i; 166 } 167 168 if (mInEmailOrUrl) { 169 // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.) 170 uint16_t lastChar = mText[mLast]; 171 ssize_t i; 172 for (i = mLast + 1; i < mScanOffset; i++) { 173 if (breakAfter(lastChar)) { 174 break; 175 } 176 // break after double slash 177 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') { 178 break; 179 } 180 uint16_t thisChar = mText[i]; 181 // never break after hyphen 182 if (lastChar != '-') { 183 if (breakBefore(thisChar)) { 184 break; 185 } 186 // break before single slash 187 if (thisChar == '/' && lastChar != '/' && 188 !(i + 1 < mScanOffset && mText[i + 1] == '/')) { 189 break; 190 } 191 } 192 lastChar = thisChar; 193 } 194 mCurrent = i; 195 return mCurrent; 196 } 197 198 int32_t result; 199 do { 200 if (mIteratorWasReset) { 201 result = mBreakIterator->following(mCurrent); 202 mIteratorWasReset = false; 203 } else { 204 result = mBreakIterator->next(); 205 } 206 } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize 207 && !isBreakValid(mText, mTextSize, result)); 208 mCurrent = (ssize_t)result; 209 return mCurrent; 210} 211 212ssize_t WordBreaker::wordStart() const { 213 if (mInEmailOrUrl) { 214 return mLast; 215 } 216 ssize_t result = mLast; 217 while (result < mCurrent) { 218 UChar32 c; 219 ssize_t ix = result; 220 U16_NEXT(mText, ix, mCurrent, c); 221 int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 222 // strip leading punctuation, defined as OP and QU line breaking classes, 223 // see UAX #14 224 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 225 break; 226 } 227 result = ix; 228 } 229 return result; 230} 231 232ssize_t WordBreaker::wordEnd() const { 233 if (mInEmailOrUrl) { 234 return mLast; 235 } 236 ssize_t result = mCurrent; 237 while (result > mLast) { 238 UChar32 c; 239 ssize_t ix = result; 240 U16_PREV(mText, mLast, ix, c); 241 int32_t gc_mask = U_GET_GC_MASK(c); 242 // strip trailing space and punctuation 243 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) { 244 break; 245 } 246 result = ix; 247 } 248 return result; 249} 250 251int WordBreaker::breakBadness() const { 252 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0; 253} 254 255void WordBreaker::finish() { 256 mText = nullptr; 257 // Note: calling utext_close multiply is safe 258 utext_close(&mUText); 259} 260 261} // namespace android 262