LineBreaker.cpp revision 39ab40115fae6d0c948e435233b3dd997ee7d8e5
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define VERBOSE_DEBUG 0 18 19#define LOG_TAG "Minikin" 20 21#include <limits> 22 23#include <log/log.h> 24 25#include <minikin/Layout.h> 26#include <minikin/LineBreaker.h> 27 28using std::vector; 29 30namespace android { 31 32const int CHAR_TAB = 0x0009; 33 34// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these 35// constants are larger than any reasonable actual width score. 36const float SCORE_INFTY = std::numeric_limits<float>::max(); 37const float SCORE_OVERFULL = 1e12f; 38const float SCORE_DESPERATE = 1e10f; 39 40// Multiplier for hyphen penalty on last line. 41const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f; 42// Penalty assigned to each line break (to try to minimize number of lines) 43// TODO: when we implement full justification (so spaces can shrink and stretch), this is 44// probably not the most appropriate method. 45const float LINE_PENALTY_MULTIPLIER = 2.0f; 46 47// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for 48// unreasonably long words. This is somewhat of a heuristic because extremely long words 49// are possible in some languages. This does mean that very long real words can get 50// broken by desperate breaks, with no hyphens. 51const size_t LONGEST_HYPHENATED_WORD = 45; 52 53// When the text buffer is within this limit, capacity of vectors is retained at finish(), 54// to avoid allocation. 55const size_t MAX_TEXT_BUF_RETAIN = 32678; 56 57void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) { 58 mWordBreaker.setLocale(locale); 59 60 mHyphenator = hyphenator; 61} 62 63void LineBreaker::setText() { 64 mWordBreaker.setText(mTextBuf.data(), mTextBuf.size()); 65 66 // handle initial break here because addStyleRun may never be called 67 mWordBreaker.next(); 68 mCandidates.clear(); 69 Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0}; 70 mCandidates.push_back(cand); 71 72 // reset greedy breaker state 73 mBreaks.clear(); 74 mWidths.clear(); 75 mFlags.clear(); 76 mLastBreak = 0; 77 mBestBreak = 0; 78 mBestScore = SCORE_INFTY; 79 mPreBreak = 0; 80 mFirstTabIndex = INT_MAX; 81} 82 83void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) { 84 mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth); 85} 86 87 88void LineBreaker::setIndents(const std::vector<float>& indents) { 89 mLineWidths.setIndents(indents); 90} 91 92// This function determines whether a character is a space that disappears at end of line. 93// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]], 94// plus '\n'. 95// Note: all such characters are in the BMP, so it's ok to use code units for this. 96static bool isLineEndSpace(uint16_t c) { 97 return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) || 98 c == 0x205F || c == 0x3000; 99} 100 101// This function determines whether a character is like U+2010 HYPHEN in 102// line breaking and usage: a character immediately after which line breaks 103// are allowed, but words containing it should not be automatically 104// hyphenated. This is a curated set, created by manually inspecting all 105// the characters that have the Unicode line breaking property of BA or HY 106// and seeing which ones are hyphens. 107static bool isLineBreakingHyphen(uint16_t c) { 108 return (c == 0x002D || // HYPHEN-MINUS 109 c == 0x058A || // ARMENIAN HYPHEN 110 c == 0x05BE || // HEBREW PUNCTUATION MAQAF 111 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN 112 c == 0x2010 || // HYPHEN 113 c == 0x2013 || // EN DASH 114 c == 0x2027 || // HYPHENATION POINT 115 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN 116 c == 0x2E40); // DOUBLE HYPHEN 117} 118 119// Ordinarily, this method measures the text in the range given. However, when paint 120// is nullptr, it assumes the widths have already been calculated and stored in the 121// width buffer. 122// This method finds the candidate word breaks (using the ICU break iterator) and sends them 123// to addCandidate. 124float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface, 125 FontStyle style, size_t start, size_t end, bool isRtl) { 126 float width = 0.0f; 127 int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR; 128 129 float hyphenPenalty = 0.0; 130 if (paint != nullptr) { 131 width = Layout::measureText(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, 132 style, *paint, typeface, mCharWidths.data() + start); 133 134 // a heuristic that seems to perform well 135 hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0); 136 if (mHyphenationFrequency == kHyphenationFrequency_Normal) { 137 hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing 138 } 139 140 mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER); 141 } 142 143 size_t current = (size_t)mWordBreaker.current(); 144 size_t afterWord = start; 145 size_t lastBreak = start; 146 ParaWidth lastBreakWidth = mWidth; 147 ParaWidth postBreak = mWidth; 148 bool temporarilySkipHyphenation = false; 149 for (size_t i = start; i < end; i++) { 150 uint16_t c = mTextBuf[i]; 151 if (c == CHAR_TAB) { 152 mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak); 153 if (mFirstTabIndex == INT_MAX) { 154 mFirstTabIndex = (int)i; 155 } 156 // fall back to greedy; other modes don't know how to deal with tabs 157 mStrategy = kBreakStrategy_Greedy; 158 } else { 159 mWidth += mCharWidths[i]; 160 if (!isLineEndSpace(c)) { 161 postBreak = mWidth; 162 afterWord = i + 1; 163 } 164 } 165 if (i + 1 == current) { 166 // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so 167 // we can pass the whole word down to Hyphenator like the soft hyphen case. 168 bool wordEndsInHyphen = isLineBreakingHyphen(c); 169 size_t wordStart = mWordBreaker.wordStart(); 170 size_t wordEnd = mWordBreaker.wordEnd(); 171 if (paint != nullptr && mHyphenator != nullptr && 172 mHyphenationFrequency != kHyphenationFrequency_None && 173 !wordEndsInHyphen && !temporarilySkipHyphenation && 174 wordStart >= start && wordEnd > wordStart && 175 wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) { 176 mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart); 177#if VERBOSE_DEBUG 178 std::string hyphenatedString; 179 for (size_t j = wordStart; j < wordEnd; j++) { 180 if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-'); 181 // Note: only works with ASCII, should do UTF-8 conversion here 182 hyphenatedString.push_back(buffer()[j]); 183 } 184 ALOGD("hyphenated string: %s", hyphenatedString.c_str()); 185#endif 186 187 // measure hyphenated substrings 188 for (size_t j = wordStart; j < wordEnd; j++) { 189 uint8_t hyph = mHyphBuf[j - wordStart]; 190 if (hyph) { 191 paint->hyphenEdit = hyph; 192 193 const float firstPartWidth = Layout::measureText(mTextBuf.data(), 194 lastBreak, j - lastBreak, mTextBuf.size(), bidiFlags, style, 195 *paint, typeface, nullptr); 196 ParaWidth hyphPostBreak = lastBreakWidth + firstPartWidth; 197 paint->hyphenEdit = 0; 198 199 const float secondPartWith = Layout::measureText(mTextBuf.data(), j, 200 afterWord - j, mTextBuf.size(), bidiFlags, style, *paint, 201 typeface, nullptr); 202 ParaWidth hyphPreBreak = postBreak - secondPartWith; 203 addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph); 204 } 205 } 206 } 207 // Skip hyphenating the next word if and only if the present word ends in a hyphen 208 temporarilySkipHyphenation = wordEndsInHyphen; 209 210 // Skip break for zero-width characters inside replacement span 211 if (paint != nullptr || current == end || mCharWidths[current] > 0) { 212 float penalty = hyphenPenalty * mWordBreaker.breakBadness(); 213 addWordBreak(current, mWidth, postBreak, penalty, 0); 214 } 215 lastBreak = current; 216 lastBreakWidth = mWidth; 217 current = (size_t)mWordBreaker.next(); 218 } 219 } 220 221 return width; 222} 223 224// add a word break (possibly for a hyphenated fragment), and add desperate breaks if 225// needed (ie when word exceeds current line width) 226void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak, 227 float penalty, uint8_t hyph) { 228 Candidate cand; 229 ParaWidth width = mCandidates.back().preBreak; 230 if (postBreak - width > currentLineWidth()) { 231 // Add desperate breaks. 232 // Note: these breaks are based on the shaping of the (non-broken) original text; they 233 // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping. 234 size_t i = mCandidates.back().offset; 235 width += mCharWidths[i++]; 236 for (; i < offset; i++) { 237 float w = mCharWidths[i]; 238 if (w > 0) { 239 cand.offset = i; 240 cand.preBreak = width; 241 cand.postBreak = width; 242 cand.penalty = SCORE_DESPERATE; 243 cand.hyphenEdit = 0; 244#if VERBOSE_DEBUG 245 ALOGD("desperate cand: %zd %g:%g", 246 mCandidates.size(), cand.postBreak, cand.preBreak); 247#endif 248 addCandidate(cand); 249 width += w; 250 } 251 } 252 } 253 254 cand.offset = offset; 255 cand.preBreak = preBreak; 256 cand.postBreak = postBreak; 257 cand.penalty = penalty; 258 cand.hyphenEdit = hyph; 259#if VERBOSE_DEBUG 260 ALOGD("cand: %zd %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak); 261#endif 262 addCandidate(cand); 263} 264 265// TODO performance: could avoid populating mCandidates if greedy only 266void LineBreaker::addCandidate(Candidate cand) { 267 size_t candIndex = mCandidates.size(); 268 mCandidates.push_back(cand); 269 if (cand.postBreak - mPreBreak > currentLineWidth()) { 270 // This break would create an overfull line, pick the best break and break there (greedy) 271 if (mBestBreak == mLastBreak) { 272 mBestBreak = candIndex; 273 } 274 pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak, 275 mCandidates[mBestBreak].hyphenEdit); 276 mBestScore = SCORE_INFTY; 277#if VERBOSE_DEBUG 278 ALOGD("break: %d %g", mBreaks.back(), mWidths.back()); 279#endif 280 mLastBreak = mBestBreak; 281 mPreBreak = mCandidates[mBestBreak].preBreak; 282 } 283 if (cand.penalty <= mBestScore) { 284 mBestBreak = candIndex; 285 mBestScore = cand.penalty; 286 } 287} 288 289void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) { 290 mBreaks.push_back(offset); 291 mWidths.push_back(width); 292 int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift; 293 flags |= hyph; 294 mFlags.push_back(flags); 295 mFirstTabIndex = INT_MAX; 296} 297 298void LineBreaker::addReplacement(size_t start, size_t end, float width) { 299 mCharWidths[start] = width; 300 std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f); 301 addStyleRun(nullptr, nullptr, FontStyle(), start, end, false); 302} 303 304float LineBreaker::currentLineWidth() const { 305 return mLineWidths.getLineWidth(mBreaks.size()); 306} 307 308void LineBreaker::computeBreaksGreedy() { 309 // All breaks but the last have been added in addCandidate already. 310 size_t nCand = mCandidates.size(); 311 if (nCand == 1 || mLastBreak != nCand - 1) { 312 pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0); 313 // don't need to update mBestScore, because we're done 314#if VERBOSE_DEBUG 315 ALOGD("final break: %d %g", mBreaks.back(), mWidths.back()); 316#endif 317 } 318} 319 320// Follow "prev" links in mCandidates array, and copy to result arrays. 321void LineBreaker::finishBreaksOptimal() { 322 // clear existing greedy break result 323 mBreaks.clear(); 324 mWidths.clear(); 325 mFlags.clear(); 326 size_t nCand = mCandidates.size(); 327 size_t prev; 328 for (size_t i = nCand - 1; i > 0; i = prev) { 329 prev = mCandidates[i].prev; 330 mBreaks.push_back(mCandidates[i].offset); 331 mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak); 332 mFlags.push_back(mCandidates[i].hyphenEdit); 333 } 334 std::reverse(mBreaks.begin(), mBreaks.end()); 335 std::reverse(mWidths.begin(), mWidths.end()); 336 std::reverse(mFlags.begin(), mFlags.end()); 337} 338 339void LineBreaker::computeBreaksOptimal(bool isRectangle) { 340 size_t active = 0; 341 size_t nCand = mCandidates.size(); 342 float width = mLineWidths.getLineWidth(0); 343 for (size_t i = 1; i < nCand; i++) { 344 bool atEnd = i == nCand - 1; 345 float best = SCORE_INFTY; 346 size_t bestPrev = 0; 347 size_t lineNumberLast = 0; 348 349 if (!isRectangle) { 350 size_t lineNumberLast = mCandidates[active].lineNumber; 351 width = mLineWidths.getLineWidth(lineNumberLast); 352 } 353 ParaWidth leftEdge = mCandidates[i].postBreak - width; 354 float bestHope = 0; 355 356 for (size_t j = active; j < i; j++) { 357 if (!isRectangle) { 358 size_t lineNumber = mCandidates[j].lineNumber; 359 if (lineNumber != lineNumberLast) { 360 float widthNew = mLineWidths.getLineWidth(lineNumber); 361 if (widthNew != width) { 362 leftEdge = mCandidates[i].postBreak - width; 363 bestHope = 0; 364 width = widthNew; 365 } 366 lineNumberLast = lineNumber; 367 } 368 } 369 float jScore = mCandidates[j].score; 370 if (jScore + bestHope >= best) continue; 371 float delta = mCandidates[j].preBreak - leftEdge; 372 373 // compute width score for line 374 375 // Note: the "bestHope" optimization makes the assumption that, when delta is 376 // non-negative, widthScore will increase monotonically as successive candidate 377 // breaks are considered. 378 float widthScore = 0.0f; 379 float additionalPenalty = 0.0f; 380 if (delta < 0) { 381 widthScore = SCORE_OVERFULL; 382 } else if (atEnd && mStrategy != kBreakStrategy_Balanced) { 383 // increase penalty for hyphen on last line 384 additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty; 385 } else { 386 widthScore = delta * delta; 387 } 388 389 if (delta < 0) { 390 active = j + 1; 391 } else { 392 bestHope = widthScore; 393 } 394 395 float score = jScore + widthScore + additionalPenalty; 396 if (score <= best) { 397 best = score; 398 bestPrev = j; 399 } 400 } 401 mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty; 402 mCandidates[i].prev = bestPrev; 403 mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1; 404#if VERBOSE_DEBUG 405 ALOGD("break %zd: score=%g, prev=%zd", i, mCandidates[i].score, mCandidates[i].prev); 406#endif 407 } 408 finishBreaksOptimal(); 409} 410 411size_t LineBreaker::computeBreaks() { 412 if (mStrategy == kBreakStrategy_Greedy) { 413 computeBreaksGreedy(); 414 } else { 415 computeBreaksOptimal(mLineWidths.isConstant()); 416 } 417 return mBreaks.size(); 418} 419 420void LineBreaker::finish() { 421 mWordBreaker.finish(); 422 mWidth = 0; 423 mLineWidths.clear(); 424 mCandidates.clear(); 425 mBreaks.clear(); 426 mWidths.clear(); 427 mFlags.clear(); 428 if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) { 429 mTextBuf.clear(); 430 mTextBuf.shrink_to_fit(); 431 mCharWidths.clear(); 432 mCharWidths.shrink_to_fit(); 433 mHyphBuf.clear(); 434 mHyphBuf.shrink_to_fit(); 435 mCandidates.shrink_to_fit(); 436 mBreaks.shrink_to_fit(); 437 mWidths.shrink_to_fit(); 438 mFlags.shrink_to_fit(); 439 } 440 mStrategy = kBreakStrategy_Greedy; 441 mHyphenationFrequency = kHyphenationFrequency_Normal; 442 mLinePenalty = 0.0f; 443} 444 445} // namespace android 446