LineBreaker.cpp revision 510047d8fd99c960d131c701bdee217d1970987f
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define VERBOSE_DEBUG 0 18 19#define LOG_TAG "Minikin" 20 21#include <limits> 22 23#include <log/log.h> 24 25#include "LayoutUtils.h" 26#include <minikin/Layout.h> 27#include <minikin/LineBreaker.h> 28 29using std::vector; 30 31namespace minikin { 32 33constexpr uint16_t CHAR_TAB = 0x0009; 34constexpr uint16_t CHAR_NBSP = 0x00A0; 35 36// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these 37// constants are larger than any reasonable actual width score. 38const float SCORE_INFTY = std::numeric_limits<float>::max(); 39const float SCORE_OVERFULL = 1e12f; 40const float SCORE_DESPERATE = 1e10f; 41 42// Multiplier for hyphen penalty on last line. 43const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f; 44// Penalty assigned to each line break (to try to minimize number of lines) 45// TODO: when we implement full justification (so spaces can shrink and stretch), this is 46// probably not the most appropriate method. 47const float LINE_PENALTY_MULTIPLIER = 2.0f; 48 49// Penalty assigned to shrinking the whitepsace. 50const float SHRINK_PENALTY_MULTIPLIER = 4.0f; 51 52// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for 53// unreasonably long words. This is somewhat of a heuristic because extremely long words 54// are possible in some languages. This does mean that very long real words can get 55// broken by desperate breaks, with no hyphens. 56const size_t LONGEST_HYPHENATED_WORD = 45; 57 58// When the text buffer is within this limit, capacity of vectors is retained at finish(), 59// to avoid allocation. 60const size_t MAX_TEXT_BUF_RETAIN = 32678; 61 62// Maximum amount that spaces can shrink, in justified text. 63const float SHRINKABILITY = 1.0 / 3.0; 64 65void LineBreaker::setLocales(const char* locales, const std::vector<Hyphenator*>& hyphenators) { 66 bool goodLocaleFound = false; 67 const ssize_t numLocales = hyphenators.size(); 68 // For now, we ignore all locales except the first valid one. 69 // TODO: Support selecting the locale based on the script of the text. 70 const char* localeStart = locales; 71 for (ssize_t i = 0; i < numLocales - 1; i++) { // Loop over all locales, except the last one. 72 const char* localeEnd = strchr(localeStart, ','); 73 const size_t localeNameLength = localeEnd - localeStart; 74 char localeName[localeNameLength + 1]; 75 strncpy(localeName, localeStart, localeNameLength); 76 localeName[localeNameLength] = '\0'; 77 mLocale = icu::Locale::createFromName(localeName); 78 goodLocaleFound = !mLocale.isBogus(); 79 if (goodLocaleFound) { 80 mHyphenator = hyphenators[i]; 81 break; 82 } else { 83 localeStart = localeEnd + 1; 84 } 85 } 86 if (!goodLocaleFound) { // Try the last locale. 87 mLocale = icu::Locale::createFromName(localeStart); 88 if (mLocale.isBogus()) { 89 // No good locale. 90 mLocale = icu::Locale::getRoot(); 91 mHyphenator = nullptr; 92 } else { 93 mHyphenator = numLocales == 0 ? nullptr : hyphenators[numLocales - 1]; 94 } 95 } 96 mWordBreaker.setLocale(mLocale); 97} 98 99void LineBreaker::setText() { 100 mWordBreaker.setText(mTextBuf.data(), mTextBuf.size()); 101 102 // handle initial break here because addStyleRun may never be called 103 mWordBreaker.next(); 104 mCandidates.clear(); 105 Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, HyphenationType::DONT_BREAK}; 106 mCandidates.push_back(cand); 107 108 // reset greedy breaker state 109 mBreaks.clear(); 110 mWidths.clear(); 111 mFlags.clear(); 112 mLastBreak = 0; 113 mBestBreak = 0; 114 mBestScore = SCORE_INFTY; 115 mPreBreak = 0; 116 mLastHyphenation = HyphenEdit::NO_EDIT; 117 mFirstTabIndex = INT_MAX; 118 mSpaceCount = 0; 119} 120 121void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) { 122 mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth); 123} 124 125 126void LineBreaker::setIndents(const std::vector<float>& indents) { 127 mLineWidths.setIndents(indents); 128} 129 130// This function determines whether a character is a space that disappears at end of line. 131// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]], 132// plus '\n'. 133// Note: all such characters are in the BMP, so it's ok to use code units for this. 134static bool isLineEndSpace(uint16_t c) { 135 return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) || 136 c == 0x205F || c == 0x3000; 137} 138 139// Hyphenates a string potentially containing non-breaking spaces. The result would be saved 140// in mHyphBuf. 141void LineBreaker::hyphenate(const uint16_t* str, size_t len) { 142 mHyphBuf.clear(); 143 mHyphBuf.reserve(len); 144 145 // A word here is any consecutive string of non-NBSP characters. 146 bool inWord = false; 147 size_t wordStart = 0; // The initial value will never be accessed, but just in case. 148 for (size_t i = 0; i <= len; i++) { 149 if (i == len || str[i] == CHAR_NBSP) { 150 if (inWord) { 151 // A word just ended. Hyphenate it. 152 const size_t wordLen = i - wordStart; 153 if (wordLen <= LONGEST_HYPHENATED_WORD) { 154 if (wordStart == 0) { 155 // The string starts with a word. Use mHyphBuf directly. 156 mHyphenator->hyphenate(&mHyphBuf, str, wordLen, mLocale); 157 } else { 158 std::vector<HyphenationType> wordVec; 159 mHyphenator->hyphenate(&wordVec, str + wordStart, wordLen, mLocale); 160 mHyphBuf.insert(mHyphBuf.end(), wordVec.cbegin(), wordVec.cend()); 161 } 162 } else { // Word is too long. Inefficient to hyphenate. 163 mHyphBuf.insert(mHyphBuf.end(), wordLen, HyphenationType::DONT_BREAK); 164 } 165 inWord = false; 166 } 167 if (i < len) { 168 // Insert one DONT_BREAK for the NBSP. 169 mHyphBuf.push_back(HyphenationType::DONT_BREAK); 170 } 171 } else if (!inWord) { 172 inWord = true; 173 wordStart = i; 174 } 175 } 176} 177 178// Ordinarily, this method measures the text in the range given. However, when paint 179// is nullptr, it assumes the widths have already been calculated and stored in the 180// width buffer. 181// This method finds the candidate word breaks (using the ICU break iterator) and sends them 182// to addCandidate. 183float LineBreaker::addStyleRun(MinikinPaint* paint, const std::shared_ptr<FontCollection>& typeface, 184 FontStyle style, size_t start, size_t end, bool isRtl) { 185 float width = 0.0f; 186 int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR; 187 188 float hyphenPenalty = 0.0; 189 if (paint != nullptr) { 190 width = Layout::measureText(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, 191 style, *paint, typeface, mCharWidths.data() + start); 192 193 // a heuristic that seems to perform well 194 hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0); 195 if (mHyphenationFrequency == kHyphenationFrequency_Normal) { 196 hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing 197 } 198 199 if (mJustified) { 200 // Make hyphenation more aggressive for fully justified text (so that "normal" in 201 // justified mode is the same as "full" in ragged-right). 202 hyphenPenalty *= 0.25; 203 } else { 204 // Line penalty is zero for justified text. 205 mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER); 206 } 207 } 208 209 size_t current = (size_t)mWordBreaker.current(); 210 size_t afterWord = start; 211 size_t lastBreak = start; 212 ParaWidth lastBreakWidth = mWidth; 213 ParaWidth postBreak = mWidth; 214 size_t postSpaceCount = mSpaceCount; 215 for (size_t i = start; i < end; i++) { 216 uint16_t c = mTextBuf[i]; 217 if (c == CHAR_TAB) { 218 mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak); 219 if (mFirstTabIndex == INT_MAX) { 220 mFirstTabIndex = (int)i; 221 } 222 // fall back to greedy; other modes don't know how to deal with tabs 223 mStrategy = kBreakStrategy_Greedy; 224 } else { 225 if (isWordSpace(c)) mSpaceCount += 1; 226 mWidth += mCharWidths[i]; 227 if (!isLineEndSpace(c)) { 228 postBreak = mWidth; 229 postSpaceCount = mSpaceCount; 230 afterWord = i + 1; 231 } 232 } 233 if (i + 1 == current) { 234 size_t wordStart = mWordBreaker.wordStart(); 235 size_t wordEnd = mWordBreaker.wordEnd(); 236 if (paint != nullptr && mHyphenator != nullptr && 237 mHyphenationFrequency != kHyphenationFrequency_None && 238 wordStart >= start && wordEnd > wordStart) { 239 hyphenate(&mTextBuf[wordStart], wordEnd - wordStart); 240#if VERBOSE_DEBUG 241 std::string hyphenatedString; 242 for (size_t j = wordStart; j < wordEnd; j++) { 243 if (mHyphBuf[j - wordStart] == HyphenationType::BREAK_AND_INSERT_HYPHEN) { 244 hyphenatedString.push_back('-'); 245 } 246 // Note: only works with ASCII, should do UTF-8 conversion here 247 hyphenatedString.push_back(buffer()[j]); 248 } 249 ALOGD("hyphenated string: %s", hyphenatedString.c_str()); 250#endif 251 252 // measure hyphenated substrings 253 for (size_t j = wordStart; j < wordEnd; j++) { 254 HyphenationType hyph = mHyphBuf[j - wordStart]; 255 if (hyph != HyphenationType::DONT_BREAK) { 256 paint->hyphenEdit = HyphenEdit::editForThisLine(hyph); 257 const float firstPartWidth = Layout::measureText(mTextBuf.data(), 258 lastBreak, j - lastBreak, mTextBuf.size(), bidiFlags, style, 259 *paint, typeface, nullptr); 260 ParaWidth hyphPostBreak = lastBreakWidth + firstPartWidth; 261 262 paint->hyphenEdit = HyphenEdit::editForNextLine(hyph); 263 const float secondPartWidth = Layout::measureText(mTextBuf.data(), j, 264 afterWord - j, mTextBuf.size(), bidiFlags, style, *paint, 265 typeface, nullptr); 266 ParaWidth hyphPreBreak = postBreak - secondPartWidth; 267 268 addWordBreak(j, hyphPreBreak, hyphPostBreak, postSpaceCount, postSpaceCount, 269 hyphenPenalty, hyph); 270 271 paint->hyphenEdit = HyphenEdit::NO_EDIT; 272 } 273 } 274 } 275 276 // Skip break for zero-width characters inside replacement span 277 if (paint != nullptr || current == end || mCharWidths[current] > 0) { 278 float penalty = hyphenPenalty * mWordBreaker.breakBadness(); 279 addWordBreak(current, mWidth, postBreak, mSpaceCount, postSpaceCount, penalty, 280 HyphenationType::DONT_BREAK); 281 } 282 lastBreak = current; 283 lastBreakWidth = mWidth; 284 current = (size_t)mWordBreaker.next(); 285 } 286 } 287 288 return width; 289} 290 291// add a word break (possibly for a hyphenated fragment), and add desperate breaks if 292// needed (ie when word exceeds current line width) 293void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak, 294 size_t preSpaceCount, size_t postSpaceCount, float penalty, HyphenationType hyph) { 295 Candidate cand; 296 ParaWidth width = mCandidates.back().preBreak; 297 if (postBreak - width > currentLineWidth()) { 298 // Add desperate breaks. 299 // Note: these breaks are based on the shaping of the (non-broken) original text; they 300 // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping. 301 size_t i = mCandidates.back().offset; 302 width += mCharWidths[i++]; 303 for (; i < offset; i++) { 304 float w = mCharWidths[i]; 305 if (w > 0) { 306 cand.offset = i; 307 cand.preBreak = width; 308 cand.postBreak = width; 309 // postSpaceCount doesn't include trailing spaces 310 cand.preSpaceCount = postSpaceCount; 311 cand.postSpaceCount = postSpaceCount; 312 cand.penalty = SCORE_DESPERATE; 313 cand.hyphenType = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; 314#if VERBOSE_DEBUG 315 ALOGD("desperate cand: %zd %g:%g", 316 mCandidates.size(), cand.postBreak, cand.preBreak); 317#endif 318 addCandidate(cand); 319 width += w; 320 } 321 } 322 } 323 324 cand.offset = offset; 325 cand.preBreak = preBreak; 326 cand.postBreak = postBreak; 327 cand.penalty = penalty; 328 cand.preSpaceCount = preSpaceCount; 329 cand.postSpaceCount = postSpaceCount; 330 cand.hyphenType = hyph; 331#if VERBOSE_DEBUG 332 ALOGD("cand: %zd %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak); 333#endif 334 addCandidate(cand); 335} 336 337// Helper method for addCandidate() 338void LineBreaker::pushGreedyBreak() { 339 const Candidate& bestCandidate = mCandidates[mBestBreak]; 340 pushBreak(bestCandidate.offset, bestCandidate.postBreak - mPreBreak, 341 mLastHyphenation | HyphenEdit::editForThisLine(bestCandidate.hyphenType)); 342 mBestScore = SCORE_INFTY; 343#if VERBOSE_DEBUG 344 ALOGD("break: %d %g", mBreaks.back(), mWidths.back()); 345#endif 346 mLastBreak = mBestBreak; 347 mPreBreak = bestCandidate.preBreak; 348 mLastHyphenation = HyphenEdit::editForNextLine(bestCandidate.hyphenType); 349} 350 351// TODO performance: could avoid populating mCandidates if greedy only 352void LineBreaker::addCandidate(Candidate cand) { 353 const size_t candIndex = mCandidates.size(); 354 mCandidates.push_back(cand); 355 356 // mLastBreak is the index of the last line break we decided to do in mCandidates, 357 // and mPreBreak is its preBreak value. mBestBreak is the index of the best line breaking candidate 358 // we have found since then, and mBestScore is its penalty. 359 if (cand.postBreak - mPreBreak > currentLineWidth()) { 360 // This break would create an overfull line, pick the best break and break there (greedy) 361 if (mBestBreak == mLastBreak) { 362 // No good break has been found since last break. Break here. 363 mBestBreak = candIndex; 364 } 365 pushGreedyBreak(); 366 } 367 368 while (mLastBreak != candIndex && cand.postBreak - mPreBreak > currentLineWidth()) { 369 // We should rarely come here. But if we are here, we have broken the line, but the 370 // remaining part still doesn't fit. We now need to break at the second best place after the 371 // last break, but we have not kept that information, so we need to go back and find it. 372 // 373 // In some really rare cases, postBreak - preBreak of a candidate itself may be over the 374 // current line width. We protect ourselves against an infinite loop in that case by 375 // checking that we have not broken the line at this candidate already. 376 for (size_t i = mLastBreak + 1; i < candIndex; i++) { 377 const float penalty = mCandidates[i].penalty; 378 if (penalty <= mBestScore) { 379 mBestBreak = i; 380 mBestScore = penalty; 381 } 382 } 383 if (mBestBreak == mLastBreak) { 384 // We didn't find anything good. Break here. 385 mBestBreak = candIndex; 386 } 387 pushGreedyBreak(); 388 } 389 390 if (cand.penalty <= mBestScore) { 391 mBestBreak = candIndex; 392 mBestScore = cand.penalty; 393 } 394} 395 396void LineBreaker::pushBreak(int offset, float width, uint8_t hyphenEdit) { 397 mBreaks.push_back(offset); 398 mWidths.push_back(width); 399 int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift; 400 flags |= hyphenEdit; 401 mFlags.push_back(flags); 402 mFirstTabIndex = INT_MAX; 403} 404 405void LineBreaker::addReplacement(size_t start, size_t end, float width) { 406 mCharWidths[start] = width; 407 std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f); 408 addStyleRun(nullptr, nullptr, FontStyle(), start, end, false); 409} 410 411// Get the width of a space. May return 0 if there are no spaces. 412// Note: if there are multiple different widths for spaces (for example, because of mixing of 413// fonts), it's only guaranteed to pick one. 414float LineBreaker::getSpaceWidth() const { 415 for (size_t i = 0; i < mTextBuf.size(); i++) { 416 if (isWordSpace(mTextBuf[i])) { 417 return mCharWidths[i]; 418 } 419 } 420 return 0.0f; 421} 422 423float LineBreaker::currentLineWidth() const { 424 return mLineWidths.getLineWidth(mBreaks.size()); 425} 426 427void LineBreaker::computeBreaksGreedy() { 428 // All breaks but the last have been added in addCandidate already. 429 size_t nCand = mCandidates.size(); 430 if (nCand == 1 || mLastBreak != nCand - 1) { 431 pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 432 mLastHyphenation); 433 // don't need to update mBestScore, because we're done 434#if VERBOSE_DEBUG 435 ALOGD("final break: %d %g", mBreaks.back(), mWidths.back()); 436#endif 437 } 438} 439 440// Follow "prev" links in mCandidates array, and copy to result arrays. 441void LineBreaker::finishBreaksOptimal() { 442 // clear existing greedy break result 443 mBreaks.clear(); 444 mWidths.clear(); 445 mFlags.clear(); 446 size_t nCand = mCandidates.size(); 447 size_t prev; 448 for (size_t i = nCand - 1; i > 0; i = prev) { 449 prev = mCandidates[i].prev; 450 mBreaks.push_back(mCandidates[i].offset); 451 mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak); 452 int flags = HyphenEdit::editForThisLine(mCandidates[i].hyphenType); 453 if (prev > 0) { 454 flags |= HyphenEdit::editForNextLine(mCandidates[prev].hyphenType); 455 } 456 mFlags.push_back(flags); 457 } 458 std::reverse(mBreaks.begin(), mBreaks.end()); 459 std::reverse(mWidths.begin(), mWidths.end()); 460 std::reverse(mFlags.begin(), mFlags.end()); 461} 462 463void LineBreaker::computeBreaksOptimal(bool isRectangle) { 464 size_t active = 0; 465 size_t nCand = mCandidates.size(); 466 float width = mLineWidths.getLineWidth(0); 467 float maxShrink = mJustified ? SHRINKABILITY * getSpaceWidth() : 0.0f; 468 469 // "i" iterates through candidates for the end of the line. 470 for (size_t i = 1; i < nCand; i++) { 471 bool atEnd = i == nCand - 1; 472 float best = SCORE_INFTY; 473 size_t bestPrev = 0; 474 size_t lineNumberLast = 0; 475 476 if (!isRectangle) { 477 size_t lineNumberLast = mCandidates[active].lineNumber; 478 width = mLineWidths.getLineWidth(lineNumberLast); 479 } 480 ParaWidth leftEdge = mCandidates[i].postBreak - width; 481 float bestHope = 0; 482 483 // "j" iterates through candidates for the beginning of the line. 484 for (size_t j = active; j < i; j++) { 485 if (!isRectangle) { 486 size_t lineNumber = mCandidates[j].lineNumber; 487 if (lineNumber != lineNumberLast) { 488 float widthNew = mLineWidths.getLineWidth(lineNumber); 489 if (widthNew != width) { 490 leftEdge = mCandidates[i].postBreak - width; 491 bestHope = 0; 492 width = widthNew; 493 } 494 lineNumberLast = lineNumber; 495 } 496 } 497 float jScore = mCandidates[j].score; 498 if (jScore + bestHope >= best) continue; 499 float delta = mCandidates[j].preBreak - leftEdge; 500 501 // compute width score for line 502 503 // Note: the "bestHope" optimization makes the assumption that, when delta is 504 // non-negative, widthScore will increase monotonically as successive candidate 505 // breaks are considered. 506 float widthScore = 0.0f; 507 float additionalPenalty = 0.0f; 508 if ((atEnd || !mJustified) && delta < 0) { 509 widthScore = SCORE_OVERFULL; 510 } else if (atEnd && mStrategy != kBreakStrategy_Balanced) { 511 // increase penalty for hyphen on last line 512 additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty; 513 } else { 514 widthScore = delta * delta; 515 if (delta < 0) { 516 if (-delta < maxShrink * 517 (mCandidates[i].postSpaceCount - mCandidates[j].preSpaceCount)) { 518 widthScore *= SHRINK_PENALTY_MULTIPLIER; 519 } else { 520 widthScore = SCORE_OVERFULL; 521 } 522 } 523 } 524 525 if (delta < 0) { 526 active = j + 1; 527 } else { 528 bestHope = widthScore; 529 } 530 531 float score = jScore + widthScore + additionalPenalty; 532 if (score <= best) { 533 best = score; 534 bestPrev = j; 535 } 536 } 537 mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty; 538 mCandidates[i].prev = bestPrev; 539 mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1; 540#if VERBOSE_DEBUG 541 ALOGD("break %zd: score=%g, prev=%zd", i, mCandidates[i].score, mCandidates[i].prev); 542#endif 543 } 544 finishBreaksOptimal(); 545} 546 547size_t LineBreaker::computeBreaks() { 548 if (mStrategy == kBreakStrategy_Greedy) { 549 computeBreaksGreedy(); 550 } else { 551 computeBreaksOptimal(mLineWidths.isConstant()); 552 } 553 return mBreaks.size(); 554} 555 556void LineBreaker::finish() { 557 mWordBreaker.finish(); 558 mWidth = 0; 559 mLineWidths.clear(); 560 mCandidates.clear(); 561 mBreaks.clear(); 562 mWidths.clear(); 563 mFlags.clear(); 564 if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) { 565 mTextBuf.clear(); 566 mTextBuf.shrink_to_fit(); 567 mCharWidths.clear(); 568 mCharWidths.shrink_to_fit(); 569 mHyphBuf.clear(); 570 mHyphBuf.shrink_to_fit(); 571 mCandidates.shrink_to_fit(); 572 mBreaks.shrink_to_fit(); 573 mWidths.shrink_to_fit(); 574 mFlags.shrink_to_fit(); 575 } 576 mStrategy = kBreakStrategy_Greedy; 577 mHyphenationFrequency = kHyphenationFrequency_Normal; 578 mLinePenalty = 0.0f; 579 mJustified = false; 580} 581 582} // namespace minikin 583