LineBreaker.cpp revision c88ef135fcc2661ec7addc171ebc60787df38aff
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define VERBOSE_DEBUG 0 18 19#include <limits> 20 21#define LOG_TAG "Minikin" 22#include <cutils/log.h> 23 24#include <minikin/Layout.h> 25#include <minikin/LineBreaker.h> 26 27using std::vector; 28 29namespace android { 30 31const int CHAR_TAB = 0x0009; 32 33// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these 34// constants are larger than any reasonable actual width score. 35const float SCORE_INFTY = std::numeric_limits<float>::max(); 36const float SCORE_OVERFULL = 1e12f; 37const float SCORE_DESPERATE = 1e10f; 38 39// Multiplier for hyphen penalty on last line. 40const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f; 41// Penalty assigned to each line break (to try to minimize number of lines) 42// TODO: when we implement full justification (so spaces can shrink and stretch), this is 43// probably not the most appropriate method. 44const float LINE_PENALTY_MULTIPLIER = 2.0f; 45 46// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for 47// unreasonably long words. This is somewhat of a heuristic because extremely long words 48// are possible in some languages. This does mean that very long real words can get 49// broken by desperate breaks, with no hyphens. 50const size_t LONGEST_HYPHENATED_WORD = 45; 51 52// When the text buffer is within this limit, capacity of vectors is retained at finish(), 53// to avoid allocation. 54const size_t MAX_TEXT_BUF_RETAIN = 32678; 55 56void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) { 57 mWordBreaker.setLocale(locale); 58 59 mHyphenator = hyphenator; 60} 61 62void LineBreaker::setText() { 63 mWordBreaker.setText(mTextBuf.data(), mTextBuf.size()); 64 65 // handle initial break here because addStyleRun may never be called 66 mWordBreaker.next(); 67 mCandidates.clear(); 68 Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0}; 69 mCandidates.push_back(cand); 70 71 // reset greedy breaker state 72 mBreaks.clear(); 73 mWidths.clear(); 74 mFlags.clear(); 75 mLastBreak = 0; 76 mBestBreak = 0; 77 mBestScore = SCORE_INFTY; 78 mPreBreak = 0; 79 mFirstTabIndex = INT_MAX; 80} 81 82void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) { 83 mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth); 84} 85 86 87void LineBreaker::setIndents(const std::vector<float>& indents) { 88 mLineWidths.setIndents(indents); 89} 90 91// This function determines whether a character is a space that disappears at end of line. 92// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]], 93// plus '\n'. 94// Note: all such characters are in the BMP, so it's ok to use code units for this. 95static bool isLineEndSpace(uint16_t c) { 96 return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) || 97 c == 0x205F || c == 0x3000; 98} 99 100// This function determines whether a character is like U+2010 HYPHEN in 101// line breaking and usage: a character immediately after which line breaks 102// are allowed, but words containing it should not be automatically 103// hyphenated. This is a curated set, created by manually inspecting all 104// the characters that have the Unicode line breaking property of BA or HY 105// and seeing which ones are hyphens. 106static bool isLineBreakingHyphen(uint16_t c) { 107 return (c == 0x002D || // HYPHEN-MINUS 108 c == 0x058A || // ARMENIAN HYPHEN 109 c == 0x05BE || // HEBREW PUNCTUATION MAQAF 110 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN 111 c == 0x2010 || // HYPHEN 112 c == 0x2013 || // EN DASH 113 c == 0x2027 || // HYPHENATION POINT 114 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN 115 c == 0x2E40); // DOUBLE HYPHEN 116} 117 118// Ordinarily, this method measures the text in the range given. However, when paint 119// is nullptr, it assumes the widths have already been calculated and stored in the 120// width buffer. 121// This method finds the candidate word breaks (using the ICU break iterator) and sends them 122// to addCandidate. 123float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface, 124 FontStyle style, size_t start, size_t end, bool isRtl) { 125 Layout layout; // performance TODO: move layout to self object to reduce allocation cost? 126 float width = 0.0f; 127 int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR; 128 129 float hyphenPenalty = 0.0; 130 if (paint != nullptr) { 131 layout.setFontCollection(typeface); 132 layout.doLayout(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, style, 133 *paint); 134 layout.getAdvances(mCharWidths.data() + start); 135 width = layout.getAdvance(); 136 137 // a heuristic that seems to perform well 138 hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0); 139 if (mHyphenationFrequency == kHyphenationFrequency_Normal) { 140 hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing 141 } 142 143 mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER); 144 } 145 146 size_t current = (size_t)mWordBreaker.current(); 147 size_t afterWord = start; 148 size_t lastBreak = start; 149 ParaWidth lastBreakWidth = mWidth; 150 ParaWidth postBreak = mWidth; 151 bool temporarilySkipHyphenation = false; 152 for (size_t i = start; i < end; i++) { 153 uint16_t c = mTextBuf[i]; 154 if (c == CHAR_TAB) { 155 mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak); 156 if (mFirstTabIndex == INT_MAX) { 157 mFirstTabIndex = (int)i; 158 } 159 // fall back to greedy; other modes don't know how to deal with tabs 160 mStrategy = kBreakStrategy_Greedy; 161 } else { 162 mWidth += mCharWidths[i]; 163 if (!isLineEndSpace(c)) { 164 postBreak = mWidth; 165 afterWord = i + 1; 166 } 167 } 168 if (i + 1 == current) { 169 // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so 170 // we can pass the whole word down to Hyphenator like the soft hyphen case. 171 bool wordEndsInHyphen = isLineBreakingHyphen(c); 172 size_t wordStart = mWordBreaker.wordStart(); 173 size_t wordEnd = mWordBreaker.wordEnd(); 174 if (paint != nullptr && mHyphenator != nullptr && 175 mHyphenationFrequency != kHyphenationFrequency_None && 176 !wordEndsInHyphen && !temporarilySkipHyphenation && 177 wordEnd > wordStart && wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) { 178 mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart); 179#if VERBOSE_DEBUG 180 std::string hyphenatedString; 181 for (size_t j = wordStart; j < wordEnd; j++) { 182 if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-'); 183 // Note: only works with ASCII, should do UTF-8 conversion here 184 hyphenatedString.push_back(buffer()[j]); 185 } 186 ALOGD("hyphenated string: %s", hyphenatedString.c_str()); 187#endif 188 189 // measure hyphenated substrings 190 for (size_t j = wordStart; j < wordEnd; j++) { 191 uint8_t hyph = mHyphBuf[j - wordStart]; 192 if (hyph) { 193 paint->hyphenEdit = hyph; 194 layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak, 195 mTextBuf.size(), bidiFlags, style, *paint); 196 ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance(); 197 paint->hyphenEdit = 0; 198 layout.doLayout(mTextBuf.data(), j, afterWord - j, 199 mTextBuf.size(), bidiFlags, style, *paint); 200 ParaWidth hyphPreBreak = postBreak - layout.getAdvance(); 201 addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph); 202 } 203 } 204 } 205 // Skip hyphenating the next word if and only if the present word ends in a hyphen 206 temporarilySkipHyphenation = wordEndsInHyphen; 207 208 // Skip break for zero-width characters inside replacement span 209 if (paint != nullptr || current == end || mCharWidths[current] > 0) { 210 float penalty = hyphenPenalty * mWordBreaker.breakBadness(); 211 addWordBreak(current, mWidth, postBreak, penalty, 0); 212 } 213 lastBreak = current; 214 lastBreakWidth = mWidth; 215 current = (size_t)mWordBreaker.next(); 216 } 217 } 218 219 return width; 220} 221 222// add a word break (possibly for a hyphenated fragment), and add desperate breaks if 223// needed (ie when word exceeds current line width) 224void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak, 225 float penalty, uint8_t hyph) { 226 Candidate cand; 227 ParaWidth width = mCandidates.back().preBreak; 228 if (postBreak - width > currentLineWidth()) { 229 // Add desperate breaks. 230 // Note: these breaks are based on the shaping of the (non-broken) original text; they 231 // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping. 232 size_t i = mCandidates.back().offset; 233 width += mCharWidths[i++]; 234 for (; i < offset; i++) { 235 float w = mCharWidths[i]; 236 if (w > 0) { 237 cand.offset = i; 238 cand.preBreak = width; 239 cand.postBreak = width; 240 cand.penalty = SCORE_DESPERATE; 241 cand.hyphenEdit = 0; 242#if VERBOSE_DEBUG 243 ALOGD("desperate cand: %zd %g:%g", 244 mCandidates.size(), cand.postBreak, cand.preBreak); 245#endif 246 addCandidate(cand); 247 width += w; 248 } 249 } 250 } 251 252 cand.offset = offset; 253 cand.preBreak = preBreak; 254 cand.postBreak = postBreak; 255 cand.penalty = penalty; 256 cand.hyphenEdit = hyph; 257#if VERBOSE_DEBUG 258 ALOGD("cand: %zd %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak); 259#endif 260 addCandidate(cand); 261} 262 263// TODO performance: could avoid populating mCandidates if greedy only 264void LineBreaker::addCandidate(Candidate cand) { 265 size_t candIndex = mCandidates.size(); 266 mCandidates.push_back(cand); 267 if (cand.postBreak - mPreBreak > currentLineWidth()) { 268 // This break would create an overfull line, pick the best break and break there (greedy) 269 if (mBestBreak == mLastBreak) { 270 mBestBreak = candIndex; 271 } 272 pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak, 273 mCandidates[mBestBreak].hyphenEdit); 274 mBestScore = SCORE_INFTY; 275#if VERBOSE_DEBUG 276 ALOGD("break: %d %g", mBreaks.back(), mWidths.back()); 277#endif 278 mLastBreak = mBestBreak; 279 mPreBreak = mCandidates[mBestBreak].preBreak; 280 } 281 if (cand.penalty <= mBestScore) { 282 mBestBreak = candIndex; 283 mBestScore = cand.penalty; 284 } 285} 286 287void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) { 288 mBreaks.push_back(offset); 289 mWidths.push_back(width); 290 int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift; 291 flags |= hyph; 292 mFlags.push_back(flags); 293 mFirstTabIndex = INT_MAX; 294} 295 296void LineBreaker::addReplacement(size_t start, size_t end, float width) { 297 mCharWidths[start] = width; 298 std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f); 299 addStyleRun(nullptr, nullptr, FontStyle(), start, end, false); 300} 301 302float LineBreaker::currentLineWidth() const { 303 return mLineWidths.getLineWidth(mBreaks.size()); 304} 305 306void LineBreaker::computeBreaksGreedy() { 307 // All breaks but the last have been added in addCandidate already. 308 size_t nCand = mCandidates.size(); 309 if (nCand == 1 || mLastBreak != nCand - 1) { 310 pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0); 311 // don't need to update mBestScore, because we're done 312#if VERBOSE_DEBUG 313 ALOGD("final break: %d %g", mBreaks.back(), mWidths.back()); 314#endif 315 } 316} 317 318// Follow "prev" links in mCandidates array, and copy to result arrays. 319void LineBreaker::finishBreaksOptimal() { 320 // clear existing greedy break result 321 mBreaks.clear(); 322 mWidths.clear(); 323 mFlags.clear(); 324 size_t nCand = mCandidates.size(); 325 size_t prev; 326 for (size_t i = nCand - 1; i > 0; i = prev) { 327 prev = mCandidates[i].prev; 328 mBreaks.push_back(mCandidates[i].offset); 329 mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak); 330 mFlags.push_back(mCandidates[i].hyphenEdit); 331 } 332 std::reverse(mBreaks.begin(), mBreaks.end()); 333 std::reverse(mWidths.begin(), mWidths.end()); 334 std::reverse(mFlags.begin(), mFlags.end()); 335} 336 337void LineBreaker::computeBreaksOptimal(bool isRectangle) { 338 size_t active = 0; 339 size_t nCand = mCandidates.size(); 340 float width = mLineWidths.getLineWidth(0); 341 for (size_t i = 1; i < nCand; i++) { 342 bool atEnd = i == nCand - 1; 343 float best = SCORE_INFTY; 344 size_t bestPrev = 0; 345 size_t lineNumberLast = 0; 346 347 if (!isRectangle) { 348 size_t lineNumberLast = mCandidates[active].lineNumber; 349 width = mLineWidths.getLineWidth(lineNumberLast); 350 } 351 ParaWidth leftEdge = mCandidates[i].postBreak - width; 352 float bestHope = 0; 353 354 for (size_t j = active; j < i; j++) { 355 if (!isRectangle) { 356 size_t lineNumber = mCandidates[j].lineNumber; 357 if (lineNumber != lineNumberLast) { 358 float widthNew = mLineWidths.getLineWidth(lineNumber); 359 if (widthNew != width) { 360 leftEdge = mCandidates[i].postBreak - width; 361 bestHope = 0; 362 width = widthNew; 363 } 364 lineNumberLast = lineNumber; 365 } 366 } 367 float jScore = mCandidates[j].score; 368 if (jScore + bestHope >= best) continue; 369 float delta = mCandidates[j].preBreak - leftEdge; 370 371 // compute width score for line 372 373 // Note: the "bestHope" optimization makes the assumption that, when delta is 374 // non-negative, widthScore will increase monotonically as successive candidate 375 // breaks are considered. 376 float widthScore = 0.0f; 377 float additionalPenalty = 0.0f; 378 if (delta < 0) { 379 widthScore = SCORE_OVERFULL; 380 } else if (atEnd && mStrategy != kBreakStrategy_Balanced) { 381 // increase penalty for hyphen on last line 382 additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty; 383 } else { 384 widthScore = delta * delta; 385 } 386 387 if (delta < 0) { 388 active = j + 1; 389 } else { 390 bestHope = widthScore; 391 } 392 393 float score = jScore + widthScore + additionalPenalty; 394 if (score <= best) { 395 best = score; 396 bestPrev = j; 397 } 398 } 399 mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty; 400 mCandidates[i].prev = bestPrev; 401 mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1; 402#if VERBOSE_DEBUG 403 ALOGD("break %zd: score=%g, prev=%zd", i, mCandidates[i].score, mCandidates[i].prev); 404#endif 405 } 406 finishBreaksOptimal(); 407} 408 409size_t LineBreaker::computeBreaks() { 410 if (mStrategy == kBreakStrategy_Greedy) { 411 computeBreaksGreedy(); 412 } else { 413 computeBreaksOptimal(mLineWidths.isConstant()); 414 } 415 return mBreaks.size(); 416} 417 418void LineBreaker::finish() { 419 mWordBreaker.finish(); 420 mWidth = 0; 421 mCandidates.clear(); 422 mBreaks.clear(); 423 mWidths.clear(); 424 mFlags.clear(); 425 if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) { 426 mTextBuf.clear(); 427 mTextBuf.shrink_to_fit(); 428 mCharWidths.clear(); 429 mCharWidths.shrink_to_fit(); 430 mHyphBuf.clear(); 431 mHyphBuf.shrink_to_fit(); 432 mCandidates.shrink_to_fit(); 433 mBreaks.shrink_to_fit(); 434 mWidths.shrink_to_fit(); 435 mFlags.shrink_to_fit(); 436 } 437 mStrategy = kBreakStrategy_Greedy; 438 mHyphenationFrequency = kHyphenationFrequency_Normal; 439 mLinePenalty = 0.0f; 440} 441 442} // namespace android 443