LineBreaker.cpp revision e9ec9a1d1b9e177659b7cc2cd3a102760fcfec14
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define VERBOSE_DEBUG 0 18 19#include <limits> 20 21#define LOG_TAG "Minikin" 22#include <cutils/log.h> 23 24#include <minikin/Layout.h> 25#include <minikin/LineBreaker.h> 26 27using std::vector; 28 29namespace android { 30 31const int CHAR_TAB = 0x0009; 32const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; 33 34// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these 35// constants are larger than any reasonable actual width score. 36const float SCORE_INFTY = std::numeric_limits<float>::max(); 37const float SCORE_OVERFULL = 1e12f; 38const float SCORE_DESPERATE = 1e10f; 39 40// Multiplier for hyphen penalty on last line. 41const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f; 42// Penalty assigned to each line break (to try to minimize number of lines) 43// TODO: when we implement full justification (so spaces can shrink and stretch), this is 44// probably not the most appropriate method. 45const float LINE_PENALTY_MULTIPLIER = 2.0f; 46 47// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for 48// unreasonably long words. This is somewhat of a heuristic because extremely long words 49// are possible in some languages. This does mean that very long real words can get 50// broken by desperate breaks, with no hyphens. 51const size_t LONGEST_HYPHENATED_WORD = 45; 52 53// When the text buffer is within this limit, capacity of vectors is retained at finish(), 54// to avoid allocation. 55const size_t MAX_TEXT_BUF_RETAIN = 32678; 56 57void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) { 58 delete mBreakIterator; 59 UErrorCode status = U_ZERO_ERROR; 60 mBreakIterator = icu::BreakIterator::createLineInstance(locale, status); 61 // TODO: check status 62 63 // TODO: load actual resource dependent on locale; letting Minikin do it is a hack 64 mHyphenator = hyphenator; 65} 66 67void LineBreaker::setText() { 68 UErrorCode status = U_ZERO_ERROR; 69 utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status); 70 mBreakIterator->setText(&mUText, status); 71 mBreakIterator->first(); 72 73 // handle initial break here because addStyleRun may never be called 74 mBreakIterator->next(); 75 mCandidates.clear(); 76 Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0}; 77 mCandidates.push_back(cand); 78 79 // reset greedy breaker state 80 mBreaks.clear(); 81 mWidths.clear(); 82 mFlags.clear(); 83 mLastBreak = 0; 84 mBestBreak = 0; 85 mBestScore = SCORE_INFTY; 86 mPreBreak = 0; 87 mFirstTabIndex = INT_MAX; 88} 89 90void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) { 91 mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth); 92} 93 94 95void LineBreaker::setIndents(const std::vector<float>& indents) { 96 mLineWidths.setIndents(indents); 97} 98 99// This function determines whether a character is a space that disappears at end of line. 100// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]], 101// plus '\n'. 102// Note: all such characters are in the BMP, so it's ok to use code units for this. 103static bool isLineEndSpace(uint16_t c) { 104 return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) || 105 c == 0x205F || c == 0x3000; 106} 107 108// This function determines whether a character is like U+2010 HYPHEN in 109// line breaking and usage: a character immediately after which line breaks 110// are allowed, but words containing it should not be automatically 111// hyphenated. This is a curated set, created by manually inspecting all 112// the characters that have the Unicode line breaking property of BA or HY 113// and seeing which ones are hyphens. 114static bool isLineBreakingHyphen(uint16_t c) { 115 return (c == 0x002D || // HYPHEN-MINUS 116 c == 0x058A || // ARMENIAN HYPHEN 117 c == 0x05BE || // HEBREW PUNCTUATION MAQAF 118 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN 119 c == 0x2010 || // HYPHEN 120 c == 0x2013 || // EN DASH 121 c == 0x2027 || // HYPHENATION POINT 122 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN 123 c == 0x2E40); // DOUBLE HYPHEN 124} 125 126// Ordinarily, this method measures the text in the range given. However, when paint 127// is nullptr, it assumes the widths have already been calculated and stored in the 128// width buffer. 129// This method finds the candidate word breaks (using the ICU break iterator) and sends them 130// to addCandidate. 131float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface, 132 FontStyle style, size_t start, size_t end, bool isRtl) { 133 Layout layout; // performance TODO: move layout to self object to reduce allocation cost? 134 float width = 0.0f; 135 int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR; 136 137 float hyphenPenalty = 0.0; 138 if (paint != nullptr) { 139 layout.setFontCollection(typeface); 140 layout.doLayout(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, style, 141 *paint); 142 layout.getAdvances(mCharWidths.data() + start); 143 width = layout.getAdvance(); 144 145 // a heuristic that seems to perform well 146 hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0); 147 if (mHyphenationFrequency == kHyphenationFrequency_Normal) { 148 hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing 149 } 150 151 mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER); 152 } 153 154 size_t current = (size_t)mBreakIterator->current(); 155 size_t wordEnd = start; 156 size_t lastBreak = start; 157 ParaWidth lastBreakWidth = mWidth; 158 ParaWidth postBreak = mWidth; 159 bool temporarilySkipHyphenation = false; 160 for (size_t i = start; i < end; i++) { 161 uint16_t c = mTextBuf[i]; 162 if (c == CHAR_TAB) { 163 mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak); 164 if (mFirstTabIndex == INT_MAX) { 165 mFirstTabIndex = (int)i; 166 } 167 // fall back to greedy; other modes don't know how to deal with tabs 168 mStrategy = kBreakStrategy_Greedy; 169 } else { 170 mWidth += mCharWidths[i]; 171 if (!isLineEndSpace(c)) { 172 postBreak = mWidth; 173 wordEnd = i + 1; 174 } 175 } 176 if (i + 1 == current) { 177 // Override ICU's treatment of soft hyphen as a break opportunity, because we want it 178 // to be a hyphen break, with penalty and drawing behavior. 179 if (c != CHAR_SOFT_HYPHEN) { 180 // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so 181 // we can pass the whole word down to Hyphenator like the soft hyphen case. 182 bool wordEndsInHyphen = isLineBreakingHyphen(c); 183 if (paint != nullptr && mHyphenator != nullptr && 184 mHyphenationFrequency != kHyphenationFrequency_None && 185 !wordEndsInHyphen && !temporarilySkipHyphenation && 186 wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) { 187 mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak); 188 #if VERBOSE_DEBUG 189 std::string hyphenatedString; 190 for (size_t j = lastBreak; j < wordEnd; j++) { 191 if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-'); 192 // Note: only works with ASCII, should do UTF-8 conversion here 193 hyphenatedString.push_back(buffer()[j]); 194 } 195 ALOGD("hyphenated string: %s", hyphenatedString.c_str()); 196 #endif 197 198 // measure hyphenated substrings 199 for (size_t j = lastBreak; j < wordEnd; j++) { 200 uint8_t hyph = mHyphBuf[j - lastBreak]; 201 if (hyph) { 202 paint->hyphenEdit = hyph; 203 layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak, 204 mTextBuf.size(), bidiFlags, style, *paint); 205 ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance(); 206 paint->hyphenEdit = 0; 207 layout.doLayout(mTextBuf.data(), j, wordEnd - j, 208 mTextBuf.size(), bidiFlags, style, *paint); 209 ParaWidth hyphPreBreak = postBreak - layout.getAdvance(); 210 addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph); 211 } 212 } 213 } 214 // Skip hyphenating the next word if and only if the present word ends in a hyphen 215 temporarilySkipHyphenation = wordEndsInHyphen; 216 217 // Skip break for zero-width characters inside replacement span 218 if (paint != nullptr || current == end || mCharWidths[current] > 0) { 219 addWordBreak(current, mWidth, postBreak, 0.0, 0); 220 } 221 lastBreak = current; 222 lastBreakWidth = mWidth; 223 } 224 current = (size_t)mBreakIterator->next(); 225 } 226 } 227 228 return width; 229} 230 231// add a word break (possibly for a hyphenated fragment), and add desperate breaks if 232// needed (ie when word exceeds current line width) 233void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak, 234 float penalty, uint8_t hyph) { 235 Candidate cand; 236 ParaWidth width = mCandidates.back().preBreak; 237 if (postBreak - width > currentLineWidth()) { 238 // Add desperate breaks. 239 // Note: these breaks are based on the shaping of the (non-broken) original text; they 240 // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping. 241 size_t i = mCandidates.back().offset; 242 width += mCharWidths[i++]; 243 for (; i < offset; i++) { 244 float w = mCharWidths[i]; 245 if (w > 0) { 246 cand.offset = i; 247 cand.preBreak = width; 248 cand.postBreak = width; 249 cand.penalty = SCORE_DESPERATE; 250 cand.hyphenEdit = 0; 251#if VERBOSE_DEBUG 252 ALOGD("desperate cand: %d %g:%g", 253 mCandidates.size(), cand.postBreak, cand.preBreak); 254#endif 255 addCandidate(cand); 256 width += w; 257 } 258 } 259 } 260 261 cand.offset = offset; 262 cand.preBreak = preBreak; 263 cand.postBreak = postBreak; 264 cand.penalty = penalty; 265 cand.hyphenEdit = hyph; 266#if VERBOSE_DEBUG 267 ALOGD("cand: %d %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak); 268#endif 269 addCandidate(cand); 270} 271 272// TODO performance: could avoid populating mCandidates if greedy only 273void LineBreaker::addCandidate(Candidate cand) { 274 size_t candIndex = mCandidates.size(); 275 mCandidates.push_back(cand); 276 if (cand.postBreak - mPreBreak > currentLineWidth()) { 277 // This break would create an overfull line, pick the best break and break there (greedy) 278 if (mBestBreak == mLastBreak) { 279 mBestBreak = candIndex; 280 } 281 pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak, 282 mCandidates[mBestBreak].hyphenEdit); 283 mBestScore = SCORE_INFTY; 284#if VERBOSE_DEBUG 285 ALOGD("break: %d %g", mBreaks.back(), mWidths.back()); 286#endif 287 mLastBreak = mBestBreak; 288 mPreBreak = mCandidates[mBestBreak].preBreak; 289 } 290 if (cand.penalty <= mBestScore) { 291 mBestBreak = candIndex; 292 mBestScore = cand.penalty; 293 } 294} 295 296void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) { 297 mBreaks.push_back(offset); 298 mWidths.push_back(width); 299 int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift; 300 flags |= hyph; 301 mFlags.push_back(flags); 302 mFirstTabIndex = INT_MAX; 303} 304 305void LineBreaker::addReplacement(size_t start, size_t end, float width) { 306 mCharWidths[start] = width; 307 std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f); 308 addStyleRun(nullptr, nullptr, FontStyle(), start, end, false); 309} 310 311float LineBreaker::currentLineWidth() const { 312 return mLineWidths.getLineWidth(mBreaks.size()); 313} 314 315void LineBreaker::computeBreaksGreedy() { 316 // All breaks but the last have been added in addCandidate already. 317 size_t nCand = mCandidates.size(); 318 if (nCand == 1 || mLastBreak != nCand - 1) { 319 pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0); 320 // don't need to update mBestScore, because we're done 321#if VERBOSE_DEBUG 322 ALOGD("final break: %d %g", mBreaks.back(), mWidths.back()); 323#endif 324 } 325} 326 327// Follow "prev" links in mCandidates array, and copy to result arrays. 328void LineBreaker::finishBreaksOptimal() { 329 // clear existing greedy break result 330 mBreaks.clear(); 331 mWidths.clear(); 332 mFlags.clear(); 333 size_t nCand = mCandidates.size(); 334 size_t prev; 335 for (size_t i = nCand - 1; i > 0; i = prev) { 336 prev = mCandidates[i].prev; 337 mBreaks.push_back(mCandidates[i].offset); 338 mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak); 339 mFlags.push_back(mCandidates[i].hyphenEdit); 340 } 341 std::reverse(mBreaks.begin(), mBreaks.end()); 342 std::reverse(mWidths.begin(), mWidths.end()); 343 std::reverse(mFlags.begin(), mFlags.end()); 344} 345 346void LineBreaker::computeBreaksOptimal(bool isRectangle) { 347 size_t active = 0; 348 size_t nCand = mCandidates.size(); 349 float width = mLineWidths.getLineWidth(0); 350 for (size_t i = 1; i < nCand; i++) { 351 bool atEnd = i == nCand - 1; 352 float best = SCORE_INFTY; 353 size_t bestPrev = 0; 354 size_t lineNumberLast = 0; 355 356 if (!isRectangle) { 357 size_t lineNumberLast = mCandidates[active].lineNumber; 358 width = mLineWidths.getLineWidth(lineNumberLast); 359 } 360 ParaWidth leftEdge = mCandidates[i].postBreak - width; 361 float bestHope = 0; 362 363 for (size_t j = active; j < i; j++) { 364 if (!isRectangle) { 365 size_t lineNumber = mCandidates[j].lineNumber; 366 if (lineNumber != lineNumberLast) { 367 float widthNew = mLineWidths.getLineWidth(lineNumber); 368 if (widthNew != width) { 369 leftEdge = mCandidates[i].postBreak - width; 370 bestHope = 0; 371 width = widthNew; 372 } 373 lineNumberLast = lineNumber; 374 } 375 } 376 float jScore = mCandidates[j].score; 377 if (jScore + bestHope >= best) continue; 378 float delta = mCandidates[j].preBreak - leftEdge; 379 380 // compute width score for line 381 382 // Note: the "bestHope" optimization makes the assumption that, when delta is 383 // non-negative, widthScore will increase monotonically as successive candidate 384 // breaks are considered. 385 float widthScore = 0.0f; 386 float additionalPenalty = 0.0f; 387 if (delta < 0) { 388 widthScore = SCORE_OVERFULL; 389 } else if (atEnd && mStrategy != kBreakStrategy_Balanced) { 390 // increase penalty for hyphen on last line 391 additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty; 392 } else { 393 widthScore = delta * delta; 394 } 395 396 if (delta < 0) { 397 active = j + 1; 398 } else { 399 bestHope = widthScore; 400 } 401 402 float score = jScore + widthScore + additionalPenalty; 403 if (score <= best) { 404 best = score; 405 bestPrev = j; 406 } 407 } 408 mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty; 409 mCandidates[i].prev = bestPrev; 410 mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1; 411#if VERBOSE_DEBUG 412 ALOGD("break %d: score=%g, prev=%d", i, mCandidates[i].score, mCandidates[i].prev); 413#endif 414 } 415 finishBreaksOptimal(); 416} 417 418size_t LineBreaker::computeBreaks() { 419 if (mStrategy == kBreakStrategy_Greedy) { 420 computeBreaksGreedy(); 421 } else { 422 computeBreaksOptimal(mLineWidths.isConstant()); 423 } 424 return mBreaks.size(); 425} 426 427void LineBreaker::finish() { 428 mWidth = 0; 429 mCandidates.clear(); 430 mBreaks.clear(); 431 mWidths.clear(); 432 mFlags.clear(); 433 if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) { 434 mTextBuf.clear(); 435 mTextBuf.shrink_to_fit(); 436 mCharWidths.clear(); 437 mCharWidths.shrink_to_fit(); 438 mHyphBuf.clear(); 439 mHyphBuf.shrink_to_fit(); 440 mCandidates.shrink_to_fit(); 441 mBreaks.shrink_to_fit(); 442 mWidths.shrink_to_fit(); 443 mFlags.shrink_to_fit(); 444 } 445 mStrategy = kBreakStrategy_Greedy; 446 mHyphenationFrequency = kHyphenationFrequency_Normal; 447 mLinePenalty = 0.0f; 448} 449 450} // namespace android 451