LineBreaker.cpp revision 510047d8fd99c960d131c701bdee217d1970987f
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define VERBOSE_DEBUG 0
18
19#define LOG_TAG "Minikin"
20
21#include <limits>
22
23#include <log/log.h>
24
25#include "LayoutUtils.h"
26#include <minikin/Layout.h>
27#include <minikin/LineBreaker.h>
28
29using std::vector;
30
31namespace minikin {
32
33constexpr uint16_t CHAR_TAB = 0x0009;
34constexpr uint16_t CHAR_NBSP = 0x00A0;
35
36// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
37// constants are larger than any reasonable actual width score.
38const float SCORE_INFTY = std::numeric_limits<float>::max();
39const float SCORE_OVERFULL = 1e12f;
40const float SCORE_DESPERATE = 1e10f;
41
42// Multiplier for hyphen penalty on last line.
43const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f;
44// Penalty assigned to each line break (to try to minimize number of lines)
45// TODO: when we implement full justification (so spaces can shrink and stretch), this is
46// probably not the most appropriate method.
47const float LINE_PENALTY_MULTIPLIER = 2.0f;
48
49// Penalty assigned to shrinking the whitepsace.
50const float SHRINK_PENALTY_MULTIPLIER = 4.0f;
51
52// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for
53// unreasonably long words. This is somewhat of a heuristic because extremely long words
54// are possible in some languages. This does mean that very long real words can get
55// broken by desperate breaks, with no hyphens.
56const size_t LONGEST_HYPHENATED_WORD = 45;
57
58// When the text buffer is within this limit, capacity of vectors is retained at finish(),
59// to avoid allocation.
60const size_t MAX_TEXT_BUF_RETAIN = 32678;
61
62// Maximum amount that spaces can shrink, in justified text.
63const float SHRINKABILITY = 1.0 / 3.0;
64
65void LineBreaker::setLocales(const char* locales, const std::vector<Hyphenator*>& hyphenators) {
66    bool goodLocaleFound = false;
67    const ssize_t numLocales = hyphenators.size();
68    // For now, we ignore all locales except the first valid one.
69    // TODO: Support selecting the locale based on the script of the text.
70    const char* localeStart = locales;
71    for (ssize_t i = 0; i < numLocales - 1; i++) { // Loop over all locales, except the last one.
72        const char* localeEnd = strchr(localeStart, ',');
73        const size_t localeNameLength = localeEnd - localeStart;
74        char localeName[localeNameLength + 1];
75        strncpy(localeName, localeStart, localeNameLength);
76        localeName[localeNameLength] = '\0';
77        mLocale = icu::Locale::createFromName(localeName);
78        goodLocaleFound = !mLocale.isBogus();
79        if (goodLocaleFound) {
80            mHyphenator = hyphenators[i];
81            break;
82        } else {
83            localeStart = localeEnd + 1;
84        }
85    }
86    if (!goodLocaleFound) { // Try the last locale.
87        mLocale = icu::Locale::createFromName(localeStart);
88        if (mLocale.isBogus()) {
89            // No good locale.
90            mLocale = icu::Locale::getRoot();
91            mHyphenator = nullptr;
92        } else {
93            mHyphenator = numLocales == 0 ? nullptr : hyphenators[numLocales - 1];
94        }
95    }
96    mWordBreaker.setLocale(mLocale);
97}
98
99void LineBreaker::setText() {
100    mWordBreaker.setText(mTextBuf.data(), mTextBuf.size());
101
102    // handle initial break here because addStyleRun may never be called
103    mWordBreaker.next();
104    mCandidates.clear();
105    Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0, HyphenationType::DONT_BREAK};
106    mCandidates.push_back(cand);
107
108    // reset greedy breaker state
109    mBreaks.clear();
110    mWidths.clear();
111    mFlags.clear();
112    mLastBreak = 0;
113    mBestBreak = 0;
114    mBestScore = SCORE_INFTY;
115    mPreBreak = 0;
116    mLastHyphenation = HyphenEdit::NO_EDIT;
117    mFirstTabIndex = INT_MAX;
118    mSpaceCount = 0;
119}
120
121void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) {
122    mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth);
123}
124
125
126void LineBreaker::setIndents(const std::vector<float>& indents) {
127    mLineWidths.setIndents(indents);
128}
129
130// This function determines whether a character is a space that disappears at end of line.
131// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]],
132// plus '\n'.
133// Note: all such characters are in the BMP, so it's ok to use code units for this.
134static bool isLineEndSpace(uint16_t c) {
135    return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) ||
136            c == 0x205F || c == 0x3000;
137}
138
139// Hyphenates a string potentially containing non-breaking spaces. The result would be saved
140// in mHyphBuf.
141void LineBreaker::hyphenate(const uint16_t* str, size_t len) {
142    mHyphBuf.clear();
143    mHyphBuf.reserve(len);
144
145    // A word here is any consecutive string of non-NBSP characters.
146    bool inWord = false;
147    size_t wordStart = 0; // The initial value will never be accessed, but just in case.
148    for (size_t i = 0; i <= len; i++) {
149        if (i == len || str[i] == CHAR_NBSP) {
150            if (inWord) {
151                // A word just ended. Hyphenate it.
152                const size_t wordLen = i - wordStart;
153                if (wordLen <= LONGEST_HYPHENATED_WORD) {
154                    if (wordStart == 0) {
155                        // The string starts with a word. Use mHyphBuf directly.
156                        mHyphenator->hyphenate(&mHyphBuf, str, wordLen, mLocale);
157                    } else {
158                        std::vector<HyphenationType> wordVec;
159                        mHyphenator->hyphenate(&wordVec, str + wordStart, wordLen, mLocale);
160                        mHyphBuf.insert(mHyphBuf.end(), wordVec.cbegin(), wordVec.cend());
161                    }
162                } else { // Word is too long. Inefficient to hyphenate.
163                    mHyphBuf.insert(mHyphBuf.end(), wordLen, HyphenationType::DONT_BREAK);
164                }
165                inWord = false;
166            }
167            if (i < len) {
168                // Insert one DONT_BREAK for the NBSP.
169                mHyphBuf.push_back(HyphenationType::DONT_BREAK);
170            }
171        } else if (!inWord) {
172            inWord = true;
173            wordStart = i;
174        }
175    }
176}
177
178// Ordinarily, this method measures the text in the range given. However, when paint
179// is nullptr, it assumes the widths have already been calculated and stored in the
180// width buffer.
181// This method finds the candidate word breaks (using the ICU break iterator) and sends them
182// to addCandidate.
183float LineBreaker::addStyleRun(MinikinPaint* paint, const std::shared_ptr<FontCollection>& typeface,
184        FontStyle style, size_t start, size_t end, bool isRtl) {
185    float width = 0.0f;
186    int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR;
187
188    float hyphenPenalty = 0.0;
189    if (paint != nullptr) {
190        width = Layout::measureText(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags,
191                style, *paint, typeface, mCharWidths.data() + start);
192
193        // a heuristic that seems to perform well
194        hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0);
195        if (mHyphenationFrequency == kHyphenationFrequency_Normal) {
196            hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing
197        }
198
199        if (mJustified) {
200            // Make hyphenation more aggressive for fully justified text (so that "normal" in
201            // justified mode is the same as "full" in ragged-right).
202            hyphenPenalty *= 0.25;
203        } else {
204            // Line penalty is zero for justified text.
205            mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
206        }
207    }
208
209    size_t current = (size_t)mWordBreaker.current();
210    size_t afterWord = start;
211    size_t lastBreak = start;
212    ParaWidth lastBreakWidth = mWidth;
213    ParaWidth postBreak = mWidth;
214    size_t postSpaceCount = mSpaceCount;
215    for (size_t i = start; i < end; i++) {
216        uint16_t c = mTextBuf[i];
217        if (c == CHAR_TAB) {
218            mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak);
219            if (mFirstTabIndex == INT_MAX) {
220                mFirstTabIndex = (int)i;
221            }
222            // fall back to greedy; other modes don't know how to deal with tabs
223            mStrategy = kBreakStrategy_Greedy;
224        } else {
225            if (isWordSpace(c)) mSpaceCount += 1;
226            mWidth += mCharWidths[i];
227            if (!isLineEndSpace(c)) {
228                postBreak = mWidth;
229                postSpaceCount = mSpaceCount;
230                afterWord = i + 1;
231            }
232        }
233        if (i + 1 == current) {
234            size_t wordStart = mWordBreaker.wordStart();
235            size_t wordEnd = mWordBreaker.wordEnd();
236            if (paint != nullptr && mHyphenator != nullptr &&
237                    mHyphenationFrequency != kHyphenationFrequency_None &&
238                    wordStart >= start && wordEnd > wordStart) {
239                hyphenate(&mTextBuf[wordStart], wordEnd - wordStart);
240#if VERBOSE_DEBUG
241                std::string hyphenatedString;
242                for (size_t j = wordStart; j < wordEnd; j++) {
243                    if (mHyphBuf[j - wordStart] == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
244                        hyphenatedString.push_back('-');
245                    }
246                    // Note: only works with ASCII, should do UTF-8 conversion here
247                    hyphenatedString.push_back(buffer()[j]);
248                }
249                ALOGD("hyphenated string: %s", hyphenatedString.c_str());
250#endif
251
252                // measure hyphenated substrings
253                for (size_t j = wordStart; j < wordEnd; j++) {
254                    HyphenationType hyph = mHyphBuf[j - wordStart];
255                    if (hyph != HyphenationType::DONT_BREAK) {
256                        paint->hyphenEdit = HyphenEdit::editForThisLine(hyph);
257                        const float firstPartWidth = Layout::measureText(mTextBuf.data(),
258                                lastBreak, j - lastBreak, mTextBuf.size(), bidiFlags, style,
259                                *paint, typeface, nullptr);
260                        ParaWidth hyphPostBreak = lastBreakWidth + firstPartWidth;
261
262                        paint->hyphenEdit = HyphenEdit::editForNextLine(hyph);
263                        const float secondPartWidth = Layout::measureText(mTextBuf.data(), j,
264                                afterWord - j, mTextBuf.size(), bidiFlags, style, *paint,
265                                typeface, nullptr);
266                        ParaWidth hyphPreBreak = postBreak - secondPartWidth;
267
268                        addWordBreak(j, hyphPreBreak, hyphPostBreak, postSpaceCount, postSpaceCount,
269                                hyphenPenalty, hyph);
270
271                        paint->hyphenEdit = HyphenEdit::NO_EDIT;
272                    }
273                }
274            }
275
276            // Skip break for zero-width characters inside replacement span
277            if (paint != nullptr || current == end || mCharWidths[current] > 0) {
278                float penalty = hyphenPenalty * mWordBreaker.breakBadness();
279                addWordBreak(current, mWidth, postBreak, mSpaceCount, postSpaceCount, penalty,
280                        HyphenationType::DONT_BREAK);
281            }
282            lastBreak = current;
283            lastBreakWidth = mWidth;
284            current = (size_t)mWordBreaker.next();
285        }
286    }
287
288    return width;
289}
290
291// add a word break (possibly for a hyphenated fragment), and add desperate breaks if
292// needed (ie when word exceeds current line width)
293void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak,
294        size_t preSpaceCount, size_t postSpaceCount, float penalty, HyphenationType hyph) {
295    Candidate cand;
296    ParaWidth width = mCandidates.back().preBreak;
297    if (postBreak - width > currentLineWidth()) {
298        // Add desperate breaks.
299        // Note: these breaks are based on the shaping of the (non-broken) original text; they
300        // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping.
301        size_t i = mCandidates.back().offset;
302        width += mCharWidths[i++];
303        for (; i < offset; i++) {
304            float w = mCharWidths[i];
305            if (w > 0) {
306                cand.offset = i;
307                cand.preBreak = width;
308                cand.postBreak = width;
309                // postSpaceCount doesn't include trailing spaces
310                cand.preSpaceCount = postSpaceCount;
311                cand.postSpaceCount = postSpaceCount;
312                cand.penalty = SCORE_DESPERATE;
313                cand.hyphenType = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
314#if VERBOSE_DEBUG
315                ALOGD("desperate cand: %zd %g:%g",
316                        mCandidates.size(), cand.postBreak, cand.preBreak);
317#endif
318                addCandidate(cand);
319                width += w;
320            }
321        }
322    }
323
324    cand.offset = offset;
325    cand.preBreak = preBreak;
326    cand.postBreak = postBreak;
327    cand.penalty = penalty;
328    cand.preSpaceCount = preSpaceCount;
329    cand.postSpaceCount = postSpaceCount;
330    cand.hyphenType = hyph;
331#if VERBOSE_DEBUG
332    ALOGD("cand: %zd %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak);
333#endif
334    addCandidate(cand);
335}
336
337// Helper method for addCandidate()
338void LineBreaker::pushGreedyBreak() {
339    const Candidate& bestCandidate = mCandidates[mBestBreak];
340    pushBreak(bestCandidate.offset, bestCandidate.postBreak - mPreBreak,
341            mLastHyphenation | HyphenEdit::editForThisLine(bestCandidate.hyphenType));
342    mBestScore = SCORE_INFTY;
343#if VERBOSE_DEBUG
344    ALOGD("break: %d %g", mBreaks.back(), mWidths.back());
345#endif
346    mLastBreak = mBestBreak;
347    mPreBreak = bestCandidate.preBreak;
348    mLastHyphenation = HyphenEdit::editForNextLine(bestCandidate.hyphenType);
349}
350
351// TODO performance: could avoid populating mCandidates if greedy only
352void LineBreaker::addCandidate(Candidate cand) {
353    const size_t candIndex = mCandidates.size();
354    mCandidates.push_back(cand);
355
356    // mLastBreak is the index of the last line break we decided to do in mCandidates,
357    // and mPreBreak is its preBreak value. mBestBreak is the index of the best line breaking candidate
358    // we have found since then, and mBestScore is its penalty.
359    if (cand.postBreak - mPreBreak > currentLineWidth()) {
360        // This break would create an overfull line, pick the best break and break there (greedy)
361        if (mBestBreak == mLastBreak) {
362            // No good break has been found since last break. Break here.
363            mBestBreak = candIndex;
364        }
365        pushGreedyBreak();
366    }
367
368    while (mLastBreak != candIndex && cand.postBreak - mPreBreak > currentLineWidth()) {
369        // We should rarely come here. But if we are here, we have broken the line, but the
370        // remaining part still doesn't fit. We now need to break at the second best place after the
371        // last break, but we have not kept that information, so we need to go back and find it.
372        //
373        // In some really rare cases, postBreak - preBreak of a candidate itself may be over the
374        // current line width. We protect ourselves against an infinite loop in that case by
375        // checking that we have not broken the line at this candidate already.
376        for (size_t i = mLastBreak + 1; i < candIndex; i++) {
377            const float penalty = mCandidates[i].penalty;
378            if (penalty <= mBestScore) {
379                mBestBreak = i;
380                mBestScore = penalty;
381            }
382        }
383        if (mBestBreak == mLastBreak) {
384            // We didn't find anything good. Break here.
385            mBestBreak = candIndex;
386        }
387        pushGreedyBreak();
388    }
389
390    if (cand.penalty <= mBestScore) {
391        mBestBreak = candIndex;
392        mBestScore = cand.penalty;
393    }
394}
395
396void LineBreaker::pushBreak(int offset, float width, uint8_t hyphenEdit) {
397    mBreaks.push_back(offset);
398    mWidths.push_back(width);
399    int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift;
400    flags |= hyphenEdit;
401    mFlags.push_back(flags);
402    mFirstTabIndex = INT_MAX;
403}
404
405void LineBreaker::addReplacement(size_t start, size_t end, float width) {
406    mCharWidths[start] = width;
407    std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f);
408    addStyleRun(nullptr, nullptr, FontStyle(), start, end, false);
409}
410
411// Get the width of a space. May return 0 if there are no spaces.
412// Note: if there are multiple different widths for spaces (for example, because of mixing of
413// fonts), it's only guaranteed to pick one.
414float LineBreaker::getSpaceWidth() const {
415    for (size_t i = 0; i < mTextBuf.size(); i++) {
416        if (isWordSpace(mTextBuf[i])) {
417            return mCharWidths[i];
418        }
419    }
420    return 0.0f;
421}
422
423float LineBreaker::currentLineWidth() const {
424    return mLineWidths.getLineWidth(mBreaks.size());
425}
426
427void LineBreaker::computeBreaksGreedy() {
428    // All breaks but the last have been added in addCandidate already.
429    size_t nCand = mCandidates.size();
430    if (nCand == 1 || mLastBreak != nCand - 1) {
431        pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak,
432                mLastHyphenation);
433        // don't need to update mBestScore, because we're done
434#if VERBOSE_DEBUG
435        ALOGD("final break: %d %g", mBreaks.back(), mWidths.back());
436#endif
437    }
438}
439
440// Follow "prev" links in mCandidates array, and copy to result arrays.
441void LineBreaker::finishBreaksOptimal() {
442    // clear existing greedy break result
443    mBreaks.clear();
444    mWidths.clear();
445    mFlags.clear();
446    size_t nCand = mCandidates.size();
447    size_t prev;
448    for (size_t i = nCand - 1; i > 0; i = prev) {
449        prev = mCandidates[i].prev;
450        mBreaks.push_back(mCandidates[i].offset);
451        mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak);
452        int flags = HyphenEdit::editForThisLine(mCandidates[i].hyphenType);
453        if (prev > 0) {
454            flags |= HyphenEdit::editForNextLine(mCandidates[prev].hyphenType);
455        }
456        mFlags.push_back(flags);
457    }
458    std::reverse(mBreaks.begin(), mBreaks.end());
459    std::reverse(mWidths.begin(), mWidths.end());
460    std::reverse(mFlags.begin(), mFlags.end());
461}
462
463void LineBreaker::computeBreaksOptimal(bool isRectangle) {
464    size_t active = 0;
465    size_t nCand = mCandidates.size();
466    float width = mLineWidths.getLineWidth(0);
467    float maxShrink = mJustified ? SHRINKABILITY * getSpaceWidth() : 0.0f;
468
469    // "i" iterates through candidates for the end of the line.
470    for (size_t i = 1; i < nCand; i++) {
471        bool atEnd = i == nCand - 1;
472        float best = SCORE_INFTY;
473        size_t bestPrev = 0;
474        size_t lineNumberLast = 0;
475
476        if (!isRectangle) {
477            size_t lineNumberLast = mCandidates[active].lineNumber;
478            width = mLineWidths.getLineWidth(lineNumberLast);
479        }
480        ParaWidth leftEdge = mCandidates[i].postBreak - width;
481        float bestHope = 0;
482
483        // "j" iterates through candidates for the beginning of the line.
484        for (size_t j = active; j < i; j++) {
485            if (!isRectangle) {
486                size_t lineNumber = mCandidates[j].lineNumber;
487                if (lineNumber != lineNumberLast) {
488                    float widthNew = mLineWidths.getLineWidth(lineNumber);
489                    if (widthNew != width) {
490                        leftEdge = mCandidates[i].postBreak - width;
491                        bestHope = 0;
492                        width = widthNew;
493                    }
494                    lineNumberLast = lineNumber;
495                }
496            }
497            float jScore = mCandidates[j].score;
498            if (jScore + bestHope >= best) continue;
499            float delta = mCandidates[j].preBreak - leftEdge;
500
501            // compute width score for line
502
503            // Note: the "bestHope" optimization makes the assumption that, when delta is
504            // non-negative, widthScore will increase monotonically as successive candidate
505            // breaks are considered.
506            float widthScore = 0.0f;
507            float additionalPenalty = 0.0f;
508            if ((atEnd || !mJustified) && delta < 0) {
509                widthScore = SCORE_OVERFULL;
510            } else if (atEnd && mStrategy != kBreakStrategy_Balanced) {
511                // increase penalty for hyphen on last line
512                additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty;
513            } else {
514                widthScore = delta * delta;
515                if (delta < 0) {
516                    if (-delta < maxShrink *
517                            (mCandidates[i].postSpaceCount - mCandidates[j].preSpaceCount)) {
518                        widthScore *= SHRINK_PENALTY_MULTIPLIER;
519                    } else {
520                        widthScore = SCORE_OVERFULL;
521                    }
522                }
523            }
524
525            if (delta < 0) {
526                active = j + 1;
527            } else {
528                bestHope = widthScore;
529            }
530
531            float score = jScore + widthScore + additionalPenalty;
532            if (score <= best) {
533                best = score;
534                bestPrev = j;
535            }
536        }
537        mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty;
538        mCandidates[i].prev = bestPrev;
539        mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1;
540#if VERBOSE_DEBUG
541        ALOGD("break %zd: score=%g, prev=%zd", i, mCandidates[i].score, mCandidates[i].prev);
542#endif
543    }
544    finishBreaksOptimal();
545}
546
547size_t LineBreaker::computeBreaks() {
548    if (mStrategy == kBreakStrategy_Greedy) {
549        computeBreaksGreedy();
550    } else {
551        computeBreaksOptimal(mLineWidths.isConstant());
552    }
553    return mBreaks.size();
554}
555
556void LineBreaker::finish() {
557    mWordBreaker.finish();
558    mWidth = 0;
559    mLineWidths.clear();
560    mCandidates.clear();
561    mBreaks.clear();
562    mWidths.clear();
563    mFlags.clear();
564    if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) {
565        mTextBuf.clear();
566        mTextBuf.shrink_to_fit();
567        mCharWidths.clear();
568        mCharWidths.shrink_to_fit();
569        mHyphBuf.clear();
570        mHyphBuf.shrink_to_fit();
571        mCandidates.shrink_to_fit();
572        mBreaks.shrink_to_fit();
573        mWidths.shrink_to_fit();
574        mFlags.shrink_to_fit();
575    }
576    mStrategy = kBreakStrategy_Greedy;
577    mHyphenationFrequency = kHyphenationFrequency_Normal;
578    mLinePenalty = 0.0f;
579    mJustified = false;
580}
581
582}  // namespace minikin
583