LineBreaker.cpp revision c88ef135fcc2661ec7addc171ebc60787df38aff
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define VERBOSE_DEBUG 0
18
19#include <limits>
20
21#define LOG_TAG "Minikin"
22#include <cutils/log.h>
23
24#include <minikin/Layout.h>
25#include <minikin/LineBreaker.h>
26
27using std::vector;
28
29namespace android {
30
31const int CHAR_TAB = 0x0009;
32
33// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
34// constants are larger than any reasonable actual width score.
35const float SCORE_INFTY = std::numeric_limits<float>::max();
36const float SCORE_OVERFULL = 1e12f;
37const float SCORE_DESPERATE = 1e10f;
38
39// Multiplier for hyphen penalty on last line.
40const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f;
41// Penalty assigned to each line break (to try to minimize number of lines)
42// TODO: when we implement full justification (so spaces can shrink and stretch), this is
43// probably not the most appropriate method.
44const float LINE_PENALTY_MULTIPLIER = 2.0f;
45
46// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for
47// unreasonably long words. This is somewhat of a heuristic because extremely long words
48// are possible in some languages. This does mean that very long real words can get
49// broken by desperate breaks, with no hyphens.
50const size_t LONGEST_HYPHENATED_WORD = 45;
51
52// When the text buffer is within this limit, capacity of vectors is retained at finish(),
53// to avoid allocation.
54const size_t MAX_TEXT_BUF_RETAIN = 32678;
55
56void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
57    mWordBreaker.setLocale(locale);
58
59    mHyphenator = hyphenator;
60}
61
62void LineBreaker::setText() {
63    mWordBreaker.setText(mTextBuf.data(), mTextBuf.size());
64
65    // handle initial break here because addStyleRun may never be called
66    mWordBreaker.next();
67    mCandidates.clear();
68    Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
69    mCandidates.push_back(cand);
70
71    // reset greedy breaker state
72    mBreaks.clear();
73    mWidths.clear();
74    mFlags.clear();
75    mLastBreak = 0;
76    mBestBreak = 0;
77    mBestScore = SCORE_INFTY;
78    mPreBreak = 0;
79    mFirstTabIndex = INT_MAX;
80}
81
82void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) {
83    mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth);
84}
85
86
87void LineBreaker::setIndents(const std::vector<float>& indents) {
88    mLineWidths.setIndents(indents);
89}
90
91// This function determines whether a character is a space that disappears at end of line.
92// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]],
93// plus '\n'.
94// Note: all such characters are in the BMP, so it's ok to use code units for this.
95static bool isLineEndSpace(uint16_t c) {
96    return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) ||
97            c == 0x205F || c == 0x3000;
98}
99
100// This function determines whether a character is like U+2010 HYPHEN in
101// line breaking and usage: a character immediately after which line breaks
102// are allowed, but words containing it should not be automatically
103// hyphenated. This is a curated set, created by manually inspecting all
104// the characters that have the Unicode line breaking property of BA or HY
105// and seeing which ones are hyphens.
106static bool isLineBreakingHyphen(uint16_t c) {
107    return (c == 0x002D || // HYPHEN-MINUS
108            c == 0x058A || // ARMENIAN HYPHEN
109            c == 0x05BE || // HEBREW PUNCTUATION MAQAF
110            c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
111            c == 0x2010 || // HYPHEN
112            c == 0x2013 || // EN DASH
113            c == 0x2027 || // HYPHENATION POINT
114            c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
115            c == 0x2E40);  // DOUBLE HYPHEN
116}
117
118// Ordinarily, this method measures the text in the range given. However, when paint
119// is nullptr, it assumes the widths have already been calculated and stored in the
120// width buffer.
121// This method finds the candidate word breaks (using the ICU break iterator) and sends them
122// to addCandidate.
123float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface,
124        FontStyle style, size_t start, size_t end, bool isRtl) {
125    Layout layout;  // performance TODO: move layout to self object to reduce allocation cost?
126    float width = 0.0f;
127    int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR;
128
129    float hyphenPenalty = 0.0;
130    if (paint != nullptr) {
131        layout.setFontCollection(typeface);
132        layout.doLayout(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, style,
133                *paint);
134        layout.getAdvances(mCharWidths.data() + start);
135        width = layout.getAdvance();
136
137        // a heuristic that seems to perform well
138        hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0);
139        if (mHyphenationFrequency == kHyphenationFrequency_Normal) {
140            hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing
141        }
142
143        mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
144    }
145
146    size_t current = (size_t)mWordBreaker.current();
147    size_t afterWord = start;
148    size_t lastBreak = start;
149    ParaWidth lastBreakWidth = mWidth;
150    ParaWidth postBreak = mWidth;
151    bool temporarilySkipHyphenation = false;
152    for (size_t i = start; i < end; i++) {
153        uint16_t c = mTextBuf[i];
154        if (c == CHAR_TAB) {
155            mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak);
156            if (mFirstTabIndex == INT_MAX) {
157                mFirstTabIndex = (int)i;
158            }
159            // fall back to greedy; other modes don't know how to deal with tabs
160            mStrategy = kBreakStrategy_Greedy;
161        } else {
162            mWidth += mCharWidths[i];
163            if (!isLineEndSpace(c)) {
164                postBreak = mWidth;
165                afterWord = i + 1;
166            }
167        }
168        if (i + 1 == current) {
169            // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
170            // we can pass the whole word down to Hyphenator like the soft hyphen case.
171            bool wordEndsInHyphen = isLineBreakingHyphen(c);
172            size_t wordStart = mWordBreaker.wordStart();
173            size_t wordEnd = mWordBreaker.wordEnd();
174            if (paint != nullptr && mHyphenator != nullptr &&
175                    mHyphenationFrequency != kHyphenationFrequency_None &&
176                    !wordEndsInHyphen && !temporarilySkipHyphenation &&
177                    wordEnd > wordStart && wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) {
178                mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart);
179#if VERBOSE_DEBUG
180                std::string hyphenatedString;
181                for (size_t j = wordStart; j < wordEnd; j++) {
182                    if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-');
183                    // Note: only works with ASCII, should do UTF-8 conversion here
184                    hyphenatedString.push_back(buffer()[j]);
185                }
186                ALOGD("hyphenated string: %s", hyphenatedString.c_str());
187#endif
188
189                // measure hyphenated substrings
190                for (size_t j = wordStart; j < wordEnd; j++) {
191                    uint8_t hyph = mHyphBuf[j - wordStart];
192                    if (hyph) {
193                        paint->hyphenEdit = hyph;
194                        layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
195                                mTextBuf.size(), bidiFlags, style, *paint);
196                        ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
197                        paint->hyphenEdit = 0;
198                        layout.doLayout(mTextBuf.data(), j, afterWord - j,
199                                mTextBuf.size(), bidiFlags, style, *paint);
200                        ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
201                        addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
202                    }
203                }
204            }
205            // Skip hyphenating the next word if and only if the present word ends in a hyphen
206            temporarilySkipHyphenation = wordEndsInHyphen;
207
208            // Skip break for zero-width characters inside replacement span
209            if (paint != nullptr || current == end || mCharWidths[current] > 0) {
210                float penalty = hyphenPenalty * mWordBreaker.breakBadness();
211                addWordBreak(current, mWidth, postBreak, penalty, 0);
212            }
213            lastBreak = current;
214            lastBreakWidth = mWidth;
215            current = (size_t)mWordBreaker.next();
216        }
217    }
218
219    return width;
220}
221
222// add a word break (possibly for a hyphenated fragment), and add desperate breaks if
223// needed (ie when word exceeds current line width)
224void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak,
225        float penalty, uint8_t hyph) {
226    Candidate cand;
227    ParaWidth width = mCandidates.back().preBreak;
228    if (postBreak - width > currentLineWidth()) {
229        // Add desperate breaks.
230        // Note: these breaks are based on the shaping of the (non-broken) original text; they
231        // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping.
232        size_t i = mCandidates.back().offset;
233        width += mCharWidths[i++];
234        for (; i < offset; i++) {
235            float w = mCharWidths[i];
236            if (w > 0) {
237                cand.offset = i;
238                cand.preBreak = width;
239                cand.postBreak = width;
240                cand.penalty = SCORE_DESPERATE;
241                cand.hyphenEdit = 0;
242#if VERBOSE_DEBUG
243                ALOGD("desperate cand: %zd %g:%g",
244                        mCandidates.size(), cand.postBreak, cand.preBreak);
245#endif
246                addCandidate(cand);
247                width += w;
248            }
249        }
250    }
251
252    cand.offset = offset;
253    cand.preBreak = preBreak;
254    cand.postBreak = postBreak;
255    cand.penalty = penalty;
256    cand.hyphenEdit = hyph;
257#if VERBOSE_DEBUG
258    ALOGD("cand: %zd %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak);
259#endif
260    addCandidate(cand);
261}
262
263// TODO performance: could avoid populating mCandidates if greedy only
264void LineBreaker::addCandidate(Candidate cand) {
265    size_t candIndex = mCandidates.size();
266    mCandidates.push_back(cand);
267    if (cand.postBreak - mPreBreak > currentLineWidth()) {
268        // This break would create an overfull line, pick the best break and break there (greedy)
269        if (mBestBreak == mLastBreak) {
270            mBestBreak = candIndex;
271        }
272        pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak,
273                mCandidates[mBestBreak].hyphenEdit);
274        mBestScore = SCORE_INFTY;
275#if VERBOSE_DEBUG
276        ALOGD("break: %d %g", mBreaks.back(), mWidths.back());
277#endif
278        mLastBreak = mBestBreak;
279        mPreBreak = mCandidates[mBestBreak].preBreak;
280    }
281    if (cand.penalty <= mBestScore) {
282        mBestBreak = candIndex;
283        mBestScore = cand.penalty;
284    }
285}
286
287void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) {
288    mBreaks.push_back(offset);
289    mWidths.push_back(width);
290    int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift;
291    flags |= hyph;
292    mFlags.push_back(flags);
293    mFirstTabIndex = INT_MAX;
294}
295
296void LineBreaker::addReplacement(size_t start, size_t end, float width) {
297    mCharWidths[start] = width;
298    std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f);
299    addStyleRun(nullptr, nullptr, FontStyle(), start, end, false);
300}
301
302float LineBreaker::currentLineWidth() const {
303    return mLineWidths.getLineWidth(mBreaks.size());
304}
305
306void LineBreaker::computeBreaksGreedy() {
307    // All breaks but the last have been added in addCandidate already.
308    size_t nCand = mCandidates.size();
309    if (nCand == 1 || mLastBreak != nCand - 1) {
310        pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0);
311        // don't need to update mBestScore, because we're done
312#if VERBOSE_DEBUG
313        ALOGD("final break: %d %g", mBreaks.back(), mWidths.back());
314#endif
315    }
316}
317
318// Follow "prev" links in mCandidates array, and copy to result arrays.
319void LineBreaker::finishBreaksOptimal() {
320    // clear existing greedy break result
321    mBreaks.clear();
322    mWidths.clear();
323    mFlags.clear();
324    size_t nCand = mCandidates.size();
325    size_t prev;
326    for (size_t i = nCand - 1; i > 0; i = prev) {
327        prev = mCandidates[i].prev;
328        mBreaks.push_back(mCandidates[i].offset);
329        mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak);
330        mFlags.push_back(mCandidates[i].hyphenEdit);
331    }
332    std::reverse(mBreaks.begin(), mBreaks.end());
333    std::reverse(mWidths.begin(), mWidths.end());
334    std::reverse(mFlags.begin(), mFlags.end());
335}
336
337void LineBreaker::computeBreaksOptimal(bool isRectangle) {
338    size_t active = 0;
339    size_t nCand = mCandidates.size();
340    float width = mLineWidths.getLineWidth(0);
341    for (size_t i = 1; i < nCand; i++) {
342        bool atEnd = i == nCand - 1;
343        float best = SCORE_INFTY;
344        size_t bestPrev = 0;
345        size_t lineNumberLast = 0;
346
347        if (!isRectangle) {
348            size_t lineNumberLast = mCandidates[active].lineNumber;
349            width = mLineWidths.getLineWidth(lineNumberLast);
350        }
351        ParaWidth leftEdge = mCandidates[i].postBreak - width;
352        float bestHope = 0;
353
354        for (size_t j = active; j < i; j++) {
355            if (!isRectangle) {
356                size_t lineNumber = mCandidates[j].lineNumber;
357                if (lineNumber != lineNumberLast) {
358                    float widthNew = mLineWidths.getLineWidth(lineNumber);
359                    if (widthNew != width) {
360                        leftEdge = mCandidates[i].postBreak - width;
361                        bestHope = 0;
362                        width = widthNew;
363                    }
364                    lineNumberLast = lineNumber;
365                }
366            }
367            float jScore = mCandidates[j].score;
368            if (jScore + bestHope >= best) continue;
369            float delta = mCandidates[j].preBreak - leftEdge;
370
371            // compute width score for line
372
373            // Note: the "bestHope" optimization makes the assumption that, when delta is
374            // non-negative, widthScore will increase monotonically as successive candidate
375            // breaks are considered.
376            float widthScore = 0.0f;
377            float additionalPenalty = 0.0f;
378            if (delta < 0) {
379                widthScore = SCORE_OVERFULL;
380            } else if (atEnd && mStrategy != kBreakStrategy_Balanced) {
381                // increase penalty for hyphen on last line
382                additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty;
383            } else {
384                widthScore = delta * delta;
385            }
386
387            if (delta < 0) {
388                active = j + 1;
389            } else {
390                bestHope = widthScore;
391            }
392
393            float score = jScore + widthScore + additionalPenalty;
394            if (score <= best) {
395                best = score;
396                bestPrev = j;
397            }
398        }
399        mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty;
400        mCandidates[i].prev = bestPrev;
401        mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1;
402#if VERBOSE_DEBUG
403        ALOGD("break %zd: score=%g, prev=%zd", i, mCandidates[i].score, mCandidates[i].prev);
404#endif
405    }
406    finishBreaksOptimal();
407}
408
409size_t LineBreaker::computeBreaks() {
410    if (mStrategy == kBreakStrategy_Greedy) {
411        computeBreaksGreedy();
412    } else {
413        computeBreaksOptimal(mLineWidths.isConstant());
414    }
415    return mBreaks.size();
416}
417
418void LineBreaker::finish() {
419    mWordBreaker.finish();
420    mWidth = 0;
421    mCandidates.clear();
422    mBreaks.clear();
423    mWidths.clear();
424    mFlags.clear();
425    if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) {
426        mTextBuf.clear();
427        mTextBuf.shrink_to_fit();
428        mCharWidths.clear();
429        mCharWidths.shrink_to_fit();
430        mHyphBuf.clear();
431        mHyphBuf.shrink_to_fit();
432        mCandidates.shrink_to_fit();
433        mBreaks.shrink_to_fit();
434        mWidths.shrink_to_fit();
435        mFlags.shrink_to_fit();
436    }
437    mStrategy = kBreakStrategy_Greedy;
438    mHyphenationFrequency = kHyphenationFrequency_Normal;
439    mLinePenalty = 0.0f;
440}
441
442}  // namespace android
443