LineBreaker.cpp revision a6f8bf6f530f80b00a7cc64605667dd1d75a971c
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define VERBOSE_DEBUG 0
18
19#include <limits>
20
21#define LOG_TAG "Minikin"
22#include <cutils/log.h>
23
24#include <minikin/Layout.h>
25#include <minikin/LineBreaker.h>
26
27using std::vector;
28
29namespace android {
30
31const int CHAR_TAB = 0x0009;
32const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
33
34// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
35// constants are larger than any reasonable actual width score.
36const float SCORE_INFTY = std::numeric_limits<float>::max();
37const float SCORE_OVERFULL = 1e12f;
38const float SCORE_DESPERATE = 1e10f;
39
40// Multiplier for hyphen penalty on last line.
41const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f;
42// Penalty assigned to each line break (to try to minimize number of lines)
43// TODO: when we implement full justification (so spaces can shrink and stretch), this is
44// probably not the most appropriate method.
45const float LINE_PENALTY_MULTIPLIER = 2.0f;
46
47// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for
48// unreasonably long words. This is somewhat of a heuristic because extremely long words
49// are possible in some languages. This does mean that very long real words can get
50// broken by desperate breaks, with no hyphens.
51const size_t LONGEST_HYPHENATED_WORD = 45;
52
53// When the text buffer is within this limit, capacity of vectors is retained at finish(),
54// to avoid allocation.
55const size_t MAX_TEXT_BUF_RETAIN = 32678;
56
57void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
58    delete mBreakIterator;
59    UErrorCode status = U_ZERO_ERROR;
60    mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
61    // TODO: check status
62
63    // TODO: load actual resource dependent on locale; letting Minikin do it is a hack
64    mHyphenator = hyphenator;
65}
66
67void LineBreaker::setText() {
68    UErrorCode status = U_ZERO_ERROR;
69    utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
70    mBreakIterator->setText(&mUText, status);
71    mBreakIterator->first();
72
73    // handle initial break here because addStyleRun may never be called
74    mBreakIterator->next();
75    mCandidates.clear();
76    Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
77    mCandidates.push_back(cand);
78
79    // reset greedy breaker state
80    mBreaks.clear();
81    mWidths.clear();
82    mFlags.clear();
83    mLastBreak = 0;
84    mBestBreak = 0;
85    mBestScore = SCORE_INFTY;
86    mPreBreak = 0;
87    mFirstTabIndex = INT_MAX;
88}
89
90void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) {
91    mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth);
92}
93
94
95void LineBreaker::setIndents(const std::vector<float>& indents) {
96    mLineWidths.setIndents(indents);
97}
98
99// This function determines whether a character is a space that disappears at end of line.
100// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]],
101// plus '\n'.
102// Note: all such characters are in the BMP, so it's ok to use code units for this.
103static bool isLineEndSpace(uint16_t c) {
104    return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) ||
105            c == 0x205F || c == 0x3000;
106}
107
108// Ordinarily, this method measures the text in the range given. However, when paint
109// is nullptr, it assumes the widths have already been calculated and stored in the
110// width buffer.
111// This method finds the candidate word breaks (using the ICU break iterator) and sends them
112// to addCandidate.
113float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface,
114        FontStyle style, size_t start, size_t end, bool isRtl) {
115    Layout layout;  // performance TODO: move layout to self object to reduce allocation cost?
116    float width = 0.0f;
117    int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR;
118
119    float hyphenPenalty = 0.0;
120    if (paint != nullptr) {
121        layout.setFontCollection(typeface);
122        layout.doLayout(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, style,
123                *paint);
124        layout.getAdvances(mCharWidths.data() + start);
125        width = layout.getAdvance();
126
127        // a heuristic that seems to perform well
128        hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0);
129        if (mHyphenationFrequency == kHyphenationFrequency_Normal) {
130            hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing
131        }
132
133        mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
134    }
135
136    size_t current = (size_t)mBreakIterator->current();
137    size_t wordEnd = start;
138    size_t lastBreak = start;
139    ParaWidth lastBreakWidth = mWidth;
140    ParaWidth postBreak = mWidth;
141    for (size_t i = start; i < end; i++) {
142        uint16_t c = mTextBuf[i];
143        if (c == CHAR_TAB) {
144            mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak);
145            if (mFirstTabIndex == INT_MAX) {
146                mFirstTabIndex = (int)i;
147            }
148            // fall back to greedy; other modes don't know how to deal with tabs
149            mStrategy = kBreakStrategy_Greedy;
150        } else {
151            mWidth += mCharWidths[i];
152            if (!isLineEndSpace(c)) {
153                postBreak = mWidth;
154                wordEnd = i + 1;
155            }
156        }
157        if (i + 1 == current) {
158            // Override ICU's treatment of soft hyphen as a break opportunity, because we want it
159            // to be a hyphen break, with penalty and drawing behavior.
160            if (c != CHAR_SOFT_HYPHEN) {
161                if (paint != nullptr && mHyphenator != nullptr &&
162                        mHyphenationFrequency != kHyphenationFrequency_None &&
163                        wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) {
164                    mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
165    #if VERBOSE_DEBUG
166                    std::string hyphenatedString;
167                    for (size_t j = lastBreak; j < wordEnd; j++) {
168                        if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
169                        // Note: only works with ASCII, should do UTF-8 conversion here
170                        hyphenatedString.push_back(buffer()[j]);
171                    }
172                    ALOGD("hyphenated string: %s", hyphenatedString.c_str());
173    #endif
174
175                    // measure hyphenated substrings
176                    for (size_t j = lastBreak; j < wordEnd; j++) {
177                        uint8_t hyph = mHyphBuf[j - lastBreak];
178                        if (hyph) {
179                            paint->hyphenEdit = hyph;
180                            layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
181                                    mTextBuf.size(), bidiFlags, style, *paint);
182                            ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
183                            paint->hyphenEdit = 0;
184                            layout.doLayout(mTextBuf.data(), j, wordEnd - j,
185                                    mTextBuf.size(), bidiFlags, style, *paint);
186                            ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
187                            addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
188                        }
189                    }
190                }
191
192                // Skip break for zero-width characters.
193                if (current == mTextBuf.size() || mCharWidths[current] > 0) {
194                    addWordBreak(current, mWidth, postBreak, 0.0, 0);
195                }
196                lastBreak = current;
197                lastBreakWidth = mWidth;
198            }
199            current = (size_t)mBreakIterator->next();
200        }
201    }
202
203    return width;
204}
205
206// add a word break (possibly for a hyphenated fragment), and add desperate breaks if
207// needed (ie when word exceeds current line width)
208void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak,
209        float penalty, uint8_t hyph) {
210    Candidate cand;
211    ParaWidth width = mCandidates.back().preBreak;
212    if (postBreak - width > currentLineWidth()) {
213        // Add desperate breaks.
214        // Note: these breaks are based on the shaping of the (non-broken) original text; they
215        // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping.
216        size_t i = mCandidates.back().offset;
217        width += mCharWidths[i++];
218        for (; i < offset; i++) {
219            float w = mCharWidths[i];
220            if (w > 0) {
221                cand.offset = i;
222                cand.preBreak = width;
223                cand.postBreak = width;
224                cand.penalty = SCORE_DESPERATE;
225                cand.hyphenEdit = 0;
226#if VERBOSE_DEBUG
227                ALOGD("desperate cand: %d %g:%g",
228                        mCandidates.size(), cand.postBreak, cand.preBreak);
229#endif
230                addCandidate(cand);
231                width += w;
232            }
233        }
234    }
235
236    cand.offset = offset;
237    cand.preBreak = preBreak;
238    cand.postBreak = postBreak;
239    cand.penalty = penalty;
240    cand.hyphenEdit = hyph;
241#if VERBOSE_DEBUG
242    ALOGD("cand: %d %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak);
243#endif
244    addCandidate(cand);
245}
246
247// TODO performance: could avoid populating mCandidates if greedy only
248void LineBreaker::addCandidate(Candidate cand) {
249    size_t candIndex = mCandidates.size();
250    mCandidates.push_back(cand);
251    if (cand.postBreak - mPreBreak > currentLineWidth()) {
252        // This break would create an overfull line, pick the best break and break there (greedy)
253        if (mBestBreak == mLastBreak) {
254            mBestBreak = candIndex;
255        }
256        pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak,
257                mCandidates[mBestBreak].hyphenEdit);
258        mBestScore = SCORE_INFTY;
259#if VERBOSE_DEBUG
260        ALOGD("break: %d %g", mBreaks.back(), mWidths.back());
261#endif
262        mLastBreak = mBestBreak;
263        mPreBreak = mCandidates[mBestBreak].preBreak;
264    }
265    if (cand.penalty <= mBestScore) {
266        mBestBreak = candIndex;
267        mBestScore = cand.penalty;
268    }
269}
270
271void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) {
272    mBreaks.push_back(offset);
273    mWidths.push_back(width);
274    int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift;
275    flags |= hyph;
276    mFlags.push_back(flags);
277    mFirstTabIndex = INT_MAX;
278}
279
280void LineBreaker::addReplacement(size_t start, size_t end, float width) {
281    mCharWidths[start] = width;
282    std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f);
283    addStyleRun(nullptr, nullptr, FontStyle(), start, end, false);
284}
285
286float LineBreaker::currentLineWidth() const {
287    return mLineWidths.getLineWidth(mBreaks.size());
288}
289
290void LineBreaker::computeBreaksGreedy() {
291    // All breaks but the last have been added in addCandidate already.
292    size_t nCand = mCandidates.size();
293    if (nCand == 1 || mLastBreak != nCand - 1) {
294        pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0);
295        // don't need to update mBestScore, because we're done
296#if VERBOSE_DEBUG
297        ALOGD("final break: %d %g", mBreaks.back(), mWidths.back());
298#endif
299    }
300}
301
302// Follow "prev" links in mCandidates array, and copy to result arrays.
303void LineBreaker::finishBreaksOptimal() {
304    // clear existing greedy break result
305    mBreaks.clear();
306    mWidths.clear();
307    mFlags.clear();
308    size_t nCand = mCandidates.size();
309    size_t prev;
310    for (size_t i = nCand - 1; i > 0; i = prev) {
311        prev = mCandidates[i].prev;
312        mBreaks.push_back(mCandidates[i].offset);
313        mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak);
314        mFlags.push_back(mCandidates[i].hyphenEdit);
315    }
316    std::reverse(mBreaks.begin(), mBreaks.end());
317    std::reverse(mWidths.begin(), mWidths.end());
318    std::reverse(mFlags.begin(), mFlags.end());
319}
320
321void LineBreaker::computeBreaksOptimal(bool isRectangle) {
322    size_t active = 0;
323    size_t nCand = mCandidates.size();
324    float width = mLineWidths.getLineWidth(0);
325    for (size_t i = 1; i < nCand; i++) {
326        bool atEnd = i == nCand - 1;
327        float best = SCORE_INFTY;
328        size_t bestPrev = 0;
329        size_t lineNumberLast = 0;
330
331        if (!isRectangle) {
332            size_t lineNumberLast = mCandidates[active].lineNumber;
333            width = mLineWidths.getLineWidth(lineNumberLast);
334        }
335        ParaWidth leftEdge = mCandidates[i].postBreak - width;
336        float bestHope = 0;
337
338        for (size_t j = active; j < i; j++) {
339            if (!isRectangle) {
340                size_t lineNumber = mCandidates[j].lineNumber;
341                if (lineNumber != lineNumberLast) {
342                    float widthNew = mLineWidths.getLineWidth(lineNumber);
343                    if (widthNew != width) {
344                        leftEdge = mCandidates[i].postBreak - width;
345                        bestHope = 0;
346                        width = widthNew;
347                    }
348                    lineNumberLast = lineNumber;
349                }
350            }
351            float jScore = mCandidates[j].score;
352            if (jScore + bestHope >= best) continue;
353            float delta = mCandidates[j].preBreak - leftEdge;
354
355            // compute width score for line
356
357            // Note: the "bestHope" optimization makes the assumption that, when delta is
358            // non-negative, widthScore will increase monotonically as successive candidate
359            // breaks are considered.
360            float widthScore = 0.0f;
361            float additionalPenalty = 0.0f;
362            if (delta < 0) {
363                widthScore = SCORE_OVERFULL;
364            } else if (atEnd && mStrategy != kBreakStrategy_Balanced) {
365                // increase penalty for hyphen on last line
366                additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty;
367            } else {
368                widthScore = delta * delta;
369            }
370
371            if (delta < 0) {
372                active = j + 1;
373            } else {
374                bestHope = widthScore;
375            }
376
377            float score = jScore + widthScore + additionalPenalty;
378            if (score <= best) {
379                best = score;
380                bestPrev = j;
381            }
382        }
383        mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty;
384        mCandidates[i].prev = bestPrev;
385        mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1;
386#ifdef VERBOSE_DEBUG
387        ALOGD("break %d: score=%g, prev=%d", i, mCandidates[i].score, mCandidates[i].prev);
388#endif
389    }
390    finishBreaksOptimal();
391}
392
393size_t LineBreaker::computeBreaks() {
394    if (mStrategy == kBreakStrategy_Greedy) {
395        computeBreaksGreedy();
396    } else {
397        computeBreaksOptimal(mLineWidths.isConstant());
398    }
399    return mBreaks.size();
400}
401
402void LineBreaker::finish() {
403    mWidth = 0;
404    mCandidates.clear();
405    mBreaks.clear();
406    mWidths.clear();
407    mFlags.clear();
408    if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) {
409        mTextBuf.clear();
410        mTextBuf.shrink_to_fit();
411        mCharWidths.clear();
412        mCharWidths.shrink_to_fit();
413        mHyphBuf.clear();
414        mHyphBuf.shrink_to_fit();
415        mCandidates.shrink_to_fit();
416        mBreaks.shrink_to_fit();
417        mWidths.shrink_to_fit();
418        mFlags.shrink_to_fit();
419    }
420    mStrategy = kBreakStrategy_Greedy;
421    mHyphenationFrequency = kHyphenationFrequency_Normal;
422    mLinePenalty = 0.0f;
423}
424
425}  // namespace android
426