LineBreaker.cpp revision bae347682989d2627081310129a5b60541ed6ad0
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define VERBOSE_DEBUG 0
18
19#include <limits>
20
21#define LOG_TAG "Minikin"
22#include <cutils/log.h>
23
24#include <minikin/Layout.h>
25#include <minikin/LineBreaker.h>
26
27using std::vector;
28
29namespace android {
30
31const int CHAR_TAB = 0x0009;
32const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
33
34// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
35// constants are larger than any reasonable actual width score.
36const float SCORE_INFTY = std::numeric_limits<float>::max();
37const float SCORE_OVERFULL = 1e12f;
38const float SCORE_DESPERATE = 1e10f;
39
40// Multiplier for hyphen penalty on last line.
41const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f;
42// Penalty assigned to each line break (to try to minimize number of lines)
43// TODO: when we implement full justification (so spaces can shrink and stretch), this is
44// probably not the most appropriate method.
45const float LINE_PENALTY_MULTIPLIER = 2.0f;
46
47// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for
48// unreasonably long words. This is somewhat of a heuristic because extremely long words
49// are possible in some languages. This does mean that very long real words can get
50// broken by desperate breaks, with no hyphens.
51const size_t LONGEST_HYPHENATED_WORD = 45;
52
53// When the text buffer is within this limit, capacity of vectors is retained at finish(),
54// to avoid allocation.
55const size_t MAX_TEXT_BUF_RETAIN = 32678;
56
57void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
58    delete mBreakIterator;
59    UErrorCode status = U_ZERO_ERROR;
60    mBreakIterator = icu::BreakIterator::createLineInstance(locale, status);
61    // TODO: check status
62
63    // TODO: load actual resource dependent on locale; letting Minikin do it is a hack
64    mHyphenator = hyphenator;
65}
66
67void LineBreaker::setText() {
68    UErrorCode status = U_ZERO_ERROR;
69    utext_openUChars(&mUText, mTextBuf.data(), mTextBuf.size(), &status);
70    mBreakIterator->setText(&mUText, status);
71    mBreakIterator->first();
72
73    // handle initial break here because addStyleRun may never be called
74    mBreakIterator->next();
75    mCandidates.clear();
76    Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
77    mCandidates.push_back(cand);
78
79    // reset greedy breaker state
80    mBreaks.clear();
81    mWidths.clear();
82    mFlags.clear();
83    mLastBreak = 0;
84    mBestBreak = 0;
85    mBestScore = SCORE_INFTY;
86    mPreBreak = 0;
87    mFirstTabIndex = INT_MAX;
88}
89
90void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) {
91    mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth);
92}
93
94
95void LineBreaker::setIndents(const std::vector<float>& indents) {
96    mLineWidths.setIndents(indents);
97}
98
99// This function determines whether a character is a space that disappears at end of line.
100// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]],
101// plus '\n'.
102// Note: all such characters are in the BMP, so it's ok to use code units for this.
103static bool isLineEndSpace(uint16_t c) {
104    return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) ||
105            c == 0x205F || c == 0x3000;
106}
107
108// This function determines whether a character is like U+2010 HYPHEN in
109// line breaking and usage: a character immediately after which line breaks
110// are allowed, but words containing it should not be automatically
111// hyphenated. This is a curated set, created by manually inspecting all
112// the characters that have the Unicode line breaking property of BA or HY
113// and seeing which ones are hyphens.
114static bool isLineBreakingHyphen(uint16_t c) {
115    return (c == 0x002D || // HYPHEN-MINUS
116            c == 0x058A || // ARMENIAN HYPHEN
117            c == 0x05BE || // HEBREW PUNCTUATION MAQAF
118            c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
119            c == 0x2010 || // HYPHEN
120            c == 0x2013 || // EN DASH
121            c == 0x2027 || // HYPHENATION POINT
122            c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
123            c == 0x2E40);  // DOUBLE HYPHEN
124}
125
126// Ordinarily, this method measures the text in the range given. However, when paint
127// is nullptr, it assumes the widths have already been calculated and stored in the
128// width buffer.
129// This method finds the candidate word breaks (using the ICU break iterator) and sends them
130// to addCandidate.
131float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface,
132        FontStyle style, size_t start, size_t end, bool isRtl) {
133    Layout layout;  // performance TODO: move layout to self object to reduce allocation cost?
134    float width = 0.0f;
135    int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR;
136
137    float hyphenPenalty = 0.0;
138    if (paint != nullptr) {
139        layout.setFontCollection(typeface);
140        layout.doLayout(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags, style,
141                *paint);
142        layout.getAdvances(mCharWidths.data() + start);
143        width = layout.getAdvance();
144
145        // a heuristic that seems to perform well
146        hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0);
147        if (mHyphenationFrequency == kHyphenationFrequency_Normal) {
148            hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing
149        }
150
151        mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
152    }
153
154    size_t current = (size_t)mBreakIterator->current();
155    size_t wordEnd = start;
156    size_t lastBreak = start;
157    ParaWidth lastBreakWidth = mWidth;
158    ParaWidth postBreak = mWidth;
159    bool temporarilySkipHyphenation = false;
160    for (size_t i = start; i < end; i++) {
161        uint16_t c = mTextBuf[i];
162        if (c == CHAR_TAB) {
163            mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak);
164            if (mFirstTabIndex == INT_MAX) {
165                mFirstTabIndex = (int)i;
166            }
167            // fall back to greedy; other modes don't know how to deal with tabs
168            mStrategy = kBreakStrategy_Greedy;
169        } else {
170            mWidth += mCharWidths[i];
171            if (!isLineEndSpace(c)) {
172                postBreak = mWidth;
173                wordEnd = i + 1;
174            }
175        }
176        if (i + 1 == current) {
177            // Override ICU's treatment of soft hyphen as a break opportunity, because we want it
178            // to be a hyphen break, with penalty and drawing behavior.
179            if (c != CHAR_SOFT_HYPHEN) {
180                // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
181                // we can pass the whole word down to Hyphenator like the soft hyphen case.
182                bool wordEndsInHyphen = isLineBreakingHyphen(c);
183                if (paint != nullptr && mHyphenator != nullptr &&
184                        mHyphenationFrequency != kHyphenationFrequency_None &&
185                        !wordEndsInHyphen && !temporarilySkipHyphenation &&
186                        wordEnd > lastBreak && wordEnd - lastBreak <= LONGEST_HYPHENATED_WORD) {
187                    mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[lastBreak], wordEnd - lastBreak);
188    #if VERBOSE_DEBUG
189                    std::string hyphenatedString;
190                    for (size_t j = lastBreak; j < wordEnd; j++) {
191                        if (mHyphBuf[j - lastBreak]) hyphenatedString.push_back('-');
192                        // Note: only works with ASCII, should do UTF-8 conversion here
193                        hyphenatedString.push_back(buffer()[j]);
194                    }
195                    ALOGD("hyphenated string: %s", hyphenatedString.c_str());
196    #endif
197
198                    // measure hyphenated substrings
199                    for (size_t j = lastBreak; j < wordEnd; j++) {
200                        uint8_t hyph = mHyphBuf[j - lastBreak];
201                        if (hyph) {
202                            paint->hyphenEdit = hyph;
203                            layout.doLayout(mTextBuf.data(), lastBreak, j - lastBreak,
204                                    mTextBuf.size(), bidiFlags, style, *paint);
205                            ParaWidth hyphPostBreak = lastBreakWidth + layout.getAdvance();
206                            paint->hyphenEdit = 0;
207                            layout.doLayout(mTextBuf.data(), j, wordEnd - j,
208                                    mTextBuf.size(), bidiFlags, style, *paint);
209                            ParaWidth hyphPreBreak = postBreak - layout.getAdvance();
210                            addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
211                        }
212                    }
213                }
214                // Skip hyphenating the next word if and only if the present word ends in a hyphen
215                temporarilySkipHyphenation = wordEndsInHyphen;
216
217                // Skip break for zero-width characters inside replacement span
218                if (paint != nullptr || current == end || mCharWidths[current] > 0) {
219                    addWordBreak(current, mWidth, postBreak, 0.0, 0);
220                }
221                lastBreak = current;
222                lastBreakWidth = mWidth;
223            }
224            current = (size_t)mBreakIterator->next();
225        }
226    }
227
228    return width;
229}
230
231// add a word break (possibly for a hyphenated fragment), and add desperate breaks if
232// needed (ie when word exceeds current line width)
233void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak,
234        float penalty, uint8_t hyph) {
235    Candidate cand;
236    ParaWidth width = mCandidates.back().preBreak;
237    if (postBreak - width > currentLineWidth()) {
238        // Add desperate breaks.
239        // Note: these breaks are based on the shaping of the (non-broken) original text; they
240        // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping.
241        size_t i = mCandidates.back().offset;
242        width += mCharWidths[i++];
243        for (; i < offset; i++) {
244            float w = mCharWidths[i];
245            if (w > 0) {
246                cand.offset = i;
247                cand.preBreak = width;
248                cand.postBreak = width;
249                cand.penalty = SCORE_DESPERATE;
250                cand.hyphenEdit = 0;
251#if VERBOSE_DEBUG
252                ALOGD("desperate cand: %zd %g:%g",
253                        mCandidates.size(), cand.postBreak, cand.preBreak);
254#endif
255                addCandidate(cand);
256                width += w;
257            }
258        }
259    }
260
261    cand.offset = offset;
262    cand.preBreak = preBreak;
263    cand.postBreak = postBreak;
264    cand.penalty = penalty;
265    cand.hyphenEdit = hyph;
266#if VERBOSE_DEBUG
267    ALOGD("cand: %zd %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak);
268#endif
269    addCandidate(cand);
270}
271
272// TODO performance: could avoid populating mCandidates if greedy only
273void LineBreaker::addCandidate(Candidate cand) {
274    size_t candIndex = mCandidates.size();
275    mCandidates.push_back(cand);
276    if (cand.postBreak - mPreBreak > currentLineWidth()) {
277        // This break would create an overfull line, pick the best break and break there (greedy)
278        if (mBestBreak == mLastBreak) {
279            mBestBreak = candIndex;
280        }
281        pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak,
282                mCandidates[mBestBreak].hyphenEdit);
283        mBestScore = SCORE_INFTY;
284#if VERBOSE_DEBUG
285        ALOGD("break: %d %g", mBreaks.back(), mWidths.back());
286#endif
287        mLastBreak = mBestBreak;
288        mPreBreak = mCandidates[mBestBreak].preBreak;
289    }
290    if (cand.penalty <= mBestScore) {
291        mBestBreak = candIndex;
292        mBestScore = cand.penalty;
293    }
294}
295
296void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) {
297    mBreaks.push_back(offset);
298    mWidths.push_back(width);
299    int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift;
300    flags |= hyph;
301    mFlags.push_back(flags);
302    mFirstTabIndex = INT_MAX;
303}
304
305void LineBreaker::addReplacement(size_t start, size_t end, float width) {
306    mCharWidths[start] = width;
307    std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f);
308    addStyleRun(nullptr, nullptr, FontStyle(), start, end, false);
309}
310
311float LineBreaker::currentLineWidth() const {
312    return mLineWidths.getLineWidth(mBreaks.size());
313}
314
315void LineBreaker::computeBreaksGreedy() {
316    // All breaks but the last have been added in addCandidate already.
317    size_t nCand = mCandidates.size();
318    if (nCand == 1 || mLastBreak != nCand - 1) {
319        pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0);
320        // don't need to update mBestScore, because we're done
321#if VERBOSE_DEBUG
322        ALOGD("final break: %d %g", mBreaks.back(), mWidths.back());
323#endif
324    }
325}
326
327// Follow "prev" links in mCandidates array, and copy to result arrays.
328void LineBreaker::finishBreaksOptimal() {
329    // clear existing greedy break result
330    mBreaks.clear();
331    mWidths.clear();
332    mFlags.clear();
333    size_t nCand = mCandidates.size();
334    size_t prev;
335    for (size_t i = nCand - 1; i > 0; i = prev) {
336        prev = mCandidates[i].prev;
337        mBreaks.push_back(mCandidates[i].offset);
338        mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak);
339        mFlags.push_back(mCandidates[i].hyphenEdit);
340    }
341    std::reverse(mBreaks.begin(), mBreaks.end());
342    std::reverse(mWidths.begin(), mWidths.end());
343    std::reverse(mFlags.begin(), mFlags.end());
344}
345
346void LineBreaker::computeBreaksOptimal(bool isRectangle) {
347    size_t active = 0;
348    size_t nCand = mCandidates.size();
349    float width = mLineWidths.getLineWidth(0);
350    for (size_t i = 1; i < nCand; i++) {
351        bool atEnd = i == nCand - 1;
352        float best = SCORE_INFTY;
353        size_t bestPrev = 0;
354        size_t lineNumberLast = 0;
355
356        if (!isRectangle) {
357            size_t lineNumberLast = mCandidates[active].lineNumber;
358            width = mLineWidths.getLineWidth(lineNumberLast);
359        }
360        ParaWidth leftEdge = mCandidates[i].postBreak - width;
361        float bestHope = 0;
362
363        for (size_t j = active; j < i; j++) {
364            if (!isRectangle) {
365                size_t lineNumber = mCandidates[j].lineNumber;
366                if (lineNumber != lineNumberLast) {
367                    float widthNew = mLineWidths.getLineWidth(lineNumber);
368                    if (widthNew != width) {
369                        leftEdge = mCandidates[i].postBreak - width;
370                        bestHope = 0;
371                        width = widthNew;
372                    }
373                    lineNumberLast = lineNumber;
374                }
375            }
376            float jScore = mCandidates[j].score;
377            if (jScore + bestHope >= best) continue;
378            float delta = mCandidates[j].preBreak - leftEdge;
379
380            // compute width score for line
381
382            // Note: the "bestHope" optimization makes the assumption that, when delta is
383            // non-negative, widthScore will increase monotonically as successive candidate
384            // breaks are considered.
385            float widthScore = 0.0f;
386            float additionalPenalty = 0.0f;
387            if (delta < 0) {
388                widthScore = SCORE_OVERFULL;
389            } else if (atEnd && mStrategy != kBreakStrategy_Balanced) {
390                // increase penalty for hyphen on last line
391                additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty;
392            } else {
393                widthScore = delta * delta;
394            }
395
396            if (delta < 0) {
397                active = j + 1;
398            } else {
399                bestHope = widthScore;
400            }
401
402            float score = jScore + widthScore + additionalPenalty;
403            if (score <= best) {
404                best = score;
405                bestPrev = j;
406            }
407        }
408        mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty;
409        mCandidates[i].prev = bestPrev;
410        mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1;
411#if VERBOSE_DEBUG
412        ALOGD("break %zd: score=%g, prev=%zd", i, mCandidates[i].score, mCandidates[i].prev);
413#endif
414    }
415    finishBreaksOptimal();
416}
417
418size_t LineBreaker::computeBreaks() {
419    if (mStrategy == kBreakStrategy_Greedy) {
420        computeBreaksGreedy();
421    } else {
422        computeBreaksOptimal(mLineWidths.isConstant());
423    }
424    return mBreaks.size();
425}
426
427void LineBreaker::finish() {
428    mWidth = 0;
429    mCandidates.clear();
430    mBreaks.clear();
431    mWidths.clear();
432    mFlags.clear();
433    if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) {
434        mTextBuf.clear();
435        mTextBuf.shrink_to_fit();
436        mCharWidths.clear();
437        mCharWidths.shrink_to_fit();
438        mHyphBuf.clear();
439        mHyphBuf.shrink_to_fit();
440        mCandidates.shrink_to_fit();
441        mBreaks.shrink_to_fit();
442        mWidths.shrink_to_fit();
443        mFlags.shrink_to_fit();
444    }
445    mStrategy = kBreakStrategy_Greedy;
446    mHyphenationFrequency = kHyphenationFrequency_Normal;
447    mLinePenalty = 0.0f;
448}
449
450}  // namespace android
451