LineBreaker.cpp revision 39ab40115fae6d0c948e435233b3dd997ee7d8e5
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define VERBOSE_DEBUG 0
18
19#define LOG_TAG "Minikin"
20
21#include <limits>
22
23#include <log/log.h>
24
25#include <minikin/Layout.h>
26#include <minikin/LineBreaker.h>
27
28using std::vector;
29
30namespace android {
31
32const int CHAR_TAB = 0x0009;
33
34// Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
35// constants are larger than any reasonable actual width score.
36const float SCORE_INFTY = std::numeric_limits<float>::max();
37const float SCORE_OVERFULL = 1e12f;
38const float SCORE_DESPERATE = 1e10f;
39
40// Multiplier for hyphen penalty on last line.
41const float LAST_LINE_PENALTY_MULTIPLIER = 4.0f;
42// Penalty assigned to each line break (to try to minimize number of lines)
43// TODO: when we implement full justification (so spaces can shrink and stretch), this is
44// probably not the most appropriate method.
45const float LINE_PENALTY_MULTIPLIER = 2.0f;
46
47// Very long words trigger O(n^2) behavior in hyphenation, so we disable hyphenation for
48// unreasonably long words. This is somewhat of a heuristic because extremely long words
49// are possible in some languages. This does mean that very long real words can get
50// broken by desperate breaks, with no hyphens.
51const size_t LONGEST_HYPHENATED_WORD = 45;
52
53// When the text buffer is within this limit, capacity of vectors is retained at finish(),
54// to avoid allocation.
55const size_t MAX_TEXT_BUF_RETAIN = 32678;
56
57void LineBreaker::setLocale(const icu::Locale& locale, Hyphenator* hyphenator) {
58    mWordBreaker.setLocale(locale);
59
60    mHyphenator = hyphenator;
61}
62
63void LineBreaker::setText() {
64    mWordBreaker.setText(mTextBuf.data(), mTextBuf.size());
65
66    // handle initial break here because addStyleRun may never be called
67    mWordBreaker.next();
68    mCandidates.clear();
69    Candidate cand = {0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0};
70    mCandidates.push_back(cand);
71
72    // reset greedy breaker state
73    mBreaks.clear();
74    mWidths.clear();
75    mFlags.clear();
76    mLastBreak = 0;
77    mBestBreak = 0;
78    mBestScore = SCORE_INFTY;
79    mPreBreak = 0;
80    mFirstTabIndex = INT_MAX;
81}
82
83void LineBreaker::setLineWidths(float firstWidth, int firstWidthLineCount, float restWidth) {
84    mLineWidths.setWidths(firstWidth, firstWidthLineCount, restWidth);
85}
86
87
88void LineBreaker::setIndents(const std::vector<float>& indents) {
89    mLineWidths.setIndents(indents);
90}
91
92// This function determines whether a character is a space that disappears at end of line.
93// It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]],
94// plus '\n'.
95// Note: all such characters are in the BMP, so it's ok to use code units for this.
96static bool isLineEndSpace(uint16_t c) {
97    return c == '\n' || c == ' ' || c == 0x1680 || (0x2000 <= c && c <= 0x200A && c != 0x2007) ||
98            c == 0x205F || c == 0x3000;
99}
100
101// This function determines whether a character is like U+2010 HYPHEN in
102// line breaking and usage: a character immediately after which line breaks
103// are allowed, but words containing it should not be automatically
104// hyphenated. This is a curated set, created by manually inspecting all
105// the characters that have the Unicode line breaking property of BA or HY
106// and seeing which ones are hyphens.
107static bool isLineBreakingHyphen(uint16_t c) {
108    return (c == 0x002D || // HYPHEN-MINUS
109            c == 0x058A || // ARMENIAN HYPHEN
110            c == 0x05BE || // HEBREW PUNCTUATION MAQAF
111            c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
112            c == 0x2010 || // HYPHEN
113            c == 0x2013 || // EN DASH
114            c == 0x2027 || // HYPHENATION POINT
115            c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
116            c == 0x2E40);  // DOUBLE HYPHEN
117}
118
119// Ordinarily, this method measures the text in the range given. However, when paint
120// is nullptr, it assumes the widths have already been calculated and stored in the
121// width buffer.
122// This method finds the candidate word breaks (using the ICU break iterator) and sends them
123// to addCandidate.
124float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typeface,
125        FontStyle style, size_t start, size_t end, bool isRtl) {
126    float width = 0.0f;
127    int bidiFlags = isRtl ? kBidi_Force_RTL : kBidi_Force_LTR;
128
129    float hyphenPenalty = 0.0;
130    if (paint != nullptr) {
131        width = Layout::measureText(mTextBuf.data(), start, end - start, mTextBuf.size(), bidiFlags,
132                style, *paint, typeface, mCharWidths.data() + start);
133
134        // a heuristic that seems to perform well
135        hyphenPenalty = 0.5 * paint->size * paint->scaleX * mLineWidths.getLineWidth(0);
136        if (mHyphenationFrequency == kHyphenationFrequency_Normal) {
137            hyphenPenalty *= 4.0; // TODO: Replace with a better value after some testing
138        }
139
140        mLinePenalty = std::max(mLinePenalty, hyphenPenalty * LINE_PENALTY_MULTIPLIER);
141    }
142
143    size_t current = (size_t)mWordBreaker.current();
144    size_t afterWord = start;
145    size_t lastBreak = start;
146    ParaWidth lastBreakWidth = mWidth;
147    ParaWidth postBreak = mWidth;
148    bool temporarilySkipHyphenation = false;
149    for (size_t i = start; i < end; i++) {
150        uint16_t c = mTextBuf[i];
151        if (c == CHAR_TAB) {
152            mWidth = mPreBreak + mTabStops.nextTab(mWidth - mPreBreak);
153            if (mFirstTabIndex == INT_MAX) {
154                mFirstTabIndex = (int)i;
155            }
156            // fall back to greedy; other modes don't know how to deal with tabs
157            mStrategy = kBreakStrategy_Greedy;
158        } else {
159            mWidth += mCharWidths[i];
160            if (!isLineEndSpace(c)) {
161                postBreak = mWidth;
162                afterWord = i + 1;
163            }
164        }
165        if (i + 1 == current) {
166            // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
167            // we can pass the whole word down to Hyphenator like the soft hyphen case.
168            bool wordEndsInHyphen = isLineBreakingHyphen(c);
169            size_t wordStart = mWordBreaker.wordStart();
170            size_t wordEnd = mWordBreaker.wordEnd();
171            if (paint != nullptr && mHyphenator != nullptr &&
172                    mHyphenationFrequency != kHyphenationFrequency_None &&
173                    !wordEndsInHyphen && !temporarilySkipHyphenation &&
174                    wordStart >= start && wordEnd > wordStart &&
175                    wordEnd - wordStart <= LONGEST_HYPHENATED_WORD) {
176                mHyphenator->hyphenate(&mHyphBuf, &mTextBuf[wordStart], wordEnd - wordStart);
177#if VERBOSE_DEBUG
178                std::string hyphenatedString;
179                for (size_t j = wordStart; j < wordEnd; j++) {
180                    if (mHyphBuf[j - wordStart]) hyphenatedString.push_back('-');
181                    // Note: only works with ASCII, should do UTF-8 conversion here
182                    hyphenatedString.push_back(buffer()[j]);
183                }
184                ALOGD("hyphenated string: %s", hyphenatedString.c_str());
185#endif
186
187                // measure hyphenated substrings
188                for (size_t j = wordStart; j < wordEnd; j++) {
189                    uint8_t hyph = mHyphBuf[j - wordStart];
190                    if (hyph) {
191                        paint->hyphenEdit = hyph;
192
193                        const float firstPartWidth = Layout::measureText(mTextBuf.data(),
194                                lastBreak, j - lastBreak, mTextBuf.size(), bidiFlags, style,
195                                *paint, typeface, nullptr);
196                        ParaWidth hyphPostBreak = lastBreakWidth + firstPartWidth;
197                        paint->hyphenEdit = 0;
198
199                        const float secondPartWith = Layout::measureText(mTextBuf.data(), j,
200                                afterWord - j, mTextBuf.size(), bidiFlags, style, *paint,
201                                typeface, nullptr);
202                        ParaWidth hyphPreBreak = postBreak - secondPartWith;
203                        addWordBreak(j, hyphPreBreak, hyphPostBreak, hyphenPenalty, hyph);
204                    }
205                }
206            }
207            // Skip hyphenating the next word if and only if the present word ends in a hyphen
208            temporarilySkipHyphenation = wordEndsInHyphen;
209
210            // Skip break for zero-width characters inside replacement span
211            if (paint != nullptr || current == end || mCharWidths[current] > 0) {
212                float penalty = hyphenPenalty * mWordBreaker.breakBadness();
213                addWordBreak(current, mWidth, postBreak, penalty, 0);
214            }
215            lastBreak = current;
216            lastBreakWidth = mWidth;
217            current = (size_t)mWordBreaker.next();
218        }
219    }
220
221    return width;
222}
223
224// add a word break (possibly for a hyphenated fragment), and add desperate breaks if
225// needed (ie when word exceeds current line width)
226void LineBreaker::addWordBreak(size_t offset, ParaWidth preBreak, ParaWidth postBreak,
227        float penalty, uint8_t hyph) {
228    Candidate cand;
229    ParaWidth width = mCandidates.back().preBreak;
230    if (postBreak - width > currentLineWidth()) {
231        // Add desperate breaks.
232        // Note: these breaks are based on the shaping of the (non-broken) original text; they
233        // are imprecise especially in the presence of kerning, ligatures, and Arabic shaping.
234        size_t i = mCandidates.back().offset;
235        width += mCharWidths[i++];
236        for (; i < offset; i++) {
237            float w = mCharWidths[i];
238            if (w > 0) {
239                cand.offset = i;
240                cand.preBreak = width;
241                cand.postBreak = width;
242                cand.penalty = SCORE_DESPERATE;
243                cand.hyphenEdit = 0;
244#if VERBOSE_DEBUG
245                ALOGD("desperate cand: %zd %g:%g",
246                        mCandidates.size(), cand.postBreak, cand.preBreak);
247#endif
248                addCandidate(cand);
249                width += w;
250            }
251        }
252    }
253
254    cand.offset = offset;
255    cand.preBreak = preBreak;
256    cand.postBreak = postBreak;
257    cand.penalty = penalty;
258    cand.hyphenEdit = hyph;
259#if VERBOSE_DEBUG
260    ALOGD("cand: %zd %g:%g", mCandidates.size(), cand.postBreak, cand.preBreak);
261#endif
262    addCandidate(cand);
263}
264
265// TODO performance: could avoid populating mCandidates if greedy only
266void LineBreaker::addCandidate(Candidate cand) {
267    size_t candIndex = mCandidates.size();
268    mCandidates.push_back(cand);
269    if (cand.postBreak - mPreBreak > currentLineWidth()) {
270        // This break would create an overfull line, pick the best break and break there (greedy)
271        if (mBestBreak == mLastBreak) {
272            mBestBreak = candIndex;
273        }
274        pushBreak(mCandidates[mBestBreak].offset, mCandidates[mBestBreak].postBreak - mPreBreak,
275                mCandidates[mBestBreak].hyphenEdit);
276        mBestScore = SCORE_INFTY;
277#if VERBOSE_DEBUG
278        ALOGD("break: %d %g", mBreaks.back(), mWidths.back());
279#endif
280        mLastBreak = mBestBreak;
281        mPreBreak = mCandidates[mBestBreak].preBreak;
282    }
283    if (cand.penalty <= mBestScore) {
284        mBestBreak = candIndex;
285        mBestScore = cand.penalty;
286    }
287}
288
289void LineBreaker::pushBreak(int offset, float width, uint8_t hyph) {
290    mBreaks.push_back(offset);
291    mWidths.push_back(width);
292    int flags = (mFirstTabIndex < mBreaks.back()) << kTab_Shift;
293    flags |= hyph;
294    mFlags.push_back(flags);
295    mFirstTabIndex = INT_MAX;
296}
297
298void LineBreaker::addReplacement(size_t start, size_t end, float width) {
299    mCharWidths[start] = width;
300    std::fill(&mCharWidths[start + 1], &mCharWidths[end], 0.0f);
301    addStyleRun(nullptr, nullptr, FontStyle(), start, end, false);
302}
303
304float LineBreaker::currentLineWidth() const {
305    return mLineWidths.getLineWidth(mBreaks.size());
306}
307
308void LineBreaker::computeBreaksGreedy() {
309    // All breaks but the last have been added in addCandidate already.
310    size_t nCand = mCandidates.size();
311    if (nCand == 1 || mLastBreak != nCand - 1) {
312        pushBreak(mCandidates[nCand - 1].offset, mCandidates[nCand - 1].postBreak - mPreBreak, 0);
313        // don't need to update mBestScore, because we're done
314#if VERBOSE_DEBUG
315        ALOGD("final break: %d %g", mBreaks.back(), mWidths.back());
316#endif
317    }
318}
319
320// Follow "prev" links in mCandidates array, and copy to result arrays.
321void LineBreaker::finishBreaksOptimal() {
322    // clear existing greedy break result
323    mBreaks.clear();
324    mWidths.clear();
325    mFlags.clear();
326    size_t nCand = mCandidates.size();
327    size_t prev;
328    for (size_t i = nCand - 1; i > 0; i = prev) {
329        prev = mCandidates[i].prev;
330        mBreaks.push_back(mCandidates[i].offset);
331        mWidths.push_back(mCandidates[i].postBreak - mCandidates[prev].preBreak);
332        mFlags.push_back(mCandidates[i].hyphenEdit);
333    }
334    std::reverse(mBreaks.begin(), mBreaks.end());
335    std::reverse(mWidths.begin(), mWidths.end());
336    std::reverse(mFlags.begin(), mFlags.end());
337}
338
339void LineBreaker::computeBreaksOptimal(bool isRectangle) {
340    size_t active = 0;
341    size_t nCand = mCandidates.size();
342    float width = mLineWidths.getLineWidth(0);
343    for (size_t i = 1; i < nCand; i++) {
344        bool atEnd = i == nCand - 1;
345        float best = SCORE_INFTY;
346        size_t bestPrev = 0;
347        size_t lineNumberLast = 0;
348
349        if (!isRectangle) {
350            size_t lineNumberLast = mCandidates[active].lineNumber;
351            width = mLineWidths.getLineWidth(lineNumberLast);
352        }
353        ParaWidth leftEdge = mCandidates[i].postBreak - width;
354        float bestHope = 0;
355
356        for (size_t j = active; j < i; j++) {
357            if (!isRectangle) {
358                size_t lineNumber = mCandidates[j].lineNumber;
359                if (lineNumber != lineNumberLast) {
360                    float widthNew = mLineWidths.getLineWidth(lineNumber);
361                    if (widthNew != width) {
362                        leftEdge = mCandidates[i].postBreak - width;
363                        bestHope = 0;
364                        width = widthNew;
365                    }
366                    lineNumberLast = lineNumber;
367                }
368            }
369            float jScore = mCandidates[j].score;
370            if (jScore + bestHope >= best) continue;
371            float delta = mCandidates[j].preBreak - leftEdge;
372
373            // compute width score for line
374
375            // Note: the "bestHope" optimization makes the assumption that, when delta is
376            // non-negative, widthScore will increase monotonically as successive candidate
377            // breaks are considered.
378            float widthScore = 0.0f;
379            float additionalPenalty = 0.0f;
380            if (delta < 0) {
381                widthScore = SCORE_OVERFULL;
382            } else if (atEnd && mStrategy != kBreakStrategy_Balanced) {
383                // increase penalty for hyphen on last line
384                additionalPenalty = LAST_LINE_PENALTY_MULTIPLIER * mCandidates[j].penalty;
385            } else {
386                widthScore = delta * delta;
387            }
388
389            if (delta < 0) {
390                active = j + 1;
391            } else {
392                bestHope = widthScore;
393            }
394
395            float score = jScore + widthScore + additionalPenalty;
396            if (score <= best) {
397                best = score;
398                bestPrev = j;
399            }
400        }
401        mCandidates[i].score = best + mCandidates[i].penalty + mLinePenalty;
402        mCandidates[i].prev = bestPrev;
403        mCandidates[i].lineNumber = mCandidates[bestPrev].lineNumber + 1;
404#if VERBOSE_DEBUG
405        ALOGD("break %zd: score=%g, prev=%zd", i, mCandidates[i].score, mCandidates[i].prev);
406#endif
407    }
408    finishBreaksOptimal();
409}
410
411size_t LineBreaker::computeBreaks() {
412    if (mStrategy == kBreakStrategy_Greedy) {
413        computeBreaksGreedy();
414    } else {
415        computeBreaksOptimal(mLineWidths.isConstant());
416    }
417    return mBreaks.size();
418}
419
420void LineBreaker::finish() {
421    mWordBreaker.finish();
422    mWidth = 0;
423    mLineWidths.clear();
424    mCandidates.clear();
425    mBreaks.clear();
426    mWidths.clear();
427    mFlags.clear();
428    if (mTextBuf.size() > MAX_TEXT_BUF_RETAIN) {
429        mTextBuf.clear();
430        mTextBuf.shrink_to_fit();
431        mCharWidths.clear();
432        mCharWidths.shrink_to_fit();
433        mHyphBuf.clear();
434        mHyphBuf.shrink_to_fit();
435        mCandidates.shrink_to_fit();
436        mBreaks.shrink_to_fit();
437        mWidths.shrink_to_fit();
438        mFlags.shrink_to_fit();
439    }
440    mStrategy = kBreakStrategy_Greedy;
441    mHyphenationFrequency = kHyphenationFrequency_Normal;
442    mLinePenalty = 0.0f;
443}
444
445}  // namespace android
446