1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/**
18 * An implementation of Liang's hyphenation algorithm.
19 */
20
21#ifndef MINIKIN_HYPHENATOR_H
22#define MINIKIN_HYPHENATOR_H
23
24#include <string>
25#include <vector>
26
27#include "minikin/Characters.h"
28#include "minikin/U16StringPiece.h"
29
30namespace minikin {
31
32class Hyphenator;
33
34// Registers the hyphenator.
35// This doesn't take ownership of the hyphenator but we don't need to care about the ownership.
36// In Android, the Hyphenator is allocated in Zygote and never gets released.
37void addHyphenator(const std::string& localeStr, const Hyphenator* hyphenator);
38void addHyphenatorAlias(const std::string& fromLocaleStr, const std::string& toLocaleStr);
39
40enum class HyphenationType : uint8_t {
41    // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0.
42
43    // Do not break.
44    DONT_BREAK = 0,
45    // Break the line and insert a normal hyphen.
46    BREAK_AND_INSERT_HYPHEN = 1,
47    // Break the line and insert an Armenian hyphen (U+058A).
48    BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
49    // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
50    BREAK_AND_INSERT_MAQAF = 3,
51    // Break the line and insert a Canadian Syllabics hyphen (U+1400).
52    BREAK_AND_INSERT_UCAS_HYPHEN = 4,
53    // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
54    // present or the script does not use a hyphen (e.g. in Malayalam).
55    BREAK_AND_DONT_INSERT_HYPHEN = 5,
56    // Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates
57    // as "l-/l".
58    BREAK_AND_REPLACE_WITH_HYPHEN = 6,
59    // Break the line, and repeat the hyphen (which is the last character) at the beginning of the
60    // next line. Used in Polish (where "czerwono-niebieska" should hyphenate as
61    // "czerwono-/-niebieska") and Slovenian.
62    BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
63    // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
64    // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
65    // behavior when a soft hyphen is used in Arabic script.
66    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
67};
68
69// The hyphen edit represents an edit to the string when a word is hyphenated.
70// The most common hyphen edit is adding a "-" at the end of a syllable, but nonstandard hyphenation
71// allows for more choices.
72// One at the beginning of the string/line and one at the end.
73enum class EndHyphenEdit : uint8_t {
74    // Note that everything inserting characters must have a value greater than or equal to
75    // INSERT_HYPHEN.
76    NO_EDIT = 0b000,
77    REPLACE_WITH_HYPHEN = 0b001,
78
79    INSERT_HYPHEN = 0b010,
80    INSERT_ARMENIAN_HYPHEN = 0b011,
81    INSERT_MAQAF = 0b100,
82    INSERT_UCAS_HYPHEN = 0b101,
83    INSERT_ZWJ_AND_HYPHEN = 0b110,
84};
85
86enum class StartHyphenEdit : uint8_t {
87    NO_EDIT = 0b00,
88
89    INSERT_HYPHEN = 0b01,
90    INSERT_ZWJ = 0b10,
91};
92
93typedef uint8_t HyphenEdit;
94constexpr uint8_t START_BITS_SHIFT = 3;
95// The following two masks must keep in sync with the definitions in the Java code at:
96// frameworks/base/graphics/java/android/graphics/Paint.java
97constexpr uint8_t MASK_END_OF_LINE = 0b00111;
98constexpr uint8_t MASK_START_OF_LINE = 0b11000;
99
100inline HyphenEdit packHyphenEdit(StartHyphenEdit start, EndHyphenEdit end) {
101    return static_cast<uint8_t>(start) << START_BITS_SHIFT | static_cast<uint8_t>(end);
102}
103
104inline EndHyphenEdit endHyphenEdit(HyphenEdit hyphenEdit) {
105    return static_cast<EndHyphenEdit>(hyphenEdit & MASK_END_OF_LINE);
106}
107
108inline StartHyphenEdit startHyphenEdit(HyphenEdit hyphenEdit) {
109    return static_cast<StartHyphenEdit>(hyphenEdit >> START_BITS_SHIFT);
110}
111
112inline bool isReplacement(EndHyphenEdit hyph) {
113    return hyph == EndHyphenEdit::REPLACE_WITH_HYPHEN;
114}
115
116inline bool isInsertion(StartHyphenEdit hyph) {
117    return hyph != StartHyphenEdit::NO_EDIT;
118}
119
120inline bool isInsertion(EndHyphenEdit hyph) {
121    return static_cast<uint8_t>(hyph) >= static_cast<uint8_t>(EndHyphenEdit::INSERT_HYPHEN);
122}
123
124template <typename T, size_t size>
125constexpr size_t ARRAYSIZE(T const (&)[size]) {
126    return size;
127}
128constexpr uint32_t HYPHEN_STR_ZWJ[] = {CHAR_ZWJ};
129constexpr uint32_t HYPHEN_STR_HYPHEN[] = {CHAR_HYPHEN};
130constexpr uint32_t HYPHEN_STR_ARMENIAN_HYPHEN[] = {CHAR_ARMENIAN_HYPHEN};
131constexpr uint32_t HYPHEN_STR_MAQAF[] = {CHAR_MAQAF};
132constexpr uint32_t HYPHEN_STR_UCAS_HYPHEN[] = {CHAR_UCAS_HYPHEN};
133constexpr uint32_t HYPHEN_STR_ZWJ_AND_HYPHEN[] = {CHAR_ZWJ, CHAR_HYPHEN};
134constexpr std::pair<const uint32_t*, size_t> EMPTY_HYPHEN_STR(nullptr, 0);
135#define MAKE_HYPHEN_STR(chars) std::make_pair((chars), ARRAYSIZE(chars))
136
137inline std::pair<const uint32_t*, size_t> getHyphenString(StartHyphenEdit hyph) {
138    if (hyph == StartHyphenEdit::INSERT_ZWJ) {
139        return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ);
140    } else if (hyph == StartHyphenEdit::INSERT_HYPHEN) {
141        return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
142    } else {
143        return EMPTY_HYPHEN_STR;
144    }
145}
146
147inline std::pair<const uint32_t*, size_t> getHyphenString(EndHyphenEdit hyph) {
148    switch (hyph) {
149        case EndHyphenEdit::REPLACE_WITH_HYPHEN:  // fall through
150        case EndHyphenEdit::INSERT_HYPHEN:
151            return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
152        case EndHyphenEdit::INSERT_ARMENIAN_HYPHEN:
153            return MAKE_HYPHEN_STR(HYPHEN_STR_ARMENIAN_HYPHEN);
154        case EndHyphenEdit::INSERT_MAQAF:
155            return MAKE_HYPHEN_STR(HYPHEN_STR_MAQAF);
156        case EndHyphenEdit::INSERT_UCAS_HYPHEN:
157            return MAKE_HYPHEN_STR(HYPHEN_STR_UCAS_HYPHEN);
158        case EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN:
159            return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ_AND_HYPHEN);
160        case EndHyphenEdit::NO_EDIT:
161        default:
162            return EMPTY_HYPHEN_STR;
163    }
164}
165#undef MAKE_HYPHEN_STR
166
167EndHyphenEdit editForThisLine(HyphenationType type);
168StartHyphenEdit editForNextLine(HyphenationType type);
169
170// hyb file header; implementation details are in the .cpp file
171struct Header;
172
173class Hyphenator {
174public:
175    // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
176    // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
177    // corresponding code unit offset in the word.
178    //
179    // out must have at least the length of the word capacity.
180    //
181    // Example: word is "hyphen", result is the following, corresponding to "hy-phen":
182    // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
183    void hyphenate(const U16StringPiece& word, HyphenationType* out) const;
184
185    // Compute the hyphenation of a word.
186    //
187    // out will be resized to word length.
188    void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const {
189        out->resize(word.size());
190        return hyphenate(word, out->data());
191    }
192
193    // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character
194    // immediately after which line breaks are allowed, but words containing it should not be
195    // automatically hyphenated.
196    static bool isLineBreakingHyphen(uint32_t cp);
197
198    // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
199    // the caller is responsible for ensuring that the lifetime of the pattern data is
200    // at least as long as the Hyphenator object.
201
202    // This class doesn't copy or take ownership of patternData. Caller must keep the data valid
203    // until this instance is deleted.
204    // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
205    static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
206                                  const std::string& locale);
207
208private:
209    enum class HyphenationLocale : uint8_t {
210        OTHER = 0,
211        CATALAN = 1,
212        POLISH = 2,
213        SLOVENIAN = 3,
214    };
215
216    // Use Hyphenator::loadBinary instead.
217    Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
218               HyphenationLocale hyphenLocale);
219
220    // apply various hyphenation rules including hard and soft hyphens, ignoring patterns
221    void hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const;
222
223    // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map.
224    // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or
225    // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen.
226    // Note that this method writes len+2 entries into alpha_codes (including start and stop)
227    HyphenationType alphabetLookup(uint16_t* alpha_codes, const U16StringPiece& word) const;
228
229    // calculate hyphenation from patterns, assuming alphabet lookup has already been done
230    void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
231                            HyphenationType* out) const;
232
233    // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
234    // that temporary buffers can be stack-allocated without waste, which is a slightly
235    // different use case. It measures UTF-16 code units.
236    static const size_t MAX_HYPHENATED_SIZE = 64;
237
238    const uint8_t* mPatternData;
239    const size_t mMinPrefix, mMinSuffix;
240    const HyphenationLocale mHyphenationLocale;
241
242    // accessors for binary data
243    const Header* getHeader() const { return reinterpret_cast<const Header*>(mPatternData); }
244};
245
246}  // namespace minikin
247
248#endif  // MINIKIN_HYPHENATOR_H
249