15cdad92c300a65cab89b172e952186f0c5870657Raph Levien/*
25cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Copyright (C) 2015 The Android Open Source Project
35cdad92c300a65cab89b172e952186f0c5870657Raph Levien *
45cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Licensed under the Apache License, Version 2.0 (the "License");
55cdad92c300a65cab89b172e952186f0c5870657Raph Levien * you may not use this file except in compliance with the License.
65cdad92c300a65cab89b172e952186f0c5870657Raph Levien * You may obtain a copy of the License at
75cdad92c300a65cab89b172e952186f0c5870657Raph Levien *
85cdad92c300a65cab89b172e952186f0c5870657Raph Levien *      http://www.apache.org/licenses/LICENSE-2.0
95cdad92c300a65cab89b172e952186f0c5870657Raph Levien *
105cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Unless required by applicable law or agreed to in writing, software
115cdad92c300a65cab89b172e952186f0c5870657Raph Levien * distributed under the License is distributed on an "AS IS" BASIS,
125cdad92c300a65cab89b172e952186f0c5870657Raph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
135cdad92c300a65cab89b172e952186f0c5870657Raph Levien * See the License for the specific language governing permissions and
145cdad92c300a65cab89b172e952186f0c5870657Raph Levien * limitations under the License.
155cdad92c300a65cab89b172e952186f0c5870657Raph Levien */
165cdad92c300a65cab89b172e952186f0c5870657Raph Levien
175cdad92c300a65cab89b172e952186f0c5870657Raph Levien/**
185cdad92c300a65cab89b172e952186f0c5870657Raph Levien * An implementation of Liang's hyphenation algorithm.
195cdad92c300a65cab89b172e952186f0c5870657Raph Levien */
205cdad92c300a65cab89b172e952186f0c5870657Raph Levien
218fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka#ifndef MINIKIN_HYPHENATOR_H
228fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka#define MINIKIN_HYPHENATOR_H
238fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka
248fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka#include <string>
25b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka#include <vector>
26b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
27b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka#include "minikin/Characters.h"
28524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka#include "minikin/U16StringPiece.h"
295cdad92c300a65cab89b172e952186f0c5870657Raph Levien
3014e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonakanamespace minikin {
315cdad92c300a65cab89b172e952186f0c5870657Raph Levien
328fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonakaclass Hyphenator;
338fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka
348fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka// Registers the hyphenator.
358fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka// This doesn't take ownership of the hyphenator but we don't need to care about the ownership.
368fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka// In Android, the Hyphenator is allocated in Zygote and never gets released.
378fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonakavoid addHyphenator(const std::string& localeStr, const Hyphenator* hyphenator);
388fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonakavoid addHyphenatorAlias(const std::string& fromLocaleStr, const std::string& toLocaleStr);
398fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka
40c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderenum class HyphenationType : uint8_t {
41c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0.
42c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
43c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Do not break.
44c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    DONT_BREAK = 0,
45c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Break the line and insert a normal hyphen.
46c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    BREAK_AND_INSERT_HYPHEN = 1,
47c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Break the line and insert an Armenian hyphen (U+058A).
48c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
49c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
50c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    BREAK_AND_INSERT_MAQAF = 3,
51c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Break the line and insert a Canadian Syllabics hyphen (U+1400).
52c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    BREAK_AND_INSERT_UCAS_HYPHEN = 4,
53c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
54c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // present or the script does not use a hyphen (e.g. in Malayalam).
55c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    BREAK_AND_DONT_INSERT_HYPHEN = 5,
56c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates
57c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // as "l-/l".
58c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    BREAK_AND_REPLACE_WITH_HYPHEN = 6,
59c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Break the line, and repeat the hyphen (which is the last character) at the beginning of the
60237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader    // next line. Used in Polish (where "czerwono-niebieska" should hyphenate as
61237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader    // "czerwono-/-niebieska") and Slovenian.
62c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
63c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
64c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
65c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // behavior when a soft hyphen is used in Arabic script.
66c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
67c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader};
68c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
69b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// The hyphen edit represents an edit to the string when a word is hyphenated.
70b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// The most common hyphen edit is adding a "-" at the end of a syllable, but nonstandard hyphenation
71b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// allows for more choices.
72c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// One at the beginning of the string/line and one at the end.
73b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaenum class EndHyphenEdit : uint8_t {
74b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    // Note that everything inserting characters must have a value greater than or equal to
75b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    // INSERT_HYPHEN.
766c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    NO_EDIT = 0b000,
776c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    REPLACE_WITH_HYPHEN = 0b001,
78b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
796c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    INSERT_HYPHEN = 0b010,
80b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    INSERT_ARMENIAN_HYPHEN = 0b011,
816c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    INSERT_MAQAF = 0b100,
826c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    INSERT_UCAS_HYPHEN = 0b101,
836c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    INSERT_ZWJ_AND_HYPHEN = 0b110,
84b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka};
85c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
86b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaenum class StartHyphenEdit : uint8_t {
876c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    NO_EDIT = 0b00,
88c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
89b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    INSERT_HYPHEN = 0b01,
906c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    INSERT_ZWJ = 0b10,
91b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka};
92c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
93b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakatypedef uint8_t HyphenEdit;
94b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint8_t START_BITS_SHIFT = 3;
95b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// The following two masks must keep in sync with the definitions in the Java code at:
96b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// frameworks/base/graphics/java/android/graphics/Paint.java
976c8722e217ff5238f0b849152d7936959a728103Seigo Nonakaconstexpr uint8_t MASK_END_OF_LINE = 0b00111;
98b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint8_t MASK_START_OF_LINE = 0b11000;
99b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
100b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline HyphenEdit packHyphenEdit(StartHyphenEdit start, EndHyphenEdit end) {
101b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    return static_cast<uint8_t>(start) << START_BITS_SHIFT | static_cast<uint8_t>(end);
102b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}
103b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
104b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline EndHyphenEdit endHyphenEdit(HyphenEdit hyphenEdit) {
105b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    return static_cast<EndHyphenEdit>(hyphenEdit & MASK_END_OF_LINE);
106b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}
107b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
108b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline StartHyphenEdit startHyphenEdit(HyphenEdit hyphenEdit) {
109b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    return static_cast<StartHyphenEdit>(hyphenEdit >> START_BITS_SHIFT);
110b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}
111b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
112b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline bool isReplacement(EndHyphenEdit hyph) {
113b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    return hyph == EndHyphenEdit::REPLACE_WITH_HYPHEN;
114b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}
115b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
116b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline bool isInsertion(StartHyphenEdit hyph) {
117b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    return hyph != StartHyphenEdit::NO_EDIT;
118b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}
119b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
120b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline bool isInsertion(EndHyphenEdit hyph) {
121b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    return static_cast<uint8_t>(hyph) >= static_cast<uint8_t>(EndHyphenEdit::INSERT_HYPHEN);
122b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}
123b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
1246c8722e217ff5238f0b849152d7936959a728103Seigo Nonakatemplate <typename T, size_t size>
1256c8722e217ff5238f0b849152d7936959a728103Seigo Nonakaconstexpr size_t ARRAYSIZE(T const (&)[size]) {
1266c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    return size;
1276c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka}
128b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_ZWJ[] = {CHAR_ZWJ};
129b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_HYPHEN[] = {CHAR_HYPHEN};
130b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_ARMENIAN_HYPHEN[] = {CHAR_ARMENIAN_HYPHEN};
131b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_MAQAF[] = {CHAR_MAQAF};
132b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_UCAS_HYPHEN[] = {CHAR_UCAS_HYPHEN};
133b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_ZWJ_AND_HYPHEN[] = {CHAR_ZWJ, CHAR_HYPHEN};
134b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr std::pair<const uint32_t*, size_t> EMPTY_HYPHEN_STR(nullptr, 0);
135b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka#define MAKE_HYPHEN_STR(chars) std::make_pair((chars), ARRAYSIZE(chars))
136b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
137b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline std::pair<const uint32_t*, size_t> getHyphenString(StartHyphenEdit hyph) {
138b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    if (hyph == StartHyphenEdit::INSERT_ZWJ) {
139b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ);
140b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    } else if (hyph == StartHyphenEdit::INSERT_HYPHEN) {
141b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
142b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    } else {
143b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        return EMPTY_HYPHEN_STR;
144b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    }
145b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}
146b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka
147b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline std::pair<const uint32_t*, size_t> getHyphenString(EndHyphenEdit hyph) {
148b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    switch (hyph) {
149b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        case EndHyphenEdit::REPLACE_WITH_HYPHEN:  // fall through
150b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        case EndHyphenEdit::INSERT_HYPHEN:
151b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka            return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
152b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        case EndHyphenEdit::INSERT_ARMENIAN_HYPHEN:
153b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka            return MAKE_HYPHEN_STR(HYPHEN_STR_ARMENIAN_HYPHEN);
154b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        case EndHyphenEdit::INSERT_MAQAF:
155b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka            return MAKE_HYPHEN_STR(HYPHEN_STR_MAQAF);
156b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        case EndHyphenEdit::INSERT_UCAS_HYPHEN:
157b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka            return MAKE_HYPHEN_STR(HYPHEN_STR_UCAS_HYPHEN);
158b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        case EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN:
159b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka            return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ_AND_HYPHEN);
160b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        case EndHyphenEdit::NO_EDIT:
161b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka        default:
162b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka            return EMPTY_HYPHEN_STR;
163b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka    }
164b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}
165b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka#undef MAKE_HYPHEN_STR
166c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
167b1363f277a872e9b932059b3bd88bc17be1f3577Seigo NonakaEndHyphenEdit editForThisLine(HyphenationType type);
168b1363f277a872e9b932059b3bd88bc17be1f3577Seigo NonakaStartHyphenEdit editForNextLine(HyphenationType type);
169c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
170f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien// hyb file header; implementation details are in the .cpp file
171f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct Header;
1725cdad92c300a65cab89b172e952186f0c5870657Raph Levien
1735cdad92c300a65cab89b172e952186f0c5870657Raph Levienclass Hyphenator {
1745cdad92c300a65cab89b172e952186f0c5870657Raph Levienpublic:
175c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
176c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
177c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // corresponding code unit offset in the word.
178c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    //
179524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    // out must have at least the length of the word capacity.
180524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    //
181c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Example: word is "hyphen", result is the following, corresponding to "hy-phen":
182c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
183524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    void hyphenate(const U16StringPiece& word, HyphenationType* out) const;
184524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka
185524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    // Compute the hyphenation of a word.
186524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    //
187524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    // out will be resized to word length.
188524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const {
189524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka        out->resize(word.size());
190524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka        return hyphenate(word, out->data());
191524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    }
192c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
193c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character
194c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // immediately after which line breaks are allowed, but words containing it should not be
195c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // automatically hyphenated.
196c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    static bool isLineBreakingHyphen(uint32_t cp);
1975cdad92c300a65cab89b172e952186f0c5870657Raph Levien
198f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
199f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // the caller is responsible for ensuring that the lifetime of the pattern data is
200f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // at least as long as the Hyphenator object.
201f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
2025aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    // This class doesn't copy or take ownership of patternData. Caller must keep the data valid
2035aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    // until this instance is deleted.
204c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
2055aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
2066c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka                                  const std::string& locale);
2076c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka
2085cdad92c300a65cab89b172e952186f0c5870657Raph Levienprivate:
2095aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    enum class HyphenationLocale : uint8_t {
2105aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka        OTHER = 0,
211237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader        CATALAN = 1,
212237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader        POLISH = 2,
213237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader        SLOVENIAN = 3,
2145aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    };
2155aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka
2165aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    // Use Hyphenator::loadBinary instead.
2175aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
2186c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka               HyphenationLocale hyphenLocale);
2195aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka
220c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // apply various hyphenation rules including hard and soft hyphens, ignoring patterns
221524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    void hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const;
2225cdad92c300a65cab89b172e952186f0c5870657Raph Levien
223c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map.
224c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or
225c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen.
226c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Note that this method writes len+2 entries into alpha_codes (including start and stop)
227524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    HyphenationType alphabetLookup(uint16_t* alpha_codes, const U16StringPiece& word) const;
228f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
229f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // calculate hyphenation from patterns, assuming alphabet lookup has already been done
230524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka    void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
231524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka                            HyphenationType* out) const;
2325cdad92c300a65cab89b172e952186f0c5870657Raph Levien
233f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
234f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // that temporary buffers can be stack-allocated without waste, which is a slightly
235f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // different use case. It measures UTF-16 code units.
236f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    static const size_t MAX_HYPHENATED_SIZE = 64;
237f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
2385aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    const uint8_t* mPatternData;
2395aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    const size_t mMinPrefix, mMinSuffix;
2405aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka    const HyphenationLocale mHyphenationLocale;
241f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
242f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // accessors for binary data
2436c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka    const Header* getHeader() const { return reinterpret_cast<const Header*>(mPatternData); }
2445cdad92c300a65cab89b172e952186f0c5870657Raph Levien};
2455cdad92c300a65cab89b172e952186f0c5870657Raph Levien
24614e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka}  // namespace minikin
2475cdad92c300a65cab89b172e952186f0c5870657Raph Levien
2486c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka#endif  // MINIKIN_HYPHENATOR_H
249