15cdad92c300a65cab89b172e952186f0c5870657Raph Levien/* 25cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Copyright (C) 2015 The Android Open Source Project 35cdad92c300a65cab89b172e952186f0c5870657Raph Levien * 45cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 55cdad92c300a65cab89b172e952186f0c5870657Raph Levien * you may not use this file except in compliance with the License. 65cdad92c300a65cab89b172e952186f0c5870657Raph Levien * You may obtain a copy of the License at 75cdad92c300a65cab89b172e952186f0c5870657Raph Levien * 85cdad92c300a65cab89b172e952186f0c5870657Raph Levien * http://www.apache.org/licenses/LICENSE-2.0 95cdad92c300a65cab89b172e952186f0c5870657Raph Levien * 105cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Unless required by applicable law or agreed to in writing, software 115cdad92c300a65cab89b172e952186f0c5870657Raph Levien * distributed under the License is distributed on an "AS IS" BASIS, 125cdad92c300a65cab89b172e952186f0c5870657Raph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 135cdad92c300a65cab89b172e952186f0c5870657Raph Levien * See the License for the specific language governing permissions and 145cdad92c300a65cab89b172e952186f0c5870657Raph Levien * limitations under the License. 155cdad92c300a65cab89b172e952186f0c5870657Raph Levien */ 165cdad92c300a65cab89b172e952186f0c5870657Raph Levien 175cdad92c300a65cab89b172e952186f0c5870657Raph Levien/** 185cdad92c300a65cab89b172e952186f0c5870657Raph Levien * An implementation of Liang's hyphenation algorithm. 195cdad92c300a65cab89b172e952186f0c5870657Raph Levien */ 205cdad92c300a65cab89b172e952186f0c5870657Raph Levien 218fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka#ifndef MINIKIN_HYPHENATOR_H 228fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka#define MINIKIN_HYPHENATOR_H 238fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka 248fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka#include <string> 25b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka#include <vector> 26b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 27b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka#include "minikin/Characters.h" 28524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka#include "minikin/U16StringPiece.h" 295cdad92c300a65cab89b172e952186f0c5870657Raph Levien 3014e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonakanamespace minikin { 315cdad92c300a65cab89b172e952186f0c5870657Raph Levien 328fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonakaclass Hyphenator; 338fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka 348fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka// Registers the hyphenator. 358fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka// This doesn't take ownership of the hyphenator but we don't need to care about the ownership. 368fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka// In Android, the Hyphenator is allocated in Zygote and never gets released. 378fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonakavoid addHyphenator(const std::string& localeStr, const Hyphenator* hyphenator); 388fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonakavoid addHyphenatorAlias(const std::string& fromLocaleStr, const std::string& toLocaleStr); 398fbcbda6f9a2ee254ad8d22a5fe025e094fd6ff0Seigo Nonaka 40c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderenum class HyphenationType : uint8_t { 41c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0. 42c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 43c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Do not break. 44c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader DONT_BREAK = 0, 45c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break the line and insert a normal hyphen. 46c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader BREAK_AND_INSERT_HYPHEN = 1, 47c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break the line and insert an Armenian hyphen (U+058A). 48c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2, 49c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break the line and insert a maqaf (Hebrew hyphen, U+05BE). 50c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader BREAK_AND_INSERT_MAQAF = 3, 51c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break the line and insert a Canadian Syllabics hyphen (U+1400). 52c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader BREAK_AND_INSERT_UCAS_HYPHEN = 4, 53c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen 54c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // present or the script does not use a hyphen (e.g. in Malayalam). 55c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader BREAK_AND_DONT_INSERT_HYPHEN = 5, 56c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates 57c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // as "l-/l". 58c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader BREAK_AND_REPLACE_WITH_HYPHEN = 6, 59c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break the line, and repeat the hyphen (which is the last character) at the beginning of the 60237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader // next line. Used in Polish (where "czerwono-niebieska" should hyphenate as 61237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader // "czerwono-/-niebieska") and Slovenian. 62c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7, 63c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line. 64c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default 65c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // behavior when a soft hyphen is used in Arabic script. 66c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8 67c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}; 68c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 69b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// The hyphen edit represents an edit to the string when a word is hyphenated. 70b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// The most common hyphen edit is adding a "-" at the end of a syllable, but nonstandard hyphenation 71b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// allows for more choices. 72c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// One at the beginning of the string/line and one at the end. 73b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaenum class EndHyphenEdit : uint8_t { 74b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka // Note that everything inserting characters must have a value greater than or equal to 75b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka // INSERT_HYPHEN. 766c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka NO_EDIT = 0b000, 776c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka REPLACE_WITH_HYPHEN = 0b001, 78b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 796c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka INSERT_HYPHEN = 0b010, 80b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka INSERT_ARMENIAN_HYPHEN = 0b011, 816c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka INSERT_MAQAF = 0b100, 826c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka INSERT_UCAS_HYPHEN = 0b101, 836c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka INSERT_ZWJ_AND_HYPHEN = 0b110, 84b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}; 85c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 86b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaenum class StartHyphenEdit : uint8_t { 876c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka NO_EDIT = 0b00, 88c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 89b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka INSERT_HYPHEN = 0b01, 906c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka INSERT_ZWJ = 0b10, 91b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka}; 92c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 93b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakatypedef uint8_t HyphenEdit; 94b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint8_t START_BITS_SHIFT = 3; 95b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// The following two masks must keep in sync with the definitions in the Java code at: 96b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka// frameworks/base/graphics/java/android/graphics/Paint.java 976c8722e217ff5238f0b849152d7936959a728103Seigo Nonakaconstexpr uint8_t MASK_END_OF_LINE = 0b00111; 98b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint8_t MASK_START_OF_LINE = 0b11000; 99b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 100b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline HyphenEdit packHyphenEdit(StartHyphenEdit start, EndHyphenEdit end) { 101b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return static_cast<uint8_t>(start) << START_BITS_SHIFT | static_cast<uint8_t>(end); 102b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka} 103b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 104b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline EndHyphenEdit endHyphenEdit(HyphenEdit hyphenEdit) { 105b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return static_cast<EndHyphenEdit>(hyphenEdit & MASK_END_OF_LINE); 106b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka} 107b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 108b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline StartHyphenEdit startHyphenEdit(HyphenEdit hyphenEdit) { 109b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return static_cast<StartHyphenEdit>(hyphenEdit >> START_BITS_SHIFT); 110b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka} 111b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 112b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline bool isReplacement(EndHyphenEdit hyph) { 113b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return hyph == EndHyphenEdit::REPLACE_WITH_HYPHEN; 114b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka} 115b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 116b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline bool isInsertion(StartHyphenEdit hyph) { 117b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return hyph != StartHyphenEdit::NO_EDIT; 118b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka} 119b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 120b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline bool isInsertion(EndHyphenEdit hyph) { 121b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return static_cast<uint8_t>(hyph) >= static_cast<uint8_t>(EndHyphenEdit::INSERT_HYPHEN); 122b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka} 123b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 1246c8722e217ff5238f0b849152d7936959a728103Seigo Nonakatemplate <typename T, size_t size> 1256c8722e217ff5238f0b849152d7936959a728103Seigo Nonakaconstexpr size_t ARRAYSIZE(T const (&)[size]) { 1266c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka return size; 1276c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka} 128b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_ZWJ[] = {CHAR_ZWJ}; 129b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_HYPHEN[] = {CHAR_HYPHEN}; 130b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_ARMENIAN_HYPHEN[] = {CHAR_ARMENIAN_HYPHEN}; 131b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_MAQAF[] = {CHAR_MAQAF}; 132b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_UCAS_HYPHEN[] = {CHAR_UCAS_HYPHEN}; 133b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr uint32_t HYPHEN_STR_ZWJ_AND_HYPHEN[] = {CHAR_ZWJ, CHAR_HYPHEN}; 134b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakaconstexpr std::pair<const uint32_t*, size_t> EMPTY_HYPHEN_STR(nullptr, 0); 135b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka#define MAKE_HYPHEN_STR(chars) std::make_pair((chars), ARRAYSIZE(chars)) 136b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 137b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline std::pair<const uint32_t*, size_t> getHyphenString(StartHyphenEdit hyph) { 138b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka if (hyph == StartHyphenEdit::INSERT_ZWJ) { 139b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ); 140b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka } else if (hyph == StartHyphenEdit::INSERT_HYPHEN) { 141b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN); 142b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka } else { 143b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return EMPTY_HYPHEN_STR; 144b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka } 145b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka} 146b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka 147b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonakainline std::pair<const uint32_t*, size_t> getHyphenString(EndHyphenEdit hyph) { 148b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka switch (hyph) { 149b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka case EndHyphenEdit::REPLACE_WITH_HYPHEN: // fall through 150b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka case EndHyphenEdit::INSERT_HYPHEN: 151b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN); 152b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka case EndHyphenEdit::INSERT_ARMENIAN_HYPHEN: 153b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return MAKE_HYPHEN_STR(HYPHEN_STR_ARMENIAN_HYPHEN); 154b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka case EndHyphenEdit::INSERT_MAQAF: 155b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return MAKE_HYPHEN_STR(HYPHEN_STR_MAQAF); 156b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka case EndHyphenEdit::INSERT_UCAS_HYPHEN: 157b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return MAKE_HYPHEN_STR(HYPHEN_STR_UCAS_HYPHEN); 158b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka case EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN: 159b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ_AND_HYPHEN); 160b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka case EndHyphenEdit::NO_EDIT: 161b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka default: 162b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka return EMPTY_HYPHEN_STR; 163b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka } 164b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka} 165b1363f277a872e9b932059b3bd88bc17be1f3577Seigo Nonaka#undef MAKE_HYPHEN_STR 166c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 167b1363f277a872e9b932059b3bd88bc17be1f3577Seigo NonakaEndHyphenEdit editForThisLine(HyphenationType type); 168b1363f277a872e9b932059b3bd88bc17be1f3577Seigo NonakaStartHyphenEdit editForNextLine(HyphenationType type); 169c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 170f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien// hyb file header; implementation details are in the .cpp file 171f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct Header; 1725cdad92c300a65cab89b172e952186f0c5870657Raph Levien 1735cdad92c300a65cab89b172e952186f0c5870657Raph Levienclass Hyphenator { 1745cdad92c300a65cab89b172e952186f0c5870657Raph Levienpublic: 175c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in 176c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the 177c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // corresponding code unit offset in the word. 178c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // 179524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka // out must have at least the length of the word capacity. 180524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka // 181c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Example: word is "hyphen", result is the following, corresponding to "hy-phen": 182c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK] 183524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka void hyphenate(const U16StringPiece& word, HyphenationType* out) const; 184524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka 185524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka // Compute the hyphenation of a word. 186524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka // 187524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka // out will be resized to word length. 188524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const { 189524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka out->resize(word.size()); 190524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka return hyphenate(word, out->data()); 191524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka } 192c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 193c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character 194c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // immediately after which line breaks are allowed, but words containing it should not be 195c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // automatically hyphenated. 196c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader static bool isLineBreakingHyphen(uint32_t cp); 1975cdad92c300a65cab89b172e952186f0c5870657Raph Levien 198f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // pattern data is in binary format, as described in doc/hyb_file_format.md. Note: 199f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // the caller is responsible for ensuring that the lifetime of the pattern data is 200f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // at least as long as the Hyphenator object. 201f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 2025aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka // This class doesn't copy or take ownership of patternData. Caller must keep the data valid 2035aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka // until this instance is deleted. 204c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens. 2055aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix, 2066c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka const std::string& locale); 2076c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka 2085cdad92c300a65cab89b172e952186f0c5870657Raph Levienprivate: 2095aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka enum class HyphenationLocale : uint8_t { 2105aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka OTHER = 0, 211237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader CATALAN = 1, 212237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader POLISH = 2, 213237f0665cc54875c38d3b9d2333d5f50c6fd6d3aRoozbeh Pournader SLOVENIAN = 3, 2145aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka }; 2155aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka 2165aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka // Use Hyphenator::loadBinary instead. 2175aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka Hyphenator(const uint8_t* patternData, size_t minPrefix, size_t minSuffix, 2186c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka HyphenationLocale hyphenLocale); 2195aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka 220c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // apply various hyphenation rules including hard and soft hyphens, ignoring patterns 221524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka void hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const; 2225cdad92c300a65cab89b172e952186f0c5870657Raph Levien 223c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map. 224c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or 225c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen. 226c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Note that this method writes len+2 entries into alpha_codes (including start and stop) 227524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka HyphenationType alphabetLookup(uint16_t* alpha_codes, const U16StringPiece& word) const; 228f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 229f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // calculate hyphenation from patterns, assuming alphabet lookup has already been done 230524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue, 231524d294be051584e5506e5a4ad6ec70471bf4c08Seigo Nonaka HyphenationType* out) const; 2325cdad92c300a65cab89b172e952186f0c5870657Raph Levien 233f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so 234f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // that temporary buffers can be stack-allocated without waste, which is a slightly 235f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // different use case. It measures UTF-16 code units. 236f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien static const size_t MAX_HYPHENATED_SIZE = 64; 237f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 2385aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka const uint8_t* mPatternData; 2395aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka const size_t mMinPrefix, mMinSuffix; 2405aa870f7ccd5138af60c96ec232192b52f967530Seigo Nonaka const HyphenationLocale mHyphenationLocale; 241f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 242f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // accessors for binary data 2436c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka const Header* getHeader() const { return reinterpret_cast<const Header*>(mPatternData); } 2445cdad92c300a65cab89b172e952186f0c5870657Raph Levien}; 2455cdad92c300a65cab89b172e952186f0c5870657Raph Levien 24614e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka} // namespace minikin 2475cdad92c300a65cab89b172e952186f0c5870657Raph Levien 2486c8722e217ff5238f0b849152d7936959a728103Seigo Nonaka#endif // MINIKIN_HYPHENATOR_H 249