15cdad92c300a65cab89b172e952186f0c5870657Raph Levien/* 25cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Copyright (C) 2015 The Android Open Source Project 35cdad92c300a65cab89b172e952186f0c5870657Raph Levien * 45cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 55cdad92c300a65cab89b172e952186f0c5870657Raph Levien * you may not use this file except in compliance with the License. 65cdad92c300a65cab89b172e952186f0c5870657Raph Levien * You may obtain a copy of the License at 75cdad92c300a65cab89b172e952186f0c5870657Raph Levien * 85cdad92c300a65cab89b172e952186f0c5870657Raph Levien * http://www.apache.org/licenses/LICENSE-2.0 95cdad92c300a65cab89b172e952186f0c5870657Raph Levien * 105cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Unless required by applicable law or agreed to in writing, software 115cdad92c300a65cab89b172e952186f0c5870657Raph Levien * distributed under the License is distributed on an "AS IS" BASIS, 125cdad92c300a65cab89b172e952186f0c5870657Raph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 135cdad92c300a65cab89b172e952186f0c5870657Raph Levien * See the License for the specific language governing permissions and 145cdad92c300a65cab89b172e952186f0c5870657Raph Levien * limitations under the License. 155cdad92c300a65cab89b172e952186f0c5870657Raph Levien */ 165cdad92c300a65cab89b172e952186f0c5870657Raph Levien 175cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <vector> 185cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <memory> 195cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <algorithm> 205cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <string> 21cdd19dadd11a611409c24bb69e6629eab6812d98Roozbeh Pournader#include <unicode/uchar.h> 22c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader#include <unicode/uscript.h> 235cdad92c300a65cab89b172e952186f0c5870657Raph Levien 245cdad92c300a65cab89b172e952186f0c5870657Raph Levien// HACK: for reading pattern file 255cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <fcntl.h> 265cdad92c300a65cab89b172e952186f0c5870657Raph Levien 275cdad92c300a65cab89b172e952186f0c5870657Raph Levien#define LOG_TAG "Minikin" 285cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include "utils/Log.h" 295cdad92c300a65cab89b172e952186f0c5870657Raph Levien 305cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include "minikin/Hyphenator.h" 315cdad92c300a65cab89b172e952186f0c5870657Raph Levien 325cdad92c300a65cab89b172e952186f0c5870657Raph Levienusing std::vector; 335cdad92c300a65cab89b172e952186f0c5870657Raph Levien 3414e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonakanamespace minikin { 355cdad92c300a65cab89b172e952186f0c5870657Raph Levien 36c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic const uint16_t CHAR_HYPHEN_MINUS = 0x002D; 375cdad92c300a65cab89b172e952186f0c5870657Raph Levienstatic const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; 38c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic const uint16_t CHAR_MIDDLE_DOT = 0x00B7; 39c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic const uint16_t CHAR_HYPHEN = 0x2010; 405cdad92c300a65cab89b172e952186f0c5870657Raph Levien 41f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien// The following are structs that correspond to tables inside the hyb file format 42f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 43f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct AlphabetTable0 { 44f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t version; 45f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t min_codepoint; 46f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t max_codepoint; 47f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint8_t data[1]; // actually flexible array, size is known at runtime 48f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien}; 49f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 50f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct AlphabetTable1 { 51f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t version; 52f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t n_entries; 53f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t data[1]; // actually flexible array, size is known at runtime 54f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 55f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien static uint32_t codepoint(uint32_t entry) { return entry >> 11; } 56f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien static uint32_t value(uint32_t entry) { return entry & 0x7ff; } 57f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien}; 58f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 59f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct Trie { 60f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t version; 61f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t char_mask; 62f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t link_shift; 63f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t link_mask; 64f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t pattern_shift; 65f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t n_entries; 66f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t data[1]; // actually flexible array, size is known at runtime 67f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien}; 68f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 69f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct Pattern { 70f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t version; 71f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t n_entries; 72f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t pattern_offset; 73f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t pattern_size; 74f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t data[1]; // actually flexible array, size is known at runtime 75f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 76f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // accessors 77f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien static uint32_t len(uint32_t entry) { return entry >> 26; } 78f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } 79f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const uint8_t* buf(uint32_t entry) const { 80f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff); 81f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 82f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien}; 83f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 84f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct Header { 85f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t magic; 86f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t version; 87f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t alphabet_offset; 88f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t trie_offset; 89f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t pattern_offset; 90f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t file_size; 91f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 92f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // accessors 93f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); } 94f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t alphabetVersion() const { 95f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset); 965cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 97f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const AlphabetTable0* alphabetTable0() const { 98f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset); 995cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 100f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const AlphabetTable1* alphabetTable1() const { 101f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset); 102f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 103f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const Trie* trieTable() const { 104f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return reinterpret_cast<const Trie*>(bytes() + trie_offset); 105f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 106f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const Pattern* patternTable() const { 107f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return reinterpret_cast<const Pattern*>(bytes() + pattern_offset); 108f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 109f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien}; 110f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 111d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh PournaderHyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix) { 112f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien Hyphenator* result = new Hyphenator; 113f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result->patternData = patternData; 114d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader result->minPrefix = minPrefix; 115d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader result->minSuffix = minSuffix; 116f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return result; 117f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien} 118f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 119c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournadervoid Hyphenator::hyphenate(vector<HyphenationType>* result, const uint16_t* word, size_t len, 120c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader const icu::Locale& locale) { 121f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result->clear(); 122f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien result->resize(len); 123f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const size_t paddedLen = len + 2; // start and stop code each count for 1 124f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if (patternData != nullptr && 125d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader len >= minPrefix + minSuffix && paddedLen <= MAX_HYPHENATED_SIZE) { 126f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; 127c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len); 128c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (hyphenValue != HyphenationType::DONT_BREAK) { 129c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue); 130f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien return; 131f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 132f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // TODO: try NFC normalization 133f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // TODO: handle non-BMP Unicode (requires remapping of offsets) 1345cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 135c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the 136c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup 137c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // would return DONT_BREAK. 138c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader hyphenateWithNoPatterns(result->data(), word, len, locale); 1395cdad92c300a65cab89b172e952186f0c5870657Raph Levien} 1405cdad92c300a65cab89b172e952186f0c5870657Raph Levien 141c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// This function determines whether a character is like U+2010 HYPHEN in 142c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// line breaking and usage: a character immediately after which line breaks 143c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// are allowed, but words containing it should not be automatically 144c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// hyphenated using patterns. This is a curated set, created by manually 145c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// inspecting all the characters that have the Unicode line breaking 146c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// property of BA or HY and seeing which ones are hyphens. 147c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderbool Hyphenator::isLineBreakingHyphen(uint32_t c) { 148c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return (c == 0x002D || // HYPHEN-MINUS 149c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader c == 0x058A || // ARMENIAN HYPHEN 150c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader c == 0x05BE || // HEBREW PUNCTUATION MAQAF 151c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader c == 0x1400 || // CANADIAN SYLLABICS HYPHEN 152c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader c == 0x2010 || // HYPHEN 153c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader c == 0x2013 || // EN DASH 154c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader c == 0x2027 || // HYPHENATION POINT 155c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN 156c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader c == 0x2E40); // DOUBLE HYPHEN 157c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader} 158c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 159c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t HYPHEN_STR[] = {0x2010, 0}; 160c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0}; 161c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t MAQAF_STR[] = {0x05BE, 0}; 162c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0}; 163c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t ZWJ_STR[] = {0x200D, 0}; 164c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0}; 165c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 166c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) { 167c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader switch (hyph) { 168c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case INSERT_HYPHEN_AT_END: 169c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case REPLACE_WITH_HYPHEN_AT_END: 170c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case INSERT_HYPHEN_AT_START: 171c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HYPHEN_STR; 172c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case INSERT_ARMENIAN_HYPHEN_AT_END: 173c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return ARMENIAN_HYPHEN_STR; 174c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case INSERT_MAQAF_AT_END: 175c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return MAQAF_STR; 176c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case INSERT_UCAS_HYPHEN_AT_END: 177c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return UCAS_HYPHEN_STR; 178c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case INSERT_ZWJ_AND_HYPHEN_AT_END: 179c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return ZWJ_AND_HYPHEN_STR; 180c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case INSERT_ZWJ_AT_START: 181c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return ZWJ_STR; 182c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader default: 183c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return nullptr; 184c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 185c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader} 186c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 187c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderuint32_t HyphenEdit::editForThisLine(HyphenationType type) { 188c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader switch (type) { 189c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::DONT_BREAK: 190c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return NO_EDIT; 191c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::BREAK_AND_INSERT_HYPHEN: 192c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return INSERT_HYPHEN_AT_END; 193c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN: 194c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return INSERT_ARMENIAN_HYPHEN_AT_END; 195c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::BREAK_AND_INSERT_MAQAF: 196c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return INSERT_MAQAF_AT_END; 197c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN: 198c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return INSERT_UCAS_HYPHEN_AT_END; 199c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN: 200c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return REPLACE_WITH_HYPHEN_AT_END; 201c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: 202c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return INSERT_ZWJ_AND_HYPHEN_AT_END; 203c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader default: 204c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return BREAK_AT_END; 205c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 206c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader} 207c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 208c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderuint32_t HyphenEdit::editForNextLine(HyphenationType type) { 209c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader switch (type) { 210c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::DONT_BREAK: 211c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return NO_EDIT; 212c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE: 213c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return INSERT_HYPHEN_AT_START; 214c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ: 215c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return INSERT_ZWJ_AT_START; 216c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader default: 217c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return BREAK_AT_START; 218c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 219c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader} 220c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 221c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic UScriptCode getScript(uint32_t codePoint) { 222c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader UErrorCode errorCode = U_ZERO_ERROR; 223c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode); 224c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (U_SUCCESS(errorCode)) { 225c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return script; 226c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else { 227c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return USCRIPT_INVALID_CODE; 228c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 229c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader} 230c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 231c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) { 232c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen 233c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens 234c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch 235c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // to maqaf for Hebrew, we can simply add a condition here. 236c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader const UScriptCode script = getScript(codePoint); 237c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (script == USCRIPT_KANNADA 238c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader || script == USCRIPT_MALAYALAM 239c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader || script == USCRIPT_TAMIL 240c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader || script == USCRIPT_TELUGU) { 241c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Grantha is not included, since we don't support non-BMP hyphenation yet. 242c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; 243c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else if (script == USCRIPT_ARMENIAN) { 244c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN; 245c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else if (script == USCRIPT_CANADIAN_ABORIGINAL) { 246c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN; 247c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else { 248c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::BREAK_AND_INSERT_HYPHEN; 249c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 250c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader} 251c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 252c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic inline int32_t getJoiningType(UChar32 codepoint) { 253c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE); 254c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader} 255c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 256c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN. 257c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// This function decides if the letters before and after the hyphen should appear as joining. 258c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic inline HyphenationType getHyphTypeForArabic(const uint16_t* word, size_t len, 259c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader size_t location) { 260c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader ssize_t i = location; 261c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader int32_t type = U_JT_NON_JOINING; 262c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader while (static_cast<size_t>(i) < len && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { 263c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader i++; 264c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 265c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) { 266c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // The next character is of the type that may join the last character. See if the last 267c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // character is also of the right type. 268c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader i = location - 2; // Skip the soft hyphen 269c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader type = U_JT_NON_JOINING; 270c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) { 271c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader i--; 272c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 273c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) { 274c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ; 275c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 276c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 277c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::BREAK_AND_INSERT_HYPHEN; 278c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader} 279c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 280c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words 281c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections 282c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen). 283c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournadervoid Hyphenator::hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len, 284c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader const icu::Locale& locale) { 285c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result[0] = HyphenationType::DONT_BREAK; 2865cdad92c300a65cab89b172e952186f0c5870657Raph Levien for (size_t i = 1; i < len; i++) { 287c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader const uint16_t prevChar = word[i - 1]; 288c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (i > 1 && isLineBreakingHyphen(prevChar)) { 289c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break after hyphens, but only if they don't start the word. 290c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 291c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) 292c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader && strcmp(locale.getLanguage(), "pl") == 0 293c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader && getScript(word[i]) == USCRIPT_LATIN ) { 294c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // In Polish, hyphens get repeated at the next line. To be safe, 295c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // we will do this only if the next character is Latin. 296c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE; 297c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else { 298c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN; 299c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 300c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) { 301c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Break after soft hyphens, but only if they don't start the word (a soft hyphen 302c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // starting the word doesn't give any useful break opportunities). The type of the break 303c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // is based on the script of the character we break on. 304c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (getScript(word[i]) == USCRIPT_ARABIC) { 305c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // For Arabic, we need to look and see if the characters around the soft hyphen 306c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // actually join. If they don't, we'll just insert a normal hyphen. 307c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result[i] = getHyphTypeForArabic(word, len, i); 308c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else { 309c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result[i] = hyphenationTypeBasedOnScript(word[i]); 310c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 311c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else if (prevChar == CHAR_MIDDLE_DOT 312d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader && minPrefix < i && i <= len - minSuffix 313c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader && ((word[i - 2] == 'l' && word[i] == 'l') 314c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader || (word[i - 2] == 'L' && word[i] == 'L')) 315c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader && strcmp(locale.getLanguage(), "ca") == 0) { 316c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // In Catalan, "l·l" should break as "l-" on the first line 317c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // and "l" on the next line. 318c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN; 319c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } else { 320c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result[i] = HyphenationType::DONT_BREAK; 321c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 322f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 3235cdad92c300a65cab89b172e952186f0c5870657Raph Levien} 3245cdad92c300a65cab89b172e952186f0c5870657Raph Levien 325c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh PournaderHyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, 326c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader size_t len) { 327f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const Header* header = getHeader(); 328c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN; 329f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // TODO: check header magic 330f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t alphabetVersion = header->alphabetVersion(); 331f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if (alphabetVersion == 0) { 332f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const AlphabetTable0* alphabet = header->alphabetTable0(); 333f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t min_codepoint = alphabet->min_codepoint; 334f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t max_codepoint = alphabet->max_codepoint; 335f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alpha_codes[0] = 0; // word start 336f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for (size_t i = 0; i < len; i++) { 337f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint16_t c = word[i]; 338f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if (c < min_codepoint || c >= max_codepoint) { 339c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::DONT_BREAK; 340f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 341f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint8_t code = alphabet->data[c - min_codepoint]; 342f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if (code == 0) { 343c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::DONT_BREAK; 344c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 345c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { 346c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result = hyphenationTypeBasedOnScript(c); 347f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 348f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alpha_codes[i + 1] = code; 349f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 350f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alpha_codes[len + 1] = 0; // word termination 351c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return result; 352f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } else if (alphabetVersion == 1) { 353f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const AlphabetTable1* alphabet = header->alphabetTable1(); 354f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien size_t n_entries = alphabet->n_entries; 355f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const uint32_t* begin = alphabet->data; 356f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const uint32_t* end = begin + n_entries; 357f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alpha_codes[0] = 0; 358f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for (size_t i = 0; i < len; i++) { 359f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint16_t c = word[i]; 360f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien auto p = std::lower_bound(begin, end, c << 11); 361f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if (p == end) { 362c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::DONT_BREAK; 3635cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 364f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t entry = *p; 365f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if (AlphabetTable1::codepoint(entry) != c) { 366c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::DONT_BREAK; 367c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader } 368c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) { 369c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result = hyphenationTypeBasedOnScript(c); 370f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 371f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alpha_codes[i + 1] = AlphabetTable1::value(entry); 372f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 373f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien alpha_codes[len + 1] = 0; 374c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return result; 375f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien } 376c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader return HyphenationType::DONT_BREAK; 377f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien} 378f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien 379f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien/** 380f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien * Internal implementation, after conversion to codes. All case folding and normalization 381f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien * has been done by now, and all characters have been found in the alphabet. 382f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien * Note: len here is the padded length including 0 codes at start and end. 383f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien **/ 384c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournadervoid Hyphenator::hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len, 385c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader HyphenationType hyphenValue) { 386c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t."); 387c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Reuse the result array as a buffer for calculating intermediate hyphenation numbers. 388c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader uint8_t* buffer = reinterpret_cast<uint8_t*>(result); 389c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader 390f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const Header* header = getHeader(); 391f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const Trie* trie = header->trieTable(); 392f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const Pattern* pattern = header->patternTable(); 393f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t char_mask = trie->char_mask; 394f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t link_shift = trie->link_shift; 395f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t link_mask = trie->link_mask; 396f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t pattern_shift = trie->pattern_shift; 397d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader size_t maxOffset = len - minSuffix - 1; 398f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for (size_t i = 0; i < len - 1; i++) { 399f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t node = 0; // index into Trie table 400f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien for (size_t j = i; j < len; j++) { 401f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint16_t c = codes[j]; 402f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t entry = trie->data[node + c]; 403f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if ((entry & char_mask) == c) { 404f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien node = (entry & link_mask) >> link_shift; 4055cdad92c300a65cab89b172e952186f0c5870657Raph Levien } else { 4065cdad92c300a65cab89b172e952186f0c5870657Raph Levien break; 4075cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 408f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t pat_ix = trie->data[node] >> pattern_shift; 409f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset 410f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien // into the buf pool. This is the pattern for the substring (i..j) we just matched, 411c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // which we combine (via point-wise max) into the buffer vector. 412f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien if (pat_ix != 0) { 413f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien uint32_t pat_entry = pattern->data[pat_ix]; 414f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien int pat_len = Pattern::len(pat_entry); 415f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien int pat_shift = Pattern::shift(pat_entry); 416f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien const uint8_t* pat_buf = pattern->buf(pat_entry); 417f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien int offset = j + 1 - (pat_len + pat_shift); 418c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // offset is the index within buffer that lines up with the start of pat_buf 419d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader int start = std::max((int)minPrefix - offset, 0); 420f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien int end = std::min(pat_len, (int)maxOffset - offset); 4215cdad92c300a65cab89b172e952186f0c5870657Raph Levien for (int k = start; k < end; k++) { 422c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]); 4235cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 4245cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 4255cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 4265cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 4275cdad92c300a65cab89b172e952186f0c5870657Raph Levien // Since the above calculation does not modify values outside 428d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK. 429d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader for (size_t i = minPrefix; i < maxOffset; i++) { 430c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader // Hyphenation opportunities happen when the hyphenation numbers are odd. 431c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK; 4325cdad92c300a65cab89b172e952186f0c5870657Raph Levien } 4335cdad92c300a65cab89b172e952186f0c5870657Raph Levien} 4345cdad92c300a65cab89b172e952186f0c5870657Raph Levien 43514e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka} // namespace minikin 436