15cdad92c300a65cab89b172e952186f0c5870657Raph Levien/*
25cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Copyright (C) 2015 The Android Open Source Project
35cdad92c300a65cab89b172e952186f0c5870657Raph Levien *
45cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Licensed under the Apache License, Version 2.0 (the "License");
55cdad92c300a65cab89b172e952186f0c5870657Raph Levien * you may not use this file except in compliance with the License.
65cdad92c300a65cab89b172e952186f0c5870657Raph Levien * You may obtain a copy of the License at
75cdad92c300a65cab89b172e952186f0c5870657Raph Levien *
85cdad92c300a65cab89b172e952186f0c5870657Raph Levien *      http://www.apache.org/licenses/LICENSE-2.0
95cdad92c300a65cab89b172e952186f0c5870657Raph Levien *
105cdad92c300a65cab89b172e952186f0c5870657Raph Levien * Unless required by applicable law or agreed to in writing, software
115cdad92c300a65cab89b172e952186f0c5870657Raph Levien * distributed under the License is distributed on an "AS IS" BASIS,
125cdad92c300a65cab89b172e952186f0c5870657Raph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
135cdad92c300a65cab89b172e952186f0c5870657Raph Levien * See the License for the specific language governing permissions and
145cdad92c300a65cab89b172e952186f0c5870657Raph Levien * limitations under the License.
155cdad92c300a65cab89b172e952186f0c5870657Raph Levien */
165cdad92c300a65cab89b172e952186f0c5870657Raph Levien
175cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <vector>
185cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <memory>
195cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <algorithm>
205cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <string>
21cdd19dadd11a611409c24bb69e6629eab6812d98Roozbeh Pournader#include <unicode/uchar.h>
22c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader#include <unicode/uscript.h>
235cdad92c300a65cab89b172e952186f0c5870657Raph Levien
245cdad92c300a65cab89b172e952186f0c5870657Raph Levien// HACK: for reading pattern file
255cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include <fcntl.h>
265cdad92c300a65cab89b172e952186f0c5870657Raph Levien
275cdad92c300a65cab89b172e952186f0c5870657Raph Levien#define LOG_TAG "Minikin"
285cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include "utils/Log.h"
295cdad92c300a65cab89b172e952186f0c5870657Raph Levien
305cdad92c300a65cab89b172e952186f0c5870657Raph Levien#include "minikin/Hyphenator.h"
315cdad92c300a65cab89b172e952186f0c5870657Raph Levien
325cdad92c300a65cab89b172e952186f0c5870657Raph Levienusing std::vector;
335cdad92c300a65cab89b172e952186f0c5870657Raph Levien
3414e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonakanamespace minikin {
355cdad92c300a65cab89b172e952186f0c5870657Raph Levien
36c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
375cdad92c300a65cab89b172e952186f0c5870657Raph Levienstatic const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
38c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic const uint16_t CHAR_MIDDLE_DOT = 0x00B7;
39c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic const uint16_t CHAR_HYPHEN = 0x2010;
405cdad92c300a65cab89b172e952186f0c5870657Raph Levien
41f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien// The following are structs that correspond to tables inside the hyb file format
42f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
43f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct AlphabetTable0 {
44f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t version;
45f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t min_codepoint;
46f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t max_codepoint;
47f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint8_t data[1];  // actually flexible array, size is known at runtime
48f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien};
49f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
50f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct AlphabetTable1 {
51f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t version;
52f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t n_entries;
53f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t data[1]; // actually flexible array, size is known at runtime
54f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
55f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
56f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
57f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien};
58f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
59f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct Trie {
60f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t version;
61f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t char_mask;
62f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t link_shift;
63f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t link_mask;
64f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t pattern_shift;
65f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t n_entries;
66f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t data[1];  // actually flexible array, size is known at runtime
67f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien};
68f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
69f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct Pattern {
70f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t version;
71f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t n_entries;
72f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t pattern_offset;
73f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t pattern_size;
74f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t data[1];  // actually flexible array, size is known at runtime
75f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
76f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // accessors
77f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    static uint32_t len(uint32_t entry) { return entry >> 26; }
78f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
79f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const uint8_t* buf(uint32_t entry) const {
80f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
81f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    }
82f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien};
83f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
84f0be43de02a1e07308d3d95408349c3c7f973430Raph Levienstruct Header {
85f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t magic;
86f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t version;
87f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t alphabet_offset;
88f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t trie_offset;
89f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t pattern_offset;
90f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t file_size;
91f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
92f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // accessors
93f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
94f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t alphabetVersion() const {
95f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
965cdad92c300a65cab89b172e952186f0c5870657Raph Levien    }
97f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const AlphabetTable0* alphabetTable0() const {
98f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
995cdad92c300a65cab89b172e952186f0c5870657Raph Levien    }
100f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const AlphabetTable1* alphabetTable1() const {
101f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
102f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    }
103f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const Trie* trieTable() const {
104f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return reinterpret_cast<const Trie*>(bytes() + trie_offset);
105f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    }
106f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const Pattern* patternTable() const {
107f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
108f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    }
109f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien};
110f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
111d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh PournaderHyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix) {
112f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    Hyphenator* result = new Hyphenator;
113f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    result->patternData = patternData;
114d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader    result->minPrefix = minPrefix;
115d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader    result->minSuffix = minSuffix;
116f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    return result;
117f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien}
118f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
119c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournadervoid Hyphenator::hyphenate(vector<HyphenationType>* result, const uint16_t* word, size_t len,
120c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        const icu::Locale& locale) {
121f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    result->clear();
122f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    result->resize(len);
123f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const size_t paddedLen = len + 2;  // start and stop code each count for 1
124f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if (patternData != nullptr &&
125d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader            len >= minPrefix + minSuffix && paddedLen <= MAX_HYPHENATED_SIZE) {
126f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
127c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len);
128c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        if (hyphenValue != HyphenationType::DONT_BREAK) {
129c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue);
130f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            return;
131f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        }
132f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        // TODO: try NFC normalization
133f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        // TODO: handle non-BMP Unicode (requires remapping of offsets)
1345cdad92c300a65cab89b172e952186f0c5870657Raph Levien    }
135c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the
136c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup
137c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // would return DONT_BREAK.
138c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    hyphenateWithNoPatterns(result->data(), word, len, locale);
1395cdad92c300a65cab89b172e952186f0c5870657Raph Levien}
1405cdad92c300a65cab89b172e952186f0c5870657Raph Levien
141c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// This function determines whether a character is like U+2010 HYPHEN in
142c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// line breaking and usage: a character immediately after which line breaks
143c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// are allowed, but words containing it should not be automatically
144c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// hyphenated using patterns. This is a curated set, created by manually
145c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// inspecting all the characters that have the Unicode line breaking
146c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// property of BA or HY and seeing which ones are hyphens.
147c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderbool Hyphenator::isLineBreakingHyphen(uint32_t c) {
148c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    return (c == 0x002D || // HYPHEN-MINUS
149c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            c == 0x058A || // ARMENIAN HYPHEN
150c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            c == 0x05BE || // HEBREW PUNCTUATION MAQAF
151c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
152c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            c == 0x2010 || // HYPHEN
153c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            c == 0x2013 || // EN DASH
154c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            c == 0x2027 || // HYPHENATION POINT
155c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
156c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            c == 0x2E40);  // DOUBLE HYPHEN
157c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}
158c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
159c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t HYPHEN_STR[] = {0x2010, 0};
160c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0};
161c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t MAQAF_STR[] = {0x05BE, 0};
162c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0};
163c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t ZWJ_STR[] = {0x200D, 0};
164c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0};
165c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
166c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderconst uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) {
167c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    switch (hyph) {
168c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case INSERT_HYPHEN_AT_END:
169c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case REPLACE_WITH_HYPHEN_AT_END:
170c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case INSERT_HYPHEN_AT_START:
171c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return HYPHEN_STR;
172c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case INSERT_ARMENIAN_HYPHEN_AT_END:
173c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return ARMENIAN_HYPHEN_STR;
174c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case INSERT_MAQAF_AT_END:
175c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return MAQAF_STR;
176c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case INSERT_UCAS_HYPHEN_AT_END:
177c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return UCAS_HYPHEN_STR;
178c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case INSERT_ZWJ_AND_HYPHEN_AT_END:
179c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return ZWJ_AND_HYPHEN_STR;
180c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case INSERT_ZWJ_AT_START:
181c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return ZWJ_STR;
182c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        default:
183c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return nullptr;
184c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    }
185c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}
186c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
187c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderuint32_t HyphenEdit::editForThisLine(HyphenationType type) {
188c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    switch (type) {
189c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::DONT_BREAK:
190c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return NO_EDIT;
191c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::BREAK_AND_INSERT_HYPHEN:
192c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return INSERT_HYPHEN_AT_END;
193c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
194c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return INSERT_ARMENIAN_HYPHEN_AT_END;
195c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::BREAK_AND_INSERT_MAQAF:
196c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return INSERT_MAQAF_AT_END;
197c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
198c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return INSERT_UCAS_HYPHEN_AT_END;
199c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
200c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return REPLACE_WITH_HYPHEN_AT_END;
201c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
202c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return INSERT_ZWJ_AND_HYPHEN_AT_END;
203c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        default:
204c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return BREAK_AT_END;
205c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    }
206c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}
207c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
208c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderuint32_t HyphenEdit::editForNextLine(HyphenationType type) {
209c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    switch (type) {
210c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::DONT_BREAK:
211c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return NO_EDIT;
212c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
213c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return INSERT_HYPHEN_AT_START;
214c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
215c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return INSERT_ZWJ_AT_START;
216c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        default:
217c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return BREAK_AT_START;
218c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    }
219c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}
220c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
221c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic UScriptCode getScript(uint32_t codePoint) {
222c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    UErrorCode errorCode = U_ZERO_ERROR;
223c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
224c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    if (U_SUCCESS(errorCode)) {
225c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        return script;
226c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    } else {
227c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        return USCRIPT_INVALID_CODE;
228c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    }
229c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}
230c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
231c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
232c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
233c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
234c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
235c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // to maqaf for Hebrew, we can simply add a condition here.
236c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    const UScriptCode script = getScript(codePoint);
237c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    if (script == USCRIPT_KANNADA
238c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            || script == USCRIPT_MALAYALAM
239c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            || script == USCRIPT_TAMIL
240c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            || script == USCRIPT_TELUGU) {
241c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        // Grantha is not included, since we don't support non-BMP hyphenation yet.
242c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
243c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    } else if (script == USCRIPT_ARMENIAN) {
244c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
245c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
246c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
247c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    } else {
248c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        return HyphenationType::BREAK_AND_INSERT_HYPHEN;
249c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    }
250c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}
251c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
252c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic inline int32_t getJoiningType(UChar32 codepoint) {
253c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
254c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}
255c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
256c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN.
257c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// This function decides if the letters before and after the hyphen should appear as joining.
258c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournaderstatic inline HyphenationType getHyphTypeForArabic(const uint16_t* word, size_t len,
259c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        size_t location) {
260c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    ssize_t i = location;
261c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    int32_t type = U_JT_NON_JOINING;
262c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    while (static_cast<size_t>(i) < len && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
263c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        i++;
264c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    }
265c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) {
266c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        // The next character is of the type that may join the last character. See if the last
267c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        // character is also of the right type.
268c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        i = location - 2; // Skip the soft hyphen
269c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        type = U_JT_NON_JOINING;
270c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
271c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            i--;
272c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        }
273c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) {
274c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
275c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        }
276c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    }
277c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    return HyphenationType::BREAK_AND_INSERT_HYPHEN;
278c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader}
279c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
280c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
281c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
282c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader// 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
283c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournadervoid Hyphenator::hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len,
284c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        const icu::Locale& locale) {
285c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    result[0] = HyphenationType::DONT_BREAK;
2865cdad92c300a65cab89b172e952186f0c5870657Raph Levien    for (size_t i = 1; i < len; i++) {
287c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        const uint16_t prevChar = word[i - 1];
288c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        if (i > 1 && isLineBreakingHyphen(prevChar)) {
289c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            // Break after hyphens, but only if they don't start the word.
290c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
291c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN)
292c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                    && strcmp(locale.getLanguage(), "pl") == 0
293c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                    && getScript(word[i]) == USCRIPT_LATIN ) {
294c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                // In Polish, hyphens get repeated at the next line. To be safe,
295c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                // we will do this only if the next character is Latin.
296c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
297c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            } else {
298c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
299c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            }
300c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
301c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            // Break after soft hyphens, but only if they don't start the word (a soft hyphen
302c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            // starting the word doesn't give any useful break opportunities). The type of the break
303c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            // is based on the script of the character we break on.
304c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            if (getScript(word[i]) == USCRIPT_ARABIC) {
305c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                // For Arabic, we need to look and see if the characters around the soft hyphen
306c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                // actually join. If they don't, we'll just insert a normal hyphen.
307c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                result[i] = getHyphTypeForArabic(word, len, i);
308c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            } else {
309c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                result[i] = hyphenationTypeBasedOnScript(word[i]);
310c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            }
311c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        } else if (prevChar == CHAR_MIDDLE_DOT
312d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader                && minPrefix < i && i <= len - minSuffix
313c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                && ((word[i - 2] == 'l' && word[i] == 'l')
314c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                        || (word[i - 2] == 'L' && word[i] == 'L'))
315c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                && strcmp(locale.getLanguage(), "ca") == 0) {
316c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            // In Catalan, "l·l" should break as "l-" on the first line
317c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            // and "l" on the next line.
318c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
319c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        } else {
320c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            result[i] = HyphenationType::DONT_BREAK;
321c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        }
322f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien     }
3235cdad92c300a65cab89b172e952186f0c5870657Raph Levien}
3245cdad92c300a65cab89b172e952186f0c5870657Raph Levien
325c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh PournaderHyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word,
326c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        size_t len) {
327f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const Header* header = getHeader();
328c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
329f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    // TODO: check header magic
330f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t alphabetVersion = header->alphabetVersion();
331f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    if (alphabetVersion == 0) {
332f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        const AlphabetTable0* alphabet = header->alphabetTable0();
333f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        uint32_t min_codepoint = alphabet->min_codepoint;
334f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        uint32_t max_codepoint = alphabet->max_codepoint;
335f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        alpha_codes[0] = 0;  // word start
336f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for (size_t i = 0; i < len; i++) {
337f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            uint16_t c = word[i];
338f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if (c < min_codepoint || c >= max_codepoint) {
339c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                return HyphenationType::DONT_BREAK;
340f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            }
341f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            uint8_t code = alphabet->data[c - min_codepoint];
342f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if (code == 0) {
343c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                return HyphenationType::DONT_BREAK;
344c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            }
345c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
346c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                result = hyphenationTypeBasedOnScript(c);
347f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            }
348f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            alpha_codes[i + 1] = code;
349f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        }
350f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        alpha_codes[len + 1] = 0;  // word termination
351c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        return result;
352f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    } else if (alphabetVersion == 1) {
353f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        const AlphabetTable1* alphabet = header->alphabetTable1();
354f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        size_t n_entries = alphabet->n_entries;
355f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        const uint32_t* begin = alphabet->data;
356f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        const uint32_t* end = begin + n_entries;
357f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        alpha_codes[0] = 0;
358f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for (size_t i = 0; i < len; i++) {
359f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            uint16_t c = word[i];
360f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            auto p = std::lower_bound(begin, end, c << 11);
361f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if (p == end) {
362c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                return HyphenationType::DONT_BREAK;
3635cdad92c300a65cab89b172e952186f0c5870657Raph Levien            }
364f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            uint32_t entry = *p;
365f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if (AlphabetTable1::codepoint(entry) != c) {
366c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                return HyphenationType::DONT_BREAK;
367c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            }
368c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
369c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                result = hyphenationTypeBasedOnScript(c);
370f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            }
371f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            alpha_codes[i + 1] = AlphabetTable1::value(entry);
372f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        }
373f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        alpha_codes[len + 1] = 0;
374c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        return result;
375f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    }
376c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    return HyphenationType::DONT_BREAK;
377f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien}
378f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien
379f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien/**
380f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien * Internal implementation, after conversion to codes. All case folding and normalization
381f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien * has been done by now, and all characters have been found in the alphabet.
382f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien * Note: len here is the padded length including 0 codes at start and end.
383f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien **/
384c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournadervoid Hyphenator::hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len,
385c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        HyphenationType hyphenValue) {
386c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t.");
387c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    // Reuse the result array as a buffer for calculating intermediate hyphenation numbers.
388c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader    uint8_t* buffer = reinterpret_cast<uint8_t*>(result);
389c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader
390f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const Header* header = getHeader();
391f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const Trie* trie = header->trieTable();
392f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    const Pattern* pattern = header->patternTable();
393f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t char_mask = trie->char_mask;
394f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t link_shift = trie->link_shift;
395f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t link_mask = trie->link_mask;
396f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    uint32_t pattern_shift = trie->pattern_shift;
397d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader    size_t maxOffset = len - minSuffix - 1;
398f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien    for (size_t i = 0; i < len - 1; i++) {
399f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        uint32_t node = 0;  // index into Trie table
400f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien        for (size_t j = i; j < len; j++) {
401f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            uint16_t c = codes[j];
402f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            uint32_t entry = trie->data[node + c];
403f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if ((entry & char_mask) == c) {
404f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                node = (entry & link_mask) >> link_shift;
4055cdad92c300a65cab89b172e952186f0c5870657Raph Levien            } else {
4065cdad92c300a65cab89b172e952186f0c5870657Raph Levien                break;
4075cdad92c300a65cab89b172e952186f0c5870657Raph Levien            }
408f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            uint32_t pat_ix = trie->data[node] >> pattern_shift;
409f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
410f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            // into the buf pool. This is the pattern for the substring (i..j) we just matched,
411c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader            // which we combine (via point-wise max) into the buffer vector.
412f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien            if (pat_ix != 0) {
413f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                uint32_t pat_entry = pattern->data[pat_ix];
414f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                int pat_len = Pattern::len(pat_entry);
415f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                int pat_shift = Pattern::shift(pat_entry);
416f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                const uint8_t* pat_buf = pattern->buf(pat_entry);
417f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                int offset = j + 1 - (pat_len + pat_shift);
418c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                // offset is the index within buffer that lines up with the start of pat_buf
419d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader                int start = std::max((int)minPrefix - offset, 0);
420f0be43de02a1e07308d3d95408349c3c7f973430Raph Levien                int end = std::min(pat_len, (int)maxOffset - offset);
4215cdad92c300a65cab89b172e952186f0c5870657Raph Levien                for (int k = start; k < end; k++) {
422c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader                    buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
4235cdad92c300a65cab89b172e952186f0c5870657Raph Levien                }
4245cdad92c300a65cab89b172e952186f0c5870657Raph Levien            }
4255cdad92c300a65cab89b172e952186f0c5870657Raph Levien        }
4265cdad92c300a65cab89b172e952186f0c5870657Raph Levien    }
4275cdad92c300a65cab89b172e952186f0c5870657Raph Levien    // Since the above calculation does not modify values outside
428d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader    // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK.
429d78f260a988024b878909555edbfcd7159e7ad2fRoozbeh Pournader    for (size_t i = minPrefix; i < maxOffset; i++) {
430c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        // Hyphenation opportunities happen when the hyphenation numbers are odd.
431c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8eeRoozbeh Pournader        result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
4325cdad92c300a65cab89b172e952186f0c5870657Raph Levien    }
4335cdad92c300a65cab89b172e952186f0c5870657Raph Levien}
4345cdad92c300a65cab89b172e952186f0c5870657Raph Levien
43514e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka}  // namespace minikin
436