1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <vector>
18#include <memory>
19#include <algorithm>
20#include <string>
21#include <unicode/uchar.h>
22#include <unicode/uscript.h>
23
24// HACK: for reading pattern file
25#include <fcntl.h>
26
27#define LOG_TAG "Minikin"
28#include "utils/Log.h"
29
30#include "minikin/Hyphenator.h"
31
32using std::vector;
33
34namespace minikin {
35
36static const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
37static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
38static const uint16_t CHAR_MIDDLE_DOT = 0x00B7;
39static const uint16_t CHAR_HYPHEN = 0x2010;
40
41// The following are structs that correspond to tables inside the hyb file format
42
43struct AlphabetTable0 {
44    uint32_t version;
45    uint32_t min_codepoint;
46    uint32_t max_codepoint;
47    uint8_t data[1];  // actually flexible array, size is known at runtime
48};
49
50struct AlphabetTable1 {
51    uint32_t version;
52    uint32_t n_entries;
53    uint32_t data[1]; // actually flexible array, size is known at runtime
54
55    static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
56    static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
57};
58
59struct Trie {
60    uint32_t version;
61    uint32_t char_mask;
62    uint32_t link_shift;
63    uint32_t link_mask;
64    uint32_t pattern_shift;
65    uint32_t n_entries;
66    uint32_t data[1];  // actually flexible array, size is known at runtime
67};
68
69struct Pattern {
70    uint32_t version;
71    uint32_t n_entries;
72    uint32_t pattern_offset;
73    uint32_t pattern_size;
74    uint32_t data[1];  // actually flexible array, size is known at runtime
75
76    // accessors
77    static uint32_t len(uint32_t entry) { return entry >> 26; }
78    static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
79    const uint8_t* buf(uint32_t entry) const {
80        return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
81    }
82};
83
84struct Header {
85    uint32_t magic;
86    uint32_t version;
87    uint32_t alphabet_offset;
88    uint32_t trie_offset;
89    uint32_t pattern_offset;
90    uint32_t file_size;
91
92    // accessors
93    const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
94    uint32_t alphabetVersion() const {
95        return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
96    }
97    const AlphabetTable0* alphabetTable0() const {
98        return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
99    }
100    const AlphabetTable1* alphabetTable1() const {
101        return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
102    }
103    const Trie* trieTable() const {
104        return reinterpret_cast<const Trie*>(bytes() + trie_offset);
105    }
106    const Pattern* patternTable() const {
107        return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
108    }
109};
110
111Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix) {
112    Hyphenator* result = new Hyphenator;
113    result->patternData = patternData;
114    result->minPrefix = minPrefix;
115    result->minSuffix = minSuffix;
116    return result;
117}
118
119void Hyphenator::hyphenate(vector<HyphenationType>* result, const uint16_t* word, size_t len,
120        const icu::Locale& locale) {
121    result->clear();
122    result->resize(len);
123    const size_t paddedLen = len + 2;  // start and stop code each count for 1
124    if (patternData != nullptr &&
125            len >= minPrefix + minSuffix && paddedLen <= MAX_HYPHENATED_SIZE) {
126        uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
127        const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len);
128        if (hyphenValue != HyphenationType::DONT_BREAK) {
129            hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue);
130            return;
131        }
132        // TODO: try NFC normalization
133        // TODO: handle non-BMP Unicode (requires remapping of offsets)
134    }
135    // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the
136    // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup
137    // would return DONT_BREAK.
138    hyphenateWithNoPatterns(result->data(), word, len, locale);
139}
140
141// This function determines whether a character is like U+2010 HYPHEN in
142// line breaking and usage: a character immediately after which line breaks
143// are allowed, but words containing it should not be automatically
144// hyphenated using patterns. This is a curated set, created by manually
145// inspecting all the characters that have the Unicode line breaking
146// property of BA or HY and seeing which ones are hyphens.
147bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
148    return (c == 0x002D || // HYPHEN-MINUS
149            c == 0x058A || // ARMENIAN HYPHEN
150            c == 0x05BE || // HEBREW PUNCTUATION MAQAF
151            c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
152            c == 0x2010 || // HYPHEN
153            c == 0x2013 || // EN DASH
154            c == 0x2027 || // HYPHENATION POINT
155            c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
156            c == 0x2E40);  // DOUBLE HYPHEN
157}
158
159const static uint32_t HYPHEN_STR[] = {0x2010, 0};
160const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0};
161const static uint32_t MAQAF_STR[] = {0x05BE, 0};
162const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0};
163const static uint32_t ZWJ_STR[] = {0x200D, 0};
164const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0};
165
166const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) {
167    switch (hyph) {
168        case INSERT_HYPHEN_AT_END:
169        case REPLACE_WITH_HYPHEN_AT_END:
170        case INSERT_HYPHEN_AT_START:
171            return HYPHEN_STR;
172        case INSERT_ARMENIAN_HYPHEN_AT_END:
173            return ARMENIAN_HYPHEN_STR;
174        case INSERT_MAQAF_AT_END:
175            return MAQAF_STR;
176        case INSERT_UCAS_HYPHEN_AT_END:
177            return UCAS_HYPHEN_STR;
178        case INSERT_ZWJ_AND_HYPHEN_AT_END:
179            return ZWJ_AND_HYPHEN_STR;
180        case INSERT_ZWJ_AT_START:
181            return ZWJ_STR;
182        default:
183            return nullptr;
184    }
185}
186
187uint32_t HyphenEdit::editForThisLine(HyphenationType type) {
188    switch (type) {
189        case HyphenationType::DONT_BREAK:
190            return NO_EDIT;
191        case HyphenationType::BREAK_AND_INSERT_HYPHEN:
192            return INSERT_HYPHEN_AT_END;
193        case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
194            return INSERT_ARMENIAN_HYPHEN_AT_END;
195        case HyphenationType::BREAK_AND_INSERT_MAQAF:
196            return INSERT_MAQAF_AT_END;
197        case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
198            return INSERT_UCAS_HYPHEN_AT_END;
199        case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
200            return REPLACE_WITH_HYPHEN_AT_END;
201        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
202            return INSERT_ZWJ_AND_HYPHEN_AT_END;
203        default:
204            return BREAK_AT_END;
205    }
206}
207
208uint32_t HyphenEdit::editForNextLine(HyphenationType type) {
209    switch (type) {
210        case HyphenationType::DONT_BREAK:
211            return NO_EDIT;
212        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
213            return INSERT_HYPHEN_AT_START;
214        case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
215            return INSERT_ZWJ_AT_START;
216        default:
217            return BREAK_AT_START;
218    }
219}
220
221static UScriptCode getScript(uint32_t codePoint) {
222    UErrorCode errorCode = U_ZERO_ERROR;
223    const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
224    if (U_SUCCESS(errorCode)) {
225        return script;
226    } else {
227        return USCRIPT_INVALID_CODE;
228    }
229}
230
231static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
232    // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
233    // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
234    // for now to be safe.  BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
235    // to maqaf for Hebrew, we can simply add a condition here.
236    const UScriptCode script = getScript(codePoint);
237    if (script == USCRIPT_KANNADA
238            || script == USCRIPT_MALAYALAM
239            || script == USCRIPT_TAMIL
240            || script == USCRIPT_TELUGU) {
241        // Grantha is not included, since we don't support non-BMP hyphenation yet.
242        return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
243    } else if (script == USCRIPT_ARMENIAN) {
244        return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
245    } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
246        return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
247    } else {
248        return HyphenationType::BREAK_AND_INSERT_HYPHEN;
249    }
250}
251
252static inline int32_t getJoiningType(UChar32 codepoint) {
253    return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
254}
255
256// Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN.
257// This function decides if the letters before and after the hyphen should appear as joining.
258static inline HyphenationType getHyphTypeForArabic(const uint16_t* word, size_t len,
259        size_t location) {
260    ssize_t i = location;
261    int32_t type = U_JT_NON_JOINING;
262    while (static_cast<size_t>(i) < len && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
263        i++;
264    }
265    if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) {
266        // The next character is of the type that may join the last character. See if the last
267        // character is also of the right type.
268        i = location - 2; // Skip the soft hyphen
269        type = U_JT_NON_JOINING;
270        while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
271            i--;
272        }
273        if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) {
274            return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
275        }
276    }
277    return HyphenationType::BREAK_AND_INSERT_HYPHEN;
278}
279
280// Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
281// that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
282// 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
283void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len,
284        const icu::Locale& locale) {
285    result[0] = HyphenationType::DONT_BREAK;
286    for (size_t i = 1; i < len; i++) {
287        const uint16_t prevChar = word[i - 1];
288        if (i > 1 && isLineBreakingHyphen(prevChar)) {
289            // Break after hyphens, but only if they don't start the word.
290
291            if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN)
292                    && strcmp(locale.getLanguage(), "pl") == 0
293                    && getScript(word[i]) == USCRIPT_LATIN ) {
294                // In Polish, hyphens get repeated at the next line. To be safe,
295                // we will do this only if the next character is Latin.
296                result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
297            } else {
298                result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
299            }
300        } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
301            // Break after soft hyphens, but only if they don't start the word (a soft hyphen
302            // starting the word doesn't give any useful break opportunities). The type of the break
303            // is based on the script of the character we break on.
304            if (getScript(word[i]) == USCRIPT_ARABIC) {
305                // For Arabic, we need to look and see if the characters around the soft hyphen
306                // actually join. If they don't, we'll just insert a normal hyphen.
307                result[i] = getHyphTypeForArabic(word, len, i);
308            } else {
309                result[i] = hyphenationTypeBasedOnScript(word[i]);
310            }
311        } else if (prevChar == CHAR_MIDDLE_DOT
312                && minPrefix < i && i <= len - minSuffix
313                && ((word[i - 2] == 'l' && word[i] == 'l')
314                        || (word[i - 2] == 'L' && word[i] == 'L'))
315                && strcmp(locale.getLanguage(), "ca") == 0) {
316            // In Catalan, "l·l" should break as "l-" on the first line
317            // and "l" on the next line.
318            result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
319        } else {
320            result[i] = HyphenationType::DONT_BREAK;
321        }
322     }
323}
324
325HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word,
326        size_t len) {
327    const Header* header = getHeader();
328    HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
329    // TODO: check header magic
330    uint32_t alphabetVersion = header->alphabetVersion();
331    if (alphabetVersion == 0) {
332        const AlphabetTable0* alphabet = header->alphabetTable0();
333        uint32_t min_codepoint = alphabet->min_codepoint;
334        uint32_t max_codepoint = alphabet->max_codepoint;
335        alpha_codes[0] = 0;  // word start
336        for (size_t i = 0; i < len; i++) {
337            uint16_t c = word[i];
338            if (c < min_codepoint || c >= max_codepoint) {
339                return HyphenationType::DONT_BREAK;
340            }
341            uint8_t code = alphabet->data[c - min_codepoint];
342            if (code == 0) {
343                return HyphenationType::DONT_BREAK;
344            }
345            if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
346                result = hyphenationTypeBasedOnScript(c);
347            }
348            alpha_codes[i + 1] = code;
349        }
350        alpha_codes[len + 1] = 0;  // word termination
351        return result;
352    } else if (alphabetVersion == 1) {
353        const AlphabetTable1* alphabet = header->alphabetTable1();
354        size_t n_entries = alphabet->n_entries;
355        const uint32_t* begin = alphabet->data;
356        const uint32_t* end = begin + n_entries;
357        alpha_codes[0] = 0;
358        for (size_t i = 0; i < len; i++) {
359            uint16_t c = word[i];
360            auto p = std::lower_bound(begin, end, c << 11);
361            if (p == end) {
362                return HyphenationType::DONT_BREAK;
363            }
364            uint32_t entry = *p;
365            if (AlphabetTable1::codepoint(entry) != c) {
366                return HyphenationType::DONT_BREAK;
367            }
368            if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
369                result = hyphenationTypeBasedOnScript(c);
370            }
371            alpha_codes[i + 1] = AlphabetTable1::value(entry);
372        }
373        alpha_codes[len + 1] = 0;
374        return result;
375    }
376    return HyphenationType::DONT_BREAK;
377}
378
379/**
380 * Internal implementation, after conversion to codes. All case folding and normalization
381 * has been done by now, and all characters have been found in the alphabet.
382 * Note: len here is the padded length including 0 codes at start and end.
383 **/
384void Hyphenator::hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len,
385        HyphenationType hyphenValue) {
386    static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t.");
387    // Reuse the result array as a buffer for calculating intermediate hyphenation numbers.
388    uint8_t* buffer = reinterpret_cast<uint8_t*>(result);
389
390    const Header* header = getHeader();
391    const Trie* trie = header->trieTable();
392    const Pattern* pattern = header->patternTable();
393    uint32_t char_mask = trie->char_mask;
394    uint32_t link_shift = trie->link_shift;
395    uint32_t link_mask = trie->link_mask;
396    uint32_t pattern_shift = trie->pattern_shift;
397    size_t maxOffset = len - minSuffix - 1;
398    for (size_t i = 0; i < len - 1; i++) {
399        uint32_t node = 0;  // index into Trie table
400        for (size_t j = i; j < len; j++) {
401            uint16_t c = codes[j];
402            uint32_t entry = trie->data[node + c];
403            if ((entry & char_mask) == c) {
404                node = (entry & link_mask) >> link_shift;
405            } else {
406                break;
407            }
408            uint32_t pat_ix = trie->data[node] >> pattern_shift;
409            // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
410            // into the buf pool. This is the pattern for the substring (i..j) we just matched,
411            // which we combine (via point-wise max) into the buffer vector.
412            if (pat_ix != 0) {
413                uint32_t pat_entry = pattern->data[pat_ix];
414                int pat_len = Pattern::len(pat_entry);
415                int pat_shift = Pattern::shift(pat_entry);
416                const uint8_t* pat_buf = pattern->buf(pat_entry);
417                int offset = j + 1 - (pat_len + pat_shift);
418                // offset is the index within buffer that lines up with the start of pat_buf
419                int start = std::max((int)minPrefix - offset, 0);
420                int end = std::min(pat_len, (int)maxOffset - offset);
421                for (int k = start; k < end; k++) {
422                    buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
423                }
424            }
425        }
426    }
427    // Since the above calculation does not modify values outside
428    // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK.
429    for (size_t i = minPrefix; i < maxOffset; i++) {
430        // Hyphenation opportunities happen when the hyphenation numbers are odd.
431        result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
432    }
433}
434
435}  // namespace minikin
436