1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <vector>
18#include <memory>
19#include <algorithm>
20#include <string>
21#include <unicode/uchar.h>
22
23// HACK: for reading pattern file
24#include <fcntl.h>
25
26#define LOG_TAG "Minikin"
27#include "utils/Log.h"
28
29#include "minikin/Hyphenator.h"
30
31using std::vector;
32
33namespace android {
34
35static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
36
37// The following are structs that correspond to tables inside the hyb file format
38
39struct AlphabetTable0 {
40    uint32_t version;
41    uint32_t min_codepoint;
42    uint32_t max_codepoint;
43    uint8_t data[1];  // actually flexible array, size is known at runtime
44};
45
46struct AlphabetTable1 {
47    uint32_t version;
48    uint32_t n_entries;
49    uint32_t data[1]; // actually flexible array, size is known at runtime
50
51    static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
52    static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
53};
54
55struct Trie {
56    uint32_t version;
57    uint32_t char_mask;
58    uint32_t link_shift;
59    uint32_t link_mask;
60    uint32_t pattern_shift;
61    uint32_t n_entries;
62    uint32_t data[1];  // actually flexible array, size is known at runtime
63};
64
65struct Pattern {
66    uint32_t version;
67    uint32_t n_entries;
68    uint32_t pattern_offset;
69    uint32_t pattern_size;
70    uint32_t data[1];  // actually flexible array, size is known at runtime
71
72    // accessors
73    static uint32_t len(uint32_t entry) { return entry >> 26; }
74    static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
75    const uint8_t* buf(uint32_t entry) const {
76        return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
77    }
78};
79
80struct Header {
81    uint32_t magic;
82    uint32_t version;
83    uint32_t alphabet_offset;
84    uint32_t trie_offset;
85    uint32_t pattern_offset;
86    uint32_t file_size;
87
88    // accessors
89    const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
90    uint32_t alphabetVersion() const {
91        return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
92    }
93    const AlphabetTable0* alphabetTable0() const {
94        return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
95    }
96    const AlphabetTable1* alphabetTable1() const {
97        return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
98    }
99    const Trie* trieTable() const {
100        return reinterpret_cast<const Trie*>(bytes() + trie_offset);
101    }
102    const Pattern* patternTable() const {
103        return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
104    }
105};
106
107Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) {
108    Hyphenator* result = new Hyphenator;
109    result->patternData = patternData;
110    return result;
111}
112
113void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
114    result->clear();
115    result->resize(len);
116    const size_t paddedLen = len + 2;  // start and stop code each count for 1
117    if (patternData != nullptr &&
118            (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_SIZE) {
119        uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
120        if (alphabetLookup(alpha_codes, word, len)) {
121            hyphenateFromCodes(result->data(), alpha_codes, paddedLen);
122            return;
123        }
124        // TODO: try NFC normalization
125        // TODO: handle non-BMP Unicode (requires remapping of offsets)
126    }
127    hyphenateSoft(result->data(), word, len);
128}
129
130// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
131// as recommended in UAX #14 (Use of Soft Hyphen)
132void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len) {
133    result[0] = 0;
134    for (size_t i = 1; i < len; i++) {
135        result[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
136     }
137}
138
139bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len) {
140    const Header* header = getHeader();
141    // TODO: check header magic
142    uint32_t alphabetVersion = header->alphabetVersion();
143    if (alphabetVersion == 0) {
144        const AlphabetTable0* alphabet = header->alphabetTable0();
145        uint32_t min_codepoint = alphabet->min_codepoint;
146        uint32_t max_codepoint = alphabet->max_codepoint;
147        alpha_codes[0] = 0;  // word start
148        for (size_t i = 0; i < len; i++) {
149            uint16_t c = word[i];
150            if (c < min_codepoint || c >= max_codepoint) {
151                return false;
152            }
153            uint8_t code = alphabet->data[c - min_codepoint];
154            if (code == 0) {
155                return false;
156            }
157            alpha_codes[i + 1] = code;
158        }
159        alpha_codes[len + 1] = 0;  // word termination
160        return true;
161    } else if (alphabetVersion == 1) {
162        const AlphabetTable1* alphabet = header->alphabetTable1();
163        size_t n_entries = alphabet->n_entries;
164        const uint32_t* begin = alphabet->data;
165        const uint32_t* end = begin + n_entries;
166        alpha_codes[0] = 0;
167        for (size_t i = 0; i < len; i++) {
168            uint16_t c = word[i];
169            auto p = std::lower_bound(begin, end, c << 11);
170            if (p == end) {
171                return false;
172            }
173            uint32_t entry = *p;
174            if (AlphabetTable1::codepoint(entry) != c) {
175                return false;
176            }
177            alpha_codes[i + 1] = AlphabetTable1::value(entry);
178        }
179        alpha_codes[len + 1] = 0;
180        return true;
181    }
182    return false;
183}
184
185/**
186 * Internal implementation, after conversion to codes. All case folding and normalization
187 * has been done by now, and all characters have been found in the alphabet.
188 * Note: len here is the padded length including 0 codes at start and end.
189 **/
190void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len) {
191    const Header* header = getHeader();
192    const Trie* trie = header->trieTable();
193    const Pattern* pattern = header->patternTable();
194    uint32_t char_mask = trie->char_mask;
195    uint32_t link_shift = trie->link_shift;
196    uint32_t link_mask = trie->link_mask;
197    uint32_t pattern_shift = trie->pattern_shift;
198    size_t maxOffset = len - MIN_SUFFIX - 1;
199    for (size_t i = 0; i < len - 1; i++) {
200        uint32_t node = 0;  // index into Trie table
201        for (size_t j = i; j < len; j++) {
202            uint16_t c = codes[j];
203            uint32_t entry = trie->data[node + c];
204            if ((entry & char_mask) == c) {
205                node = (entry & link_mask) >> link_shift;
206            } else {
207                break;
208            }
209            uint32_t pat_ix = trie->data[node] >> pattern_shift;
210            // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
211            // into the buf pool. This is the pattern for the substring (i..j) we just matched,
212            // which we combine (via point-wise max) into the result vector.
213            if (pat_ix != 0) {
214                uint32_t pat_entry = pattern->data[pat_ix];
215                int pat_len = Pattern::len(pat_entry);
216                int pat_shift = Pattern::shift(pat_entry);
217                const uint8_t* pat_buf = pattern->buf(pat_entry);
218                int offset = j + 1 - (pat_len + pat_shift);
219                // offset is the index within result that lines up with the start of pat_buf
220                int start = std::max(MIN_PREFIX - offset, 0);
221                int end = std::min(pat_len, (int)maxOffset - offset);
222                for (int k = start; k < end; k++) {
223                    result[offset + k] = std::max(result[offset + k], pat_buf[k]);
224                }
225            }
226        }
227    }
228    // Since the above calculation does not modify values outside
229    // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
230    for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
231        result[i] &= 1;
232    }
233}
234
235}  // namespace android
236