Hyphenator.cpp revision 6e2cccdc518f8d3424c84ae6fbe0e87ae3c3f66a
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <vector> 18#include <memory> 19#include <algorithm> 20#include <string> 21#include <unicode/uchar.h> 22 23// HACK: for reading pattern file 24#include <fcntl.h> 25 26#define LOG_TAG "Minikin" 27#include "utils/Log.h" 28 29#include "minikin/Hyphenator.h" 30 31using std::vector; 32 33namespace android { 34 35static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; 36 37// The following are structs that correspond to tables inside the hyb file format 38 39struct AlphabetTable0 { 40 uint32_t version; 41 uint32_t min_codepoint; 42 uint32_t max_codepoint; 43 uint8_t data[1]; // actually flexible array, size is known at runtime 44}; 45 46struct AlphabetTable1 { 47 uint32_t version; 48 uint32_t n_entries; 49 uint32_t data[1]; // actually flexible array, size is known at runtime 50 51 static uint32_t codepoint(uint32_t entry) { return entry >> 11; } 52 static uint32_t value(uint32_t entry) { return entry & 0x7ff; } 53}; 54 55struct Trie { 56 uint32_t version; 57 uint32_t char_mask; 58 uint32_t link_shift; 59 uint32_t link_mask; 60 uint32_t pattern_shift; 61 uint32_t n_entries; 62 uint32_t data[1]; // actually flexible array, size is known at runtime 63}; 64 65struct Pattern { 66 uint32_t version; 67 uint32_t n_entries; 68 uint32_t pattern_offset; 69 uint32_t pattern_size; 70 uint32_t data[1]; // actually flexible array, size is known at runtime 71 72 // accessors 73 static uint32_t len(uint32_t entry) { return entry >> 26; } 74 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } 75 const uint8_t* buf(uint32_t entry) const { 76 return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff); 77 } 78}; 79 80struct Header { 81 uint32_t magic; 82 uint32_t version; 83 uint32_t alphabet_offset; 84 uint32_t trie_offset; 85 uint32_t pattern_offset; 86 uint32_t file_size; 87 88 // accessors 89 const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); } 90 uint32_t alphabetVersion() const { 91 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset); 92 } 93 const AlphabetTable0* alphabetTable0() const { 94 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset); 95 } 96 const AlphabetTable1* alphabetTable1() const { 97 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset); 98 } 99 const Trie* trieTable() const { 100 return reinterpret_cast<const Trie*>(bytes() + trie_offset); 101 } 102 const Pattern* patternTable() const { 103 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset); 104 } 105}; 106 107Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) { 108 Hyphenator* result = new Hyphenator; 109 result->patternData = patternData; 110 return result; 111} 112 113void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) { 114 result->clear(); 115 result->resize(len); 116 const size_t paddedLen = len + 2; // start and stop code each count for 1 117 if (patternData != nullptr && 118 (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_SIZE) { 119 uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; 120 if (alphabetLookup(alpha_codes, word, len)) { 121 hyphenateFromCodes(result->data(), alpha_codes, paddedLen); 122 return; 123 } 124 // TODO: try NFC normalization 125 // TODO: handle non-BMP Unicode (requires remapping of offsets) 126 } 127 hyphenateSoft(result->data(), word, len); 128} 129 130// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation, 131// as recommended in UAX #14 (Use of Soft Hyphen) 132void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len) { 133 result[0] = 0; 134 for (size_t i = 1; i < len; i++) { 135 result[i] = word[i - 1] == CHAR_SOFT_HYPHEN; 136 } 137} 138 139bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len) { 140 const Header* header = getHeader(); 141 // TODO: check header magic 142 uint32_t alphabetVersion = header->alphabetVersion(); 143 if (alphabetVersion == 0) { 144 const AlphabetTable0* alphabet = header->alphabetTable0(); 145 uint32_t min_codepoint = alphabet->min_codepoint; 146 uint32_t max_codepoint = alphabet->max_codepoint; 147 alpha_codes[0] = 0; // word start 148 for (size_t i = 0; i < len; i++) { 149 uint16_t c = word[i]; 150 if (c < min_codepoint || c >= max_codepoint) { 151 return false; 152 } 153 uint8_t code = alphabet->data[c - min_codepoint]; 154 if (code == 0) { 155 return false; 156 } 157 alpha_codes[i + 1] = code; 158 } 159 alpha_codes[len + 1] = 0; // word termination 160 return true; 161 } else if (alphabetVersion == 1) { 162 const AlphabetTable1* alphabet = header->alphabetTable1(); 163 size_t n_entries = alphabet->n_entries; 164 const uint32_t* begin = alphabet->data; 165 const uint32_t* end = begin + n_entries; 166 alpha_codes[0] = 0; 167 for (size_t i = 0; i < len; i++) { 168 uint16_t c = word[i]; 169 auto p = std::lower_bound(begin, end, c << 11); 170 if (p == end) { 171 return false; 172 } 173 uint32_t entry = *p; 174 if (AlphabetTable1::codepoint(entry) != c) { 175 return false; 176 } 177 alpha_codes[i + 1] = AlphabetTable1::value(entry); 178 } 179 alpha_codes[len + 1] = 0; 180 return true; 181 } 182 return false; 183} 184 185/** 186 * Internal implementation, after conversion to codes. All case folding and normalization 187 * has been done by now, and all characters have been found in the alphabet. 188 * Note: len here is the padded length including 0 codes at start and end. 189 **/ 190void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len) { 191 const Header* header = getHeader(); 192 const Trie* trie = header->trieTable(); 193 const Pattern* pattern = header->patternTable(); 194 uint32_t char_mask = trie->char_mask; 195 uint32_t link_shift = trie->link_shift; 196 uint32_t link_mask = trie->link_mask; 197 uint32_t pattern_shift = trie->pattern_shift; 198 size_t maxOffset = len - MIN_SUFFIX - 1; 199 for (size_t i = 0; i < len - 1; i++) { 200 uint32_t node = 0; // index into Trie table 201 for (size_t j = i; j < len; j++) { 202 uint16_t c = codes[j]; 203 uint32_t entry = trie->data[node + c]; 204 if ((entry & char_mask) == c) { 205 node = (entry & link_mask) >> link_shift; 206 } else { 207 break; 208 } 209 uint32_t pat_ix = trie->data[node] >> pattern_shift; 210 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset 211 // into the buf pool. This is the pattern for the substring (i..j) we just matched, 212 // which we combine (via point-wise max) into the result vector. 213 if (pat_ix != 0) { 214 uint32_t pat_entry = pattern->data[pat_ix]; 215 int pat_len = Pattern::len(pat_entry); 216 int pat_shift = Pattern::shift(pat_entry); 217 const uint8_t* pat_buf = pattern->buf(pat_entry); 218 int offset = j + 1 - (pat_len + pat_shift); 219 // offset is the index within result that lines up with the start of pat_buf 220 int start = std::max(MIN_PREFIX - offset, 0); 221 int end = std::min(pat_len, (int)maxOffset - offset); 222 for (int k = start; k < end; k++) { 223 result[offset + k] = std::max(result[offset + k], pat_buf[k]); 224 } 225 } 226 } 227 } 228 // Since the above calculation does not modify values outside 229 // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0. 230 for (size_t i = MIN_PREFIX; i < maxOffset; i++) { 231 result[i] &= 1; 232 } 233} 234 235} // namespace android 236