Hyphenator.cpp revision cdd19dadd11a611409c24bb69e6629eab6812d98
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <vector> 18#include <memory> 19#include <algorithm> 20#include <string> 21#include <unicode/uchar.h> 22 23// HACK: for reading pattern file 24#include <fcntl.h> 25 26#define LOG_TAG "Minikin" 27#include "utils/Log.h" 28 29#include "minikin/Hyphenator.h" 30 31using std::vector; 32 33namespace android { 34 35static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; 36 37void Hyphenator::addPattern(const uint16_t* pattern, size_t size) { 38 vector<uint16_t> word; 39 vector<uint8_t> result; 40 41 // start by parsing the Liang-format pattern into a word and a result vector, the 42 // vector right-aligned but without leading zeros. Examples: 43 // a1bc2d -> abcd [1, 0, 2, 0] 44 // abc1 -> abc [1] 45 // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5] 46 bool lastWasLetter = false; 47 bool haveSeenNumber = false; 48 for (size_t i = 0; i < size; i++) { 49 uint16_t c = pattern[i]; 50 if (isdigit(c)) { 51 result.push_back(c - '0'); 52 lastWasLetter = false; 53 haveSeenNumber = true; 54 } else { 55 word.push_back(c); 56 if (lastWasLetter && haveSeenNumber) { 57 result.push_back(0); 58 } 59 lastWasLetter = true; 60 } 61 } 62 if (lastWasLetter) { 63 result.push_back(0); 64 } 65 Trie* t = &root; 66 for (size_t i = 0; i < word.size(); i++) { 67 t = &t->succ[word[i]]; 68 } 69 t->result = result; 70} 71 72// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation, 73// as recommended in UAX #14 (Use of Soft Hyphen) 74void Hyphenator::hyphenateSoft(vector<uint8_t>* result, const uint16_t* word, size_t len) { 75 (*result)[0] = 0; 76 for (size_t i = 1; i < len; i++) { 77 (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN; 78 } 79} 80 81void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) { 82 result->clear(); 83 result->resize(len); 84 if (len < MIN_PREFIX + MIN_SUFFIX) return; 85 size_t maxOffset = len - MIN_SUFFIX + 1; 86 for (size_t i = 0; i < len + 1; i++) { 87 const Trie* node = &root; 88 for (size_t j = i; j < len + 2; j++) { 89 uint16_t c; 90 if (j == 0 || j == len + 1) { 91 c = '.'; // word boundary character in pattern data files 92 } else { 93 c = word[j - 1]; 94 if (c == CHAR_SOFT_HYPHEN) { 95 hyphenateSoft(result, word, len); 96 return; 97 } 98 // TODO: This uses ICU's simple character to character lowercasing, which ignores 99 // the locale, and ignores cases when lowercasing a character results in more than 100 // one character. It should be fixed to consider the locale (in order for it to work 101 // correctly for Turkish and Azerbaijani), as well as support one-to-many, and 102 // many-to-many case conversions (including non-BMP cases). 103 if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character 104 // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is 105 if (0x0041 <= c && c <= 0x005A) { 106 c += 0x0020; 107 } 108 } else { 109 c = u_tolower(c); 110 } 111 } 112 auto search = node->succ.find(c); 113 if (search != node->succ.end()) { 114 node = &search->second; 115 } else { 116 break; 117 } 118 if (!node->result.empty()) { 119 int resultLen = node->result.size(); 120 int offset = j + 1 - resultLen; 121 int start = std::max(MIN_PREFIX - offset, 0); 122 int end = std::min(resultLen, (int)maxOffset - offset); 123 // TODO performance: this inner loop can profitably be optimized 124 for (int k = start; k < end; k++) { 125 (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]); 126 } 127#if 0 128 // debug printing of matched patterns 129 std::string dbg; 130 for (size_t k = i; k <= j + 1; k++) { 131 int off = k - j - 2 + resultLen; 132 if (off >= 0 && node->result[off] != 0) { 133 dbg.push_back((char)('0' + node->result[off])); 134 } 135 if (k < j + 1) { 136 uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1]; 137 dbg.push_back((char)c); 138 } 139 } 140 ALOGD("%d:%d %s", i, j, dbg.c_str()); 141#endif 142 } 143 } 144 } 145 // Since the above calculation does not modify values outside 146 // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0. 147 for (size_t i = MIN_PREFIX; i < maxOffset; i++) { 148 (*result)[i] &= 1; 149 } 150} 151 152Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) { 153 Hyphenator* result = new Hyphenator; 154 for (size_t i = 0; i < size; i++) { 155 size_t end = i; 156 while (patternData[end] != '\n') end++; 157 result->addPattern(patternData + i, end - i); 158 i = end; 159 } 160 return result; 161} 162 163} // namespace android 164