Hyphenator.cpp revision 5cdad92c300a65cab89b172e952186f0c5870657
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <vector> 18#include <memory> 19#include <cctype> 20#include <algorithm> 21#include <string> 22 23// HACK: for reading pattern file 24#include <fcntl.h> 25 26#define LOG_TAG "Minikin" 27#include "utils/Log.h" 28 29#include "minikin/Hyphenator.h" 30 31using std::vector; 32 33namespace android { 34 35static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; 36 37void Hyphenator::addPattern(const uint16_t* pattern, size_t size) { 38 vector<uint16_t> word; 39 vector<uint8_t> result; 40 41 // start by parsing the Liang-format pattern into a word and a result vector, the 42 // vector right-aligned but without leading zeros. Examples: 43 // a1bc2d -> abcd [1, 0, 2, 0] 44 // abc1 -> abc [1] 45 // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5] 46 bool lastWasLetter = false; 47 bool haveSeenNumber = false; 48 for (size_t i = 0; i < size; i++) { 49 uint16_t c = pattern[i]; 50 if (isdigit(c)) { 51 result.push_back(c - '0'); 52 lastWasLetter = false; 53 haveSeenNumber = true; 54 } else { 55 word.push_back(c); 56 if (lastWasLetter && haveSeenNumber) { 57 result.push_back(0); 58 } 59 lastWasLetter = true; 60 } 61 } 62 if (lastWasLetter) { 63 result.push_back(0); 64 } 65 Trie* t = &root; 66 for (size_t i = 0; i < word.size(); i++) { 67 t = &t->succ[word[i]]; 68 } 69 t->result = result; 70} 71 72// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation, 73// as recommended in UAX #14 (Use of Soft Hyphen) 74void Hyphenator::hyphenateSoft(vector<uint8_t>* result, const uint16_t* word, size_t len) { 75 (*result)[0] = 0; 76 for (size_t i = 1; i < len; i++) { 77 (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN; 78 } 79} 80 81void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) { 82 result->clear(); 83 result->resize(len); 84 if (len < MIN_PREFIX + MIN_SUFFIX) return; 85 size_t maxOffset = len - MIN_SUFFIX + 1; 86 for (size_t i = 0; i < len + 1; i++) { 87 const Trie* node = &root; 88 for (size_t j = i; j < len + 2; j++) { 89 uint16_t c; 90 if (j == 0 || j == len + 1) { 91 c = '.'; // word boundary character in pattern data files 92 } else { 93 c = word[j - 1]; 94 if (c == CHAR_SOFT_HYPHEN) { 95 hyphenateSoft(result, word, len); 96 return; 97 } 98 // TODO: use locale-sensitive case folding from ICU. 99 c = tolower(c); 100 } 101 auto search = node->succ.find(c); 102 if (search != node->succ.end()) { 103 node = &search->second; 104 } else { 105 break; 106 } 107 if (!node->result.empty()) { 108 int resultLen = node->result.size(); 109 int offset = j + 1 - resultLen; 110 int start = std::max(MIN_PREFIX - offset, 0); 111 int end = std::min(resultLen, (int)maxOffset - offset); 112 // TODO performance: this inner loop can profitably be optimized 113 for (int k = start; k < end; k++) { 114 (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]); 115 } 116#if 0 117 // debug printing of matched patterns 118 std::string dbg; 119 for (size_t k = i; k <= j + 1; k++) { 120 int off = k - j - 2 + resultLen; 121 if (off >= 0 && node->result[off] != 0) { 122 dbg.push_back((char)('0' + node->result[off])); 123 } 124 if (k < j + 1) { 125 uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1]; 126 dbg.push_back((char)c); 127 } 128 } 129 ALOGD("%d:%d %s", i, j, dbg.c_str()); 130#endif 131 } 132 } 133 } 134 // Since the above calculation does not modify values outside 135 // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0. 136 for (size_t i = MIN_PREFIX; i < maxOffset; i++) { 137 (*result)[i] &= 1; 138 } 139} 140 141Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) { 142 Hyphenator* result = new Hyphenator; 143 for (size_t i = 0; i < size; i++) { 144 size_t end = i; 145 while (patternData[end] != '\n') end++; 146 result->addPattern(patternData + i, end - i); 147 i = end; 148 } 149 return result; 150} 151 152} // namespace android 153