Hyphenator.cpp revision 5cdad92c300a65cab89b172e952186f0c5870657
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <vector>
18#include <memory>
19#include <cctype>
20#include <algorithm>
21#include <string>
22
23// HACK: for reading pattern file
24#include <fcntl.h>
25
26#define LOG_TAG "Minikin"
27#include "utils/Log.h"
28
29#include "minikin/Hyphenator.h"
30
31using std::vector;
32
33namespace android {
34
35static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
36
37void Hyphenator::addPattern(const uint16_t* pattern, size_t size) {
38    vector<uint16_t> word;
39    vector<uint8_t> result;
40
41    // start by parsing the Liang-format pattern into a word and a result vector, the
42    // vector right-aligned but without leading zeros. Examples:
43    // a1bc2d -> abcd [1, 0, 2, 0]
44    // abc1 -> abc [1]
45    // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5]
46    bool lastWasLetter = false;
47    bool haveSeenNumber = false;
48    for (size_t i = 0; i < size; i++) {
49        uint16_t c = pattern[i];
50        if (isdigit(c)) {
51            result.push_back(c - '0');
52            lastWasLetter = false;
53            haveSeenNumber = true;
54        } else {
55            word.push_back(c);
56            if (lastWasLetter && haveSeenNumber) {
57                result.push_back(0);
58            }
59            lastWasLetter = true;
60        }
61    }
62    if (lastWasLetter) {
63        result.push_back(0);
64    }
65    Trie* t = &root;
66    for (size_t i = 0; i < word.size(); i++) {
67        t = &t->succ[word[i]];
68    }
69    t->result = result;
70}
71
72// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
73// as recommended in UAX #14 (Use of Soft Hyphen)
74void Hyphenator::hyphenateSoft(vector<uint8_t>* result, const uint16_t* word, size_t len) {
75    (*result)[0] = 0;
76    for (size_t i = 1; i < len; i++) {
77        (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
78    }
79}
80
81void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
82    result->clear();
83    result->resize(len);
84    if (len < MIN_PREFIX + MIN_SUFFIX) return;
85    size_t maxOffset = len - MIN_SUFFIX + 1;
86    for (size_t i = 0; i < len + 1; i++) {
87        const Trie* node = &root;
88        for (size_t j = i; j < len + 2; j++) {
89            uint16_t c;
90            if (j == 0 || j == len + 1) {
91                c = '.';  // word boundary character in pattern data files
92            } else {
93                c = word[j - 1];
94                if (c == CHAR_SOFT_HYPHEN) {
95                    hyphenateSoft(result, word, len);
96                    return;
97                }
98                // TODO: use locale-sensitive case folding from ICU.
99                c = tolower(c);
100            }
101            auto search = node->succ.find(c);
102            if (search != node->succ.end()) {
103                node = &search->second;
104            } else {
105                break;
106            }
107            if (!node->result.empty()) {
108                int resultLen = node->result.size();
109                int offset = j + 1 - resultLen;
110                int start = std::max(MIN_PREFIX - offset, 0);
111                int end = std::min(resultLen, (int)maxOffset - offset);
112                // TODO performance: this inner loop can profitably be optimized
113                for (int k = start; k < end; k++) {
114                    (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]);
115                }
116#if 0
117                // debug printing of matched patterns
118                std::string dbg;
119                for (size_t k = i; k <= j + 1; k++) {
120                    int off = k - j - 2 + resultLen;
121                    if (off >= 0 && node->result[off] != 0) {
122                        dbg.push_back((char)('0' + node->result[off]));
123                    }
124                    if (k < j + 1) {
125                        uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1];
126                        dbg.push_back((char)c);
127                    }
128                }
129                ALOGD("%d:%d %s", i, j, dbg.c_str());
130#endif
131            }
132        }
133    }
134    // Since the above calculation does not modify values outside
135    // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
136    for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
137        (*result)[i] &= 1;
138    }
139}
140
141Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) {
142    Hyphenator* result = new Hyphenator;
143    for (size_t i = 0; i < size; i++) {
144        size_t end = i;
145        while (patternData[end] != '\n') end++;
146        result->addPattern(patternData + i, end - i);
147        i = end;
148    }
149    return result;
150}
151
152}  // namespace android
153