Hyphenator.h revision f0be43de02a1e07308d3d95408349c3c7f973430
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/**
18 * An implementation of Liang's hyphenation algorithm.
19 */
20
21#include <memory>
22#include <unordered_map>
23
24#ifndef MINIKIN_HYPHENATOR_H
25#define MINIKIN_HYPHENATOR_H
26
27namespace android {
28
29// hyb file header; implementation details are in the .cpp file
30struct Header;
31
32class Hyphenator {
33public:
34    // Note: this will also require a locale, for proper case folding behavior
35    static Hyphenator* load(const uint16_t* patternData, size_t size);
36
37    // Compute the hyphenation of a word, storing the hyphenation in result vector. Each
38    // entry in the vector is a "hyphen edit" to be applied at the corresponding code unit
39    // offset in the word. Currently 0 means no hyphen and 1 means insert hyphen and break,
40    // but this will be expanded to other edits for nonstandard hyphenation.
41    // Example: word is "hyphen", result is [0 0 1 0 0 0], corresponding to "hy-phen".
42    void hyphenate(std::vector<uint8_t>* result, const uint16_t* word, size_t len);
43
44    // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
45    // the caller is responsible for ensuring that the lifetime of the pattern data is
46    // at least as long as the Hyphenator object.
47
48    // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens
49    static Hyphenator* loadBinary(const uint8_t* patternData);
50
51private:
52    // apply soft hyphens only, ignoring patterns
53    void hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len);
54
55    // try looking up word in alphabet table, return false if any code units fail to map
56    // Note that this methor writes len+2 entries into alpha_codes (including start and stop)
57    bool alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len);
58
59    // calculate hyphenation from patterns, assuming alphabet lookup has already been done
60    void hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len);
61
62    // TODO: these should become parameters, as they might vary by locale, screen size, and
63    // possibly explicit user control.
64    static const int MIN_PREFIX = 2;
65    static const int MIN_SUFFIX = 3;
66
67    // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
68    // that temporary buffers can be stack-allocated without waste, which is a slightly
69    // different use case. It measures UTF-16 code units.
70    static const size_t MAX_HYPHENATED_SIZE = 64;
71
72    const uint8_t* patternData;
73
74    // accessors for binary data
75    const Header* getHeader() const {
76        return reinterpret_cast<const Header*>(patternData);
77    }
78
79};
80
81}  // namespace android
82
83#endif   // MINIKIN_HYPHENATOR_H
84