HyphenatorTest.cpp revision c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8ee
1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <gtest/gtest.h>
18
19#include "ICUTestBase.h"
20#include <minikin/Hyphenator.h>
21#include <FileUtils.h>
22
23#ifndef NELEM
24#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
25#endif
26
27namespace minikin {
28
29const char* usHyph = "/system/usr/hyphen-data/hyph-en-us.hyb";
30const char* malayalamHyph = "/system/usr/hyphen-data/hyph-ml.hyb";
31
32typedef ICUTestBase HyphenatorTest;
33
34const icu::Locale catalanLocale("ca", "ES", nullptr, nullptr);
35const icu::Locale polishLocale("pl", "PL", nullptr, nullptr);
36const icu::Locale& usLocale = icu::Locale::getUS();
37
38const uint16_t HYPHEN_MINUS = 0x002D;
39const uint16_t SOFT_HYPHEN = 0x00AD;
40const uint16_t MIDDLE_DOT = 0x00B7;
41const uint16_t GREEK_LOWER_ALPHA = 0x03B1;
42const uint16_t ARMENIAN_AYB = 0x0531;
43const uint16_t HEBREW_ALEF = 0x05D0;
44const uint16_t ARABIC_ALEF = 0x0627;
45const uint16_t ARABIC_BEH = 0x0628;
46const uint16_t ARABIC_ZWARAKAY = 0x0659;
47const uint16_t MALAYALAM_KA = 0x0D15;
48const uint16_t UCAS_E = 0x1401;
49const uint16_t HYPHEN = 0x2010;
50const uint16_t EN_DASH = 0x2013;
51
52// Simple test for US English. This tests "table", which happens to be the in the exceptions list.
53TEST_F(HyphenatorTest, usEnglishAutomaticHyphenation) {
54    Hyphenator* hyphenator = Hyphenator::loadBinary(readWholeFile(usHyph).data());
55    const uint16_t word[] = {'t', 'a', 'b', 'l', 'e'};
56    std::vector<HyphenationType> result;
57    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
58    EXPECT_EQ((size_t) 5, result.size());
59    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
60    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
61    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
62    EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
63    EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
64}
65
66// Catalan l·l should break as l-/l
67TEST_F(HyphenatorTest, catalanMiddleDot) {
68    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
69    const uint16_t word[] = {'l', 'l', MIDDLE_DOT, 'l', 'l', 'l'};
70    std::vector<HyphenationType> result;
71    hyphenator->hyphenate(&result, word, NELEM(word), catalanLocale);
72    EXPECT_EQ((size_t) 6, result.size());
73    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
74    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
75    EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
76    EXPECT_EQ(HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN, result[3]);
77    EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
78    EXPECT_EQ(HyphenationType::DONT_BREAK, result[5]);
79}
80
81// Catalan l·l should not break if the word is too short.
82TEST_F(HyphenatorTest, catalanMiddleDotShortWord) {
83    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
84    const uint16_t word[] = {'l', MIDDLE_DOT, 'l'};
85    std::vector<HyphenationType> result;
86    hyphenator->hyphenate(&result, word, NELEM(word), catalanLocale);
87    EXPECT_EQ((size_t) 3, result.size());
88    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
89    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
90    EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
91}
92
93// If we break on a hyphen in Polish, the hyphen should be repeated on the next line.
94TEST_F(HyphenatorTest, polishHyphen) {
95    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
96    const uint16_t word[] = {'x', HYPHEN, 'y'};
97    std::vector<HyphenationType> result;
98    hyphenator->hyphenate(&result, word, NELEM(word), polishLocale);
99    EXPECT_EQ((size_t) 3, result.size());
100    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
101    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
102    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]);
103}
104
105// If the language is Polish but the script is not Latin, don't use Polish rules for hyphenation.
106TEST_F(HyphenatorTest, polishHyphenButNonLatinWord) {
107    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
108    const uint16_t word[] = {GREEK_LOWER_ALPHA, HYPHEN, GREEK_LOWER_ALPHA};
109    std::vector<HyphenationType> result;
110    hyphenator->hyphenate(&result, word, NELEM(word), polishLocale);
111    EXPECT_EQ((size_t) 3, result.size());
112    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
113    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
114    EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
115}
116
117// Polish en dash doesn't repeat on next line (as far as we know), but just provides a break
118// opportunity.
119TEST_F(HyphenatorTest, polishEnDash) {
120    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
121    const uint16_t word[] = {'x', EN_DASH, 'y'};
122    std::vector<HyphenationType> result;
123    hyphenator->hyphenate(&result, word, NELEM(word), polishLocale);
124    EXPECT_EQ((size_t) 3, result.size());
125    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
126    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
127    EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
128}
129
130// In Latin script text, soft hyphens should insert a visible hyphen if broken at.
131TEST_F(HyphenatorTest, latinSoftHyphen) {
132    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
133    const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
134    std::vector<HyphenationType> result;
135    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
136    EXPECT_EQ((size_t) 3, result.size());
137    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
138    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
139    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
140}
141
142// Soft hyphens at the beginning of a word are not useful in linebreaking.
143TEST_F(HyphenatorTest, latinSoftHyphenStartingTheWord) {
144    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
145    const uint16_t word[] = {SOFT_HYPHEN, 'y'};
146    std::vector<HyphenationType> result;
147    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
148    EXPECT_EQ((size_t) 2, result.size());
149    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
150    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
151}
152
153// In Malayalam script text, soft hyphens should not insert a visible hyphen if broken at.
154TEST_F(HyphenatorTest, malayalamSoftHyphen) {
155    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
156    const uint16_t word[] = {MALAYALAM_KA, SOFT_HYPHEN, MALAYALAM_KA};
157    std::vector<HyphenationType> result;
158    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
159    EXPECT_EQ((size_t) 3, result.size());
160    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
161    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
162    EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
163}
164
165// In automatically hyphenated Malayalam script text, we should not insert a visible hyphen.
166TEST_F(HyphenatorTest, malayalamAutomaticHyphenation) {
167    Hyphenator* hyphenator = Hyphenator::loadBinary(readWholeFile(malayalamHyph).data());
168    const uint16_t word[] = {
169            MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA};
170    std::vector<HyphenationType> result;
171    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
172    EXPECT_EQ((size_t) 5, result.size());
173    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
174    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
175    EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
176    EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
177    EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
178}
179
180// In Armenian script text, soft hyphens should insert an Armenian hyphen if broken at.
181TEST_F(HyphenatorTest, aremenianSoftHyphen) {
182    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
183    const uint16_t word[] = {ARMENIAN_AYB, SOFT_HYPHEN, ARMENIAN_AYB};
184    std::vector<HyphenationType> result;
185    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
186    EXPECT_EQ((size_t) 3, result.size());
187    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
188    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
189    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN, result[2]);
190}
191
192// In Hebrew script text, soft hyphens should insert a normal hyphen if broken at, for now.
193// We may need to change this to maqaf later.
194TEST_F(HyphenatorTest, hebrewSoftHyphen) {
195    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
196    const uint16_t word[] = {HEBREW_ALEF, SOFT_HYPHEN, HEBREW_ALEF};
197    std::vector<HyphenationType> result;
198    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
199    EXPECT_EQ((size_t) 3, result.size());
200    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
201    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
202    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
203}
204
205// Soft hyphen between two Arabic letters that join should keep the joining
206// behavior when broken across lines.
207TEST_F(HyphenatorTest, arabicSoftHyphenConnecting) {
208    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
209    const uint16_t word[] = {ARABIC_BEH, SOFT_HYPHEN, ARABIC_BEH};
210    std::vector<HyphenationType> result;
211    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
212    EXPECT_EQ((size_t) 3, result.size());
213    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
214    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
215    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[2]);
216}
217
218// Arabic letters may be joining on one side, but if it's the wrong side, we
219// should use the normal hyphen.
220TEST_F(HyphenatorTest, arabicSoftHyphenNonConnecting) {
221    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
222    const uint16_t word[] = {ARABIC_ALEF, SOFT_HYPHEN, ARABIC_BEH};
223    std::vector<HyphenationType> result;
224    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
225    EXPECT_EQ((size_t) 3, result.size());
226    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
227    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
228    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
229}
230
231// Skip transparent characters until you find a non-transparent one.
232TEST_F(HyphenatorTest, arabicSoftHyphenSkipTransparents) {
233    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
234    const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
235    std::vector<HyphenationType> result;
236    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
237    EXPECT_EQ((size_t) 5, result.size());
238    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
239    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
240    EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
241    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[3]);
242    EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
243}
244
245// Skip transparent characters until you find a non-transparent one. If we get to one end without
246// finding anything, we are still non-joining.
247TEST_F(HyphenatorTest, arabicSoftHyphenTransparentsAtEnd) {
248    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
249    const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY};
250    std::vector<HyphenationType> result;
251    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
252    EXPECT_EQ((size_t) 4, result.size());
253    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
254    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
255    EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
256    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[3]);
257}
258
259// Skip transparent characters until you find a non-transparent one. If we get to one end without
260// finding anything, we are still non-joining.
261TEST_F(HyphenatorTest, arabicSoftHyphenTransparentsAtStart) {
262    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
263    const uint16_t word[] = {ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
264    std::vector<HyphenationType> result;
265    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
266    EXPECT_EQ((size_t) 4, result.size());
267    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
268    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
269    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
270    EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
271}
272
273// In Unified Canadian Aboriginal script (UCAS) text, soft hyphens should insert a UCAS hyphen.
274TEST_F(HyphenatorTest, ucasSoftHyphen) {
275    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
276    const uint16_t word[] = {UCAS_E, SOFT_HYPHEN, UCAS_E};
277    std::vector<HyphenationType> result;
278    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
279    EXPECT_EQ((size_t) 3, result.size());
280    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
281    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
282    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
283}
284
285// Presently, soft hyphen looks at the character after it to determine hyphenation type. This is a
286// little arbitrary, but let's test it anyway.
287TEST_F(HyphenatorTest, mixedScriptSoftHyphen) {
288    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
289    const uint16_t word[] = {'a', SOFT_HYPHEN, UCAS_E};
290    std::vector<HyphenationType> result;
291    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
292    EXPECT_EQ((size_t) 3, result.size());
293    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
294    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
295    EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
296}
297
298// Hard hyphens provide a breaking opportunity with nothing extra inserted.
299TEST_F(HyphenatorTest, hardHyphen) {
300    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
301    const uint16_t word[] = {'x', HYPHEN, 'y'};
302    std::vector<HyphenationType> result;
303    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
304    EXPECT_EQ((size_t) 3, result.size());
305    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
306    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
307    EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
308}
309
310// Hyphen-minuses also provide a breaking opportunity with nothing extra inserted.
311TEST_F(HyphenatorTest, hyphenMinus) {
312    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
313    const uint16_t word[] = {'x', HYPHEN_MINUS, 'y'};
314    std::vector<HyphenationType> result;
315    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
316    EXPECT_EQ((size_t) 3, result.size());
317    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
318    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
319    EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
320}
321
322// If the word starts with a hard hyphen or hyphen-minus, it doesn't make sense to break
323// it at that point.
324TEST_F(HyphenatorTest, startingHyphenMinus) {
325    Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr);
326    const uint16_t word[] = {HYPHEN_MINUS, 'y'};
327    std::vector<HyphenationType> result;
328    hyphenator->hyphenate(&result, word, NELEM(word), usLocale);
329    EXPECT_EQ((size_t) 2, result.size());
330    EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
331    EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
332}
333
334}  // namespace minikin
335
336