HyphenatorTest.cpp revision c7ef4000c1e840c3d3b66e85a40ebd34a5a2a8ee
1/* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <gtest/gtest.h> 18 19#include "ICUTestBase.h" 20#include <minikin/Hyphenator.h> 21#include <FileUtils.h> 22 23#ifndef NELEM 24#define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) 25#endif 26 27namespace minikin { 28 29const char* usHyph = "/system/usr/hyphen-data/hyph-en-us.hyb"; 30const char* malayalamHyph = "/system/usr/hyphen-data/hyph-ml.hyb"; 31 32typedef ICUTestBase HyphenatorTest; 33 34const icu::Locale catalanLocale("ca", "ES", nullptr, nullptr); 35const icu::Locale polishLocale("pl", "PL", nullptr, nullptr); 36const icu::Locale& usLocale = icu::Locale::getUS(); 37 38const uint16_t HYPHEN_MINUS = 0x002D; 39const uint16_t SOFT_HYPHEN = 0x00AD; 40const uint16_t MIDDLE_DOT = 0x00B7; 41const uint16_t GREEK_LOWER_ALPHA = 0x03B1; 42const uint16_t ARMENIAN_AYB = 0x0531; 43const uint16_t HEBREW_ALEF = 0x05D0; 44const uint16_t ARABIC_ALEF = 0x0627; 45const uint16_t ARABIC_BEH = 0x0628; 46const uint16_t ARABIC_ZWARAKAY = 0x0659; 47const uint16_t MALAYALAM_KA = 0x0D15; 48const uint16_t UCAS_E = 0x1401; 49const uint16_t HYPHEN = 0x2010; 50const uint16_t EN_DASH = 0x2013; 51 52// Simple test for US English. This tests "table", which happens to be the in the exceptions list. 53TEST_F(HyphenatorTest, usEnglishAutomaticHyphenation) { 54 Hyphenator* hyphenator = Hyphenator::loadBinary(readWholeFile(usHyph).data()); 55 const uint16_t word[] = {'t', 'a', 'b', 'l', 'e'}; 56 std::vector<HyphenationType> result; 57 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 58 EXPECT_EQ((size_t) 5, result.size()); 59 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 60 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 61 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 62 EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]); 63 EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]); 64} 65 66// Catalan l·l should break as l-/l 67TEST_F(HyphenatorTest, catalanMiddleDot) { 68 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 69 const uint16_t word[] = {'l', 'l', MIDDLE_DOT, 'l', 'l', 'l'}; 70 std::vector<HyphenationType> result; 71 hyphenator->hyphenate(&result, word, NELEM(word), catalanLocale); 72 EXPECT_EQ((size_t) 6, result.size()); 73 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 74 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 75 EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]); 76 EXPECT_EQ(HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN, result[3]); 77 EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]); 78 EXPECT_EQ(HyphenationType::DONT_BREAK, result[5]); 79} 80 81// Catalan l·l should not break if the word is too short. 82TEST_F(HyphenatorTest, catalanMiddleDotShortWord) { 83 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 84 const uint16_t word[] = {'l', MIDDLE_DOT, 'l'}; 85 std::vector<HyphenationType> result; 86 hyphenator->hyphenate(&result, word, NELEM(word), catalanLocale); 87 EXPECT_EQ((size_t) 3, result.size()); 88 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 89 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 90 EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]); 91} 92 93// If we break on a hyphen in Polish, the hyphen should be repeated on the next line. 94TEST_F(HyphenatorTest, polishHyphen) { 95 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 96 const uint16_t word[] = {'x', HYPHEN, 'y'}; 97 std::vector<HyphenationType> result; 98 hyphenator->hyphenate(&result, word, NELEM(word), polishLocale); 99 EXPECT_EQ((size_t) 3, result.size()); 100 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 101 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 102 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]); 103} 104 105// If the language is Polish but the script is not Latin, don't use Polish rules for hyphenation. 106TEST_F(HyphenatorTest, polishHyphenButNonLatinWord) { 107 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 108 const uint16_t word[] = {GREEK_LOWER_ALPHA, HYPHEN, GREEK_LOWER_ALPHA}; 109 std::vector<HyphenationType> result; 110 hyphenator->hyphenate(&result, word, NELEM(word), polishLocale); 111 EXPECT_EQ((size_t) 3, result.size()); 112 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 113 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 114 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 115} 116 117// Polish en dash doesn't repeat on next line (as far as we know), but just provides a break 118// opportunity. 119TEST_F(HyphenatorTest, polishEnDash) { 120 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 121 const uint16_t word[] = {'x', EN_DASH, 'y'}; 122 std::vector<HyphenationType> result; 123 hyphenator->hyphenate(&result, word, NELEM(word), polishLocale); 124 EXPECT_EQ((size_t) 3, result.size()); 125 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 126 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 127 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 128} 129 130// In Latin script text, soft hyphens should insert a visible hyphen if broken at. 131TEST_F(HyphenatorTest, latinSoftHyphen) { 132 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 133 const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'}; 134 std::vector<HyphenationType> result; 135 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 136 EXPECT_EQ((size_t) 3, result.size()); 137 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 138 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 139 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 140} 141 142// Soft hyphens at the beginning of a word are not useful in linebreaking. 143TEST_F(HyphenatorTest, latinSoftHyphenStartingTheWord) { 144 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 145 const uint16_t word[] = {SOFT_HYPHEN, 'y'}; 146 std::vector<HyphenationType> result; 147 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 148 EXPECT_EQ((size_t) 2, result.size()); 149 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 150 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 151} 152 153// In Malayalam script text, soft hyphens should not insert a visible hyphen if broken at. 154TEST_F(HyphenatorTest, malayalamSoftHyphen) { 155 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 156 const uint16_t word[] = {MALAYALAM_KA, SOFT_HYPHEN, MALAYALAM_KA}; 157 std::vector<HyphenationType> result; 158 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 159 EXPECT_EQ((size_t) 3, result.size()); 160 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 161 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 162 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 163} 164 165// In automatically hyphenated Malayalam script text, we should not insert a visible hyphen. 166TEST_F(HyphenatorTest, malayalamAutomaticHyphenation) { 167 Hyphenator* hyphenator = Hyphenator::loadBinary(readWholeFile(malayalamHyph).data()); 168 const uint16_t word[] = { 169 MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA}; 170 std::vector<HyphenationType> result; 171 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 172 EXPECT_EQ((size_t) 5, result.size()); 173 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 174 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 175 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 176 EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]); 177 EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]); 178} 179 180// In Armenian script text, soft hyphens should insert an Armenian hyphen if broken at. 181TEST_F(HyphenatorTest, aremenianSoftHyphen) { 182 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 183 const uint16_t word[] = {ARMENIAN_AYB, SOFT_HYPHEN, ARMENIAN_AYB}; 184 std::vector<HyphenationType> result; 185 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 186 EXPECT_EQ((size_t) 3, result.size()); 187 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 188 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 189 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN, result[2]); 190} 191 192// In Hebrew script text, soft hyphens should insert a normal hyphen if broken at, for now. 193// We may need to change this to maqaf later. 194TEST_F(HyphenatorTest, hebrewSoftHyphen) { 195 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 196 const uint16_t word[] = {HEBREW_ALEF, SOFT_HYPHEN, HEBREW_ALEF}; 197 std::vector<HyphenationType> result; 198 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 199 EXPECT_EQ((size_t) 3, result.size()); 200 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 201 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 202 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 203} 204 205// Soft hyphen between two Arabic letters that join should keep the joining 206// behavior when broken across lines. 207TEST_F(HyphenatorTest, arabicSoftHyphenConnecting) { 208 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 209 const uint16_t word[] = {ARABIC_BEH, SOFT_HYPHEN, ARABIC_BEH}; 210 std::vector<HyphenationType> result; 211 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 212 EXPECT_EQ((size_t) 3, result.size()); 213 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 214 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 215 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[2]); 216} 217 218// Arabic letters may be joining on one side, but if it's the wrong side, we 219// should use the normal hyphen. 220TEST_F(HyphenatorTest, arabicSoftHyphenNonConnecting) { 221 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 222 const uint16_t word[] = {ARABIC_ALEF, SOFT_HYPHEN, ARABIC_BEH}; 223 std::vector<HyphenationType> result; 224 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 225 EXPECT_EQ((size_t) 3, result.size()); 226 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 227 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 228 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 229} 230 231// Skip transparent characters until you find a non-transparent one. 232TEST_F(HyphenatorTest, arabicSoftHyphenSkipTransparents) { 233 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 234 const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH}; 235 std::vector<HyphenationType> result; 236 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 237 EXPECT_EQ((size_t) 5, result.size()); 238 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 239 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 240 EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]); 241 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[3]); 242 EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]); 243} 244 245// Skip transparent characters until you find a non-transparent one. If we get to one end without 246// finding anything, we are still non-joining. 247TEST_F(HyphenatorTest, arabicSoftHyphenTransparentsAtEnd) { 248 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 249 const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY}; 250 std::vector<HyphenationType> result; 251 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 252 EXPECT_EQ((size_t) 4, result.size()); 253 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 254 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 255 EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]); 256 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[3]); 257} 258 259// Skip transparent characters until you find a non-transparent one. If we get to one end without 260// finding anything, we are still non-joining. 261TEST_F(HyphenatorTest, arabicSoftHyphenTransparentsAtStart) { 262 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 263 const uint16_t word[] = {ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH}; 264 std::vector<HyphenationType> result; 265 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 266 EXPECT_EQ((size_t) 4, result.size()); 267 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 268 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 269 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 270 EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]); 271} 272 273// In Unified Canadian Aboriginal script (UCAS) text, soft hyphens should insert a UCAS hyphen. 274TEST_F(HyphenatorTest, ucasSoftHyphen) { 275 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 276 const uint16_t word[] = {UCAS_E, SOFT_HYPHEN, UCAS_E}; 277 std::vector<HyphenationType> result; 278 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 279 EXPECT_EQ((size_t) 3, result.size()); 280 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 281 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 282 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]); 283} 284 285// Presently, soft hyphen looks at the character after it to determine hyphenation type. This is a 286// little arbitrary, but let's test it anyway. 287TEST_F(HyphenatorTest, mixedScriptSoftHyphen) { 288 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 289 const uint16_t word[] = {'a', SOFT_HYPHEN, UCAS_E}; 290 std::vector<HyphenationType> result; 291 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 292 EXPECT_EQ((size_t) 3, result.size()); 293 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 294 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 295 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]); 296} 297 298// Hard hyphens provide a breaking opportunity with nothing extra inserted. 299TEST_F(HyphenatorTest, hardHyphen) { 300 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 301 const uint16_t word[] = {'x', HYPHEN, 'y'}; 302 std::vector<HyphenationType> result; 303 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 304 EXPECT_EQ((size_t) 3, result.size()); 305 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 306 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 307 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 308} 309 310// Hyphen-minuses also provide a breaking opportunity with nothing extra inserted. 311TEST_F(HyphenatorTest, hyphenMinus) { 312 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 313 const uint16_t word[] = {'x', HYPHEN_MINUS, 'y'}; 314 std::vector<HyphenationType> result; 315 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 316 EXPECT_EQ((size_t) 3, result.size()); 317 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 318 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 319 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 320} 321 322// If the word starts with a hard hyphen or hyphen-minus, it doesn't make sense to break 323// it at that point. 324TEST_F(HyphenatorTest, startingHyphenMinus) { 325 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr); 326 const uint16_t word[] = {HYPHEN_MINUS, 'y'}; 327 std::vector<HyphenationType> result; 328 hyphenator->hyphenate(&result, word, NELEM(word), usLocale); 329 EXPECT_EQ((size_t) 2, result.size()); 330 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 331 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 332} 333 334} // namespace minikin 335 336