1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define LOG_TAG "Minikin" 18 19#include <android/log.h> 20#include <gtest/gtest.h> 21 22#include "ICUTestBase.h" 23#include "UnicodeUtils.h" 24#include <minikin/WordBreaker.h> 25#include <unicode/locid.h> 26#include <unicode/uclean.h> 27#include <unicode/udata.h> 28 29#ifndef NELEM 30#define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) 31#endif 32 33#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint) 34 35namespace minikin { 36 37typedef ICUTestBase WordBreakerTest; 38 39TEST_F(WordBreakerTest, basic) { 40 uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 41 WordBreaker breaker; 42 breaker.setLocale(icu::Locale::getUS()); 43 breaker.setText(buf, NELEM(buf)); 44 EXPECT_EQ(0, breaker.current()); 45 EXPECT_EQ(6, breaker.next()); // after "hello " 46 EXPECT_EQ(0, breaker.wordStart()); // "hello" 47 EXPECT_EQ(5, breaker.wordEnd()); 48 EXPECT_EQ(0, breaker.breakBadness()); 49 EXPECT_EQ(6, breaker.current()); 50 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 51 EXPECT_EQ(6, breaker.wordStart()); // "world" 52 EXPECT_EQ(11, breaker.wordEnd()); 53 EXPECT_EQ(0, breaker.breakBadness()); 54 EXPECT_EQ(11, breaker.current()); 55} 56 57TEST_F(WordBreakerTest, softHyphen) { 58 uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 59 WordBreaker breaker; 60 breaker.setLocale(icu::Locale::getUS()); 61 breaker.setText(buf, NELEM(buf)); 62 EXPECT_EQ(0, breaker.current()); 63 EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo " 64 EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo" 65 EXPECT_EQ(6, breaker.wordEnd()); 66 EXPECT_EQ(0, breaker.breakBadness()); 67 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 68 EXPECT_EQ(7, breaker.wordStart()); // "world" 69 EXPECT_EQ(12, breaker.wordEnd()); 70 EXPECT_EQ(0, breaker.breakBadness()); 71} 72 73TEST_F(WordBreakerTest, hardHyphen) { 74 // Hyphens should not allow breaks anymore. 75 uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'}; 76 WordBreaker breaker; 77 breaker.setLocale(icu::Locale::getUS()); 78 breaker.setText(buf, NELEM(buf)); 79 EXPECT_EQ(0, breaker.current()); 80 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); 81 EXPECT_EQ(0, breaker.wordStart()); 82 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); 83 EXPECT_EQ(0, breaker.breakBadness()); 84} 85 86TEST_F(WordBreakerTest, postfixAndPrefix) { 87 uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥ 88 WordBreaker breaker; 89 breaker.setLocale(icu::Locale::getUS()); 90 breaker.setText(buf, NELEM(buf)); 91 EXPECT_EQ(0, breaker.current()); 92 93 EXPECT_EQ(4, breaker.next()); // after CENT SIGN 94 EXPECT_EQ(0, breaker.wordStart()); // "US¢" 95 EXPECT_EQ(3, breaker.wordEnd()); 96 97 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string 98 EXPECT_EQ(4, breaker.wordStart()); // "JP¥" 99 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); 100} 101 102TEST_F(WordBreakerTest, myanmarKinzi) { 103 uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU 104 WordBreaker breaker; 105 icu::Locale burmese("my"); 106 breaker.setLocale(burmese); 107 breaker.setText(buf, NELEM(buf)); 108 EXPECT_EQ(0, breaker.current()); 109 110 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string 111 EXPECT_EQ(0, breaker.wordStart()); 112 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); 113} 114 115TEST_F(WordBreakerTest, zwjEmojiSequences) { 116 uint16_t buf[] = { 117 // man + zwj + heart + zwj + man 118 UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468), 119 // woman + zwj + heart + zwj + kiss mark + zwj + woman 120 UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469), 121 // eye + zwj + left speech bubble 122 UTF16(0x1F441), 0x200D, UTF16(0x1F5E8), 123 // CAT FACE + zwj + BUST IN SILHOUETTE 124 UTF16(0x1F431), 0x200D, UTF16(0x1F464), 125 }; 126 WordBreaker breaker; 127 breaker.setLocale(icu::Locale::getUS()); 128 breaker.setText(buf, NELEM(buf)); 129 EXPECT_EQ(0, breaker.current()); 130 EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man 131 EXPECT_EQ(0, breaker.wordStart()); 132 EXPECT_EQ(7, breaker.wordEnd()); 133 EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman 134 EXPECT_EQ(7, breaker.wordStart()); 135 EXPECT_EQ(17, breaker.wordEnd()); 136 EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble 137 EXPECT_EQ(17, breaker.wordStart()); 138 EXPECT_EQ(22, breaker.wordEnd()); 139 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 140 EXPECT_EQ(22, breaker.wordStart()); 141 EXPECT_EQ(27, breaker.wordEnd()); 142} 143 144TEST_F(WordBreakerTest, emojiWithModifier) { 145 uint16_t buf[] = { 146 UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier 147 0x270C, 0xFE0F, UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier 148 }; 149 WordBreaker breaker; 150 breaker.setLocale(icu::Locale::getUS()); 151 breaker.setText(buf, NELEM(buf)); 152 EXPECT_EQ(0, breaker.current()); 153 EXPECT_EQ(4, breaker.next()); // after boy + type 1-2 fitzpatrick modifier 154 EXPECT_EQ(0, breaker.wordStart()); 155 EXPECT_EQ(4, breaker.wordEnd()); 156 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 157 EXPECT_EQ(4, breaker.wordStart()); 158 EXPECT_EQ(8, breaker.wordEnd()); 159} 160 161TEST_F(WordBreakerTest, unicode10Emoji) { 162 // Should break between emojis. 163 uint16_t buf[] = { 164 // SLED + SLED 165 UTF16(0x1F6F7), UTF16(0x1F6F7), 166 // SLED + VS15 + SLED 167 UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7), 168 // WHITE SMILING FACE + SLED 169 0x263A, UTF16(0x1F6F7), 170 // WHITE SMILING FACE + VS16 + SLED 171 0x263A, 0xFE0F, UTF16(0x1F6F7), 172 }; 173 WordBreaker breaker; 174 breaker.setLocale(icu::Locale::getEnglish()); 175 breaker.setText(buf, NELEM(buf)); 176 EXPECT_EQ(0, breaker.current()); 177 EXPECT_EQ(2, breaker.next()); 178 EXPECT_EQ(0, breaker.wordStart()); 179 EXPECT_EQ(2, breaker.wordEnd()); 180 181 EXPECT_EQ(4, breaker.next()); 182 EXPECT_EQ(2, breaker.wordStart()); 183 EXPECT_EQ(4, breaker.wordEnd()); 184 185 EXPECT_EQ(7, breaker.next()); 186 EXPECT_EQ(4, breaker.wordStart()); 187 EXPECT_EQ(7, breaker.wordEnd()); 188 189 EXPECT_EQ(9, breaker.next()); 190 EXPECT_EQ(7, breaker.wordStart()); 191 EXPECT_EQ(9, breaker.wordEnd()); 192 193 EXPECT_EQ(10, breaker.next()); 194 EXPECT_EQ(9, breaker.wordStart()); 195 EXPECT_EQ(10, breaker.wordEnd()); 196 197 EXPECT_EQ(12, breaker.next()); 198 EXPECT_EQ(10, breaker.wordStart()); 199 EXPECT_EQ(12, breaker.wordEnd()); 200 201 EXPECT_EQ(14, breaker.next()); 202 EXPECT_EQ(12, breaker.wordStart()); 203 EXPECT_EQ(14, breaker.wordEnd()); 204 205 EXPECT_EQ(16, breaker.next()); 206 EXPECT_EQ(14, breaker.wordStart()); 207 EXPECT_EQ(16, breaker.wordEnd()); 208} 209 210TEST_F(WordBreakerTest, flagsSequenceSingleFlag) { 211 const std::string kFlag = "U+1F3F4"; 212 const std::string flags = kFlag + " " + kFlag; 213 214 const int kFlagLength = 2; 215 const size_t BUF_SIZE = kFlagLength * 2; 216 217 uint16_t buf[BUF_SIZE]; 218 size_t size; 219 ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr); 220 221 WordBreaker breaker; 222 breaker.setLocale(icu::Locale::getUS()); 223 breaker.setText(buf, size); 224 EXPECT_EQ(0, breaker.current()); 225 EXPECT_EQ(kFlagLength, breaker.next()); // end of the first flag 226 EXPECT_EQ(0, breaker.wordStart()); 227 EXPECT_EQ(kFlagLength, breaker.wordEnd()); 228 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next()); 229 EXPECT_EQ(kFlagLength, breaker.wordStart()); 230 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd()); 231} 232 233TEST_F(WordBreakerTest, flagsSequence) { 234 // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag 235 // of Scotland. 236 const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F"; 237 const std::string flagSequence = kFlagSequence + " " + kFlagSequence; 238 239 const int kFlagLength = 14; 240 const size_t BUF_SIZE = kFlagLength * 2; 241 242 uint16_t buf[BUF_SIZE]; 243 size_t size; 244 ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr); 245 246 WordBreaker breaker; 247 breaker.setLocale(icu::Locale::getUS()); 248 breaker.setText(buf, size); 249 EXPECT_EQ(0, breaker.current()); 250 EXPECT_EQ(kFlagLength, breaker.next()); // end of the first flag sequence 251 EXPECT_EQ(0, breaker.wordStart()); 252 EXPECT_EQ(kFlagLength, breaker.wordEnd()); 253 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next()); 254 EXPECT_EQ(kFlagLength, breaker.wordStart()); 255 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd()); 256} 257 258TEST_F(WordBreakerTest, punct) { 259 uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', 260 '!', '!'}; 261 WordBreaker breaker; 262 breaker.setLocale(icu::Locale::getUS()); 263 breaker.setText(buf, NELEM(buf)); 264 EXPECT_EQ(0, breaker.current()); 265 EXPECT_EQ(9, breaker.next()); // after "¡¡hello, " 266 EXPECT_EQ(2, breaker.wordStart()); // "hello" 267 EXPECT_EQ(7, breaker.wordEnd()); 268 EXPECT_EQ(0, breaker.breakBadness()); 269 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 270 EXPECT_EQ(9, breaker.wordStart()); // "world" 271 EXPECT_EQ(14, breaker.wordEnd()); 272 EXPECT_EQ(0, breaker.breakBadness()); 273} 274 275TEST_F(WordBreakerTest, email) { 276 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 277 ' ', 'x'}; 278 WordBreaker breaker; 279 breaker.setLocale(icu::Locale::getUS()); 280 breaker.setText(buf, NELEM(buf)); 281 EXPECT_EQ(0, breaker.current()); 282 EXPECT_EQ(11, breaker.next()); // after "foo@example" 283 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 284 EXPECT_EQ(1, breaker.breakBadness()); 285 EXPECT_EQ(16, breaker.next()); // after ".com " 286 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 287 EXPECT_EQ(0, breaker.breakBadness()); 288 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 289 EXPECT_EQ(16, breaker.wordStart()); // "x" 290 EXPECT_EQ(17, breaker.wordEnd()); 291 EXPECT_EQ(0, breaker.breakBadness()); 292} 293 294TEST_F(WordBreakerTest, mailto) { 295 uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 296 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; 297 WordBreaker breaker; 298 breaker.setLocale(icu::Locale::getUS()); 299 breaker.setText(buf, NELEM(buf)); 300 EXPECT_EQ(0, breaker.current()); 301 EXPECT_EQ(7, breaker.next()); // after "mailto:" 302 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 303 EXPECT_EQ(1, breaker.breakBadness()); 304 EXPECT_EQ(18, breaker.next()); // after "foo@example" 305 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 306 EXPECT_EQ(1, breaker.breakBadness()); 307 EXPECT_EQ(23, breaker.next()); // after ".com " 308 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 309 EXPECT_EQ(0, breaker.breakBadness()); 310 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 311 EXPECT_EQ(23, breaker.wordStart()); // "x" 312 EXPECT_EQ(24, breaker.wordEnd()); 313 EXPECT_EQ(0, breaker.breakBadness()); 314} 315 316// The current logic always places a line break after a detected email address or URL 317// and an immediately following non-ASCII character. 318TEST_F(WordBreakerTest, emailNonAscii) { 319 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 320 0x4E00}; 321 WordBreaker breaker; 322 breaker.setLocale(icu::Locale::getUS()); 323 breaker.setText(buf, NELEM(buf)); 324 EXPECT_EQ(0, breaker.current()); 325 EXPECT_EQ(11, breaker.next()); // after "foo@example" 326 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 327 EXPECT_EQ(1, breaker.breakBadness()); 328 EXPECT_EQ(15, breaker.next()); // after ".com" 329 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 330 EXPECT_EQ(0, breaker.breakBadness()); 331 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 332 EXPECT_EQ(15, breaker.wordStart()); // "一" 333 EXPECT_EQ(16, breaker.wordEnd()); 334 EXPECT_EQ(0, breaker.breakBadness()); 335} 336 337TEST_F(WordBreakerTest, emailCombining) { 338 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 339 0x0303, ' ', 'x'}; 340 WordBreaker breaker; 341 breaker.setLocale(icu::Locale::getUS()); 342 breaker.setText(buf, NELEM(buf)); 343 EXPECT_EQ(0, breaker.current()); 344 EXPECT_EQ(11, breaker.next()); // after "foo@example" 345 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 346 EXPECT_EQ(1, breaker.breakBadness()); 347 EXPECT_EQ(17, breaker.next()); // after ".com̃ " 348 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 349 EXPECT_EQ(0, breaker.breakBadness()); 350 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 351 EXPECT_EQ(17, breaker.wordStart()); // "x" 352 EXPECT_EQ(18, breaker.wordEnd()); 353 EXPECT_EQ(0, breaker.breakBadness()); 354} 355 356TEST_F(WordBreakerTest, lonelyAt) { 357 uint16_t buf[] = {'a', ' ', '@', ' ', 'b'}; 358 WordBreaker breaker; 359 breaker.setLocale(icu::Locale::getUS()); 360 breaker.setText(buf, NELEM(buf)); 361 EXPECT_EQ(0, breaker.current()); 362 EXPECT_EQ(2, breaker.next()); // after "a " 363 EXPECT_EQ(0, breaker.wordStart()); // "a" 364 EXPECT_EQ(1, breaker.wordEnd()); 365 EXPECT_EQ(0, breaker.breakBadness()); 366 EXPECT_EQ(4, breaker.next()); // after "@ " 367 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 368 EXPECT_EQ(0, breaker.breakBadness()); 369 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 370 EXPECT_EQ(4, breaker.wordStart()); // "b" 371 EXPECT_EQ(5, breaker.wordEnd()); 372 EXPECT_EQ(0, breaker.breakBadness()); 373} 374 375TEST_F(WordBreakerTest, url) { 376 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e', 377 '.', 'c', 'o', 'm', ' ', 'x'}; 378 WordBreaker breaker; 379 breaker.setLocale(icu::Locale::getUS()); 380 breaker.setText(buf, NELEM(buf)); 381 EXPECT_EQ(0, breaker.current()); 382 EXPECT_EQ(5, breaker.next()); // after "http:" 383 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 384 EXPECT_EQ(1, breaker.breakBadness()); 385 EXPECT_EQ(7, breaker.next()); // after "//" 386 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 387 EXPECT_EQ(1, breaker.breakBadness()); 388 EXPECT_EQ(14, breaker.next()); // after "example" 389 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 390 EXPECT_EQ(1, breaker.breakBadness()); 391 EXPECT_EQ(19, breaker.next()); // after ".com " 392 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 393 EXPECT_EQ(0, breaker.breakBadness()); 394 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 395 EXPECT_EQ(19, breaker.wordStart()); // "x" 396 EXPECT_EQ(20, breaker.wordEnd()); 397 EXPECT_EQ(0, breaker.breakBadness()); 398} 399 400// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks* 401TEST_F(WordBreakerTest, urlBreakChars) { 402 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd', 403 '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'}; 404 WordBreaker breaker; 405 breaker.setLocale(icu::Locale::getUS()); 406 breaker.setText(buf, NELEM(buf)); 407 EXPECT_EQ(0, breaker.current()); 408 EXPECT_EQ(5, breaker.next()); // after "http:" 409 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 410 EXPECT_EQ(1, breaker.breakBadness()); 411 EXPECT_EQ(7, breaker.next()); // after "//" 412 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 413 EXPECT_EQ(1, breaker.breakBadness()); 414 EXPECT_EQ(8, breaker.next()); // after "a" 415 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 416 EXPECT_EQ(1, breaker.breakBadness()); 417 EXPECT_EQ(10, breaker.next()); // after ".b" 418 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 419 EXPECT_EQ(1, breaker.breakBadness()); 420 EXPECT_EQ(11, breaker.next()); // after "/" 421 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 422 EXPECT_EQ(1, breaker.breakBadness()); 423 EXPECT_EQ(13, breaker.next()); // after "~c" 424 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 425 EXPECT_EQ(1, breaker.breakBadness()); 426 EXPECT_EQ(15, breaker.next()); // after ",d" 427 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 428 EXPECT_EQ(1, breaker.breakBadness()); 429 EXPECT_EQ(17, breaker.next()); // after "-e" 430 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 431 EXPECT_EQ(1, breaker.breakBadness()); 432 EXPECT_EQ(19, breaker.next()); // after "?f" 433 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 434 EXPECT_EQ(1, breaker.breakBadness()); 435 EXPECT_EQ(20, breaker.next()); // after "=" 436 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 437 EXPECT_EQ(1, breaker.breakBadness()); 438 EXPECT_EQ(21, breaker.next()); // after "g" 439 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 440 EXPECT_EQ(1, breaker.breakBadness()); 441 EXPECT_EQ(22, breaker.next()); // after "&" 442 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 443 EXPECT_EQ(1, breaker.breakBadness()); 444 EXPECT_EQ(23, breaker.next()); // after "h" 445 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 446 EXPECT_EQ(1, breaker.breakBadness()); 447 EXPECT_EQ(25, breaker.next()); // after "#i" 448 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 449 EXPECT_EQ(1, breaker.breakBadness()); 450 EXPECT_EQ(27, breaker.next()); // after "%j" 451 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 452 EXPECT_EQ(1, breaker.breakBadness()); 453 EXPECT_EQ(29, breaker.next()); // after "_k" 454 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 455 EXPECT_EQ(1, breaker.breakBadness()); 456 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 457 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 458 EXPECT_EQ(0, breaker.breakBadness()); 459} 460 461TEST_F(WordBreakerTest, urlNoHyphenBreak) { 462 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'}; 463 WordBreaker breaker; 464 breaker.setLocale(icu::Locale::getUS()); 465 breaker.setText(buf, NELEM(buf)); 466 EXPECT_EQ(0, breaker.current()); 467 EXPECT_EQ(5, breaker.next()); // after "http:" 468 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 469 EXPECT_EQ(7, breaker.next()); // after "//" 470 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 471 EXPECT_EQ(8, breaker.next()); // after "a" 472 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 473 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 474 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 475} 476 477TEST_F(WordBreakerTest, urlEndsWithSlash) { 478 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'}; 479 WordBreaker breaker; 480 breaker.setLocale(icu::Locale::getUS()); 481 breaker.setText(buf, NELEM(buf)); 482 EXPECT_EQ(0, breaker.current()); 483 EXPECT_EQ(5, breaker.next()); // after "http:" 484 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 485 EXPECT_EQ(7, breaker.next()); // after "//" 486 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 487 EXPECT_EQ(8, breaker.next()); // after "a" 488 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 489 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 490 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 491} 492 493TEST_F(WordBreakerTest, emailStartsWithSlash) { 494 uint16_t buf[] = {'/', 'a', '@', 'b'}; 495 WordBreaker breaker; 496 breaker.setLocale(icu::Locale::getUS()); 497 breaker.setText(buf, NELEM(buf)); 498 EXPECT_EQ(0, breaker.current()); 499 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 500 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 501} 502 503} // namespace minikin 504