WordBreakerTests.cpp revision 56840e8006ca2b822adb401fc8a65f3c075cde10
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/* 257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project 357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License. 657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at 757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software 1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and 1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License. 1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */ 1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <gtest/gtest.h> 1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "ICUTestBase.h" 1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "UnicodeUtils.h" 2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <minikin/WordBreaker.h> 2157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/locid.h> 2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uclean.h> 2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/udata.h> 2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin" 2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h> 2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#ifndef NELEM 2957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) 3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#endif 3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint) 3356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien 3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienusing namespace android; 3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levientypedef ICUTestBase WordBreakerTest; 3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3857b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, basic) { 3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.next()); // after "hello " 4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.wordStart()); // "hello" 4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(5, breaker.wordEnd()); 47c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.current()); 4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.wordStart()); // "world" 5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(11, breaker.wordEnd()); 52c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(11, breaker.current()); 5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 5657b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, softHyphen) { 5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 5957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 6057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 6157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 6257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo " 6357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo" 6457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.wordEnd()); 65c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 6657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 6757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.wordStart()); // "world" 6857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(12, breaker.wordEnd()); 69c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 7057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 7157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 72d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph LevienTEST_F(WordBreakerTest, zwjEmojiSequences) { 73d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien uint16_t buf[] = { 74d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien // man + zwj + heart + zwj + man 7556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468), 7656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // woman + zwj + heart + zwj + kiss mark + zwj + woman 7756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469), 78d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien // eye + zwj + left speech bubble 7956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien UTF16(0x1F441), 0x200D, UTF16(0x1F5E8), 80d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien }; 81d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien WordBreaker breaker; 82d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 83d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien breaker.setText(buf, NELEM(buf)); 84d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(0, breaker.current()); 85d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man 86d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(0, breaker.wordStart()); 87d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(7, breaker.wordEnd()); 88d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman 89d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(7, breaker.wordStart()); 90d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(17, breaker.wordEnd()); 91d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 92d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(17, breaker.wordStart()); 93d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(22, breaker.wordEnd()); 94d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien} 95d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien 9656840e8006ca2b822adb401fc8a65f3c075cde10Raph LevienTEST_F(WordBreakerTest, emojiWithModifier) { 9756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien uint16_t buf[] = { 9856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier 9956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien 0x270C, 0xFE0F, UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier 10056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien }; 10156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien WordBreaker breaker; 10256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 10356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien breaker.setText(buf, NELEM(buf)); 10456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(0, breaker.current()); 10556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(4, breaker.next()); // after man + type 6 fitzpatrick modifier 10656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(0, breaker.wordStart()); 10756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(4, breaker.wordEnd()); 10856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 10956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(4, breaker.wordStart()); 11056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(8, breaker.wordEnd()); 11156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien} 11256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien 11357b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, punct) { 11457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', 11557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien '!', '!'}; 11657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 11757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 11857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 11957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 12057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(9, breaker.next()); // after "¡¡hello, " 12157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(2, breaker.wordStart()); // "hello" 12257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.wordEnd()); 123c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 12457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 12557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(9, breaker.wordStart()); // "world" 12657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(14, breaker.wordEnd()); 127c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 12857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 1299c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, email) { 1319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 1329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien ' ', 'x'}; 1339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1376d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(11, breaker.next()); // after "foo@example" 1386d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 139c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 1406d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(16, breaker.next()); // after ".com " 1419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 142c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(16, breaker.wordStart()); // "x" 1459c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(17, breaker.wordEnd()); 146c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 1489c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, mailto) { 1509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 1519c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; 1529c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1539c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1549c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1559c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "mailto:" 1576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 158c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 1596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(18, breaker.next()); // after "foo@example" 1606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 161c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 1626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(23, breaker.next()); // after ".com " 1639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 164c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 1669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(23, breaker.wordStart()); // "x" 1679c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(24, breaker.wordEnd()); 168c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1699c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 1709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// The current logic always places a line break after a detected email address or URL 1726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// and an immediately following non-ASCII character. 1739c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailNonAscii) { 1749c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 1759c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 0x4E00}; 1769c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1779c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1789c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1799c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(11, breaker.next()); // after "foo@example" 1816d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 182c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 1836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(15, breaker.next()); // after ".com" 1849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 185c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 1879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(15, breaker.wordStart()); // "一" 1889c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(16, breaker.wordEnd()); 189c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1909c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 1919c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1929c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailCombining) { 1939c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 1949c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 0x0303, ' ', 'x'}; 1959c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1969c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1989c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1996d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(11, breaker.next()); // after "foo@example" 2006d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 201c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2026d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(17, breaker.next()); // after ".com̃ " 2039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 204c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2059c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 2069c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(17, breaker.wordStart()); // "x" 2079c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(18, breaker.wordEnd()); 208c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2099c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 2109c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 2116d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, lonelyAt) { 2126d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'a', ' ', '@', ' ', 'b'}; 2136d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 2146d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 2156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 2166d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 2176d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(2, breaker.next()); // after "a " 2186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.wordStart()); // "a" 2196d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(1, breaker.wordEnd()); 220c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(4, breaker.next()); // after "@ " 2226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 223c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 2256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(4, breaker.wordStart()); // "b" 2266d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.wordEnd()); 227c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 2296d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 2309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, url) { 2319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e', 2329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien '.', 'c', 'o', 'm', ' ', 'x'}; 2339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 2349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 2359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 2369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 2376d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.next()); // after "http:" 2386d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 239c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2406d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "//" 2416d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 242c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2436d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(14, breaker.next()); // after "example" 2446d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 245c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2466d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(19, breaker.next()); // after ".com " 2479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 248c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2499c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 2509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(19, breaker.wordStart()); // "x" 2519c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(20, breaker.wordEnd()); 252c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2539c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 2546d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 2556d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks* 2566d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlBreakChars) { 2576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd', 2586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'}; 2596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 2606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 2616d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 2626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 2636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.next()); // after "http:" 2646d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 265c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2666d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "//" 2676d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 268c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(8, breaker.next()); // after "a" 2706d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 271c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(10, breaker.next()); // after ".b" 2736d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 274c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(11, breaker.next()); // after "/" 2766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 277c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(13, breaker.next()); // after "~c" 2796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 280c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2816d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(15, breaker.next()); // after ",d" 2826d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 283c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2846d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(17, breaker.next()); // after "-e" 2856d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 286c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2876d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(19, breaker.next()); // after "?f" 2886d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 289c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2906d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(20, breaker.next()); // after "=" 2916d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 292c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2936d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(21, breaker.next()); // after "g" 2946d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 295c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(22, breaker.next()); // after "&" 2976d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 298c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2996d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(23, breaker.next()); // after "h" 3006d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 301c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3026d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(25, breaker.next()); // after "#i" 3036d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 304c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3056d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(27, breaker.next()); // after "%j" 3066d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 307c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3086d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(29, breaker.next()); // after "_k" 3096d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 310c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3116d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 3126d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 313c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 3146d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 3156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 3166d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlNoHyphenBreak) { 3176d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'}; 3186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 3196d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 3206d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 3216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 3226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.next()); // after "http:" 3236d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "//" 3256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3266d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(8, breaker.next()); // after "a" 3276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 3296d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3306d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 3316d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 3326d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlEndsWithSlash) { 3336d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'}; 3346d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 3356d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 3366d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 3376d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 3386d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.next()); // after "http:" 3396d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3406d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "//" 3416d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3426d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(8, breaker.next()); // after "a" 3436d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3446d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 3456d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3466d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 3476d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 3486d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, emailStartsWithSlash) { 3496d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'/', 'a', '@', 'b'}; 3506d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 3516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 3526d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 3536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 3546d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 3556d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 357