WordBreakerTests.cpp revision 74b56175e5d41c1c1dc992208842b5576973d452
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/* 257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project 357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License. 657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at 757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * 1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software 1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and 1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License. 1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */ 1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <gtest/gtest.h> 1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "ICUTestBase.h" 1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "UnicodeUtils.h" 2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <minikin/WordBreaker.h> 2157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/locid.h> 2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uclean.h> 2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/udata.h> 2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin" 2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h> 2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#ifndef NELEM 2957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) 3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#endif 3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint) 3356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien 3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienusing namespace android; 3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levientypedef ICUTestBase WordBreakerTest; 3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 3857b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, basic) { 3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.next()); // after "hello " 4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.wordStart()); // "hello" 4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(5, breaker.wordEnd()); 47c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.current()); 4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.wordStart()); // "world" 5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(11, breaker.wordEnd()); 52c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(11, breaker.current()); 5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 5657b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, softHyphen) { 5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 5957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 6057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 6157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 6257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo " 6357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo" 6457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(6, breaker.wordEnd()); 65c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 6657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 6757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.wordStart()); // "world" 6857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(12, breaker.wordEnd()); 69c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 7057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 7157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien 72d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh PournaderTEST_F(WordBreakerTest, postfixAndPrefix) { 73d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥ 74d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader WordBreaker breaker; 75d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader breaker.setLocale(icu::Locale::getEnglish()); 76d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader breaker.setText(buf, NELEM(buf)); 77d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader EXPECT_EQ(0, breaker.current()); 78d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader 79d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader EXPECT_EQ(4, breaker.next()); // after CENT SIGN 80d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader EXPECT_EQ(0, breaker.wordStart()); // "US¢" 81d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader EXPECT_EQ(3, breaker.wordEnd()); 82d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader 83d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string 84d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader EXPECT_EQ(4, breaker.wordStart()); // "JP¥" 85d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); 86d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader} 87d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader 8874b56175e5d41c1c1dc992208842b5576973d452Roozbeh PournaderTEST_F(WordBreakerTest, MyanmarKinzi) { 8974b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU 9074b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader WordBreaker breaker; 9174b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader icu::Locale burmese("my"); 9274b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader breaker.setLocale(burmese); 9374b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader breaker.setText(buf, NELEM(buf)); 9474b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader EXPECT_EQ(0, breaker.current()); 9574b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader 9674b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string 9774b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader EXPECT_EQ(0, breaker.wordStart()); 9874b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); 9974b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader} 10074b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader 101d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph LevienTEST_F(WordBreakerTest, zwjEmojiSequences) { 102d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien uint16_t buf[] = { 103d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien // man + zwj + heart + zwj + man 10456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468), 10556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien // woman + zwj + heart + zwj + kiss mark + zwj + woman 10656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469), 107d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien // eye + zwj + left speech bubble 10856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien UTF16(0x1F441), 0x200D, UTF16(0x1F5E8), 10977f488345316fba46c271fc04bea470819ae1712Seigo Nonaka // CAT FACE + zwj + BUST IN SILHOUETTE 11077f488345316fba46c271fc04bea470819ae1712Seigo Nonaka UTF16(0x1F431), 0x200D, UTF16(0x1F464), 111d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien }; 112d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien WordBreaker breaker; 113d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 114d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien breaker.setText(buf, NELEM(buf)); 115d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(0, breaker.current()); 116d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man 117d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(0, breaker.wordStart()); 118d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(7, breaker.wordEnd()); 119d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman 120d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(7, breaker.wordStart()); 121d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(17, breaker.wordEnd()); 12277f488345316fba46c271fc04bea470819ae1712Seigo Nonaka EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble 123d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(17, breaker.wordStart()); 124d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien EXPECT_EQ(22, breaker.wordEnd()); 12577f488345316fba46c271fc04bea470819ae1712Seigo Nonaka EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 12677f488345316fba46c271fc04bea470819ae1712Seigo Nonaka EXPECT_EQ(22, breaker.wordStart()); 12777f488345316fba46c271fc04bea470819ae1712Seigo Nonaka EXPECT_EQ(27, breaker.wordEnd()); 128d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien} 129d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien 13056840e8006ca2b822adb401fc8a65f3c075cde10Raph LevienTEST_F(WordBreakerTest, emojiWithModifier) { 13156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien uint16_t buf[] = { 13256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier 13356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien 0x270C, 0xFE0F, UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier 13456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien }; 13556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien WordBreaker breaker; 13656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 13756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien breaker.setText(buf, NELEM(buf)); 13856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(0, breaker.current()); 13956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(4, breaker.next()); // after man + type 6 fitzpatrick modifier 14056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(0, breaker.wordStart()); 14156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(4, breaker.wordEnd()); 14256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 14356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(4, breaker.wordStart()); 14456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien EXPECT_EQ(8, breaker.wordEnd()); 14556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien} 14656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien 14757b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, punct) { 14857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', 14957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien '!', '!'}; 15057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien WordBreaker breaker; 15157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 15257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien breaker.setText(buf, NELEM(buf)); 15357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(0, breaker.current()); 15457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(9, breaker.next()); // after "¡¡hello, " 15557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(2, breaker.wordStart()); // "hello" 15657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(7, breaker.wordEnd()); 157c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 15857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 15957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(9, breaker.wordStart()); // "world" 16057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien EXPECT_EQ(14, breaker.wordEnd()); 161c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 16257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien} 1639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, email) { 1659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 1669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien ' ', 'x'}; 1679c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1689c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1699c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(11, breaker.next()); // after "foo@example" 1726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 173c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 1746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(16, breaker.next()); // after ".com " 1759c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 176c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1779c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 1789c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(16, breaker.wordStart()); // "x" 1799c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(17, breaker.wordEnd()); 180c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 1829c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 1839c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, mailto) { 1849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 1859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; 1869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 1879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 1889c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 1899c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 1906d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "mailto:" 1916d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 192c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 1936d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(18, breaker.next()); // after "foo@example" 1946d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 195c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 1966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(23, breaker.next()); // after ".com " 1979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 198c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 1999c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 2009c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(23, breaker.wordStart()); // "x" 2019c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(24, breaker.wordEnd()); 202c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 2049c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 2056d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// The current logic always places a line break after a detected email address or URL 2066d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// and an immediately following non-ASCII character. 2079c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailNonAscii) { 2089c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 2099c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 0x4E00}; 2109c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 2119c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 2129c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 2139c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 2146d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(11, breaker.next()); // after "foo@example" 2156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 216c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2176d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(15, breaker.next()); // after ".com" 2189c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 219c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2209c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 2219c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(15, breaker.wordStart()); // "一" 2229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(16, breaker.wordEnd()); 223c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 2259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 2269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailCombining) { 2279c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 2289c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 0x0303, ' ', 'x'}; 2299c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 2309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 2319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 2329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 2336d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(11, breaker.next()); // after "foo@example" 2346d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 235c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2366d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(17, breaker.next()); // after ".com̃ " 2379c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 238c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2399c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 2409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(17, breaker.wordStart()); // "x" 2419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(18, breaker.wordEnd()); 242c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 2449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien 2456d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, lonelyAt) { 2466d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'a', ' ', '@', ' ', 'b'}; 2476d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 2486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 2496d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 2506d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 2516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(2, breaker.next()); // after "a " 2526d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.wordStart()); // "a" 2536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(1, breaker.wordEnd()); 254c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2556d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(4, breaker.next()); // after "@ " 2566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 257c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 2596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(4, breaker.wordStart()); // "b" 2606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.wordEnd()); 261c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 2636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 2649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, url) { 2659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e', 2669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien '.', 'c', 'o', 'm', ' ', 'x'}; 2679c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien WordBreaker breaker; 2689c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setLocale(icu::Locale::getEnglish()); 2699c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien breaker.setText(buf, NELEM(buf)); 2709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(0, breaker.current()); 2716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.next()); // after "http:" 2726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 273c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "//" 2756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 276c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(14, breaker.next()); // after "example" 2786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 279c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 2806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(19, breaker.next()); // after ".com " 2819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 282c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2839c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 2849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(19, breaker.wordStart()); // "x" 2859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien EXPECT_EQ(20, breaker.wordEnd()); 286c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 2879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien} 2886d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 2896d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks* 2906d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlBreakChars) { 2916d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd', 2926d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'}; 2936d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 2946d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 2956d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 2966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 2976d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.next()); // after "http:" 2986d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 299c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3006d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "//" 3016d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 302c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3036d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(8, breaker.next()); // after "a" 3046d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 305c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3066d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(10, breaker.next()); // after ".b" 3076d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 308c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3096d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(11, breaker.next()); // after "/" 3106d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 311c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3126d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(13, breaker.next()); // after "~c" 3136d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 314c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(15, breaker.next()); // after ",d" 3166d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 317c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(17, breaker.next()); // after "-e" 3196d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 320c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(19, breaker.next()); // after "?f" 3226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 323c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(20, breaker.next()); // after "=" 3256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 326c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(21, breaker.next()); // after "g" 3286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 329c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3306d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(22, breaker.next()); // after "&" 3316d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 332c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3336d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(23, breaker.next()); // after "h" 3346d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 335c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3366d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(25, breaker.next()); // after "#i" 3376d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 338c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3396d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(27, breaker.next()); // after "%j" 3406d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 341c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3426d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(29, breaker.next()); // after "_k" 3436d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 344c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(1, breaker.breakBadness()); 3456d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 3466d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 347c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien EXPECT_EQ(0, breaker.breakBadness()); 3486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 3496d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 3506d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlNoHyphenBreak) { 3516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'}; 3526d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 3536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 3546d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 3556d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 3566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.next()); // after "http:" 3576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "//" 3596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(8, breaker.next()); // after "a" 3616d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 3636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3646d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 3656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 3666d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlEndsWithSlash) { 3676d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'}; 3686d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 3696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 3706d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 3716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 3726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(5, breaker.next()); // after "http:" 3736d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(7, breaker.next()); // after "//" 3756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(8, breaker.next()); // after "a" 3776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 3796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 3816d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien 3826d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, emailStartsWithSlash) { 3836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien uint16_t buf[] = {'/', 'a', '@', 'b'}; 3846d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien WordBreaker breaker; 3856d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setLocale(icu::Locale::getEnglish()); 3866d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien breaker.setText(buf, NELEM(buf)); 3876d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ(0, breaker.current()); 3886d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 3896d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 3906d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien} 391