157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/*
257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project
357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License.
657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at
757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software
1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and
1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License.
1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */
1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <gtest/gtest.h>
1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "ICUTestBase.h"
1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "UnicodeUtils.h"
2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <minikin/WordBreaker.h>
2157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/locid.h>
2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uclean.h>
2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/udata.h>
2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin"
2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h>
2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#ifndef NELEM
2957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#endif
3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
3356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien
3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienusing namespace android;
3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levientypedef ICUTestBase WordBreakerTest;
3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3857b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, basic) {
3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.next());  // after "hello "
4557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.wordStart());  // "hello"
4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(5, breaker.wordEnd());
47c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.current());
4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
5057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.wordStart());  // "world"
5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(11, breaker.wordEnd());
52c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(11, breaker.current());
5457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
5657b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, softHyphen) {
5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
5957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
6057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
6157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
6257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
6357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
6457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.wordEnd());
65c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
6657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
6757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.wordStart());  // "world"
6857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(12, breaker.wordEnd());
69c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
7057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
7157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
72d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh PournaderTEST_F(WordBreakerTest, postfixAndPrefix) {
73d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
74d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    WordBreaker breaker;
75d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    breaker.setLocale(icu::Locale::getEnglish());
76d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    breaker.setText(buf, NELEM(buf));
77d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    EXPECT_EQ(0, breaker.current());
78d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader
79d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    EXPECT_EQ(4, breaker.next());  // after CENT SIGN
80d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    EXPECT_EQ(0, breaker.wordStart());  // "US¢"
81d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    EXPECT_EQ(3, breaker.wordEnd());
82d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader
83d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
84d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    EXPECT_EQ(4, breaker.wordStart());  // "JP¥"
85d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader    EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
86d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader}
87d8917c69a9f7b7ca52f7ac850922dab4322113f5Roozbeh Pournader
8874b56175e5d41c1c1dc992208842b5576973d452Roozbeh PournaderTEST_F(WordBreakerTest, MyanmarKinzi) {
8974b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
9074b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    WordBreaker breaker;
9174b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    icu::Locale burmese("my");
9274b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    breaker.setLocale(burmese);
9374b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    breaker.setText(buf, NELEM(buf));
9474b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    EXPECT_EQ(0, breaker.current());
9574b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader
9674b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
9774b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    EXPECT_EQ(0, breaker.wordStart());
9874b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader    EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
9974b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader}
10074b56175e5d41c1c1dc992208842b5576973d452Roozbeh Pournader
101d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph LevienTEST_F(WordBreakerTest, zwjEmojiSequences) {
102d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    uint16_t buf[] = {
103d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        // man + zwj + heart + zwj + man
10456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
10556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        // woman + zwj + heart + zwj + kiss mark + zwj + woman
10656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
107d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        // eye + zwj + left speech bubble
10856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
10977f488345316fba46c271fc04bea470819ae1712Seigo Nonaka        // CAT FACE + zwj + BUST IN SILHOUETTE
11077f488345316fba46c271fc04bea470819ae1712Seigo Nonaka        UTF16(0x1F431), 0x200D, UTF16(0x1F464),
111d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    };
112d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    WordBreaker breaker;
113d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
114d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    breaker.setText(buf, NELEM(buf));
115d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(0, breaker.current());
116d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(7, breaker.next());  // after man + zwj + heart + zwj + man
117d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(0, breaker.wordStart());
118d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(7, breaker.wordEnd());
119d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
120d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(7, breaker.wordStart());
121d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(17, breaker.wordEnd());
12277f488345316fba46c271fc04bea470819ae1712Seigo Nonaka    EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
123d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(17, breaker.wordStart());
124d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(22, breaker.wordEnd());
12577f488345316fba46c271fc04bea470819ae1712Seigo Nonaka    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
12677f488345316fba46c271fc04bea470819ae1712Seigo Nonaka    EXPECT_EQ(22, breaker.wordStart());
12777f488345316fba46c271fc04bea470819ae1712Seigo Nonaka    EXPECT_EQ(27, breaker.wordEnd());
128d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien}
129d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien
13056840e8006ca2b822adb401fc8a65f3c075cde10Raph LevienTEST_F(WordBreakerTest, emojiWithModifier) {
13156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    uint16_t buf[] = {
13256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
13356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien        0x270C, 0xFE0F, UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
13456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    };
13556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    WordBreaker breaker;
13656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
13756840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    breaker.setText(buf, NELEM(buf));
13856840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    EXPECT_EQ(0, breaker.current());
13956840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    EXPECT_EQ(4, breaker.next());  // after man + type 6 fitzpatrick modifier
14056840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    EXPECT_EQ(0, breaker.wordStart());
14156840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    EXPECT_EQ(4, breaker.wordEnd());
14256840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
14356840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    EXPECT_EQ(4, breaker.wordStart());
14456840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien    EXPECT_EQ(8, breaker.wordEnd());
14556840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien}
14656840e8006ca2b822adb401fc8a65f3c075cde10Raph Levien
14757b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, punct) {
14857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
14957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        '!', '!'};
15057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
15157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
15257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
15357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
15457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
15557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(2, breaker.wordStart());  // "hello"
15657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.wordEnd());
157c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
15857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
15957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(9, breaker.wordStart());  // "world"
16057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(14, breaker.wordEnd());
161c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
16257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
1639c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, email) {
1659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
1669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        ' ', 'x'};
1679c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1689c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1699c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(11, breaker.next());  // after "foo@example"
1726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
173c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
1746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(16, breaker.next());  // after ".com "
1759c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
176c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1779c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1789c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(16, breaker.wordStart());  // "x"
1799c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(17, breaker.wordEnd());
180c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
1829c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1839c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, mailto) {
1849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
1859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
1869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1889c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1899c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1906d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "mailto:"
1916d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
192c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
1936d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(18, breaker.next());  // after "foo@example"
1946d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
195c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
1966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(23, breaker.next());  // after ".com "
1979c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
198c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1999c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
2009c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(23, breaker.wordStart());  // "x"
2019c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(24, breaker.wordEnd());
202c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2039c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
2049c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
2056d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// The current logic always places a line break after a detected email address or URL
2066d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// and an immediately following non-ASCII character.
2079c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailNonAscii) {
2089c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
2099c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        0x4E00};
2109c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
2119c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
2129c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
2139c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
2146d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(11, breaker.next());  // after "foo@example"
2156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
216c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2176d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(15, breaker.next());  // after ".com"
2189c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
219c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2209c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
2219c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(15, breaker.wordStart());  // "一"
2229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(16, breaker.wordEnd());
223c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
2259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
2269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailCombining) {
2279c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
2289c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        0x0303, ' ', 'x'};
2299c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
2309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
2319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
2329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
2336d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(11, breaker.next());  // after "foo@example"
2346d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
235c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2366d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(17, breaker.next());  // after ".com̃ "
2379c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
238c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2399c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
2409c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(17, breaker.wordStart());  // "x"
2419c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(18, breaker.wordEnd());
242c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2439c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
2449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
2456d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, lonelyAt) {
2466d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
2476d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
2486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
2496d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
2506d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
2516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(2, breaker.next());  // after "a "
2526d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.wordStart());  // "a"
2536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(1, breaker.wordEnd());
254c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2556d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(4, breaker.next());  // after "@ "
2566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
257c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
2596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(4, breaker.wordStart());  // "b"
2606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.wordEnd());
261c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
2636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
2649c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, url) {
2659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
2669c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        '.', 'c', 'o', 'm', ' ', 'x'};
2679c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
2689c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
2699c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
2709c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
2716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.next());  // after "http:"
2726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
273c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "//"
2756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
276c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(14, breaker.next());  // after "example"
2786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
279c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(19, breaker.next());  // after ".com "
2819c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
282c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2839c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
2849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(19, breaker.wordStart());  // "x"
2859c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(20, breaker.wordEnd());
286c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
2886d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
2896d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
2906d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlBreakChars) {
2916d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
2926d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
2936d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
2946d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
2956d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
2966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
2976d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.next());  // after "http:"
2986d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
299c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3006d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "//"
3016d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
302c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3036d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(8, breaker.next());  // after "a"
3046d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
305c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3066d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(10, breaker.next());  // after ".b"
3076d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
308c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3096d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(11, breaker.next());  // after "/"
3106d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
311c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3126d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(13, breaker.next());  // after "~c"
3136d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
314c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(15, breaker.next());  // after ",d"
3166d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
317c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(17, breaker.next());  // after "-e"
3196d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
320c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(19, breaker.next());  // after "?f"
3226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
323c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(20, breaker.next());  // after "="
3256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
326c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(21, breaker.next());  // after "g"
3286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
329c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3306d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(22, breaker.next());  // after "&"
3316d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
332c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3336d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(23, breaker.next());  // after "h"
3346d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
335c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3366d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(25, breaker.next());  // after "#i"
3376d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
338c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3396d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(27, breaker.next());  // after "%j"
3406d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
341c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3426d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(29, breaker.next());  // after "_k"
3436d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
344c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
3456d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
3466d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
347c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
3486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
3496d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
3506d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlNoHyphenBreak) {
3516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
3526d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
3536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
3546d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
3556d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
3566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.next());  // after "http:"
3576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3586d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "//"
3596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(8, breaker.next());  // after "a"
3616d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
3636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3646d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
3656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
3666d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlEndsWithSlash) {
3676d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
3686d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
3696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
3706d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
3716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
3726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.next());  // after "http:"
3736d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "//"
3756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3766d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(8, breaker.next());  // after "a"
3776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
3796d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
3816d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
3826d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, emailStartsWithSlash) {
3836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'/', 'a', '@', 'b'};
3846d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
3856d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
3866d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
3876d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
3886d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
3896d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3906d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
391