WordBreakerTests.cpp revision d3f45892c721fb1738bf02fe19a5143a320ca4bf
157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien/*
257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Copyright (C) 2015 The Android Open Source Project
357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * you may not use this file except in compliance with the License.
657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * You may obtain a copy of the License at
757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien *
1057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * Unless required by applicable law or agreed to in writing, software
1157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
1257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * See the License for the specific language governing permissions and
1457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien * limitations under the License.
1557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien */
1657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
1757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <gtest/gtest.h>
1857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "ICUTestBase.h"
1957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include "UnicodeUtils.h"
2057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <minikin/WordBreaker.h>
2157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/locid.h>
2257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/uclean.h>
2357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <unicode/udata.h>
2457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define LOG_TAG "Minikin"
2657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#include <cutils/log.h>
2757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
2857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#ifndef NELEM
2957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
3057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien#endif
3157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levienusing namespace android;
3357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levientypedef ICUTestBase WordBreakerTest;
3557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
3657b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, basic) {
3757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
3857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
3957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
4057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
4157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
4257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.next());  // after "hello "
4357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.wordStart());  // "hello"
4457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(5, breaker.wordEnd());
45c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
4657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.current());
4757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
4857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.wordStart());  // "world"
4957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(11, breaker.wordEnd());
50c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
5157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(11, breaker.current());
5257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
5357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
5457b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, softHyphen) {
5557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
5657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
5757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
5857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
5957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
6057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
6157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
6257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(6, breaker.wordEnd());
63c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
6457b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
6557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.wordStart());  // "world"
6657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(12, breaker.wordEnd());
67c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
6857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
6957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien
70d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph LevienTEST_F(WordBreakerTest, zwjEmojiSequences) {
71d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    uint16_t buf[] = {
72d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        // man + zwj + heart + zwj + man
73d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
74d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        // woman + zwj + heart + zwj + woman
75d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
76d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        // eye + zwj + left speech bubble
77d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien        0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
78d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    };
79d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    WordBreaker breaker;
80d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
81d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    breaker.setText(buf, NELEM(buf));
82d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(0, breaker.current());
83d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(7, breaker.next());  // after man + zwj + heart + zwj + man
84d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(0, breaker.wordStart());
85d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(7, breaker.wordEnd());
86d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
87d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(7, breaker.wordStart());
88d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(17, breaker.wordEnd());
89d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
90d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(17, breaker.wordStart());
91d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien    EXPECT_EQ(22, breaker.wordEnd());
92d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien}
93d3f45892c721fb1738bf02fe19a5143a320ca4bfRaph Levien
9457b6dae9894b9362ef04517ff477fd491f9d433bRaph LevienTEST_F(WordBreakerTest, punct) {
9557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
9657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien        '!', '!'};
9757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    WordBreaker breaker;
9857b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
9957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    breaker.setText(buf, NELEM(buf));
10057b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(0, breaker.current());
10157b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
10257b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(2, breaker.wordStart());  // "hello"
10357b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(7, breaker.wordEnd());
104c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
10557b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
10657b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(9, breaker.wordStart());  // "world"
10757b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien    EXPECT_EQ(14, breaker.wordEnd());
108c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
10957b6dae9894b9362ef04517ff477fd491f9d433bRaph Levien}
1109c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1119c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, email) {
1129c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
1139c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        ' ', 'x'};
1149c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1159c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1169c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1179c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(11, breaker.next());  // after "foo@example"
1196d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
120c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
1216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(16, breaker.next());  // after ".com "
1229c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
123c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1249c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1259c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(16, breaker.wordStart());  // "x"
1269c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(17, breaker.wordEnd());
127c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1289c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
1299c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, mailto) {
1319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
1329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
1339c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1359c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1369c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1376d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "mailto:"
1386d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
139c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
1406d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(18, breaker.next());  // after "foo@example"
1416d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
142c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
1436d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(23, breaker.next());  // after ".com "
1449c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
145c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1469c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1479c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(23, breaker.wordStart());  // "x"
1489c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(24, breaker.wordEnd());
149c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1509c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
1519c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1526d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// The current logic always places a line break after a detected email address or URL
1536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// and an immediately following non-ASCII character.
1549c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailNonAscii) {
1559c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
1569c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        0x4E00};
1579c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1589c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1599c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1609c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1616d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(11, breaker.next());  // after "foo@example"
1626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
163c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
1646d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(15, breaker.next());  // after ".com"
1659c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
166c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1679c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1689c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(15, breaker.wordStart());  // "一"
1699c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(16, breaker.wordEnd());
170c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1719c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
1729c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1739c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, emailCombining) {
1749c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
1759c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        0x0303, ' ', 'x'};
1769c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
1779c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
1789c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
1799c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
1806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(11, breaker.next());  // after "foo@example"
1816d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
182c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
1836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(17, breaker.next());  // after ".com̃ "
1849c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
185c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1869c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
1879c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(17, breaker.wordStart());  // "x"
1889c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(18, breaker.wordEnd());
189c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
1909c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
1919c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien
1926d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, lonelyAt) {
1936d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
1946d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
1956d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
1966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
1976d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
1986d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(2, breaker.next());  // after "a "
1996d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.wordStart());  // "a"
2006d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(1, breaker.wordEnd());
201c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2026d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(4, breaker.next());  // after "@ "
2036d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
204c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2056d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
2066d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(4, breaker.wordStart());  // "b"
2076d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.wordEnd());
208c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2096d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
2106d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
2119c4cc648abcae144f3b99d612e58ef01d5e52cceRaph LevienTEST_F(WordBreakerTest, url) {
2129c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
2139c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien        '.', 'c', 'o', 'm', ' ', 'x'};
2149c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    WordBreaker breaker;
2159c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setLocale(icu::Locale::getEnglish());
2169c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    breaker.setText(buf, NELEM(buf));
2179c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(0, breaker.current());
2186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.next());  // after "http:"
2196d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
220c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "//"
2226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
223c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(14, breaker.next());  // after "example"
2256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
226c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(19, breaker.next());  // after ".com "
2289c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
229c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2309c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
2319c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(19, breaker.wordStart());  // "x"
2329c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien    EXPECT_EQ(20, breaker.wordEnd());
233c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2349c4cc648abcae144f3b99d612e58ef01d5e52cceRaph Levien}
2356d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
2366d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
2376d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlBreakChars) {
2386d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
2396d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien        '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
2406d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
2416d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
2426d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
2436d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
2446d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.next());  // after "http:"
2456d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
246c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2476d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "//"
2486d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
249c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2506d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(8, breaker.next());  // after "a"
2516d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
252c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2536d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(10, breaker.next());  // after ".b"
2546d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
255c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2566d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(11, breaker.next());  // after "/"
2576d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
258c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2596d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(13, breaker.next());  // after "~c"
2606d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
261c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2626d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(15, breaker.next());  // after ",d"
2636d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
264c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2656d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(17, breaker.next());  // after "-e"
2666d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
267c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2686d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(19, breaker.next());  // after "?f"
2696d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
270c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2716d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(20, breaker.next());  // after "="
2726d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
273c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2746d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(21, breaker.next());  // after "g"
2756d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
276c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2776d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(22, breaker.next());  // after "&"
2786d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
279c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2806d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(23, breaker.next());  // after "h"
2816d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
282c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2836d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(25, breaker.next());  // after "#i"
2846d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
285c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2866d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(27, breaker.next());  // after "%j"
2876d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
288c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2896d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(29, breaker.next());  // after "_k"
2906d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
291c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(1, breaker.breakBadness());
2926d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
2936d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
294c88ef135fcc2661ec7addc171ebc60787df38affRaph Levien    EXPECT_EQ(0, breaker.breakBadness());
2956d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
2966d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
2976d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlNoHyphenBreak) {
2986d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
2996d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
3006d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
3016d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
3026d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
3036d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.next());  // after "http:"
3046d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3056d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "//"
3066d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3076d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(8, breaker.next());  // after "a"
3086d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3096d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
3106d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3116d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
3126d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
3136d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, urlEndsWithSlash) {
3146d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
3156d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
3166d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
3176d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
3186d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
3196d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(5, breaker.next());  // after "http:"
3206d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3216d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(7, breaker.next());  // after "//"
3226d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3236d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(8, breaker.next());  // after "a"
3246d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3256d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
3266d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3276d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
3286d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien
3296d15657e4a3826d4d47d5358f1dde211484527e9Raph LevienTEST_F(WordBreakerTest, emailStartsWithSlash) {
3306d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    uint16_t buf[] = {'/', 'a', '@', 'b'};
3316d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    WordBreaker breaker;
3326d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setLocale(icu::Locale::getEnglish());
3336d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    breaker.setText(buf, NELEM(buf));
3346d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ(0, breaker.current());
3356d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
3366d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
3376d15657e4a3826d4d47d5358f1dde211484527e9Raph Levien}
338