1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18
19#include <android/log.h>
20#include <gtest/gtest.h>
21
22#include "ICUTestBase.h"
23#include "UnicodeUtils.h"
24#include <minikin/WordBreaker.h>
25#include <unicode/locid.h>
26#include <unicode/uclean.h>
27#include <unicode/udata.h>
28
29#ifndef NELEM
30#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
31#endif
32
33#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
34
35namespace minikin {
36
37typedef ICUTestBase WordBreakerTest;
38
39TEST_F(WordBreakerTest, basic) {
40    uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
41    WordBreaker breaker;
42    breaker.setLocale(icu::Locale::getUS());
43    breaker.setText(buf, NELEM(buf));
44    EXPECT_EQ(0, breaker.current());
45    EXPECT_EQ(6, breaker.next());  // after "hello "
46    EXPECT_EQ(0, breaker.wordStart());  // "hello"
47    EXPECT_EQ(5, breaker.wordEnd());
48    EXPECT_EQ(0, breaker.breakBadness());
49    EXPECT_EQ(6, breaker.current());
50    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
51    EXPECT_EQ(6, breaker.wordStart());  // "world"
52    EXPECT_EQ(11, breaker.wordEnd());
53    EXPECT_EQ(0, breaker.breakBadness());
54    EXPECT_EQ(11, breaker.current());
55}
56
57TEST_F(WordBreakerTest, softHyphen) {
58    uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
59    WordBreaker breaker;
60    breaker.setLocale(icu::Locale::getUS());
61    breaker.setText(buf, NELEM(buf));
62    EXPECT_EQ(0, breaker.current());
63    EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
64    EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
65    EXPECT_EQ(6, breaker.wordEnd());
66    EXPECT_EQ(0, breaker.breakBadness());
67    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
68    EXPECT_EQ(7, breaker.wordStart());  // "world"
69    EXPECT_EQ(12, breaker.wordEnd());
70    EXPECT_EQ(0, breaker.breakBadness());
71}
72
73TEST_F(WordBreakerTest, hardHyphen) {
74    // Hyphens should not allow breaks anymore.
75    uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
76    WordBreaker breaker;
77    breaker.setLocale(icu::Locale::getUS());
78    breaker.setText(buf, NELEM(buf));
79    EXPECT_EQ(0, breaker.current());
80    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());
81    EXPECT_EQ(0, breaker.wordStart());
82    EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
83    EXPECT_EQ(0, breaker.breakBadness());
84}
85
86TEST_F(WordBreakerTest, postfixAndPrefix) {
87    uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
88    WordBreaker breaker;
89    breaker.setLocale(icu::Locale::getUS());
90    breaker.setText(buf, NELEM(buf));
91    EXPECT_EQ(0, breaker.current());
92
93    EXPECT_EQ(4, breaker.next());  // after CENT SIGN
94    EXPECT_EQ(0, breaker.wordStart());  // "US¢"
95    EXPECT_EQ(3, breaker.wordEnd());
96
97    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
98    EXPECT_EQ(4, breaker.wordStart());  // "JP¥"
99    EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
100}
101
102TEST_F(WordBreakerTest, myanmarKinzi) {
103    uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
104    WordBreaker breaker;
105    icu::Locale burmese("my");
106    breaker.setLocale(burmese);
107    breaker.setText(buf, NELEM(buf));
108    EXPECT_EQ(0, breaker.current());
109
110    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
111    EXPECT_EQ(0, breaker.wordStart());
112    EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
113}
114
115TEST_F(WordBreakerTest, zwjEmojiSequences) {
116    uint16_t buf[] = {
117        // man + zwj + heart + zwj + man
118        UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
119        // woman + zwj + heart + zwj + kiss mark + zwj + woman
120        UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
121        // eye + zwj + left speech bubble
122        UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
123        // CAT FACE + zwj + BUST IN SILHOUETTE
124        UTF16(0x1F431), 0x200D, UTF16(0x1F464),
125    };
126    WordBreaker breaker;
127    breaker.setLocale(icu::Locale::getUS());
128    breaker.setText(buf, NELEM(buf));
129    EXPECT_EQ(0, breaker.current());
130    EXPECT_EQ(7, breaker.next());  // after man + zwj + heart + zwj + man
131    EXPECT_EQ(0, breaker.wordStart());
132    EXPECT_EQ(7, breaker.wordEnd());
133    EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
134    EXPECT_EQ(7, breaker.wordStart());
135    EXPECT_EQ(17, breaker.wordEnd());
136    EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
137    EXPECT_EQ(17, breaker.wordStart());
138    EXPECT_EQ(22, breaker.wordEnd());
139    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
140    EXPECT_EQ(22, breaker.wordStart());
141    EXPECT_EQ(27, breaker.wordEnd());
142}
143
144TEST_F(WordBreakerTest, emojiWithModifier) {
145    uint16_t buf[] = {
146        UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
147        0x270C, 0xFE0F, UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
148    };
149    WordBreaker breaker;
150    breaker.setLocale(icu::Locale::getUS());
151    breaker.setText(buf, NELEM(buf));
152    EXPECT_EQ(0, breaker.current());
153    EXPECT_EQ(4, breaker.next());  // after boy + type 1-2 fitzpatrick modifier
154    EXPECT_EQ(0, breaker.wordStart());
155    EXPECT_EQ(4, breaker.wordEnd());
156    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
157    EXPECT_EQ(4, breaker.wordStart());
158    EXPECT_EQ(8, breaker.wordEnd());
159}
160
161TEST_F(WordBreakerTest, unicode10Emoji) {
162    // Should break between emojis.
163    uint16_t buf[] = {
164        // SLED + SLED
165        UTF16(0x1F6F7), UTF16(0x1F6F7),
166        // SLED + VS15 + SLED
167        UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
168        // WHITE SMILING FACE + SLED
169        0x263A, UTF16(0x1F6F7),
170        // WHITE SMILING FACE + VS16 + SLED
171        0x263A, 0xFE0F, UTF16(0x1F6F7),
172    };
173    WordBreaker breaker;
174    breaker.setLocale(icu::Locale::getEnglish());
175    breaker.setText(buf, NELEM(buf));
176    EXPECT_EQ(0, breaker.current());
177    EXPECT_EQ(2, breaker.next());
178    EXPECT_EQ(0, breaker.wordStart());
179    EXPECT_EQ(2, breaker.wordEnd());
180
181    EXPECT_EQ(4, breaker.next());
182    EXPECT_EQ(2, breaker.wordStart());
183    EXPECT_EQ(4, breaker.wordEnd());
184
185    EXPECT_EQ(7, breaker.next());
186    EXPECT_EQ(4, breaker.wordStart());
187    EXPECT_EQ(7, breaker.wordEnd());
188
189    EXPECT_EQ(9, breaker.next());
190    EXPECT_EQ(7, breaker.wordStart());
191    EXPECT_EQ(9, breaker.wordEnd());
192
193    EXPECT_EQ(10, breaker.next());
194    EXPECT_EQ(9, breaker.wordStart());
195    EXPECT_EQ(10, breaker.wordEnd());
196
197    EXPECT_EQ(12, breaker.next());
198    EXPECT_EQ(10, breaker.wordStart());
199    EXPECT_EQ(12, breaker.wordEnd());
200
201    EXPECT_EQ(14, breaker.next());
202    EXPECT_EQ(12, breaker.wordStart());
203    EXPECT_EQ(14, breaker.wordEnd());
204
205    EXPECT_EQ(16, breaker.next());
206    EXPECT_EQ(14, breaker.wordStart());
207    EXPECT_EQ(16, breaker.wordEnd());
208}
209
210TEST_F(WordBreakerTest, flagsSequenceSingleFlag) {
211    const std::string kFlag = "U+1F3F4";
212    const std::string flags = kFlag + " " + kFlag;
213
214    const int kFlagLength = 2;
215    const size_t BUF_SIZE = kFlagLength * 2;
216
217    uint16_t buf[BUF_SIZE];
218    size_t size;
219    ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
220
221    WordBreaker breaker;
222    breaker.setLocale(icu::Locale::getUS());
223    breaker.setText(buf, size);
224    EXPECT_EQ(0, breaker.current());
225    EXPECT_EQ(kFlagLength, breaker.next());  // end of the first flag
226    EXPECT_EQ(0, breaker.wordStart());
227    EXPECT_EQ(kFlagLength, breaker.wordEnd());
228    EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
229    EXPECT_EQ(kFlagLength, breaker.wordStart());
230    EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
231}
232
233TEST_F(WordBreakerTest, flagsSequence) {
234    // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
235    // of Scotland.
236    const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
237    const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
238
239    const int kFlagLength = 14;
240    const size_t BUF_SIZE = kFlagLength * 2;
241
242    uint16_t buf[BUF_SIZE];
243    size_t size;
244    ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
245
246    WordBreaker breaker;
247    breaker.setLocale(icu::Locale::getUS());
248    breaker.setText(buf, size);
249    EXPECT_EQ(0, breaker.current());
250    EXPECT_EQ(kFlagLength, breaker.next());  // end of the first flag sequence
251    EXPECT_EQ(0, breaker.wordStart());
252    EXPECT_EQ(kFlagLength, breaker.wordEnd());
253    EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
254    EXPECT_EQ(kFlagLength, breaker.wordStart());
255    EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
256}
257
258TEST_F(WordBreakerTest, punct) {
259    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
260        '!', '!'};
261    WordBreaker breaker;
262    breaker.setLocale(icu::Locale::getUS());
263    breaker.setText(buf, NELEM(buf));
264    EXPECT_EQ(0, breaker.current());
265    EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
266    EXPECT_EQ(2, breaker.wordStart());  // "hello"
267    EXPECT_EQ(7, breaker.wordEnd());
268    EXPECT_EQ(0, breaker.breakBadness());
269    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
270    EXPECT_EQ(9, breaker.wordStart());  // "world"
271    EXPECT_EQ(14, breaker.wordEnd());
272    EXPECT_EQ(0, breaker.breakBadness());
273}
274
275TEST_F(WordBreakerTest, email) {
276    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
277        ' ', 'x'};
278    WordBreaker breaker;
279    breaker.setLocale(icu::Locale::getUS());
280    breaker.setText(buf, NELEM(buf));
281    EXPECT_EQ(0, breaker.current());
282    EXPECT_EQ(11, breaker.next());  // after "foo@example"
283    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
284    EXPECT_EQ(1, breaker.breakBadness());
285    EXPECT_EQ(16, breaker.next());  // after ".com "
286    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
287    EXPECT_EQ(0, breaker.breakBadness());
288    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
289    EXPECT_EQ(16, breaker.wordStart());  // "x"
290    EXPECT_EQ(17, breaker.wordEnd());
291    EXPECT_EQ(0, breaker.breakBadness());
292}
293
294TEST_F(WordBreakerTest, mailto) {
295    uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
296        'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
297    WordBreaker breaker;
298    breaker.setLocale(icu::Locale::getUS());
299    breaker.setText(buf, NELEM(buf));
300    EXPECT_EQ(0, breaker.current());
301    EXPECT_EQ(7, breaker.next());  // after "mailto:"
302    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
303    EXPECT_EQ(1, breaker.breakBadness());
304    EXPECT_EQ(18, breaker.next());  // after "foo@example"
305    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
306    EXPECT_EQ(1, breaker.breakBadness());
307    EXPECT_EQ(23, breaker.next());  // after ".com "
308    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
309    EXPECT_EQ(0, breaker.breakBadness());
310    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
311    EXPECT_EQ(23, breaker.wordStart());  // "x"
312    EXPECT_EQ(24, breaker.wordEnd());
313    EXPECT_EQ(0, breaker.breakBadness());
314}
315
316// The current logic always places a line break after a detected email address or URL
317// and an immediately following non-ASCII character.
318TEST_F(WordBreakerTest, emailNonAscii) {
319    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
320        0x4E00};
321    WordBreaker breaker;
322    breaker.setLocale(icu::Locale::getUS());
323    breaker.setText(buf, NELEM(buf));
324    EXPECT_EQ(0, breaker.current());
325    EXPECT_EQ(11, breaker.next());  // after "foo@example"
326    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
327    EXPECT_EQ(1, breaker.breakBadness());
328    EXPECT_EQ(15, breaker.next());  // after ".com"
329    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
330    EXPECT_EQ(0, breaker.breakBadness());
331    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
332    EXPECT_EQ(15, breaker.wordStart());  // "一"
333    EXPECT_EQ(16, breaker.wordEnd());
334    EXPECT_EQ(0, breaker.breakBadness());
335}
336
337TEST_F(WordBreakerTest, emailCombining) {
338    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
339        0x0303, ' ', 'x'};
340    WordBreaker breaker;
341    breaker.setLocale(icu::Locale::getUS());
342    breaker.setText(buf, NELEM(buf));
343    EXPECT_EQ(0, breaker.current());
344    EXPECT_EQ(11, breaker.next());  // after "foo@example"
345    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
346    EXPECT_EQ(1, breaker.breakBadness());
347    EXPECT_EQ(17, breaker.next());  // after ".com̃ "
348    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
349    EXPECT_EQ(0, breaker.breakBadness());
350    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
351    EXPECT_EQ(17, breaker.wordStart());  // "x"
352    EXPECT_EQ(18, breaker.wordEnd());
353    EXPECT_EQ(0, breaker.breakBadness());
354}
355
356TEST_F(WordBreakerTest, lonelyAt) {
357    uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
358    WordBreaker breaker;
359    breaker.setLocale(icu::Locale::getUS());
360    breaker.setText(buf, NELEM(buf));
361    EXPECT_EQ(0, breaker.current());
362    EXPECT_EQ(2, breaker.next());  // after "a "
363    EXPECT_EQ(0, breaker.wordStart());  // "a"
364    EXPECT_EQ(1, breaker.wordEnd());
365    EXPECT_EQ(0, breaker.breakBadness());
366    EXPECT_EQ(4, breaker.next());  // after "@ "
367    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
368    EXPECT_EQ(0, breaker.breakBadness());
369    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
370    EXPECT_EQ(4, breaker.wordStart());  // "b"
371    EXPECT_EQ(5, breaker.wordEnd());
372    EXPECT_EQ(0, breaker.breakBadness());
373}
374
375TEST_F(WordBreakerTest, url) {
376    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
377        '.', 'c', 'o', 'm', ' ', 'x'};
378    WordBreaker breaker;
379    breaker.setLocale(icu::Locale::getUS());
380    breaker.setText(buf, NELEM(buf));
381    EXPECT_EQ(0, breaker.current());
382    EXPECT_EQ(5, breaker.next());  // after "http:"
383    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
384    EXPECT_EQ(1, breaker.breakBadness());
385    EXPECT_EQ(7, breaker.next());  // after "//"
386    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
387    EXPECT_EQ(1, breaker.breakBadness());
388    EXPECT_EQ(14, breaker.next());  // after "example"
389    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
390    EXPECT_EQ(1, breaker.breakBadness());
391    EXPECT_EQ(19, breaker.next());  // after ".com "
392    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
393    EXPECT_EQ(0, breaker.breakBadness());
394    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
395    EXPECT_EQ(19, breaker.wordStart());  // "x"
396    EXPECT_EQ(20, breaker.wordEnd());
397    EXPECT_EQ(0, breaker.breakBadness());
398}
399
400// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
401TEST_F(WordBreakerTest, urlBreakChars) {
402    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
403        '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
404    WordBreaker breaker;
405    breaker.setLocale(icu::Locale::getUS());
406    breaker.setText(buf, NELEM(buf));
407    EXPECT_EQ(0, breaker.current());
408    EXPECT_EQ(5, breaker.next());  // after "http:"
409    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
410    EXPECT_EQ(1, breaker.breakBadness());
411    EXPECT_EQ(7, breaker.next());  // after "//"
412    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
413    EXPECT_EQ(1, breaker.breakBadness());
414    EXPECT_EQ(8, breaker.next());  // after "a"
415    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
416    EXPECT_EQ(1, breaker.breakBadness());
417    EXPECT_EQ(10, breaker.next());  // after ".b"
418    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
419    EXPECT_EQ(1, breaker.breakBadness());
420    EXPECT_EQ(11, breaker.next());  // after "/"
421    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
422    EXPECT_EQ(1, breaker.breakBadness());
423    EXPECT_EQ(13, breaker.next());  // after "~c"
424    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
425    EXPECT_EQ(1, breaker.breakBadness());
426    EXPECT_EQ(15, breaker.next());  // after ",d"
427    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
428    EXPECT_EQ(1, breaker.breakBadness());
429    EXPECT_EQ(17, breaker.next());  // after "-e"
430    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
431    EXPECT_EQ(1, breaker.breakBadness());
432    EXPECT_EQ(19, breaker.next());  // after "?f"
433    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
434    EXPECT_EQ(1, breaker.breakBadness());
435    EXPECT_EQ(20, breaker.next());  // after "="
436    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
437    EXPECT_EQ(1, breaker.breakBadness());
438    EXPECT_EQ(21, breaker.next());  // after "g"
439    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
440    EXPECT_EQ(1, breaker.breakBadness());
441    EXPECT_EQ(22, breaker.next());  // after "&"
442    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
443    EXPECT_EQ(1, breaker.breakBadness());
444    EXPECT_EQ(23, breaker.next());  // after "h"
445    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
446    EXPECT_EQ(1, breaker.breakBadness());
447    EXPECT_EQ(25, breaker.next());  // after "#i"
448    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
449    EXPECT_EQ(1, breaker.breakBadness());
450    EXPECT_EQ(27, breaker.next());  // after "%j"
451    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
452    EXPECT_EQ(1, breaker.breakBadness());
453    EXPECT_EQ(29, breaker.next());  // after "_k"
454    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455    EXPECT_EQ(1, breaker.breakBadness());
456    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
457    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
458    EXPECT_EQ(0, breaker.breakBadness());
459}
460
461TEST_F(WordBreakerTest, urlNoHyphenBreak) {
462    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
463    WordBreaker breaker;
464    breaker.setLocale(icu::Locale::getUS());
465    breaker.setText(buf, NELEM(buf));
466    EXPECT_EQ(0, breaker.current());
467    EXPECT_EQ(5, breaker.next());  // after "http:"
468    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
469    EXPECT_EQ(7, breaker.next());  // after "//"
470    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
471    EXPECT_EQ(8, breaker.next());  // after "a"
472    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
473    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
474    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
475}
476
477TEST_F(WordBreakerTest, urlEndsWithSlash) {
478    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
479    WordBreaker breaker;
480    breaker.setLocale(icu::Locale::getUS());
481    breaker.setText(buf, NELEM(buf));
482    EXPECT_EQ(0, breaker.current());
483    EXPECT_EQ(5, breaker.next());  // after "http:"
484    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
485    EXPECT_EQ(7, breaker.next());  // after "//"
486    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
487    EXPECT_EQ(8, breaker.next());  // after "a"
488    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
489    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
490    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
491}
492
493TEST_F(WordBreakerTest, emailStartsWithSlash) {
494    uint16_t buf[] = {'/', 'a', '@', 'b'};
495    WordBreaker breaker;
496    breaker.setLocale(icu::Locale::getUS());
497    breaker.setText(buf, NELEM(buf));
498    EXPECT_EQ(0, breaker.current());
499    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
500    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
501}
502
503}  // namespace minikin
504