1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <gtest/gtest.h>
18#include "ICUTestBase.h"
19#include "UnicodeUtils.h"
20#include <minikin/WordBreaker.h>
21#include <unicode/locid.h>
22#include <unicode/uclean.h>
23#include <unicode/udata.h>
24
25#define LOG_TAG "Minikin"
26#include <cutils/log.h>
27
28#ifndef NELEM
29#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
30#endif
31
32#define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
33
34using namespace android;
35
36typedef ICUTestBase WordBreakerTest;
37
38TEST_F(WordBreakerTest, basic) {
39    uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
40    WordBreaker breaker;
41    breaker.setLocale(icu::Locale::getEnglish());
42    breaker.setText(buf, NELEM(buf));
43    EXPECT_EQ(0, breaker.current());
44    EXPECT_EQ(6, breaker.next());  // after "hello "
45    EXPECT_EQ(0, breaker.wordStart());  // "hello"
46    EXPECT_EQ(5, breaker.wordEnd());
47    EXPECT_EQ(0, breaker.breakBadness());
48    EXPECT_EQ(6, breaker.current());
49    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
50    EXPECT_EQ(6, breaker.wordStart());  // "world"
51    EXPECT_EQ(11, breaker.wordEnd());
52    EXPECT_EQ(0, breaker.breakBadness());
53    EXPECT_EQ(11, breaker.current());
54}
55
56TEST_F(WordBreakerTest, softHyphen) {
57    uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
58    WordBreaker breaker;
59    breaker.setLocale(icu::Locale::getEnglish());
60    breaker.setText(buf, NELEM(buf));
61    EXPECT_EQ(0, breaker.current());
62    EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
63    EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
64    EXPECT_EQ(6, breaker.wordEnd());
65    EXPECT_EQ(0, breaker.breakBadness());
66    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
67    EXPECT_EQ(7, breaker.wordStart());  // "world"
68    EXPECT_EQ(12, breaker.wordEnd());
69    EXPECT_EQ(0, breaker.breakBadness());
70}
71
72TEST_F(WordBreakerTest, postfixAndPrefix) {
73    uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
74    WordBreaker breaker;
75    breaker.setLocale(icu::Locale::getEnglish());
76    breaker.setText(buf, NELEM(buf));
77    EXPECT_EQ(0, breaker.current());
78
79    EXPECT_EQ(4, breaker.next());  // after CENT SIGN
80    EXPECT_EQ(0, breaker.wordStart());  // "US¢"
81    EXPECT_EQ(3, breaker.wordEnd());
82
83    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
84    EXPECT_EQ(4, breaker.wordStart());  // "JP¥"
85    EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
86}
87
88TEST_F(WordBreakerTest, MyanmarKinzi) {
89    uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
90    WordBreaker breaker;
91    icu::Locale burmese("my");
92    breaker.setLocale(burmese);
93    breaker.setText(buf, NELEM(buf));
94    EXPECT_EQ(0, breaker.current());
95
96    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
97    EXPECT_EQ(0, breaker.wordStart());
98    EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
99}
100
101TEST_F(WordBreakerTest, zwjEmojiSequences) {
102    uint16_t buf[] = {
103        // man + zwj + heart + zwj + man
104        UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
105        // woman + zwj + heart + zwj + kiss mark + zwj + woman
106        UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
107        // eye + zwj + left speech bubble
108        UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
109        // CAT FACE + zwj + BUST IN SILHOUETTE
110        UTF16(0x1F431), 0x200D, UTF16(0x1F464),
111    };
112    WordBreaker breaker;
113    breaker.setLocale(icu::Locale::getEnglish());
114    breaker.setText(buf, NELEM(buf));
115    EXPECT_EQ(0, breaker.current());
116    EXPECT_EQ(7, breaker.next());  // after man + zwj + heart + zwj + man
117    EXPECT_EQ(0, breaker.wordStart());
118    EXPECT_EQ(7, breaker.wordEnd());
119    EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
120    EXPECT_EQ(7, breaker.wordStart());
121    EXPECT_EQ(17, breaker.wordEnd());
122    EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
123    EXPECT_EQ(17, breaker.wordStart());
124    EXPECT_EQ(22, breaker.wordEnd());
125    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
126    EXPECT_EQ(22, breaker.wordStart());
127    EXPECT_EQ(27, breaker.wordEnd());
128}
129
130TEST_F(WordBreakerTest, emojiWithModifier) {
131    uint16_t buf[] = {
132        UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
133        0x270C, 0xFE0F, UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
134    };
135    WordBreaker breaker;
136    breaker.setLocale(icu::Locale::getEnglish());
137    breaker.setText(buf, NELEM(buf));
138    EXPECT_EQ(0, breaker.current());
139    EXPECT_EQ(4, breaker.next());  // after man + type 6 fitzpatrick modifier
140    EXPECT_EQ(0, breaker.wordStart());
141    EXPECT_EQ(4, breaker.wordEnd());
142    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
143    EXPECT_EQ(4, breaker.wordStart());
144    EXPECT_EQ(8, breaker.wordEnd());
145}
146
147TEST_F(WordBreakerTest, punct) {
148    uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
149        '!', '!'};
150    WordBreaker breaker;
151    breaker.setLocale(icu::Locale::getEnglish());
152    breaker.setText(buf, NELEM(buf));
153    EXPECT_EQ(0, breaker.current());
154    EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
155    EXPECT_EQ(2, breaker.wordStart());  // "hello"
156    EXPECT_EQ(7, breaker.wordEnd());
157    EXPECT_EQ(0, breaker.breakBadness());
158    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
159    EXPECT_EQ(9, breaker.wordStart());  // "world"
160    EXPECT_EQ(14, breaker.wordEnd());
161    EXPECT_EQ(0, breaker.breakBadness());
162}
163
164TEST_F(WordBreakerTest, email) {
165    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
166        ' ', 'x'};
167    WordBreaker breaker;
168    breaker.setLocale(icu::Locale::getEnglish());
169    breaker.setText(buf, NELEM(buf));
170    EXPECT_EQ(0, breaker.current());
171    EXPECT_EQ(11, breaker.next());  // after "foo@example"
172    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
173    EXPECT_EQ(1, breaker.breakBadness());
174    EXPECT_EQ(16, breaker.next());  // after ".com "
175    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
176    EXPECT_EQ(0, breaker.breakBadness());
177    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
178    EXPECT_EQ(16, breaker.wordStart());  // "x"
179    EXPECT_EQ(17, breaker.wordEnd());
180    EXPECT_EQ(0, breaker.breakBadness());
181}
182
183TEST_F(WordBreakerTest, mailto) {
184    uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
185        'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
186    WordBreaker breaker;
187    breaker.setLocale(icu::Locale::getEnglish());
188    breaker.setText(buf, NELEM(buf));
189    EXPECT_EQ(0, breaker.current());
190    EXPECT_EQ(7, breaker.next());  // after "mailto:"
191    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
192    EXPECT_EQ(1, breaker.breakBadness());
193    EXPECT_EQ(18, breaker.next());  // after "foo@example"
194    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
195    EXPECT_EQ(1, breaker.breakBadness());
196    EXPECT_EQ(23, breaker.next());  // after ".com "
197    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
198    EXPECT_EQ(0, breaker.breakBadness());
199    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
200    EXPECT_EQ(23, breaker.wordStart());  // "x"
201    EXPECT_EQ(24, breaker.wordEnd());
202    EXPECT_EQ(0, breaker.breakBadness());
203}
204
205// The current logic always places a line break after a detected email address or URL
206// and an immediately following non-ASCII character.
207TEST_F(WordBreakerTest, emailNonAscii) {
208    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
209        0x4E00};
210    WordBreaker breaker;
211    breaker.setLocale(icu::Locale::getEnglish());
212    breaker.setText(buf, NELEM(buf));
213    EXPECT_EQ(0, breaker.current());
214    EXPECT_EQ(11, breaker.next());  // after "foo@example"
215    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
216    EXPECT_EQ(1, breaker.breakBadness());
217    EXPECT_EQ(15, breaker.next());  // after ".com"
218    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
219    EXPECT_EQ(0, breaker.breakBadness());
220    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
221    EXPECT_EQ(15, breaker.wordStart());  // "一"
222    EXPECT_EQ(16, breaker.wordEnd());
223    EXPECT_EQ(0, breaker.breakBadness());
224}
225
226TEST_F(WordBreakerTest, emailCombining) {
227    uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
228        0x0303, ' ', 'x'};
229    WordBreaker breaker;
230    breaker.setLocale(icu::Locale::getEnglish());
231    breaker.setText(buf, NELEM(buf));
232    EXPECT_EQ(0, breaker.current());
233    EXPECT_EQ(11, breaker.next());  // after "foo@example"
234    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
235    EXPECT_EQ(1, breaker.breakBadness());
236    EXPECT_EQ(17, breaker.next());  // after ".com̃ "
237    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
238    EXPECT_EQ(0, breaker.breakBadness());
239    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
240    EXPECT_EQ(17, breaker.wordStart());  // "x"
241    EXPECT_EQ(18, breaker.wordEnd());
242    EXPECT_EQ(0, breaker.breakBadness());
243}
244
245TEST_F(WordBreakerTest, lonelyAt) {
246    uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
247    WordBreaker breaker;
248    breaker.setLocale(icu::Locale::getEnglish());
249    breaker.setText(buf, NELEM(buf));
250    EXPECT_EQ(0, breaker.current());
251    EXPECT_EQ(2, breaker.next());  // after "a "
252    EXPECT_EQ(0, breaker.wordStart());  // "a"
253    EXPECT_EQ(1, breaker.wordEnd());
254    EXPECT_EQ(0, breaker.breakBadness());
255    EXPECT_EQ(4, breaker.next());  // after "@ "
256    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
257    EXPECT_EQ(0, breaker.breakBadness());
258    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
259    EXPECT_EQ(4, breaker.wordStart());  // "b"
260    EXPECT_EQ(5, breaker.wordEnd());
261    EXPECT_EQ(0, breaker.breakBadness());
262}
263
264TEST_F(WordBreakerTest, url) {
265    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
266        '.', 'c', 'o', 'm', ' ', 'x'};
267    WordBreaker breaker;
268    breaker.setLocale(icu::Locale::getEnglish());
269    breaker.setText(buf, NELEM(buf));
270    EXPECT_EQ(0, breaker.current());
271    EXPECT_EQ(5, breaker.next());  // after "http:"
272    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
273    EXPECT_EQ(1, breaker.breakBadness());
274    EXPECT_EQ(7, breaker.next());  // after "//"
275    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
276    EXPECT_EQ(1, breaker.breakBadness());
277    EXPECT_EQ(14, breaker.next());  // after "example"
278    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
279    EXPECT_EQ(1, breaker.breakBadness());
280    EXPECT_EQ(19, breaker.next());  // after ".com "
281    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
282    EXPECT_EQ(0, breaker.breakBadness());
283    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
284    EXPECT_EQ(19, breaker.wordStart());  // "x"
285    EXPECT_EQ(20, breaker.wordEnd());
286    EXPECT_EQ(0, breaker.breakBadness());
287}
288
289// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
290TEST_F(WordBreakerTest, urlBreakChars) {
291    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
292        '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
293    WordBreaker breaker;
294    breaker.setLocale(icu::Locale::getEnglish());
295    breaker.setText(buf, NELEM(buf));
296    EXPECT_EQ(0, breaker.current());
297    EXPECT_EQ(5, breaker.next());  // after "http:"
298    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
299    EXPECT_EQ(1, breaker.breakBadness());
300    EXPECT_EQ(7, breaker.next());  // after "//"
301    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
302    EXPECT_EQ(1, breaker.breakBadness());
303    EXPECT_EQ(8, breaker.next());  // after "a"
304    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
305    EXPECT_EQ(1, breaker.breakBadness());
306    EXPECT_EQ(10, breaker.next());  // after ".b"
307    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
308    EXPECT_EQ(1, breaker.breakBadness());
309    EXPECT_EQ(11, breaker.next());  // after "/"
310    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
311    EXPECT_EQ(1, breaker.breakBadness());
312    EXPECT_EQ(13, breaker.next());  // after "~c"
313    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
314    EXPECT_EQ(1, breaker.breakBadness());
315    EXPECT_EQ(15, breaker.next());  // after ",d"
316    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
317    EXPECT_EQ(1, breaker.breakBadness());
318    EXPECT_EQ(17, breaker.next());  // after "-e"
319    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
320    EXPECT_EQ(1, breaker.breakBadness());
321    EXPECT_EQ(19, breaker.next());  // after "?f"
322    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
323    EXPECT_EQ(1, breaker.breakBadness());
324    EXPECT_EQ(20, breaker.next());  // after "="
325    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
326    EXPECT_EQ(1, breaker.breakBadness());
327    EXPECT_EQ(21, breaker.next());  // after "g"
328    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
329    EXPECT_EQ(1, breaker.breakBadness());
330    EXPECT_EQ(22, breaker.next());  // after "&"
331    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
332    EXPECT_EQ(1, breaker.breakBadness());
333    EXPECT_EQ(23, breaker.next());  // after "h"
334    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
335    EXPECT_EQ(1, breaker.breakBadness());
336    EXPECT_EQ(25, breaker.next());  // after "#i"
337    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
338    EXPECT_EQ(1, breaker.breakBadness());
339    EXPECT_EQ(27, breaker.next());  // after "%j"
340    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
341    EXPECT_EQ(1, breaker.breakBadness());
342    EXPECT_EQ(29, breaker.next());  // after "_k"
343    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
344    EXPECT_EQ(1, breaker.breakBadness());
345    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
346    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
347    EXPECT_EQ(0, breaker.breakBadness());
348}
349
350TEST_F(WordBreakerTest, urlNoHyphenBreak) {
351    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
352    WordBreaker breaker;
353    breaker.setLocale(icu::Locale::getEnglish());
354    breaker.setText(buf, NELEM(buf));
355    EXPECT_EQ(0, breaker.current());
356    EXPECT_EQ(5, breaker.next());  // after "http:"
357    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
358    EXPECT_EQ(7, breaker.next());  // after "//"
359    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
360    EXPECT_EQ(8, breaker.next());  // after "a"
361    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
362    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
363    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
364}
365
366TEST_F(WordBreakerTest, urlEndsWithSlash) {
367    uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
368    WordBreaker breaker;
369    breaker.setLocale(icu::Locale::getEnglish());
370    breaker.setText(buf, NELEM(buf));
371    EXPECT_EQ(0, breaker.current());
372    EXPECT_EQ(5, breaker.next());  // after "http:"
373    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
374    EXPECT_EQ(7, breaker.next());  // after "//"
375    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
376    EXPECT_EQ(8, breaker.next());  // after "a"
377    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
378    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
379    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
380}
381
382TEST_F(WordBreakerTest, emailStartsWithSlash) {
383    uint16_t buf[] = {'/', 'a', '@', 'b'};
384    WordBreaker breaker;
385    breaker.setLocale(icu::Locale::getEnglish());
386    breaker.setText(buf, NELEM(buf));
387    EXPECT_EQ(0, breaker.current());
388    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
389    EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
390}
391