1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <gtest/gtest.h>
18#include <UnicodeUtils.h>
19#include <minikin/GraphemeBreak.h>
20
21namespace minikin {
22
23bool IsBreak(const char* src) {
24    const size_t BUF_SIZE = 256;
25    uint16_t buf[BUF_SIZE];
26    size_t offset;
27    size_t size;
28    ParseUnicode(buf, BUF_SIZE, src, &size, &offset);
29    return GraphemeBreak::isGraphemeBreak(nullptr, buf, 0, size, offset);
30}
31
32bool IsBreakWithAdvances(const float* advances, const char* src) {
33    const size_t BUF_SIZE = 256;
34    uint16_t buf[BUF_SIZE];
35    size_t offset;
36    size_t size;
37    ParseUnicode(buf, BUF_SIZE, src, &size, &offset);
38    return GraphemeBreak::isGraphemeBreak(advances, buf, 0, size, offset);
39}
40
41TEST(GraphemeBreak, utf16) {
42    EXPECT_FALSE(IsBreak("U+D83C | U+DC31"));  // emoji, U+1F431
43
44    // tests for invalid UTF-16
45    EXPECT_TRUE(IsBreak("U+D800 | U+D800"));  // two leading surrogates
46    EXPECT_TRUE(IsBreak("U+DC00 | U+DC00"));  // two trailing surrogates
47    EXPECT_TRUE(IsBreak("'a' | U+D800"));  // lonely leading surrogate
48    EXPECT_TRUE(IsBreak("U+DC00 | 'a'"));  // lonely trailing surrogate
49    EXPECT_TRUE(IsBreak("U+D800 | 'a'"));  // leading surrogate followed by non-surrogate
50    EXPECT_TRUE(IsBreak("'a' | U+DC00"));  // non-surrogate followed by trailing surrogate
51}
52
53TEST(GraphemeBreak, rules) {
54    // Rule GB1, sot ÷; Rule GB2, ÷ eot
55    EXPECT_TRUE(IsBreak("| 'a'"));
56    EXPECT_TRUE(IsBreak("'a' |"));
57
58    // Rule GB3, CR x LF
59    EXPECT_FALSE(IsBreak("U+000D | U+000A"));  // CR x LF
60
61    // Rule GB4, (Control | CR | LF) ÷
62    EXPECT_TRUE(IsBreak("'a' | U+2028"));  // Line separator
63    EXPECT_TRUE(IsBreak("'a' | U+000D"));  // LF
64    EXPECT_TRUE(IsBreak("'a' | U+000A"));  // CR
65
66    // Rule GB5, ÷ (Control | CR | LF)
67    EXPECT_TRUE(IsBreak("U+2028 | 'a'"));  // Line separator
68    EXPECT_TRUE(IsBreak("U+000D | 'a'"));  // LF
69    EXPECT_TRUE(IsBreak("U+000A | 'a'"));  // CR
70
71    // Rule GB6, L x ( L | V | LV | LVT )
72    EXPECT_FALSE(IsBreak("U+1100 | U+1100"));  // L x L
73    EXPECT_FALSE(IsBreak("U+1100 | U+1161"));  // L x V
74    EXPECT_FALSE(IsBreak("U+1100 | U+AC00"));  // L x LV
75    EXPECT_FALSE(IsBreak("U+1100 | U+AC01"));  // L x LVT
76
77    // Rule GB7, ( LV | V ) x ( V | T )
78    EXPECT_FALSE(IsBreak("U+AC00 | U+1161"));  // LV x V
79    EXPECT_FALSE(IsBreak("U+1161 | U+1161"));  // V x V
80    EXPECT_FALSE(IsBreak("U+AC00 | U+11A8"));  // LV x T
81    EXPECT_FALSE(IsBreak("U+1161 | U+11A8"));  // V x T
82
83    // Rule GB8, ( LVT | T ) x T
84    EXPECT_FALSE(IsBreak("U+AC01 | U+11A8"));  // LVT x T
85    EXPECT_FALSE(IsBreak("U+11A8 | U+11A8"));  // T x T
86
87    // Other hangul pairs not counted above _are_ breaks (GB10)
88    EXPECT_TRUE(IsBreak("U+AC00 | U+1100"));  // LV x L
89    EXPECT_TRUE(IsBreak("U+AC01 | U+1100"));  // LVT x L
90    EXPECT_TRUE(IsBreak("U+11A8 | U+1100"));  // T x L
91    EXPECT_TRUE(IsBreak("U+11A8 | U+AC00"));  // T x LV
92    EXPECT_TRUE(IsBreak("U+11A8 | U+AC01"));  // T x LVT
93
94    // Rule GB12 and Rule GB13, Regional_Indicator x Regional_Indicator
95    EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8"));
96    EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
97    EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
98    EXPECT_FALSE(IsBreak("U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
99
100    EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA"));  // Regional indicator pair (flag)
101    EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA"));  // Regional indicator pair (flag)
102    // Same case as the two above, knowing that the first two characters ligate, which is what
103    // would typically happen.
104    const float firstPairLigated[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}; // Two entries per codepoint
105    EXPECT_TRUE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA"));
106    EXPECT_FALSE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA"));
107    // Repeat the tests, But now the font doesn't have a ligature for the first two characters,
108    // while it does have a ligature for the last two. This could happen for fonts that do not
109    // support some (potentially encoded later than they were developed) flags.
110    const float secondPairLigated[] = {1.0, 0.0, 1.0, 0.0, 0.0, 0.0};
111    EXPECT_FALSE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA"));
112    EXPECT_TRUE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA"));
113
114    EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA"));  // Regional indicator pair (flag)
115    EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA"));  // Regional indicator pair (flag)
116
117    EXPECT_TRUE(
118            IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8"));  // Regional indicator pair (flag)
119    EXPECT_FALSE(
120            IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8"));  // Regional indicator pair (flag)
121    EXPECT_FALSE(
122            IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8"));  // Regional indicator pair (flag)
123
124    // Rule GB9, x (Extend | ZWJ)
125    EXPECT_FALSE(IsBreak("'a' | U+0301"));  // combining accent
126    EXPECT_FALSE(IsBreak("'a' | U+200D"));  // ZWJ
127    // Rule GB9a, x SpacingMark
128    EXPECT_FALSE(IsBreak("U+0915 | U+093E"));  // KA, AA (spacing mark)
129    // Rule GB9b, Prepend x
130    // see tailoring test for prepend, as current ICU doesn't have any characters in the class
131
132    // Rule GB999, Any ÷ Any
133    EXPECT_TRUE(IsBreak("'a' | 'b'"));
134    EXPECT_TRUE(IsBreak("'f' | 'i'"));  // probable ligature
135    EXPECT_TRUE(IsBreak("U+0644 | U+0627"));  // probable ligature, lam + alef
136    EXPECT_TRUE(IsBreak("U+4E00 | U+4E00"));  // CJK ideographs
137    EXPECT_TRUE(IsBreak("'a' | U+1F1FA U+1F1F8"));  // Regional indicator pair (flag)
138    EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | 'a'"));  // Regional indicator pair (flag)
139
140    // Extended rule for emoji tag sequence.
141    EXPECT_TRUE(IsBreak("'a' | U+1F3F4 'a'"));
142    EXPECT_TRUE(IsBreak("'a' U+1F3F4 | 'a'"));
143
144    // Immediate tag_term after tag_base.
145    EXPECT_TRUE(IsBreak("'a' | U+1F3F4 U+E007F 'a'"));
146    EXPECT_FALSE(IsBreak("U+1F3F4 | U+E007F"));
147    EXPECT_TRUE(IsBreak("'a' U+1F3F4 U+E007F | 'a'"));
148
149    // Flag sequence
150    // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
151    // of Scotland.
152    // U+1F3F4 is WAVING BLACK FLAG. This can be a tag_base character.
153    // U+E0067 is TAG LATIN SMALL LETTER G. This can be a part of tag_spec.
154    // U+E0062 is TAG LATIN SMALL LETTER B. This can be a part of tag_spec.
155    // U+E0073 is TAG LATIN SMALL LETTER S. This can be a part of tag_spec.
156    // U+E0063 is TAG LATIN SMALL LETTER C. This can be a part of tag_spec.
157    // U+E0074 is TAG LATIN SMALL LETTER T. This can be a part of tag_spec.
158    // U+E007F is CANCEL TAG. This is a tag_term character.
159    EXPECT_TRUE(IsBreak("'a' | U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F"));
160    EXPECT_FALSE(IsBreak("U+1F3F4 | U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F"));
161    EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 | U+E0062 U+E0073 U+E0063 U+E0074 U+E007F"));
162    EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 | U+E0073 U+E0063 U+E0074 U+E007F"));
163    EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 | U+E0063 U+E0074 U+E007F"));
164    EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 | U+E0074 U+E007F"));
165    EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 | U+E007F"));
166    EXPECT_TRUE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F | 'a'"));
167}
168
169TEST(GraphemeBreak, tailoring) {
170    // control characters that we interpret as "extend"
171    EXPECT_FALSE(IsBreak("'a' | U+00AD"));  // soft hyphen
172    EXPECT_FALSE(IsBreak("'a' | U+200B"));  // zwsp
173    EXPECT_FALSE(IsBreak("'a' | U+200E"));  // lrm
174    EXPECT_FALSE(IsBreak("'a' | U+202A"));  // lre
175    EXPECT_FALSE(IsBreak("'a' | U+E0041"));  // tag character
176
177    // UTC-approved characters for the Prepend class
178    EXPECT_FALSE(IsBreak("U+06DD | U+0661"));  // arabic subtending mark + digit one
179
180    EXPECT_TRUE(IsBreak("U+0E01 | U+0E33"));  // Thai sara am
181
182    // virama is not a grapheme break, but "pure killer" is
183    EXPECT_FALSE(IsBreak("U+0915 | U+094D U+0915"));  // Devanagari ka+virama+ka
184    EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915"));  // Devanagari ka+virama+ka
185    EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01"));  // thai phinthu = pure killer
186    EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01"));  // thai phinthu = pure killer
187
188    // Repetition of above tests, but with a given advances array that implies everything
189    // became just one cluster.
190    const float conjoined[] = {1.0, 0.0, 0.0};
191    EXPECT_FALSE(IsBreakWithAdvances(conjoined,
192            "U+0915 | U+094D U+0915"));  // Devanagari ka+virama+ka
193    EXPECT_FALSE(IsBreakWithAdvances(conjoined,
194            "U+0915 U+094D | U+0915"));  // Devanagari ka+virama+ka
195    EXPECT_FALSE(IsBreakWithAdvances(conjoined,
196            "U+0E01 | U+0E3A U+0E01"));  // thai phinthu = pure killer
197    EXPECT_TRUE(IsBreakWithAdvances(conjoined,
198            "U+0E01 U+0E3A | U+0E01"));  // thai phinthu = pure killer
199
200    // Repetition of above tests, but with a given advances array that the virama did not
201    // form a cluster with the following consonant. The difference is that there is now
202    // a grapheme break after the virama in ka+virama+ka.
203    const float separate[] = {1.0, 0.0, 1.0};
204    EXPECT_FALSE(IsBreakWithAdvances(separate,
205            "U+0915 | U+094D U+0915"));  // Devanagari ka+virama+ka
206    EXPECT_TRUE(IsBreakWithAdvances(separate,
207            "U+0915 U+094D | U+0915"));  // Devanagari ka+virama+ka
208    EXPECT_FALSE(IsBreakWithAdvances(separate,
209            "U+0E01 | U+0E3A U+0E01"));  // thai phinthu = pure killer
210    EXPECT_TRUE(IsBreakWithAdvances(separate,
211            "U+0E01 U+0E3A | U+0E01"));  // thai phinthu = pure killer
212
213    // suppress grapheme breaks in zwj emoji sequences
214    EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468"));
215    EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468"));
216    EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468"));
217    EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466"));
218    EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466"));
219    EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466"));
220    EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466"));
221    EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466"));
222    EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8"));
223
224    // Do not break before and after zwj with all kind of emoji characters.
225    EXPECT_FALSE(IsBreak("U+1F431 | U+200D U+1F464"));
226    EXPECT_FALSE(IsBreak("U+1F431 U+200D | U+1F464"));
227
228    // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break
229    EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
230}
231
232TEST(GraphemeBreak, emojiModifiers) {
233    EXPECT_FALSE(IsBreak("U+261D | U+1F3FB"));  // white up pointing index + modifier
234    EXPECT_FALSE(IsBreak("U+270C | U+1F3FB"));  // victory hand + modifier
235    EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB"));  // boy + modifier
236    EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC"));  // boy + modifier
237    EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD"));  // boy + modifier
238    EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE"));  // boy + modifier
239    EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF"));  // boy + modifier
240    EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF"));  // sign of the horns + modifier
241    EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF"));  // selfie (Unicode 9) + modifier
242    // Reptition of the tests above, with the knowledge that they are ligated.
243    const float ligated1_2[] = {1.0, 0.0, 0.0};
244    const float ligated2_2[] = {1.0, 0.0, 0.0, 0.0};
245    EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+261D | U+1F3FB"));
246    EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+270C | U+1F3FB"));
247    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FB"));
248    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FC"));
249    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FD"));
250    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FE"));
251    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FF"));
252    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F918 | U+1F3FF"));
253    EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F933 | U+1F3FF"));
254    // Reptition of the tests above, with the knowledge that they are not ligated.
255    const float unligated1_2[] = {1.0, 1.0, 0.0};
256    const float unligated2_2[] = {1.0, 0.0, 1.0, 0.0};
257    EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+261D | U+1F3FB"));
258    EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+270C | U+1F3FB"));
259    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FB"));
260    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FC"));
261    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FD"));
262    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FE"));
263    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FF"));
264    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F918 | U+1F3FF"));
265    EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F933 | U+1F3FF"));
266
267    // adding extend characters between emoji base and modifier doesn't affect grapheme cluster
268    EXPECT_FALSE(IsBreak("U+270C U+FE0E | U+1F3FB"));  // victory hand + text style + modifier
269    EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB"));  // heart + emoji style + modifier
270    // Reptition of the two tests above, with the knowledge that they are ligated.
271    const float ligated1_1_2[] = {1.0, 0.0, 0.0, 0.0};
272    EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0E | U+1F3FB"));
273    EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0F | U+1F3FB"));
274    // Reptition of the first two tests, with the knowledge that they are not ligated.
275    const float unligated1_1_2[] = {1.0, 0.0, 1.0, 0.0};
276    EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0E | U+1F3FB"));
277    EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0F | U+1F3FB"));
278
279    // heart is not an emoji base
280    EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB"));  // heart + modifier
281    EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB"));  // heart + emoji style + modifier
282    EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB"));  // heart + emoji style + modifier
283    EXPECT_TRUE(IsBreak("U+1F3FB | U+1F3FB"));  // modifier + modifier
284
285    // rat is not an emoji modifer
286    EXPECT_TRUE(IsBreak("U+1F466 | U+1F400"));  // boy + rat
287}
288
289TEST(GraphemeBreak, genderBalancedEmoji) {
290    // U+1F469 is WOMAN, U+200D is ZWJ, U+1F4BC is BRIEFCASE.
291    EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+1F4BC"));
292    EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F4BC"));
293    // The above two cases, when the ligature is not supported in the font. We now expect a break
294    // between them.
295    const float unligated2_1_2[] = {1.0, 0.0, 0.0, 1.0, 0.0};
296    EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 | U+200D U+1F4BC"));
297    EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 U+200D | U+1F4BC"));
298
299    // U+2695 has now emoji property, so should be part of ZWJ sequence.
300    EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+2695"));
301    EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2695"));
302    // The above two cases, when the ligature is not supported in the font. We now expect a break
303    // between them.
304    const float unligated2_1_1[] = {1.0, 0.0, 0.0, 1.0};
305    EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 | U+200D U+2695"));
306    EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 U+200D | U+2695"));
307}
308
309TEST(GraphemeBreak, offsets) {
310    uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 };
311    EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 2));
312    EXPECT_FALSE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 3));
313    EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 4));
314    EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 5));
315}
316
317}  // namespace minikin
318