1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <gtest/gtest.h> 18#include <UnicodeUtils.h> 19#include <minikin/GraphemeBreak.h> 20 21namespace minikin { 22 23bool IsBreak(const char* src) { 24 const size_t BUF_SIZE = 256; 25 uint16_t buf[BUF_SIZE]; 26 size_t offset; 27 size_t size; 28 ParseUnicode(buf, BUF_SIZE, src, &size, &offset); 29 return GraphemeBreak::isGraphemeBreak(nullptr, buf, 0, size, offset); 30} 31 32bool IsBreakWithAdvances(const float* advances, const char* src) { 33 const size_t BUF_SIZE = 256; 34 uint16_t buf[BUF_SIZE]; 35 size_t offset; 36 size_t size; 37 ParseUnicode(buf, BUF_SIZE, src, &size, &offset); 38 return GraphemeBreak::isGraphemeBreak(advances, buf, 0, size, offset); 39} 40 41TEST(GraphemeBreak, utf16) { 42 EXPECT_FALSE(IsBreak("U+D83C | U+DC31")); // emoji, U+1F431 43 44 // tests for invalid UTF-16 45 EXPECT_TRUE(IsBreak("U+D800 | U+D800")); // two leading surrogates 46 EXPECT_TRUE(IsBreak("U+DC00 | U+DC00")); // two trailing surrogates 47 EXPECT_TRUE(IsBreak("'a' | U+D800")); // lonely leading surrogate 48 EXPECT_TRUE(IsBreak("U+DC00 | 'a'")); // lonely trailing surrogate 49 EXPECT_TRUE(IsBreak("U+D800 | 'a'")); // leading surrogate followed by non-surrogate 50 EXPECT_TRUE(IsBreak("'a' | U+DC00")); // non-surrogate followed by trailing surrogate 51} 52 53TEST(GraphemeBreak, rules) { 54 // Rule GB1, sot ÷; Rule GB2, ÷ eot 55 EXPECT_TRUE(IsBreak("| 'a'")); 56 EXPECT_TRUE(IsBreak("'a' |")); 57 58 // Rule GB3, CR x LF 59 EXPECT_FALSE(IsBreak("U+000D | U+000A")); // CR x LF 60 61 // Rule GB4, (Control | CR | LF) ÷ 62 EXPECT_TRUE(IsBreak("'a' | U+2028")); // Line separator 63 EXPECT_TRUE(IsBreak("'a' | U+000D")); // LF 64 EXPECT_TRUE(IsBreak("'a' | U+000A")); // CR 65 66 // Rule GB5, ÷ (Control | CR | LF) 67 EXPECT_TRUE(IsBreak("U+2028 | 'a'")); // Line separator 68 EXPECT_TRUE(IsBreak("U+000D | 'a'")); // LF 69 EXPECT_TRUE(IsBreak("U+000A | 'a'")); // CR 70 71 // Rule GB6, L x ( L | V | LV | LVT ) 72 EXPECT_FALSE(IsBreak("U+1100 | U+1100")); // L x L 73 EXPECT_FALSE(IsBreak("U+1100 | U+1161")); // L x V 74 EXPECT_FALSE(IsBreak("U+1100 | U+AC00")); // L x LV 75 EXPECT_FALSE(IsBreak("U+1100 | U+AC01")); // L x LVT 76 77 // Rule GB7, ( LV | V ) x ( V | T ) 78 EXPECT_FALSE(IsBreak("U+AC00 | U+1161")); // LV x V 79 EXPECT_FALSE(IsBreak("U+1161 | U+1161")); // V x V 80 EXPECT_FALSE(IsBreak("U+AC00 | U+11A8")); // LV x T 81 EXPECT_FALSE(IsBreak("U+1161 | U+11A8")); // V x T 82 83 // Rule GB8, ( LVT | T ) x T 84 EXPECT_FALSE(IsBreak("U+AC01 | U+11A8")); // LVT x T 85 EXPECT_FALSE(IsBreak("U+11A8 | U+11A8")); // T x T 86 87 // Other hangul pairs not counted above _are_ breaks (GB10) 88 EXPECT_TRUE(IsBreak("U+AC00 | U+1100")); // LV x L 89 EXPECT_TRUE(IsBreak("U+AC01 | U+1100")); // LVT x L 90 EXPECT_TRUE(IsBreak("U+11A8 | U+1100")); // T x L 91 EXPECT_TRUE(IsBreak("U+11A8 | U+AC00")); // T x LV 92 EXPECT_TRUE(IsBreak("U+11A8 | U+AC01")); // T x LVT 93 94 // Rule GB12 and Rule GB13, Regional_Indicator x Regional_Indicator 95 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8")); 96 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 97 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 98 EXPECT_FALSE(IsBreak("U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag) 99 100 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag) 101 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag) 102 // Same case as the two above, knowing that the first two characters ligate, which is what 103 // would typically happen. 104 const float firstPairLigated[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}; // Two entries per codepoint 105 EXPECT_TRUE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA")); 106 EXPECT_FALSE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA")); 107 // Repeat the tests, But now the font doesn't have a ligature for the first two characters, 108 // while it does have a ligature for the last two. This could happen for fonts that do not 109 // support some (potentially encoded later than they were developed) flags. 110 const float secondPairLigated[] = {1.0, 0.0, 1.0, 0.0, 0.0, 0.0}; 111 EXPECT_FALSE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA")); 112 EXPECT_TRUE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA")); 113 114 EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag) 115 EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag) 116 117 EXPECT_TRUE( 118 IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 119 EXPECT_FALSE( 120 IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 121 EXPECT_FALSE( 122 IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag) 123 124 // Rule GB9, x (Extend | ZWJ) 125 EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent 126 EXPECT_FALSE(IsBreak("'a' | U+200D")); // ZWJ 127 // Rule GB9a, x SpacingMark 128 EXPECT_FALSE(IsBreak("U+0915 | U+093E")); // KA, AA (spacing mark) 129 // Rule GB9b, Prepend x 130 // see tailoring test for prepend, as current ICU doesn't have any characters in the class 131 132 // Rule GB999, Any ÷ Any 133 EXPECT_TRUE(IsBreak("'a' | 'b'")); 134 EXPECT_TRUE(IsBreak("'f' | 'i'")); // probable ligature 135 EXPECT_TRUE(IsBreak("U+0644 | U+0627")); // probable ligature, lam + alef 136 EXPECT_TRUE(IsBreak("U+4E00 | U+4E00")); // CJK ideographs 137 EXPECT_TRUE(IsBreak("'a' | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 138 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | 'a'")); // Regional indicator pair (flag) 139 140 // Extended rule for emoji tag sequence. 141 EXPECT_TRUE(IsBreak("'a' | U+1F3F4 'a'")); 142 EXPECT_TRUE(IsBreak("'a' U+1F3F4 | 'a'")); 143 144 // Immediate tag_term after tag_base. 145 EXPECT_TRUE(IsBreak("'a' | U+1F3F4 U+E007F 'a'")); 146 EXPECT_FALSE(IsBreak("U+1F3F4 | U+E007F")); 147 EXPECT_TRUE(IsBreak("'a' U+1F3F4 U+E007F | 'a'")); 148 149 // Flag sequence 150 // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag 151 // of Scotland. 152 // U+1F3F4 is WAVING BLACK FLAG. This can be a tag_base character. 153 // U+E0067 is TAG LATIN SMALL LETTER G. This can be a part of tag_spec. 154 // U+E0062 is TAG LATIN SMALL LETTER B. This can be a part of tag_spec. 155 // U+E0073 is TAG LATIN SMALL LETTER S. This can be a part of tag_spec. 156 // U+E0063 is TAG LATIN SMALL LETTER C. This can be a part of tag_spec. 157 // U+E0074 is TAG LATIN SMALL LETTER T. This can be a part of tag_spec. 158 // U+E007F is CANCEL TAG. This is a tag_term character. 159 EXPECT_TRUE(IsBreak("'a' | U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F")); 160 EXPECT_FALSE(IsBreak("U+1F3F4 | U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F")); 161 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 | U+E0062 U+E0073 U+E0063 U+E0074 U+E007F")); 162 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 | U+E0073 U+E0063 U+E0074 U+E007F")); 163 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 | U+E0063 U+E0074 U+E007F")); 164 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 | U+E0074 U+E007F")); 165 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 | U+E007F")); 166 EXPECT_TRUE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F | 'a'")); 167} 168 169TEST(GraphemeBreak, tailoring) { 170 // control characters that we interpret as "extend" 171 EXPECT_FALSE(IsBreak("'a' | U+00AD")); // soft hyphen 172 EXPECT_FALSE(IsBreak("'a' | U+200B")); // zwsp 173 EXPECT_FALSE(IsBreak("'a' | U+200E")); // lrm 174 EXPECT_FALSE(IsBreak("'a' | U+202A")); // lre 175 EXPECT_FALSE(IsBreak("'a' | U+E0041")); // tag character 176 177 // UTC-approved characters for the Prepend class 178 EXPECT_FALSE(IsBreak("U+06DD | U+0661")); // arabic subtending mark + digit one 179 180 EXPECT_TRUE(IsBreak("U+0E01 | U+0E33")); // Thai sara am 181 182 // virama is not a grapheme break, but "pure killer" is 183 EXPECT_FALSE(IsBreak("U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka 184 EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka 185 EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer 186 EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer 187 188 // Repetition of above tests, but with a given advances array that implies everything 189 // became just one cluster. 190 const float conjoined[] = {1.0, 0.0, 0.0}; 191 EXPECT_FALSE(IsBreakWithAdvances(conjoined, 192 "U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka 193 EXPECT_FALSE(IsBreakWithAdvances(conjoined, 194 "U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka 195 EXPECT_FALSE(IsBreakWithAdvances(conjoined, 196 "U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer 197 EXPECT_TRUE(IsBreakWithAdvances(conjoined, 198 "U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer 199 200 // Repetition of above tests, but with a given advances array that the virama did not 201 // form a cluster with the following consonant. The difference is that there is now 202 // a grapheme break after the virama in ka+virama+ka. 203 const float separate[] = {1.0, 0.0, 1.0}; 204 EXPECT_FALSE(IsBreakWithAdvances(separate, 205 "U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka 206 EXPECT_TRUE(IsBreakWithAdvances(separate, 207 "U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka 208 EXPECT_FALSE(IsBreakWithAdvances(separate, 209 "U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer 210 EXPECT_TRUE(IsBreakWithAdvances(separate, 211 "U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer 212 213 // suppress grapheme breaks in zwj emoji sequences 214 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468")); 215 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468")); 216 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468")); 217 EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466")); 218 EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466")); 219 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466")); 220 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466")); 221 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466")); 222 EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8")); 223 224 // Do not break before and after zwj with all kind of emoji characters. 225 EXPECT_FALSE(IsBreak("U+1F431 | U+200D U+1F464")); 226 EXPECT_FALSE(IsBreak("U+1F431 U+200D | U+1F464")); 227 228 // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break 229 EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764")); 230} 231 232TEST(GraphemeBreak, emojiModifiers) { 233 EXPECT_FALSE(IsBreak("U+261D | U+1F3FB")); // white up pointing index + modifier 234 EXPECT_FALSE(IsBreak("U+270C | U+1F3FB")); // victory hand + modifier 235 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB")); // boy + modifier 236 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC")); // boy + modifier 237 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD")); // boy + modifier 238 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE")); // boy + modifier 239 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier 240 EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier 241 EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier 242 // Reptition of the tests above, with the knowledge that they are ligated. 243 const float ligated1_2[] = {1.0, 0.0, 0.0}; 244 const float ligated2_2[] = {1.0, 0.0, 0.0, 0.0}; 245 EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+261D | U+1F3FB")); 246 EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+270C | U+1F3FB")); 247 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FB")); 248 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FC")); 249 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FD")); 250 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FE")); 251 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FF")); 252 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F918 | U+1F3FF")); 253 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F933 | U+1F3FF")); 254 // Reptition of the tests above, with the knowledge that they are not ligated. 255 const float unligated1_2[] = {1.0, 1.0, 0.0}; 256 const float unligated2_2[] = {1.0, 0.0, 1.0, 0.0}; 257 EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+261D | U+1F3FB")); 258 EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+270C | U+1F3FB")); 259 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FB")); 260 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FC")); 261 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FD")); 262 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FE")); 263 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FF")); 264 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F918 | U+1F3FF")); 265 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F933 | U+1F3FF")); 266 267 // adding extend characters between emoji base and modifier doesn't affect grapheme cluster 268 EXPECT_FALSE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier 269 EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier 270 // Reptition of the two tests above, with the knowledge that they are ligated. 271 const float ligated1_1_2[] = {1.0, 0.0, 0.0, 0.0}; 272 EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0E | U+1F3FB")); 273 EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0F | U+1F3FB")); 274 // Reptition of the first two tests, with the knowledge that they are not ligated. 275 const float unligated1_1_2[] = {1.0, 0.0, 1.0, 0.0}; 276 EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0E | U+1F3FB")); 277 EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0F | U+1F3FB")); 278 279 // heart is not an emoji base 280 EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier 281 EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB")); // heart + emoji style + modifier 282 EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB")); // heart + emoji style + modifier 283 EXPECT_TRUE(IsBreak("U+1F3FB | U+1F3FB")); // modifier + modifier 284 285 // rat is not an emoji modifer 286 EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat 287} 288 289TEST(GraphemeBreak, genderBalancedEmoji) { 290 // U+1F469 is WOMAN, U+200D is ZWJ, U+1F4BC is BRIEFCASE. 291 EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+1F4BC")); 292 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F4BC")); 293 // The above two cases, when the ligature is not supported in the font. We now expect a break 294 // between them. 295 const float unligated2_1_2[] = {1.0, 0.0, 0.0, 1.0, 0.0}; 296 EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 | U+200D U+1F4BC")); 297 EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 U+200D | U+1F4BC")); 298 299 // U+2695 has now emoji property, so should be part of ZWJ sequence. 300 EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+2695")); 301 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2695")); 302 // The above two cases, when the ligature is not supported in the font. We now expect a break 303 // between them. 304 const float unligated2_1_1[] = {1.0, 0.0, 0.0, 1.0}; 305 EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 | U+200D U+2695")); 306 EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 U+200D | U+2695")); 307} 308 309TEST(GraphemeBreak, offsets) { 310 uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 }; 311 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 2)); 312 EXPECT_FALSE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 3)); 313 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 4)); 314 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 5)); 315} 316 317} // namespace minikin 318