GraphemeBreakTests.cpp revision 1934c2c3cb2c93aa12f852f95915190f8ac81fac
1d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien/* 2d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Copyright (C) 2015 The Android Open Source Project 3d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 4d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 5d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * you may not use this file except in compliance with the License. 6d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * You may obtain a copy of the License at 7d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 8d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 9d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 10d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Unless required by applicable law or agreed to in writing, software 11d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 12d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * See the License for the specific language governing permissions and 14d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * limitations under the License. 15d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien */ 16d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 17d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <gtest/gtest.h> 18d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <UnicodeUtils.h> 19d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <minikin/GraphemeBreak.h> 20d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 21d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levienusing namespace android; 22d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 23d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levienbool IsBreak(const char* src) { 24d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien const size_t BUF_SIZE = 256; 25d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien uint16_t buf[BUF_SIZE]; 26d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t offset; 27d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t size; 28d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ParseUnicode(buf, BUF_SIZE, src, &size, &offset); 29d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien return GraphemeBreak::isGraphemeBreak(buf, 0, size, offset); 30d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 31d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 32d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph LevienTEST(GraphemeBreak, utf16) { 33d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+D83C | U+DC31")); // emoji, U+1F431 34d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 35d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // tests for invalid UTF-16 36d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+D800 | U+D800")); // two leading surrogates 37d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+DC00 | U+DC00")); // two trailing surrogates 38d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'a' | U+D800")); // lonely leading surrogate 39d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+DC00 | 'a'")); // lonely trailing surrogate 40d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+D800 | 'a'")); // leading surrogate followed by non-surrogate 41d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'a' | U+DC00")); // non-surrogate followed by trailing surrogate 42d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 43d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 44d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph LevienTEST(GraphemeBreak, rules) { 45d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB1, sot ÷; Rule GB2, ÷ eot 46d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("| 'a'")); 47d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'a' |")); 48d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 49d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB3, CR x LF 50d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+000D | U+000A")); // CR x LF 51d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 52d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB4, (Control | CR | LF) ÷ 53d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'a' | U+2028")); // Line separator 54d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'a' | U+000D")); // LF 55d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'a' | U+000A")); // CR 56d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 57d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB5, ÷ (Control | CR | LF) 58d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+2028 | 'a'")); // Line separator 59d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+000D | 'a'")); // LF 60d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+000A | 'a'")); // CR 61d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 62d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB6, L x ( L | V | LV | LVT ) 63d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+1100 | U+1100")); // L x L 64d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+1100 | U+1161")); // L x V 65d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+1100 | U+AC00")); // L x LV 66d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+1100 | U+AC01")); // L x LVT 67d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 68d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB7, ( LV | V ) x ( V | T ) 69d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+AC00 | U+1161")); // LV x V 70d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+1161 | U+1161")); // V x V 71d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+AC00 | U+11A8")); // LV x T 72d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+1161 | U+11A8")); // V x T 73d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 74d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB8, ( LVT | T ) x T 75d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+AC01 | U+11A8")); // LVT x T 76d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+11A8 | U+11A8")); // T x T 77d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 78d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Other hangul pairs not counted above _are_ breaks (GB10) 79d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+AC00 | U+1100")); // LV x L 80d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+AC01 | U+1100")); // LVT x L 81d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+11A8 | U+1100")); // T x L 82d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+11A8 | U+AC00")); // T x LV 83d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+11A8 | U+AC01")); // T x LVT 84d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 85d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB8a, Regional_Indicator x Regional_Indicator 86d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8")); 87d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 88d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB9, x Extend 89d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent 90d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB9a, x SpacingMark 91d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+0915 | U+093E")); // KA, AA (spacing mark) 92d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB9b, Prepend x 93d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // see tailoring test for prepend, as current ICU doesn't have any characters in the class 94d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 95d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Rule GB10, Any ÷ Any 96d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'a' | 'b'")); 97d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'f' | 'i'")); // probable ligature 98d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+0644 | U+0627")); // probable ligature, lam + alef 99d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+4E00 | U+4E00")); // CJK ideographs 100d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("'a' | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 101d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | 'a'")); // Regional indicator pair (flag) 102d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 103d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 104d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph LevienTEST(GraphemeBreak, tailoring) { 105d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // control characters that we interpret as "extend" 106d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("'a' | U+00AD")); // soft hyphen 107d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("'a' | U+200B")); // zwsp 108d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("'a' | U+200E")); // lrm 109d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("'a' | U+202A")); // lre 110d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("'a' | U+E0041")); // tag character 111d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 112d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // UTC-approved characters for the Prepend class 113d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+06DD | U+0661")); // arabic subtending mark + digit one 114d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 115d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+0E01 | U+0E33")); // Thai sara am 116d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 117d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // virama is not a grapheme break, but "pure killer" is 118d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka 119d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka 120d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer 121d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer 1226638e05ac2de397455c30cae05aca399a567428dRaph Levien 1236638e05ac2de397455c30cae05aca399a567428dRaph Levien // suppress grapheme breaks in zwj emoji sequences, see 1246638e05ac2de397455c30cae05aca399a567428dRaph Levien // http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html 1256638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468")); 1266638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468")); 1276638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468")); 1286638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466")); 1296638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466")); 1306638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466")); 1316638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466")); 1326638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466")); 1336638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8")); 1346638e05ac2de397455c30cae05aca399a567428dRaph Levien 1356638e05ac2de397455c30cae05aca399a567428dRaph Levien // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break 1366638e05ac2de397455c30cae05aca399a567428dRaph Levien EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764")); 137d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 138d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 139adfa580f1f067c846509b4346e5be2cb19177c1bRaph LevienTEST(GraphemeBreak, emojiModifiers) { 140adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+261D | U+1F3FB")); // white up pointing index + modifier 141adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+270C | U+1F3FB")); // victory hand + modifier 142adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB")); // boy + modifier 143adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC")); // boy + modifier 144adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD")); // boy + modifier 145adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE")); // boy + modifier 146adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier 147adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier 148adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier 149adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien 150adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien // adding emoji style variation selector doesn't affect grapheme cluster 151adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier 152adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier 153adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien 154adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien // heart is not an emoji base 155adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier 156adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB")); // heart + emoji style + modifier 157adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB")); // heart + emoji style + modifier 1581934c2c3cb2c93aa12f852f95915190f8ac81facRaph Levien EXPECT_TRUE(IsBreak("U+1F3FB | U+1F3FB")); // modifier + modifier 159adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien 160adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien // rat is not an emoji modifer 161adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat 162adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien} 163adfa580f1f067c846509b4346e5be2cb19177c1bRaph Levien 164d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph LevienTEST(GraphemeBreak, offsets) { 165d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 }; 166d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 2)); 167d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_FALSE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 3)); 168d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 4)); 169d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 5)); 170d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 171