GraphemeBreak.cpp revision 6638e05ac2de397455c30cae05aca399a567428d
1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <stdint.h> 18#include <unicode/uchar.h> 19#include <unicode/utf16.h> 20 21#include <minikin/GraphemeBreak.h> 22 23namespace android { 24 25int32_t tailoredGraphemeClusterBreak(uint32_t c) { 26 // Characters defined as Control that we want to treat them as Extend. 27 // These are curated manually. 28 if (c == 0x00AD // SHY 29 || c == 0x061C // ALM 30 || c == 0x180E // MONGOLIAN VOWEL SEPARATOR 31 || c == 0x200B // ZWSP 32 || c == 0x200E // LRM 33 || c == 0x200F // RLM 34 || (0x202A <= c && c <= 0x202E) // LRE, RLE, PDF, LRO, RLO 35 || ((c | 0xF) == 0x206F) // WJ, invisible math operators, LRI, RLI, FSI, PDI, 36 // and the deprecated invisible format controls 37 || c == 0xFEFF // BOM 38 || ((c | 0x7F) == 0xE007F)) // recently undeprecated tag characters in Plane 14 39 return U_GCB_EXTEND; 40 // UTC-approved characters for the Prepend class, per 41 // http://www.unicode.org/L2/L2015/15183r-graph-cluster-brk.txt 42 // These should be removed when our copy of ICU gets updated to Unicode 9.0 (~2016 or 2017). 43 else if ((0x0600 <= c && c <= 0x0605) // Arabic subtending marks 44 || c == 0x06DD // ARABIC SUBTENDING MARK 45 || c == 0x070F // SYRIAC ABBREVIATION MARK 46 || c == 0x0D4E // MALAYALAM LETTER DOT REPH 47 || c == 0x110BD // KAITHI NUMBER SIGN 48 || c == 0x111C2 // SHARADA SIGN JIHVAMULIYA 49 || c == 0x111C3) // SHARADA SIGN UPADHMANIYA 50 return U_GCB_PREPEND; 51 // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they 52 // allow a grapheme break before it. 53 else if (c == 0x0E33) 54 return U_GCB_OTHER; 55 else 56 return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 57} 58 59// Returns true for all characters whose IndicSyllabicCategory is Pure_Killer. 60// From http://www.unicode.org/Public/8.0.0/ucd/IndicSyllabicCategory.txt 61bool isPureKiller(uint32_t c) { 62 return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 || c == 0x1734 63 || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 || c == 0xA806 64 || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B); 65} 66 67// Returns true if the character appears before or after zwj in a zwj emoji sequence. See 68// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html 69bool isZwjEmoji(uint32_t c) { 70 return (c == 0x2764 // HEAVY BLACK HEART 71 || c == 0x1F468 // MAN 72 || c == 0x1F469 // WOMAN 73 || c == 0x1F48B // KISS MARK 74 || c == 0x1F466 // BOY 75 || c == 0x1F467 // GIRL 76 || c == 0x1F441 // EYE 77 || c == 0x1F5E8); // LEFT SPEECH BUBBLE 78} 79 80bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count, 81 size_t offset) { 82 // This implementation closely follows Unicode Standard Annex #29 on 83 // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/), 84 // implementing a tailored version of extended grapheme clusters. 85 // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules. 86 87 // Rule GB1, sot ÷; Rule GB2, ÷ eot 88 if (offset <= start || offset >= start + count) { 89 return true; 90 } 91 if (U16_IS_TRAIL(buf[offset])) { 92 // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break 93 return !U16_IS_LEAD(buf[offset - 1]); 94 } 95 uint32_t c1 = 0; 96 uint32_t c2 = 0; 97 size_t offset_back = offset; 98 U16_PREV(buf, start, offset_back, c1); 99 U16_NEXT(buf, offset, start + count, c2); 100 int32_t p1 = tailoredGraphemeClusterBreak(c1); 101 int32_t p2 = tailoredGraphemeClusterBreak(c2); 102 // Rule GB3, CR x LF 103 if (p1 == U_GCB_CR && p2 == U_GCB_LF) { 104 return false; 105 } 106 // Rule GB4, (Control | CR | LF) ÷ 107 if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) { 108 return true; 109 } 110 // Rule GB5, ÷ (Control | CR | LF) 111 if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) { 112 return true; 113 } 114 // Rule GB6, L x ( L | V | LV | LVT ) 115 if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) { 116 return false; 117 } 118 // Rule GB7, ( LV | V ) x ( V | T ) 119 if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) { 120 return false; 121 } 122 // Rule GB8, ( LVT | T ) x T 123 if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) { 124 return false; 125 } 126 // Rule GB8a, Regional_Indicator x Regional_Indicator 127 // 128 // Known limitation: This is overly conservative, and returns no grapheme breaks between two 129 // flags, such as in the character sequence "U+1F1FA U+1F1F8 [potential break] U+1F1FA U+1F1F8". 130 // Also, it assumes that all combinations of Regional Indicators produce a flag, where they 131 // don't. 132 // 133 // There is no easy solution for doing this correctly, except for querying the font and doing 134 // some lookback. 135 if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { 136 return false; 137 } 138 // Rule GB9, x Extend; Rule GB9a, x SpacingMark; Rule GB9b, Prepend x 139 if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) { 140 return false; 141 } 142 // Cluster indic syllables together (tailoring of UAX #29) 143 // Known limitation: this is overly conservative, and assumes that the virama may form a 144 // conjunct with the following letter, which doesn't always happen. 145 // 146 // There is no easy solution to do this correctly. Even querying the font does not help (with 147 // the current font technoloies), since the font may be creating the conjunct using multiple 148 // glyphs, while the user may be perceiving that sequence of glyphs as one conjunct or one 149 // letter. 150 if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama 151 && !isPureKiller(c1) 152 && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) { 153 return false; 154 } 155 // Tailoring: make emoji sequences with ZWJ a single grapheme cluster 156 if (c1 == 0x200D && isZwjEmoji(c2) && offset_back > start) { 157 // look at character before ZWJ to see that both can participate in an emoji zwj sequence 158 uint32_t c0 = 0; 159 U16_PREV(buf, start, offset_back, c0); 160 if (c0 == 0xFE0F && offset_back > start) { 161 // skip over emoji variation selector 162 U16_PREV(buf, start, offset_back, c0); 163 } 164 if (isZwjEmoji(c0)) { 165 return false; 166 } 167 } 168 // Rule GB10, Any ÷ Any 169 return true; 170} 171 172size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count, 173 size_t offset, MoveOpt opt) { 174 switch (opt) { 175 case AFTER: 176 if (offset < start + count) { 177 offset++; 178 } 179 // fall through 180 case AT_OR_AFTER: 181 while (!isGraphemeBreak(buf, start, count, offset)) { 182 offset++; 183 } 184 break; 185 case BEFORE: 186 if (offset > start) { 187 offset--; 188 } 189 // fall through 190 case AT_OR_BEFORE: 191 while (!isGraphemeBreak(buf, start, count, offset)) { 192 offset--; 193 } 194 break; 195 case AT: 196 if (!isGraphemeBreak(buf, start, count, offset)) { 197 offset = (size_t)-1; 198 } 199 break; 200 } 201 return offset; 202} 203 204} // namespace android 205