1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <stdint.h> 18#include <algorithm> 19#include <unicode/uchar.h> 20#include <unicode/utf16.h> 21 22#include <minikin/GraphemeBreak.h> 23#include <minikin/Emoji.h> 24#include "MinikinInternal.h" 25 26namespace minikin { 27 28int32_t tailoredGraphemeClusterBreak(uint32_t c) { 29 // Characters defined as Control that we want to treat them as Extend. 30 // These are curated manually. 31 if (c == 0x00AD // SHY 32 || c == 0x061C // ALM 33 || c == 0x180E // MONGOLIAN VOWEL SEPARATOR 34 || c == 0x200B // ZWSP 35 || c == 0x200E // LRM 36 || c == 0x200F // RLM 37 || (0x202A <= c && c <= 0x202E) // LRE, RLE, PDF, LRO, RLO 38 || ((c | 0xF) == 0x206F) // WJ, invisible math operators, LRI, RLI, FSI, PDI, 39 // and the deprecated invisible format controls 40 || c == 0xFEFF // BOM 41 || ((c | 0x7F) == 0xE007F)) // recently undeprecated tag characters in Plane 14 42 return U_GCB_EXTEND; 43 // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they 44 // allow a grapheme break before it. 45 else if (c == 0x0E33) 46 return U_GCB_OTHER; 47 else 48 return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 49} 50 51// Returns true for all characters whose IndicSyllabicCategory is Pure_Killer. 52// From http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory.txt 53bool isPureKiller(uint32_t c) { 54 return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 || c == 0x1734 55 || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 || c == 0xA806 56 || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B); 57} 58 59bool GraphemeBreak::isGraphemeBreak(const float* advances, const uint16_t* buf, size_t start, 60 size_t count, const size_t offset) { 61 // This implementation closely follows Unicode Standard Annex #29 on 62 // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/), 63 // implementing a tailored version of extended grapheme clusters. 64 // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules. 65 66 // Rule GB1, sot ÷; Rule GB2, ÷ eot 67 if (offset <= start || offset >= start + count) { 68 return true; 69 } 70 if (U16_IS_TRAIL(buf[offset])) { 71 // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break 72 return !U16_IS_LEAD(buf[offset - 1]); 73 } 74 uint32_t c1 = 0; 75 uint32_t c2 = 0; 76 size_t offset_back = offset; 77 size_t offset_forward = offset; 78 U16_PREV(buf, start, offset_back, c1); 79 U16_NEXT(buf, offset_forward, start + count, c2); 80 int32_t p1 = tailoredGraphemeClusterBreak(c1); 81 int32_t p2 = tailoredGraphemeClusterBreak(c2); 82 // Rule GB3, CR x LF 83 if (p1 == U_GCB_CR && p2 == U_GCB_LF) { 84 return false; 85 } 86 // Rule GB4, (Control | CR | LF) ÷ 87 if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) { 88 return true; 89 } 90 // Rule GB5, ÷ (Control | CR | LF) 91 if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) { 92 return true; 93 } 94 // Rule GB6, L x ( L | V | LV | LVT ) 95 if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) { 96 return false; 97 } 98 // Rule GB7, ( LV | V ) x ( V | T ) 99 if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) { 100 return false; 101 } 102 // Rule GB8, ( LVT | T ) x T 103 if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) { 104 return false; 105 } 106 // Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x 107 if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) { 108 return false; 109 } 110 111 // This is used to decide font-dependent grapheme clusters. If we don't have the advance 112 // information, we become conservative in grapheme breaking and assume that it has no advance. 113 const bool c2_has_advance = (advances != nullptr && advances[offset - start] != 0.0); 114 115 // All the following rules are font-dependent, in the way that if we know c2 has an advance, 116 // we definitely know that it cannot form a grapheme with the character(s) before it. So we 117 // make the decision in favor a grapheme break early. 118 if (c2_has_advance) { 119 return true; 120 } 121 122 // Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking properties for 123 // determining emoji-ness and carry our own data, because our data could be more fresh than what 124 // ICU provides. 125 // 126 // Tailored version of Rule GB10, (E_Base | EBG) Extend* × E_Modifier. 127 // The rule itself says do not break between emoji base and emoji modifiers, skipping all Extend 128 // characters. Variation selectors are considered Extend, so they are handled fine. 129 // 130 // We tailor this by requiring that an actual ligature is formed. If the font doesn't form a 131 // ligature, we allow a break before the modifier. 132 if (isEmojiModifier(c2)) { 133 uint32_t c0 = c1; 134 size_t offset_backback = offset_back; 135 int32_t p0 = p1; 136 if (p0 == U_GCB_EXTEND && offset_backback > start) { 137 // skip over emoji variation selector 138 U16_PREV(buf, start, offset_backback, c0); 139 p0 = tailoredGraphemeClusterBreak(c0); 140 } 141 if (isEmojiBase(c0)) { 142 return false; 143 } 144 } 145 // Tailored version of Rule GB11, ZWJ × (Glue_After_Zwj | EBG) 146 // We try to make emoji sequences with ZWJ a single grapheme cluster, but only if they actually 147 // merge to one cluster. So we are more relaxed than the UAX #29 rules in accepting any emoji 148 // character after the ZWJ, but are tighter in that we only treat it as one cluster if a 149 // ligature is actually formed and we also require the character before the ZWJ to also be an 150 // emoji. 151 if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) { 152 // look at character before ZWJ to see that both can participate in an emoji zwj sequence 153 uint32_t c0 = 0; 154 size_t offset_backback = offset_back; 155 U16_PREV(buf, start, offset_backback, c0); 156 if (c0 == 0xFE0F && offset_backback > start) { 157 // skip over emoji variation selector 158 U16_PREV(buf, start, offset_backback, c0); 159 } 160 if (isEmoji(c0)) { 161 return false; 162 } 163 } 164 // Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases. 165 // sot (RI RI)* RI x RI 166 // [^RI] (RI RI)* RI x RI 167 // 168 // If we have font information, we have already broken the cluster if and only if the second 169 // character had no advance, which means a ligature was formed. If we don't, we look back like 170 // UAX #29 recommends, but only up to 1000 code units. 171 if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { 172 if (advances != nullptr) { 173 // We have advances information. But if we are here, we already know c2 has no advance. 174 // So we should definitely disallow a break. 175 return false; 176 } else { 177 // Look at up to 1000 code units. 178 const size_t lookback_barrier = std::max((ssize_t)start, (ssize_t)offset_back - 1000); 179 size_t offset_backback = offset_back; 180 while (offset_backback > lookback_barrier) { 181 uint32_t c0 = 0; 182 U16_PREV(buf, lookback_barrier, offset_backback, c0); 183 if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) { 184 offset_backback += U16_LENGTH(c0); 185 break; 186 } 187 } 188 // The number 4 comes from the number of code units in a whole flag. 189 return (offset - offset_backback) % 4 == 0; 190 } 191 } 192 // Cluster Indic syllables together (tailoring of UAX #29). 193 // Immediately after each virama (that is not just a pure killer) followed by a letter, we 194 // disallow grapheme breaks (if we are here, we don't know about advances, or we already know 195 // that c2 has no advance). 196 if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama 197 && !isPureKiller(c1) 198 && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) { 199 return false; 200 } 201 // Rule GB999, Any ÷ Any 202 return true; 203} 204 205size_t GraphemeBreak::getTextRunCursor(const float* advances, const uint16_t* buf, size_t start, 206 size_t count, size_t offset, MoveOpt opt) { 207 switch (opt) { 208 case AFTER: 209 if (offset < start + count) { 210 offset++; 211 } 212 // fall through 213 case AT_OR_AFTER: 214 while (!isGraphemeBreak(advances, buf, start, count, offset)) { 215 offset++; 216 } 217 break; 218 case BEFORE: 219 if (offset > start) { 220 offset--; 221 } 222 // fall through 223 case AT_OR_BEFORE: 224 while (!isGraphemeBreak(advances, buf, start, count, offset)) { 225 offset--; 226 } 227 break; 228 case AT: 229 if (!isGraphemeBreak(advances, buf, start, count, offset)) { 230 offset = (size_t)-1; 231 } 232 break; 233 } 234 return offset; 235} 236 237} // namespace minikin 238