GraphemeBreak.cpp revision d8dd94b81ea7efd776859fbbdf4a76458e270eab
1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <stdint.h> 18#include <unicode/uchar.h> 19#include <unicode/utf16.h> 20 21#include <minikin/GraphemeBreak.h> 22 23namespace android { 24 25int32_t tailoredGraphemeClusterBreak(uint32_t c) { 26 // Characters defined as Control that we want to treat them as Extend. 27 // These are curated manually. 28 if (c == 0x00AD // SHY 29 || c == 0x061C // ALM 30 || c == 0x180E // MONGOLIAN VOWEL SEPARATOR 31 || c == 0x200B // ZWSP 32 || c == 0x200E // LRM 33 || c == 0x200F // RLM 34 || (0x202A <= c && c <= 0x202E) // LRE, RLE, PDF, LRO, RLO 35 || ((c | 0xF) == 0x206F) // WJ, invisible math operators, LRI, RLI, FSI, PDI, 36 // and the deprecated invisible format controls 37 || c == 0xFEFF // BOM 38 || ((c | 0x7F) == 0xE007F)) // recently undeprecated tag characters in Plane 14 39 return U_GCB_EXTEND; 40 // UTC-approved characters for the Prepend class, per 41 // http://www.unicode.org/L2/L2015/15183r-graph-cluster-brk.txt 42 // These should be removed when our copy of ICU gets updated to Unicode 9.0 (~2016 or 2017). 43 else if ((0x0600 <= c && c <= 0x0605) // Arabic subtending marks 44 || c == 0x06DD // ARABIC SUBTENDING MARK 45 || c == 0x070F // SYRIAC ABBREVIATION MARK 46 || c == 0x0D4E // MALAYALAM LETTER DOT REPH 47 || c == 0x110BD // KAITHI NUMBER SIGN 48 || c == 0x111C2 // SHARADA SIGN JIHVAMULIYA 49 || c == 0x111C3) // SHARADA SIGN UPADHMANIYA 50 return U_GCB_PREPEND; 51 // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they 52 // allow a grapheme break before it. 53 else if (c == 0x0E33) 54 return U_GCB_OTHER; 55 else 56 return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 57} 58 59// Returns true for all characters whose IndicSyllabicCategory is Pure_Killer. 60// From http://www.unicode.org/Public/8.0.0/ucd/IndicSyllabicCategory.txt 61bool isPureKiller(uint32_t c) { 62 return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 || c == 0x1734 63 || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 || c == 0xA806 64 || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B); 65} 66 67bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count, 68 size_t offset) { 69 // This implementation closely follows Unicode Standard Annex #29 on 70 // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/), 71 // implementing a tailored version of extended grapheme clusters. 72 // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules. 73 74 // Rule GB1, sot ÷; Rule GB2, ÷ eot 75 if (offset <= start || offset >= start + count) { 76 return true; 77 } 78 if (U16_IS_TRAIL(buf[offset])) { 79 // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break 80 return !U16_IS_LEAD(buf[offset - 1]); 81 } 82 uint32_t c1 = 0; 83 uint32_t c2 = 0; 84 size_t offset_back = offset; 85 U16_PREV(buf, start, offset_back, c1); 86 U16_NEXT(buf, offset, start + count, c2); 87 int32_t p1 = tailoredGraphemeClusterBreak(c1); 88 int32_t p2 = tailoredGraphemeClusterBreak(c2); 89 // Rule GB3, CR x LF 90 if (p1 == U_GCB_CR && p2 == U_GCB_LF) { 91 return false; 92 } 93 // Rule GB4, (Control | CR | LF) ÷ 94 if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) { 95 return true; 96 } 97 // Rule GB5, ÷ (Control | CR | LF) 98 if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) { 99 return true; 100 } 101 // Rule GB6, L x ( L | V | LV | LVT ) 102 if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) { 103 return false; 104 } 105 // Rule GB7, ( LV | V ) x ( V | T ) 106 if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) { 107 return false; 108 } 109 // Rule GB8, ( LVT | T ) x T 110 if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) { 111 return false; 112 } 113 // Rule GB8a, Regional_Indicator x Regional_Indicator 114 // 115 // Known limitation: This is overly conservative, and returns no grapheme breaks between two 116 // flags, such as in the character sequence "U+1F1FA U+1F1F8 [potential break] U+1F1FA U+1F1F8". 117 // Also, it assumes that all combinations of Regional Indicators produce a flag, where they 118 // don't. 119 // 120 // There is no easy solution for doing this correctly, except for querying the font and doing 121 // some lookback. 122 if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { 123 return false; 124 } 125 // Rule GB9, x Extend; Rule GB9a, x SpacingMark; Rule GB9b, Prepend x 126 if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) { 127 return false; 128 } 129 // Cluster indic syllables together (tailoring of UAX #29) 130 // Known limitation: this is overly conservative, and assumes that the virama may form a 131 // conjunct with the following letter, which doesn't always happen. 132 // 133 // There is no easy solution to do this correctly. Even querying the font does not help (with 134 // the current font technoloies), since the font may be creating the conjunct using multiple 135 // glyphs, while the user may be perceiving that sequence of glyphs as one conjunct or one 136 // letter. 137 if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama 138 && !isPureKiller(c1) 139 && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) { 140 return false; 141 } 142 // Rule GB10, Any ÷ Any 143 return true; 144} 145 146size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count, 147 size_t offset, MoveOpt opt) { 148 switch (opt) { 149 case AFTER: 150 if (offset < start + count) { 151 offset++; 152 } 153 // fall through 154 case AT_OR_AFTER: 155 while (!isGraphemeBreak(buf, start, count, offset)) { 156 offset++; 157 } 158 break; 159 case BEFORE: 160 if (offset > start) { 161 offset--; 162 } 163 // fall through 164 case AT_OR_BEFORE: 165 while (!isGraphemeBreak(buf, start, count, offset)) { 166 offset--; 167 } 168 break; 169 case AT: 170 if (!isGraphemeBreak(buf, start, count, offset)) { 171 offset = (size_t)-1; 172 } 173 break; 174 } 175 return offset; 176} 177 178} // namespace android 179