1/* 2 * Copyright (C) 2014 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32#include "platform/fonts/Character.h" 33 34#include "platform/fonts/FontPlatformFeatures.h" 35#include "wtf/StdLibExtras.h" 36#include "wtf/text/StringBuilder.h" 37 38using namespace WTF; 39using namespace Unicode; 40 41namespace blink { 42 43static const UChar32 cjkIsolatedSymbolsArray[] = { 44 // 0x2C7 Caron, Mandarin Chinese 3rd Tone 45 0x2C7, 46 // 0x2CA Modifier Letter Acute Accent, Mandarin Chinese 2nd Tone 47 0x2CA, 48 // 0x2CB Modifier Letter Grave Access, Mandarin Chinese 4th Tone 49 0x2CB, 50 // 0x2D9 Dot Above, Mandarin Chinese 5th Tone 51 0x2D9, 52 0x2020, 0x2021, 0x2030, 0x203B, 0x203C, 0x2042, 0x2047, 0x2048, 0x2049, 0x2051, 53 0x20DD, 0x20DE, 0x2100, 0x2103, 0x2105, 0x2109, 0x210A, 0x2113, 0x2116, 0x2121, 54 0x212B, 0x213B, 0x2150, 0x2151, 0x2152, 0x217F, 0x2189, 0x2307, 0x2312, 0x23CE, 55 0x2423, 0x25A0, 0x25A1, 0x25A2, 0x25AA, 0x25AB, 0x25B1, 0x25B2, 0x25B3, 0x25B6, 56 0x25B7, 0x25BC, 0x25BD, 0x25C0, 0x25C1, 0x25C6, 0x25C7, 0x25C9, 0x25CB, 0x25CC, 57 0x25EF, 0x2605, 0x2606, 0x260E, 0x2616, 0x2617, 0x2640, 0x2642, 0x26A0, 0x26BD, 58 0x26BE, 0x2713, 0x271A, 0x273F, 0x2740, 0x2756, 0x2B1A, 0xFE10, 0xFE11, 0xFE12, 59 0xFE19, 0xFF1D, 60 // Emoji. 61 0x1F100 62}; 63 64// Takes a flattened list of closed intervals 65template <class T, size_t size> 66bool valueInIntervalList(const T (&intervalList)[size], const T& value) 67{ 68 const T* bound = std::upper_bound(&intervalList[0], &intervalList[size], value); 69 if ((bound - intervalList) % 2 == 1) 70 return true; 71 return bound > intervalList && *(bound - 1) == value; 72} 73 74CodePath Character::characterRangeCodePath(const UChar* characters, unsigned len) 75{ 76 static const UChar complexCodePathRanges[] = { 77 // U+02E5 through U+02E9 (Modifier Letters : Tone letters) 78 0x2E5, 0x2E9, 79 // U+0300 through U+036F Combining diacritical marks 80 0x300, 0x36F, 81 // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, ... 82 0x0591, 0x05BD, 83 // ... Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha 84 0x05BF, 0x05CF, 85 // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, 86 // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, 87 // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar 88 0x0600, 0x109F, 89 // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left 90 // here if you precompose; Modern Korean will be precomposed as a result of step A) 91 0x1100, 0x11FF, 92 // U+135D through U+135F Ethiopic combining marks 93 0x135D, 0x135F, 94 // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa,Khmer, Mongolian 95 0x1700, 0x18AF, 96 // U+1900 through U+194F Limbu (Unicode 4.0) 97 0x1900, 0x194F, 98 // U+1980 through U+19DF New Tai Lue 99 0x1980, 0x19DF, 100 // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, Vedic 101 0x1A00, 0x1CFF, 102 // U+1DC0 through U+1DFF Comining diacritical mark supplement 103 0x1DC0, 0x1DFF, 104 // U+20D0 through U+20FF Combining marks for symbols 105 0x20D0, 0x20FF, 106 // U+2CEF through U+2CF1 Combining marks for Coptic 107 0x2CEF, 0x2CF1, 108 // U+302A through U+302F Ideographic and Hangul Tone marks 109 0x302A, 0x302F, 110 // U+A67C through U+A67D Combining marks for old Cyrillic 111 0xA67C, 0xA67D, 112 // U+A6F0 through U+A6F1 Combining mark for Bamum 113 0xA6F0, 0xA6F1, 114 // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended, 115 // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, Meetei Mayek 116 0xA800, 0xABFF, 117 // U+D7B0 through U+D7FF Hangul Jamo Ext. B 118 0xD7B0, 0xD7FF, 119 // U+FE00 through U+FE0F Unicode variation selectors 120 0xFE00, 0xFE0F, 121 // U+FE20 through U+FE2F Combining half marks 122 0xFE20, 0xFE2F 123 }; 124 125 CodePath result = SimplePath; 126 for (unsigned i = 0; i < len; i++) { 127 const UChar c = characters[i]; 128 129 // Shortcut for common case 130 if (c < 0x2E5) 131 continue; 132 133 // U+1E00 through U+2000 characters with diacritics and stacked diacritics 134 if (c >= 0x1E00 && c <= 0x2000) { 135 result = SimpleWithGlyphOverflowPath; 136 continue; 137 } 138 139 // Surrogate pairs 140 if (c > 0xD7FF && c <= 0xDBFF) { 141 if (i == len - 1) 142 continue; 143 144 UChar next = characters[++i]; 145 if (!U16_IS_TRAIL(next)) 146 continue; 147 148 UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next); 149 150 if (supplementaryCharacter < 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols 151 continue; 152 if (supplementaryCharacter <= 0x1F1FF) 153 return ComplexPath; 154 155 if (supplementaryCharacter < 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors. 156 continue; 157 if (supplementaryCharacter <= 0xE01EF) 158 return ComplexPath; 159 160 // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and other complex scripts 161 // in plane 1 or higher. 162 163 continue; 164 } 165 166 // Search for other Complex cases 167 if (valueInIntervalList(complexCodePathRanges, c)) 168 return ComplexPath; 169 } 170 171 return result; 172} 173 174bool Character::isCJKIdeograph(UChar32 c) 175{ 176 static const UChar32 cjkIdeographRanges[] = { 177 // CJK Radicals Supplement and Kangxi Radicals. 178 0x2E80, 0x2FDF, 179 // CJK Strokes. 180 0x31C0, 0x31EF, 181 // CJK Unified Ideographs Extension A. 182 0x3400, 0x4DBF, 183 // The basic CJK Unified Ideographs block. 184 0x4E00, 0x9FFF, 185 // CJK Compatibility Ideographs. 186 0xF900, 0xFAFF, 187 // CJK Unified Ideographs Extension B. 188 0x20000, 0x2A6DF, 189 // CJK Unified Ideographs Extension C. 190 // CJK Unified Ideographs Extension D. 191 0x2A700, 0x2B81F, 192 // CJK Compatibility Ideographs Supplement. 193 0x2F800, 0x2FA1F 194 }; 195 static size_t cjkIdeographRangesCount = WTF_ARRAY_LENGTH(cjkIdeographRanges); 196 197 // Early out 198 if (c < cjkIdeographRanges[0] || c > cjkIdeographRanges[cjkIdeographRangesCount - 1]) 199 return false; 200 201 return valueInIntervalList(cjkIdeographRanges, c); 202} 203 204bool Character::isCJKIdeographOrSymbol(UChar32 c) 205{ 206 // Likely common case 207 if (c < 0x2C7) 208 return false; 209 210 // Hash lookup for isolated symbols (those not part of a contiguous range) 211 static HashSet<UChar32>* cjkIsolatedSymbols = 0; 212 if (!cjkIsolatedSymbols) { 213 cjkIsolatedSymbols = new HashSet<UChar32>(); 214 for (size_t i = 0; i < WTF_ARRAY_LENGTH(cjkIsolatedSymbolsArray); ++i) 215 cjkIsolatedSymbols->add(cjkIsolatedSymbolsArray[i]); 216 } 217 if (cjkIsolatedSymbols->contains(c)) 218 return true; 219 220 if (isCJKIdeograph(c)) 221 return true; 222 223 static const UChar32 cjkSymbolRanges[] = { 224 0x2156, 0x215A, 225 0x2160, 0x216B, 226 0x2170, 0x217B, 227 0x23BE, 0x23CC, 228 0x2460, 0x2492, 229 0x249C, 0x24FF, 230 0x25CE, 0x25D3, 231 0x25E2, 0x25E6, 232 0x2600, 0x2603, 233 0x2660, 0x266F, 234 0x2672, 0x267D, 235 0x2776, 0x277F, 236 // Ideographic Description Characters, with CJK Symbols and Punctuation, excluding 0x3030. 237 // Then Hiragana 0x3040 .. 0x309F, Katakana 0x30A0 .. 0x30FF, Bopomofo 0x3100 .. 0x312F 238 0x2FF0, 0x302F, 239 0x3031, 0x312F, 240 // More Bopomofo and Bopomofo Extended 0x31A0 .. 0x31BF 241 0x3190, 0x31BF, 242 // Enclosed CJK Letters and Months (0x3200 .. 0x32FF). 243 // CJK Compatibility (0x3300 .. 0x33FF). 244 0x3200, 0x33FF, 245 0xF860, 0xF862, 246 // CJK Compatibility Forms. 247 0xFE30, 0xFE4F, 248 // Halfwidth and Fullwidth Forms 249 // Usually only used in CJK 250 0xFF00, 0xFF0C, 251 0xFF0E, 0xFF1A, 252 0xFF1F, 0xFFEF, 253 // Emoji. 254 0x1F110, 0x1F129, 255 0x1F130, 0x1F149, 256 0x1F150, 0x1F169, 257 0x1F170, 0x1F189, 258 0x1F200, 0x1F6FF 259 }; 260 261 return valueInIntervalList(cjkSymbolRanges, c); 262} 263 264unsigned Character::expansionOpportunityCount(const LChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion) 265{ 266 unsigned count = 0; 267 if (direction == LTR) { 268 for (size_t i = 0; i < length; ++i) { 269 if (treatAsSpace(characters[i])) { 270 count++; 271 isAfterExpansion = true; 272 } else { 273 isAfterExpansion = false; 274 } 275 } 276 } else { 277 for (size_t i = length; i > 0; --i) { 278 if (treatAsSpace(characters[i - 1])) { 279 count++; 280 isAfterExpansion = true; 281 } else { 282 isAfterExpansion = false; 283 } 284 } 285 } 286 return count; 287} 288 289unsigned Character::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion) 290{ 291 static bool expandAroundIdeographs = FontPlatformFeatures::canExpandAroundIdeographsInComplexText(); 292 unsigned count = 0; 293 if (direction == LTR) { 294 for (size_t i = 0; i < length; ++i) { 295 UChar32 character = characters[i]; 296 if (treatAsSpace(character)) { 297 count++; 298 isAfterExpansion = true; 299 continue; 300 } 301 if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) { 302 character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]); 303 i++; 304 } 305 if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) { 306 if (!isAfterExpansion) 307 count++; 308 count++; 309 isAfterExpansion = true; 310 continue; 311 } 312 isAfterExpansion = false; 313 } 314 } else { 315 for (size_t i = length; i > 0; --i) { 316 UChar32 character = characters[i - 1]; 317 if (treatAsSpace(character)) { 318 count++; 319 isAfterExpansion = true; 320 continue; 321 } 322 if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) { 323 character = U16_GET_SUPPLEMENTARY(characters[i - 2], character); 324 i--; 325 } 326 if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) { 327 if (!isAfterExpansion) 328 count++; 329 count++; 330 isAfterExpansion = true; 331 continue; 332 } 333 isAfterExpansion = false; 334 } 335 } 336 return count; 337} 338 339bool Character::canReceiveTextEmphasis(UChar32 c) 340{ 341 CharCategory category = Unicode::category(c); 342 if (category & (Separator_Space | Separator_Line | Separator_Paragraph | Other_NotAssigned | Other_Control | Other_Format)) 343 return false; 344 345 // Additional word-separator characters listed in CSS Text Level 3 Editor's Draft 3 November 2010. 346 if (c == ethiopicWordspace || c == aegeanWordSeparatorLine || c == aegeanWordSeparatorDot 347 || c == ugariticWordDivider || c == tibetanMarkIntersyllabicTsheg || c == tibetanMarkDelimiterTshegBstar) 348 return false; 349 350 return true; 351} 352 353template <typename CharacterType> 354static inline String normalizeSpacesInternal(const CharacterType* characters, unsigned length) 355{ 356 StringBuilder normalized; 357 normalized.reserveCapacity(length); 358 359 for (unsigned i = 0; i < length; ++i) 360 normalized.append(Character::normalizeSpaces(characters[i])); 361 362 return normalized.toString(); 363} 364 365String Character::normalizeSpaces(const LChar* characters, unsigned length) 366{ 367 return normalizeSpacesInternal(characters, length); 368} 369 370String Character::normalizeSpaces(const UChar* characters, unsigned length) 371{ 372 return normalizeSpacesInternal(characters, length); 373} 374 375} // namespace blink 376