1/*
2 * Copyright (C) 2014 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "platform/fonts/Character.h"
33
34#include "platform/fonts/FontPlatformFeatures.h"
35#include "wtf/StdLibExtras.h"
36#include "wtf/text/StringBuilder.h"
37
38using namespace WTF;
39using namespace Unicode;
40
41namespace blink {
42
43static const UChar32 cjkIsolatedSymbolsArray[] = {
44    // 0x2C7 Caron, Mandarin Chinese 3rd Tone
45    0x2C7,
46    // 0x2CA Modifier Letter Acute Accent, Mandarin Chinese 2nd Tone
47    0x2CA,
48    // 0x2CB Modifier Letter Grave Access, Mandarin Chinese 4th Tone
49    0x2CB,
50    // 0x2D9 Dot Above, Mandarin Chinese 5th Tone
51    0x2D9,
52    0x2020, 0x2021, 0x2030, 0x203B, 0x203C, 0x2042, 0x2047, 0x2048, 0x2049, 0x2051,
53    0x20DD, 0x20DE, 0x2100, 0x2103, 0x2105, 0x2109, 0x210A, 0x2113, 0x2116, 0x2121,
54    0x212B, 0x213B, 0x2150, 0x2151, 0x2152, 0x217F, 0x2189, 0x2307, 0x2312, 0x23CE,
55    0x2423, 0x25A0, 0x25A1, 0x25A2, 0x25AA, 0x25AB, 0x25B1, 0x25B2, 0x25B3, 0x25B6,
56    0x25B7, 0x25BC, 0x25BD, 0x25C0, 0x25C1, 0x25C6, 0x25C7, 0x25C9, 0x25CB, 0x25CC,
57    0x25EF, 0x2605, 0x2606, 0x260E, 0x2616, 0x2617, 0x2640, 0x2642, 0x26A0, 0x26BD,
58    0x26BE, 0x2713, 0x271A, 0x273F, 0x2740, 0x2756, 0x2B1A, 0xFE10, 0xFE11, 0xFE12,
59    0xFE19, 0xFF1D,
60    // Emoji.
61    0x1F100
62};
63
64// Takes a flattened list of closed intervals
65template <class T, size_t size>
66bool valueInIntervalList(const T (&intervalList)[size], const T& value)
67{
68    const T* bound = std::upper_bound(&intervalList[0], &intervalList[size], value);
69    if ((bound - intervalList) % 2 == 1)
70        return true;
71    return bound > intervalList && *(bound - 1) == value;
72}
73
74CodePath Character::characterRangeCodePath(const UChar* characters, unsigned len)
75{
76    static const UChar complexCodePathRanges[] = {
77        // U+02E5 through U+02E9 (Modifier Letters : Tone letters)
78        0x2E5, 0x2E9,
79        // U+0300 through U+036F Combining diacritical marks
80        0x300, 0x36F,
81        // U+0591 through U+05CF excluding U+05BE Hebrew combining marks, ...
82        0x0591, 0x05BD,
83        // ... Hebrew punctuation Paseq, Sof Pasuq and Nun Hafukha
84        0x05BF, 0x05CF,
85        // U+0600 through U+109F Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic,
86        // Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
87        // Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar
88        0x0600, 0x109F,
89        // U+1100 through U+11FF Hangul Jamo (only Ancient Korean should be left
90        // here if you precompose; Modern Korean will be precomposed as a result of step A)
91        0x1100, 0x11FF,
92        // U+135D through U+135F Ethiopic combining marks
93        0x135D, 0x135F,
94        // U+1780 through U+18AF Tagalog, Hanunoo, Buhid, Taghanwa,Khmer, Mongolian
95        0x1700, 0x18AF,
96        // U+1900 through U+194F Limbu (Unicode 4.0)
97        0x1900, 0x194F,
98        // U+1980 through U+19DF New Tai Lue
99        0x1980, 0x19DF,
100        // U+1A00 through U+1CFF Buginese, Tai Tham, Balinese, Batak, Lepcha, Vedic
101        0x1A00, 0x1CFF,
102        // U+1DC0 through U+1DFF Comining diacritical mark supplement
103        0x1DC0, 0x1DFF,
104        // U+20D0 through U+20FF Combining marks for symbols
105        0x20D0, 0x20FF,
106        // U+2CEF through U+2CF1 Combining marks for Coptic
107        0x2CEF, 0x2CF1,
108        // U+302A through U+302F Ideographic and Hangul Tone marks
109        0x302A, 0x302F,
110        // U+A67C through U+A67D Combining marks for old Cyrillic
111        0xA67C, 0xA67D,
112        // U+A6F0 through U+A6F1 Combining mark for Bamum
113        0xA6F0, 0xA6F1,
114        // U+A800 through U+ABFF Nagri, Phags-pa, Saurashtra, Devanagari Extended,
115        // Hangul Jamo Ext. A, Javanese, Myanmar Extended A, Tai Viet, Meetei Mayek
116        0xA800, 0xABFF,
117        // U+D7B0 through U+D7FF Hangul Jamo Ext. B
118        0xD7B0, 0xD7FF,
119        // U+FE00 through U+FE0F Unicode variation selectors
120        0xFE00, 0xFE0F,
121        // U+FE20 through U+FE2F Combining half marks
122        0xFE20, 0xFE2F
123    };
124
125    CodePath result = SimplePath;
126    for (unsigned i = 0; i < len; i++) {
127        const UChar c = characters[i];
128
129        // Shortcut for common case
130        if (c < 0x2E5)
131            continue;
132
133        // U+1E00 through U+2000 characters with diacritics and stacked diacritics
134        if (c >= 0x1E00 && c <= 0x2000) {
135            result = SimpleWithGlyphOverflowPath;
136            continue;
137        }
138
139        // Surrogate pairs
140        if (c > 0xD7FF && c <= 0xDBFF) {
141            if (i == len - 1)
142                continue;
143
144            UChar next = characters[++i];
145            if (!U16_IS_TRAIL(next))
146                continue;
147
148            UChar32 supplementaryCharacter = U16_GET_SUPPLEMENTARY(c, next);
149
150            if (supplementaryCharacter < 0x1F1E6) // U+1F1E6 through U+1F1FF Regional Indicator Symbols
151                continue;
152            if (supplementaryCharacter <= 0x1F1FF)
153                return ComplexPath;
154
155            if (supplementaryCharacter < 0xE0100) // U+E0100 through U+E01EF Unicode variation selectors.
156                continue;
157            if (supplementaryCharacter <= 0xE01EF)
158                return ComplexPath;
159
160            // FIXME: Check for Brahmi (U+11000 block), Kaithi (U+11080 block) and other complex scripts
161            // in plane 1 or higher.
162
163            continue;
164        }
165
166        // Search for other Complex cases
167        if (valueInIntervalList(complexCodePathRanges, c))
168            return ComplexPath;
169    }
170
171    return result;
172}
173
174bool Character::isCJKIdeograph(UChar32 c)
175{
176    static const UChar32 cjkIdeographRanges[] = {
177        // CJK Radicals Supplement and Kangxi Radicals.
178        0x2E80, 0x2FDF,
179        // CJK Strokes.
180        0x31C0, 0x31EF,
181        // CJK Unified Ideographs Extension A.
182        0x3400, 0x4DBF,
183        // The basic CJK Unified Ideographs block.
184        0x4E00, 0x9FFF,
185        // CJK Compatibility Ideographs.
186        0xF900, 0xFAFF,
187        // CJK Unified Ideographs Extension B.
188        0x20000, 0x2A6DF,
189        // CJK Unified Ideographs Extension C.
190        // CJK Unified Ideographs Extension D.
191        0x2A700, 0x2B81F,
192        // CJK Compatibility Ideographs Supplement.
193        0x2F800, 0x2FA1F
194    };
195    static size_t cjkIdeographRangesCount = WTF_ARRAY_LENGTH(cjkIdeographRanges);
196
197    // Early out
198    if (c < cjkIdeographRanges[0] || c > cjkIdeographRanges[cjkIdeographRangesCount - 1])
199        return false;
200
201    return valueInIntervalList(cjkIdeographRanges, c);
202}
203
204bool Character::isCJKIdeographOrSymbol(UChar32 c)
205{
206    // Likely common case
207    if (c < 0x2C7)
208        return false;
209
210    // Hash lookup for isolated symbols (those not part of a contiguous range)
211    static HashSet<UChar32>* cjkIsolatedSymbols = 0;
212    if (!cjkIsolatedSymbols) {
213        cjkIsolatedSymbols = new HashSet<UChar32>();
214        for (size_t i = 0; i < WTF_ARRAY_LENGTH(cjkIsolatedSymbolsArray); ++i)
215            cjkIsolatedSymbols->add(cjkIsolatedSymbolsArray[i]);
216    }
217    if (cjkIsolatedSymbols->contains(c))
218        return true;
219
220    if (isCJKIdeograph(c))
221        return true;
222
223    static const UChar32 cjkSymbolRanges[] = {
224        0x2156, 0x215A,
225        0x2160, 0x216B,
226        0x2170, 0x217B,
227        0x23BE, 0x23CC,
228        0x2460, 0x2492,
229        0x249C, 0x24FF,
230        0x25CE, 0x25D3,
231        0x25E2, 0x25E6,
232        0x2600, 0x2603,
233        0x2660, 0x266F,
234        0x2672, 0x267D,
235        0x2776, 0x277F,
236        // Ideographic Description Characters, with CJK Symbols and Punctuation, excluding 0x3030.
237        // Then Hiragana 0x3040 .. 0x309F, Katakana 0x30A0 .. 0x30FF, Bopomofo 0x3100 .. 0x312F
238        0x2FF0, 0x302F,
239        0x3031, 0x312F,
240        // More Bopomofo and Bopomofo Extended 0x31A0 .. 0x31BF
241        0x3190, 0x31BF,
242        // Enclosed CJK Letters and Months (0x3200 .. 0x32FF).
243        // CJK Compatibility (0x3300 .. 0x33FF).
244        0x3200, 0x33FF,
245        0xF860, 0xF862,
246        // CJK Compatibility Forms.
247        0xFE30, 0xFE4F,
248        // Halfwidth and Fullwidth Forms
249        // Usually only used in CJK
250        0xFF00, 0xFF0C,
251        0xFF0E, 0xFF1A,
252        0xFF1F, 0xFFEF,
253        // Emoji.
254        0x1F110, 0x1F129,
255        0x1F130, 0x1F149,
256        0x1F150, 0x1F169,
257        0x1F170, 0x1F189,
258        0x1F200, 0x1F6FF
259    };
260
261    return valueInIntervalList(cjkSymbolRanges, c);
262}
263
264unsigned Character::expansionOpportunityCount(const LChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion)
265{
266    unsigned count = 0;
267    if (direction == LTR) {
268        for (size_t i = 0; i < length; ++i) {
269            if (treatAsSpace(characters[i])) {
270                count++;
271                isAfterExpansion = true;
272            } else {
273                isAfterExpansion = false;
274            }
275        }
276    } else {
277        for (size_t i = length; i > 0; --i) {
278            if (treatAsSpace(characters[i - 1])) {
279                count++;
280                isAfterExpansion = true;
281            } else {
282                isAfterExpansion = false;
283            }
284        }
285    }
286    return count;
287}
288
289unsigned Character::expansionOpportunityCount(const UChar* characters, size_t length, TextDirection direction, bool& isAfterExpansion)
290{
291    static bool expandAroundIdeographs = FontPlatformFeatures::canExpandAroundIdeographsInComplexText();
292    unsigned count = 0;
293    if (direction == LTR) {
294        for (size_t i = 0; i < length; ++i) {
295            UChar32 character = characters[i];
296            if (treatAsSpace(character)) {
297                count++;
298                isAfterExpansion = true;
299                continue;
300            }
301            if (U16_IS_LEAD(character) && i + 1 < length && U16_IS_TRAIL(characters[i + 1])) {
302                character = U16_GET_SUPPLEMENTARY(character, characters[i + 1]);
303                i++;
304            }
305            if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) {
306                if (!isAfterExpansion)
307                    count++;
308                count++;
309                isAfterExpansion = true;
310                continue;
311            }
312            isAfterExpansion = false;
313        }
314    } else {
315        for (size_t i = length; i > 0; --i) {
316            UChar32 character = characters[i - 1];
317            if (treatAsSpace(character)) {
318                count++;
319                isAfterExpansion = true;
320                continue;
321            }
322            if (U16_IS_TRAIL(character) && i > 1 && U16_IS_LEAD(characters[i - 2])) {
323                character = U16_GET_SUPPLEMENTARY(characters[i - 2], character);
324                i--;
325            }
326            if (expandAroundIdeographs && isCJKIdeographOrSymbol(character)) {
327                if (!isAfterExpansion)
328                    count++;
329                count++;
330                isAfterExpansion = true;
331                continue;
332            }
333            isAfterExpansion = false;
334        }
335    }
336    return count;
337}
338
339bool Character::canReceiveTextEmphasis(UChar32 c)
340{
341    CharCategory category = Unicode::category(c);
342    if (category & (Separator_Space | Separator_Line | Separator_Paragraph | Other_NotAssigned | Other_Control | Other_Format))
343        return false;
344
345    // Additional word-separator characters listed in CSS Text Level 3 Editor's Draft 3 November 2010.
346    if (c == ethiopicWordspace || c == aegeanWordSeparatorLine || c == aegeanWordSeparatorDot
347        || c == ugariticWordDivider || c == tibetanMarkIntersyllabicTsheg || c == tibetanMarkDelimiterTshegBstar)
348        return false;
349
350    return true;
351}
352
353template <typename CharacterType>
354static inline String normalizeSpacesInternal(const CharacterType* characters, unsigned length)
355{
356    StringBuilder normalized;
357    normalized.reserveCapacity(length);
358
359    for (unsigned i = 0; i < length; ++i)
360        normalized.append(Character::normalizeSpaces(characters[i]));
361
362    return normalized.toString();
363}
364
365String Character::normalizeSpaces(const LChar* characters, unsigned length)
366{
367    return normalizeSpacesInternal(characters, length);
368}
369
370String Character::normalizeSpaces(const UChar* characters, unsigned length)
371{
372    return normalizeSpacesInternal(characters, length);
373}
374
375} // namespace blink
376