GraphemeBreak.cpp revision 6638e05ac2de397455c30cae05aca399a567428d
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <stdint.h>
18#include <unicode/uchar.h>
19#include <unicode/utf16.h>
20
21#include <minikin/GraphemeBreak.h>
22
23namespace android {
24
25int32_t tailoredGraphemeClusterBreak(uint32_t c) {
26    // Characters defined as Control that we want to treat them as Extend.
27    // These are curated manually.
28    if (c == 0x00AD                         // SHY
29            || c == 0x061C                  // ALM
30            || c == 0x180E                  // MONGOLIAN VOWEL SEPARATOR
31            || c == 0x200B                  // ZWSP
32            || c == 0x200E                  // LRM
33            || c == 0x200F                  // RLM
34            || (0x202A <= c && c <= 0x202E) // LRE, RLE, PDF, LRO, RLO
35            || ((c | 0xF) == 0x206F)        // WJ, invisible math operators, LRI, RLI, FSI, PDI,
36                                            // and the deprecated invisible format controls
37            || c == 0xFEFF                  // BOM
38            || ((c | 0x7F) == 0xE007F))     // recently undeprecated tag characters in Plane 14
39        return U_GCB_EXTEND;
40    // UTC-approved characters for the Prepend class, per
41    // http://www.unicode.org/L2/L2015/15183r-graph-cluster-brk.txt
42    // These should be removed when our copy of ICU gets updated to Unicode 9.0 (~2016 or 2017).
43    else if ((0x0600 <= c && c <= 0x0605) // Arabic subtending marks
44            || c == 0x06DD                // ARABIC SUBTENDING MARK
45            || c == 0x070F                // SYRIAC ABBREVIATION MARK
46            || c == 0x0D4E                // MALAYALAM LETTER DOT REPH
47            || c == 0x110BD               // KAITHI NUMBER SIGN
48            || c == 0x111C2               // SHARADA SIGN JIHVAMULIYA
49            || c == 0x111C3)              // SHARADA SIGN UPADHMANIYA
50        return U_GCB_PREPEND;
51    // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they
52    // allow a grapheme break before it.
53    else if (c == 0x0E33)
54        return U_GCB_OTHER;
55    else
56        return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
57}
58
59// Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
60// From http://www.unicode.org/Public/8.0.0/ucd/IndicSyllabicCategory.txt
61bool isPureKiller(uint32_t c) {
62    return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 || c == 0x1734
63            || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 || c == 0xA806
64            || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B);
65}
66
67// Returns true if the character appears before or after zwj in a zwj emoji sequence. See
68// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
69bool isZwjEmoji(uint32_t c) {
70    return (c == 0x2764       // HEAVY BLACK HEART
71            || c == 0x1F468   // MAN
72            || c == 0x1F469   // WOMAN
73            || c == 0x1F48B   // KISS MARK
74            || c == 0x1F466   // BOY
75            || c == 0x1F467   // GIRL
76            || c == 0x1F441   // EYE
77            || c == 0x1F5E8); // LEFT SPEECH BUBBLE
78}
79
80bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
81        size_t offset) {
82    // This implementation closely follows Unicode Standard Annex #29 on
83    // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
84    // implementing a tailored version of extended grapheme clusters.
85    // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.
86
87    // Rule GB1, sot ÷; Rule GB2, ÷ eot
88    if (offset <= start || offset >= start + count) {
89        return true;
90    }
91    if (U16_IS_TRAIL(buf[offset])) {
92        // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break
93        return !U16_IS_LEAD(buf[offset - 1]);
94    }
95    uint32_t c1 = 0;
96    uint32_t c2 = 0;
97    size_t offset_back = offset;
98    U16_PREV(buf, start, offset_back, c1);
99    U16_NEXT(buf, offset, start + count, c2);
100    int32_t p1 = tailoredGraphemeClusterBreak(c1);
101    int32_t p2 = tailoredGraphemeClusterBreak(c2);
102    // Rule GB3, CR x LF
103    if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
104        return false;
105    }
106    // Rule GB4, (Control | CR | LF) ÷
107    if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
108        return true;
109    }
110    // Rule GB5, ÷ (Control | CR | LF)
111    if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
112        return true;
113    }
114    // Rule GB6, L x ( L | V | LV | LVT )
115    if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
116        return false;
117    }
118    // Rule GB7, ( LV | V ) x ( V | T )
119    if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
120        return false;
121    }
122    // Rule GB8, ( LVT | T ) x T
123    if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
124        return false;
125    }
126    // Rule GB8a, Regional_Indicator x Regional_Indicator
127    //
128    // Known limitation: This is overly conservative, and returns no grapheme breaks between two
129    // flags, such as in the character sequence "U+1F1FA U+1F1F8 [potential break] U+1F1FA U+1F1F8".
130    // Also, it assumes that all combinations of Regional Indicators produce a flag, where they
131    // don't.
132    //
133    // There is no easy solution for doing this correctly, except for querying the font and doing
134    // some lookback.
135    if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
136        return false;
137    }
138    // Rule GB9, x Extend; Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
139    if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) {
140        return false;
141    }
142    // Cluster indic syllables together (tailoring of UAX #29)
143    // Known limitation: this is overly conservative, and assumes that the virama may form a
144    // conjunct with the following letter, which doesn't always happen.
145    //
146    // There is no easy solution to do this correctly. Even querying the font does not help (with
147    // the current font technoloies), since the font may be creating the conjunct using multiple
148    // glyphs, while the user may be perceiving that sequence of glyphs as one conjunct or one
149    // letter.
150    if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
151            && !isPureKiller(c1)
152            && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
153        return false;
154    }
155    // Tailoring: make emoji sequences with ZWJ a single grapheme cluster
156    if (c1 == 0x200D && isZwjEmoji(c2) && offset_back > start) {
157        // look at character before ZWJ to see that both can participate in an emoji zwj sequence
158        uint32_t c0 = 0;
159        U16_PREV(buf, start, offset_back, c0);
160        if (c0 == 0xFE0F && offset_back > start) {
161            // skip over emoji variation selector
162            U16_PREV(buf, start, offset_back, c0);
163        }
164        if (isZwjEmoji(c0)) {
165            return false;
166        }
167    }
168    // Rule GB10, Any ÷ Any
169    return true;
170}
171
172size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count,
173        size_t offset, MoveOpt opt) {
174    switch (opt) {
175    case AFTER:
176        if (offset < start + count) {
177            offset++;
178        }
179        // fall through
180    case AT_OR_AFTER:
181        while (!isGraphemeBreak(buf, start, count, offset)) {
182            offset++;
183        }
184        break;
185    case BEFORE:
186        if (offset > start) {
187            offset--;
188        }
189        // fall through
190    case AT_OR_BEFORE:
191        while (!isGraphemeBreak(buf, start, count, offset)) {
192            offset--;
193        }
194        break;
195    case AT:
196        if (!isGraphemeBreak(buf, start, count, offset)) {
197            offset = (size_t)-1;
198        }
199        break;
200    }
201    return offset;
202}
203
204}  // namespace android
205