1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6*   Copyright (C) 1999-2012, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9******************************************************************************
10*   file name:  utf_impl.cpp
11*   encoding:   UTF-8
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 1999sep13
16*   created by: Markus W. Scherer
17*
18*   This file provides implementation functions for macros in the utfXX.h
19*   that would otherwise be too long as macros.
20*/
21
22/* set import/export definitions */
23#ifndef U_UTF8_IMPL
24#   define U_UTF8_IMPL
25#endif
26
27#include "unicode/utypes.h"
28#include "unicode/utf.h"
29#include "unicode/utf8.h"
30#include "uassert.h"
31
32/*
33 * Table of the number of utf8 trail bytes, indexed by the lead byte.
34 * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h
35 *
36 * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.
37 *
38 * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were
39 * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES
40 * may exist in old client code that must continue to run with newer icu library versions.
41 *
42 * This table could be replaced on many machines by
43 * a few lines of assembler code using an
44 * "index of first 0-bit from msb" instruction and
45 * one or two more integer instructions.
46 *
47 * For example, on an i386, do something like
48 * - MOV AL, leadByte
49 * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
50 * - MOV AH, 0
51 * - BSR BX, AX     (16-bit)
52 * - MOV AX, 6      (result)
53 * - JZ finish      (ZF==1 if leadByte==0xff)
54 * - SUB AX, BX (result)
55 * -finish:
56 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
57 */
58extern "C" U_EXPORT const uint8_t
59utf8_countTrailBytes[256]={
60    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64
65    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
70    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74
75    // illegal C0 & C1
76    // 2-byte lead bytes C2..DF
77    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
78    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
79
80    // 3-byte lead bytes E0..EF
81    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82    // 4-byte lead bytes F0..F4
83    // illegal F5..FF
84    3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
85};
86
87static const UChar32
88utf8_errorValue[6]={
89    // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
90    // but without relying on the obsolete unicode/utf_old.h.
91    0x15, 0x9f, 0xffff,
92    0x10ffff
93};
94
95static UChar32
96errorValue(int32_t count, int8_t strict) {
97    if(strict>=0) {
98        return utf8_errorValue[count];
99    } else if(strict==-3) {
100        return 0xfffd;
101    } else {
102        return U_SENTINEL;
103    }
104}
105
106/*
107 * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
108 * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
109 *
110 * U8_NEXT() supports NUL-terminated strings indicated via length<0.
111 *
112 * The "strict" parameter controls the error behavior:
113 * <0  "Safe" behavior of U8_NEXT():
114 *     -1: All illegal byte sequences yield U_SENTINEL=-1.
115 *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
116 *         Some implementations use this for roundtripping of
117 *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
118 *         contain unpaired surrogates.
119 *     -3: All illegal byte sequences yield U+FFFD.
120 *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
121 *     All illegal byte sequences yield a positive code point such that this
122 *     result code point would be encoded with the same number of bytes as
123 *     the illegal sequence.
124 * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
125 *     Same as the obsolete "safe" behavior, but non-characters are also treated
126 *     like illegal sequences.
127 *
128 * Note that a UBool is the same as an int8_t.
129 */
130U_CAPI UChar32 U_EXPORT2
131utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
132    // *pi is one after byte c.
133    int32_t i=*pi;
134    // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
135    if(i==length || c>0xf4) {
136        // end of string, or not a lead byte
137    } else if(c>=0xf0) {
138        // Test for 4-byte sequences first because
139        // U8_NEXT() handles shorter valid sequences inline.
140        uint8_t t1=s[i], t2, t3;
141        c&=7;
142        if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
143                ++i!=length && (t2=s[i]-0x80)<=0x3f &&
144                ++i!=length && (t3=s[i]-0x80)<=0x3f) {
145            ++i;
146            c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
147            // strict: forbid non-characters like U+fffe
148            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
149                *pi=i;
150                return c;
151            }
152        }
153    } else if(c>=0xe0) {
154        c&=0xf;
155        if(strict!=-2) {
156            uint8_t t1=s[i], t2;
157            if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
158                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
159                ++i;
160                c=(c<<12)|((t1&0x3f)<<6)|t2;
161                // strict: forbid non-characters like U+fffe
162                if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
163                    *pi=i;
164                    return c;
165                }
166            }
167        } else {
168            // strict=-2 -> lenient: allow surrogates
169            uint8_t t1=s[i]-0x80, t2;
170            if(t1<=0x3f && (c>0 || t1>=0x20) &&
171                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
172                *pi=i+1;
173                return (c<<12)|(t1<<6)|t2;
174            }
175        }
176    } else if(c>=0xc2) {
177        uint8_t t1=s[i]-0x80;
178        if(t1<=0x3f) {
179            *pi=i+1;
180            return ((c-0xc0)<<6)|t1;
181        }
182    }  // else 0x80<=c<0xc2 is not a lead byte
183
184    /* error handling */
185    c=errorValue(i-*pi, strict);
186    *pi=i;
187    return c;
188}
189
190U_CAPI int32_t U_EXPORT2
191utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
192    if((uint32_t)(c)<=0x7ff) {
193        if((i)+1<(length)) {
194            (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
195            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
196            return i;
197        }
198    } else if((uint32_t)(c)<=0xffff) {
199        /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
200        if((i)+2<(length) && !U_IS_SURROGATE(c)) {
201            (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
202            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
203            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
204            return i;
205        }
206    } else if((uint32_t)(c)<=0x10ffff) {
207        if((i)+3<(length)) {
208            (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
209            (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
210            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
211            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
212            return i;
213        }
214    }
215    /* c>0x10ffff or not enough space, write an error value */
216    if(pIsError!=NULL) {
217        *pIsError=TRUE;
218    } else {
219        length-=i;
220        if(length>0) {
221            int32_t offset;
222            if(length>3) {
223                length=3;
224            }
225            s+=i;
226            offset=0;
227            c=utf8_errorValue[length-1];
228            U8_APPEND_UNSAFE(s, offset, c);
229            i=i+offset;
230        }
231    }
232    return i;
233}
234
235U_CAPI UChar32 U_EXPORT2
236utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) {
237    // *pi is the index of byte c.
238    int32_t i=*pi;
239    if(U8_IS_TRAIL(c) && i>start) {
240        uint8_t b1=s[--i];
241        if(0xc2<=b1 && b1<0xe0) {
242            *pi=i;
243            return ((b1-0xc0)<<6)|(c&0x3f);
244        } else if(U8_IS_TRAIL(b1) && i>start) {
245            // Extract the value bits from the last trail byte.
246            c&=0x3f;
247            uint8_t b2=s[--i];
248            if(0xe0<=b2 && b2<0xf0) {
249                b2&=0xf;
250                if(strict!=-2) {
251                    if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
252                        *pi=i;
253                        c=(b2<<12)|((b1&0x3f)<<6)|c;
254                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
255                            return c;
256                        } else {
257                            // strict: forbid non-characters like U+fffe
258                            return errorValue(2, strict);
259                        }
260                    }
261                } else {
262                    // strict=-2 -> lenient: allow surrogates
263                    b1-=0x80;
264                    if((b2>0 || b1>=0x20)) {
265                        *pi=i;
266                        return (b2<<12)|(b1<<6)|c;
267                    }
268                }
269            } else if(U8_IS_TRAIL(b2) && i>start) {
270                uint8_t b3=s[--i];
271                if(0xf0<=b3 && b3<=0xf4) {
272                    b3&=7;
273                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
274                        *pi=i;
275                        c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
276                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
277                            return c;
278                        } else {
279                            // strict: forbid non-characters like U+fffe
280                            return errorValue(3, strict);
281                        }
282                    }
283                }
284            } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
285                // Truncated 4-byte sequence.
286                *pi=i;
287                return errorValue(2, strict);
288            }
289        } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
290                (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
291            // Truncated 3- or 4-byte sequence.
292            *pi=i;
293            return errorValue(1, strict);
294        }
295    }
296    return errorValue(0, strict);
297}
298
299U_CAPI int32_t U_EXPORT2
300utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
301    // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
302    int32_t orig_i=i;
303    uint8_t c=s[i];
304    if(U8_IS_TRAIL(c) && i>start) {
305        uint8_t b1=s[--i];
306        if(0xc2<=b1 && b1<0xe0) {
307            return i;
308        } else if(U8_IS_TRAIL(b1) && i>start) {
309            uint8_t b2=s[--i];
310            if(0xe0<=b2 && b2<0xf0) {
311                if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
312                    return i;
313                }
314            } else if(U8_IS_TRAIL(b2) && i>start) {
315                uint8_t b3=s[--i];
316                if(0xf0<=b3 && b3<=0xf4) {
317                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
318                        return i;
319                    }
320                }
321            } else if(0xf0<=b2 && b2<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
322                // Truncated 4-byte sequence.
323                return i;
324            }
325        } else if((0xe0<=b1 && b1<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b1, c)) ||
326                (0xf0<=b1 && b1<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
327            // Truncated 3- or 4-byte sequence.
328            return i;
329        }
330    }
331    return orig_i;
332}
333