1/*
2*******************************************************************************
3*
4*   Copyright (C) 1999-2004, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  utf.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 1999sep09
14*   created by: Markus W. Scherer
15*/
16
17#ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_
18#define BASE_THIRD_PARTY_ICU_ICU_UTF_H_
19
20#include <stdint.h>
21
22namespace base_icu {
23
24typedef int32_t UChar32;
25typedef uint16_t UChar;
26typedef int8_t UBool;
27
28// General ---------------------------------------------------------------------
29// from utf.h
30
31/**
32 * This value is intended for sentinel values for APIs that
33 * (take or) return single code points (UChar32).
34 * It is outside of the Unicode code point range 0..0x10ffff.
35 *
36 * For example, a "done" or "error" value in a new API
37 * could be indicated with CBU_SENTINEL.
38 *
39 * ICU APIs designed before ICU 2.4 usually define service-specific "done"
40 * values, mostly 0xffff.
41 * Those may need to be distinguished from
42 * actual U+ffff text contents by calling functions like
43 * CharacterIterator::hasNext() or UnicodeString::length().
44 *
45 * @return -1
46 * @see UChar32
47 * @stable ICU 2.4
48 */
49#define CBU_SENTINEL (-1)
50
51/**
52 * Is this code point a Unicode noncharacter?
53 * @param c 32-bit code point
54 * @return TRUE or FALSE
55 * @stable ICU 2.4
56 */
57#define CBU_IS_UNICODE_NONCHAR(c)                                          \
58  ((c) >= 0xfdd0 && ((uint32_t)(c) <= 0xfdef || ((c)&0xfffe) == 0xfffe) && \
59   (uint32_t)(c) <= 0x10ffff)
60
61/**
62 * Is c a Unicode code point value (0..U+10ffff)
63 * that can be assigned a character?
64 *
65 * Code points that are not characters include:
66 * - single surrogate code points (U+d800..U+dfff, 2048 code points)
67 * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
68 * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
69 * - the highest Unicode code point value is U+10ffff
70 *
71 * This means that all code points below U+d800 are character code points,
72 * and that boundary is tested first for performance.
73 *
74 * @param c 32-bit code point
75 * @return TRUE or FALSE
76 * @stable ICU 2.4
77 */
78#define CBU_IS_UNICODE_CHAR(c)                             \
79  ((uint32_t)(c) < 0xd800 ||                               \
80   ((uint32_t)(c) > 0xdfff && (uint32_t)(c) <= 0x10ffff && \
81    !CBU_IS_UNICODE_NONCHAR(c)))
82
83/**
84 * Is this code point a surrogate (U+d800..U+dfff)?
85 * @param c 32-bit code point
86 * @return TRUE or FALSE
87 * @stable ICU 2.4
88 */
89#define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
90
91/**
92 * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
93 * is it a lead surrogate?
94 * @param c 32-bit code point
95 * @return TRUE or FALSE
96 * @stable ICU 2.4
97 */
98#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
99
100
101// UTF-8 macros ----------------------------------------------------------------
102// from utf8.h
103
104extern const uint8_t utf8_countTrailBytes[256];
105
106/**
107 * Count the trail bytes for a UTF-8 lead byte.
108 * @internal
109 */
110#define CBU8_COUNT_TRAIL_BYTES(leadByte) \
111  (base_icu::utf8_countTrailBytes[(uint8_t)leadByte])
112
113/**
114 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
115 * @internal
116 */
117#define CBU8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
118
119/**
120 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
121 * @param c 8-bit code unit (byte)
122 * @return TRUE or FALSE
123 * @stable ICU 2.4
124 */
125#define CBU8_IS_SINGLE(c) (((c)&0x80)==0)
126
127/**
128 * Is this code unit (byte) a UTF-8 lead byte?
129 * @param c 8-bit code unit (byte)
130 * @return TRUE or FALSE
131 * @stable ICU 2.4
132 */
133#define CBU8_IS_LEAD(c) ((uint8_t)((c)-0xc0) < 0x3e)
134
135/**
136 * Is this code unit (byte) a UTF-8 trail byte?
137 * @param c 8-bit code unit (byte)
138 * @return TRUE or FALSE
139 * @stable ICU 2.4
140 */
141#define CBU8_IS_TRAIL(c) (((c)&0xc0)==0x80)
142
143/**
144 * How many code units (bytes) are used for the UTF-8 encoding
145 * of this Unicode code point?
146 * @param c 32-bit code point
147 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
148 * @stable ICU 2.4
149 */
150#define CBU8_LENGTH(c)                                                      \
151  ((uint32_t)(c) <= 0x7f                                                    \
152       ? 1                                                                  \
153       : ((uint32_t)(c) <= 0x7ff                                            \
154              ? 2                                                           \
155              : ((uint32_t)(c) <= 0xd7ff                                    \
156                     ? 3                                                    \
157                     : ((uint32_t)(c) <= 0xdfff || (uint32_t)(c) > 0x10ffff \
158                            ? 0                                             \
159                            : ((uint32_t)(c) <= 0xffff ? 3 : 4)))))
160
161/**
162 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
163 * @return 4
164 * @stable ICU 2.4
165 */
166#define CBU8_MAX_LENGTH 4
167
168/**
169 * Function for handling "next code point" with error-checking.
170 * @internal
171 */
172UChar32 utf8_nextCharSafeBody(const uint8_t* s,
173                              int32_t* pi,
174                              int32_t length,
175                              UChar32 c,
176                              UBool strict);
177
178/**
179 * Get a code point from a string at a code point boundary offset,
180 * and advance the offset to the next code point boundary.
181 * (Post-incrementing forward iteration.)
182 * "Safe" macro, checks for illegal sequences and for string boundaries.
183 *
184 * The offset may point to the lead byte of a multi-byte sequence,
185 * in which case the macro will read the whole sequence.
186 * If the offset points to a trail byte or an illegal UTF-8 sequence, then
187 * c is set to a negative value.
188 *
189 * @param s const uint8_t * string
190 * @param i string offset, i<length
191 * @param length string length
192 * @param c output UChar32 variable, set to <0 in case of an error
193 * @see CBU8_NEXT_UNSAFE
194 * @stable ICU 2.4
195 */
196#define CBU8_NEXT(s, i, length, c)                                       \
197  {                                                                      \
198    (c) = (s)[(i)++];                                                    \
199    if (((uint8_t)(c)) >= 0x80) {                                        \
200      if (CBU8_IS_LEAD(c)) {                                             \
201        (c) = base_icu::utf8_nextCharSafeBody((const uint8_t*)s, &(i),   \
202                                              (int32_t)(length), c, -1); \
203      } else {                                                           \
204        (c) = CBU_SENTINEL;                                              \
205      }                                                                  \
206    }                                                                    \
207  }
208
209/**
210 * Append a code point to a string, overwriting 1 to 4 bytes.
211 * The offset points to the current end of the string contents
212 * and is advanced (post-increment).
213 * "Unsafe" macro, assumes a valid code point and sufficient space in the
214 * string.
215 * Otherwise, the result is undefined.
216 *
217 * @param s const uint8_t * string buffer
218 * @param i string offset
219 * @param c code point to append
220 * @see CBU8_APPEND
221 * @stable ICU 2.4
222 */
223#define CBU8_APPEND_UNSAFE(s, i, c)                            \
224  {                                                            \
225    if ((uint32_t)(c) <= 0x7f) {                               \
226      (s)[(i)++] = (uint8_t)(c);                               \
227    } else {                                                   \
228      if ((uint32_t)(c) <= 0x7ff) {                            \
229        (s)[(i)++] = (uint8_t)(((c) >> 6) | 0xc0);             \
230      } else {                                                 \
231        if ((uint32_t)(c) <= 0xffff) {                         \
232          (s)[(i)++] = (uint8_t)(((c) >> 12) | 0xe0);          \
233        } else {                                               \
234          (s)[(i)++] = (uint8_t)(((c) >> 18) | 0xf0);          \
235          (s)[(i)++] = (uint8_t)((((c) >> 12) & 0x3f) | 0x80); \
236        }                                                      \
237        (s)[(i)++] = (uint8_t)((((c) >> 6) & 0x3f) | 0x80);    \
238      }                                                        \
239      (s)[(i)++] = (uint8_t)(((c)&0x3f) | 0x80);               \
240    }                                                          \
241  }
242
243// UTF-16 macros ---------------------------------------------------------------
244// from utf16.h
245
246/**
247 * Does this code unit alone encode a code point (BMP, not a surrogate)?
248 * @param c 16-bit code unit
249 * @return TRUE or FALSE
250 * @stable ICU 2.4
251 */
252#define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c)
253
254/**
255 * Is this code unit a lead surrogate (U+d800..U+dbff)?
256 * @param c 16-bit code unit
257 * @return TRUE or FALSE
258 * @stable ICU 2.4
259 */
260#define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
261
262/**
263 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
264 * @param c 16-bit code unit
265 * @return TRUE or FALSE
266 * @stable ICU 2.4
267 */
268#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
269
270/**
271 * Is this code unit a surrogate (U+d800..U+dfff)?
272 * @param c 16-bit code unit
273 * @return TRUE or FALSE
274 * @stable ICU 2.4
275 */
276#define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c)
277
278/**
279 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
280 * is it a lead surrogate?
281 * @param c 16-bit code unit
282 * @return TRUE or FALSE
283 * @stable ICU 2.4
284 */
285#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
286
287/**
288 * Helper constant for CBU16_GET_SUPPLEMENTARY.
289 * @internal
290 */
291#define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
292
293/**
294 * Get a supplementary code point value (U+10000..U+10ffff)
295 * from its lead and trail surrogates.
296 * The result is undefined if the input values are not
297 * lead and trail surrogates.
298 *
299 * @param lead lead surrogate (U+d800..U+dbff)
300 * @param trail trail surrogate (U+dc00..U+dfff)
301 * @return supplementary code point (U+10000..U+10ffff)
302 * @stable ICU 2.4
303 */
304#define CBU16_GET_SUPPLEMENTARY(lead, trail) \
305    (((base_icu::UChar32)(lead)<<10UL)+(base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET)
306
307
308/**
309 * Get the lead surrogate (0xd800..0xdbff) for a
310 * supplementary code point (0x10000..0x10ffff).
311 * @param supplementary 32-bit code point (U+10000..U+10ffff)
312 * @return lead surrogate (U+d800..U+dbff) for supplementary
313 * @stable ICU 2.4
314 */
315#define CBU16_LEAD(supplementary) \
316    (base_icu::UChar)(((supplementary)>>10)+0xd7c0)
317
318/**
319 * Get the trail surrogate (0xdc00..0xdfff) for a
320 * supplementary code point (0x10000..0x10ffff).
321 * @param supplementary 32-bit code point (U+10000..U+10ffff)
322 * @return trail surrogate (U+dc00..U+dfff) for supplementary
323 * @stable ICU 2.4
324 */
325#define CBU16_TRAIL(supplementary) \
326    (base_icu::UChar)(((supplementary)&0x3ff)|0xdc00)
327
328/**
329 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
330 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
331 * @param c 32-bit code point
332 * @return 1 or 2
333 * @stable ICU 2.4
334 */
335#define CBU16_LENGTH(c) ((uint32_t)(c) <= 0xffff ? 1 : 2)
336
337/**
338 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
339 * @return 2
340 * @stable ICU 2.4
341 */
342#define CBU16_MAX_LENGTH 2
343
344/**
345 * Get a code point from a string at a code point boundary offset,
346 * and advance the offset to the next code point boundary.
347 * (Post-incrementing forward iteration.)
348 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
349 *
350 * The offset may point to the lead surrogate unit
351 * for a supplementary code point, in which case the macro will read
352 * the following trail surrogate as well.
353 * If the offset points to a trail surrogate or
354 * to a single, unpaired lead surrogate, then that itself
355 * will be returned as the code point.
356 *
357 * @param s const UChar * string
358 * @param i string offset, i<length
359 * @param length string length
360 * @param c output UChar32 variable
361 * @stable ICU 2.4
362 */
363#define CBU16_NEXT(s, i, length, c)                            \
364  {                                                            \
365    (c) = (s)[(i)++];                                          \
366    if (CBU16_IS_LEAD(c)) {                                    \
367      uint16_t __c2;                                           \
368      if ((i) < (length) && CBU16_IS_TRAIL(__c2 = (s)[(i)])) { \
369        ++(i);                                                 \
370        (c) = CBU16_GET_SUPPLEMENTARY((c), __c2);              \
371      }                                                        \
372    }                                                          \
373  }
374
375/**
376 * Append a code point to a string, overwriting 1 or 2 code units.
377 * The offset points to the current end of the string contents
378 * and is advanced (post-increment).
379 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
380 * Otherwise, the result is undefined.
381 *
382 * @param s const UChar * string buffer
383 * @param i string offset
384 * @param c code point to append
385 * @see CBU16_APPEND
386 * @stable ICU 2.4
387 */
388#define CBU16_APPEND_UNSAFE(s, i, c)                 \
389  {                                                  \
390    if ((uint32_t)(c) <= 0xffff) {                   \
391      (s)[(i)++] = (uint16_t)(c);                    \
392    } else {                                         \
393      (s)[(i)++] = (uint16_t)(((c) >> 10) + 0xd7c0); \
394      (s)[(i)++] = (uint16_t)(((c)&0x3ff) | 0xdc00); \
395    }                                                \
396  }
397
398}  // namesapce base_icu
399
400#endif  // BASE_THIRD_PARTY_ICU_ICU_UTF_H_
401