1/*
2*******************************************************************************
3*
4*   Copyright (C) 1999-2004, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  utf16.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 1999sep09
14*   created by: Markus W. Scherer
15*/
16
17/**
18 * \file
19 * \brief C API: 16-bit Unicode handling macros
20 *
21 * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
22 * utf16.h is included by utf.h after unicode/umachine.h
23 * and some common definitions.
24 *
25 * For more information see utf.h and the ICU User Guide Strings chapter
26 * (http://oss.software.ibm.com/icu/userguide/).
27 *
28 * <em>Usage:</em>
29 * ICU coding guidelines for if() statements should be followed when using these macros.
30 * Compound statements (curly braces {}) must be used  for if-else-while...
31 * bodies and all macro statements should be terminated with semicolon.
32 */
33
34#ifndef __UTF16_H__
35#define __UTF16_H__
36
37/* utf.h must be included first. */
38#ifndef __UTF_H__
39#   include "unicode/utf.h"
40#endif
41
42/* single-code point definitions -------------------------------------------- */
43
44/**
45 * Does this code unit alone encode a code point (BMP, not a surrogate)?
46 * @param c 16-bit code unit
47 * @return TRUE or FALSE
48 * @stable ICU 2.4
49 */
50#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
51
52/**
53 * Is this code unit a lead surrogate (U+d800..U+dbff)?
54 * @param c 16-bit code unit
55 * @return TRUE or FALSE
56 * @stable ICU 2.4
57 */
58#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
59
60/**
61 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
62 * @param c 16-bit code unit
63 * @return TRUE or FALSE
64 * @stable ICU 2.4
65 */
66#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
67
68/**
69 * Is this code unit a surrogate (U+d800..U+dfff)?
70 * @param c 16-bit code unit
71 * @return TRUE or FALSE
72 * @stable ICU 2.4
73 */
74#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
75
76/**
77 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
78 * is it a lead surrogate?
79 * @param c 16-bit code unit
80 * @return TRUE or FALSE
81 * @stable ICU 2.4
82 */
83#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
84
85/**
86 * Helper constant for U16_GET_SUPPLEMENTARY.
87 * @internal
88 */
89#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
90
91/**
92 * Get a supplementary code point value (U+10000..U+10ffff)
93 * from its lead and trail surrogates.
94 * The result is undefined if the input values are not
95 * lead and trail surrogates.
96 *
97 * @param lead lead surrogate (U+d800..U+dbff)
98 * @param trail trail surrogate (U+dc00..U+dfff)
99 * @return supplementary code point (U+10000..U+10ffff)
100 * @stable ICU 2.4
101 */
102#define U16_GET_SUPPLEMENTARY(lead, trail) \
103    (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
104
105
106/**
107 * Get the lead surrogate (0xd800..0xdbff) for a
108 * supplementary code point (0x10000..0x10ffff).
109 * @param supplementary 32-bit code point (U+10000..U+10ffff)
110 * @return lead surrogate (U+d800..U+dbff) for supplementary
111 * @stable ICU 2.4
112 */
113#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
114
115/**
116 * Get the trail surrogate (0xdc00..0xdfff) for a
117 * supplementary code point (0x10000..0x10ffff).
118 * @param supplementary 32-bit code point (U+10000..U+10ffff)
119 * @return trail surrogate (U+dc00..U+dfff) for supplementary
120 * @stable ICU 2.4
121 */
122#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
123
124/**
125 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
126 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
127 * @param c 32-bit code point
128 * @return 1 or 2
129 * @stable ICU 2.4
130 */
131#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
132
133/**
134 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
135 * @return 2
136 * @stable ICU 2.4
137 */
138#define U16_MAX_LENGTH 2
139
140/**
141 * Get a code point from a string at a random-access offset,
142 * without changing the offset.
143 * "Unsafe" macro, assumes well-formed UTF-16.
144 *
145 * The offset may point to either the lead or trail surrogate unit
146 * for a supplementary code point, in which case the macro will read
147 * the adjacent matching surrogate as well.
148 * The result is undefined if the offset points to a single, unpaired surrogate.
149 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
150 *
151 * @param s const UChar * string
152 * @param i string offset
153 * @param c output UChar32 variable
154 * @see U16_GET
155 * @stable ICU 2.4
156 */
157#define U16_GET_UNSAFE(s, i, c) { \
158    (c)=(s)[i]; \
159    if(U16_IS_SURROGATE(c)) { \
160        if(U16_IS_SURROGATE_LEAD(c)) { \
161            (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
162        } else { \
163            (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
164        } \
165    } \
166}
167
168/**
169 * Get a code point from a string at a random-access offset,
170 * without changing the offset.
171 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
172 *
173 * The offset may point to either the lead or trail surrogate unit
174 * for a supplementary code point, in which case the macro will read
175 * the adjacent matching surrogate as well.
176 * If the offset points to a single, unpaired surrogate, then that itself
177 * will be returned as the code point.
178 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
179 *
180 * @param s const UChar * string
181 * @param start starting string offset (usually 0)
182 * @param i string offset, start<=i<length
183 * @param length string length
184 * @param c output UChar32 variable
185 * @see U16_GET_UNSAFE
186 * @stable ICU 2.4
187 */
188#define U16_GET(s, start, i, length, c) { \
189    (c)=(s)[i]; \
190    if(U16_IS_SURROGATE(c)) { \
191        uint16_t __c2; \
192        if(U16_IS_SURROGATE_LEAD(c)) { \
193            if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
194                (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
195            } \
196        } else { \
197            if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
198                (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
199            } \
200        } \
201    } \
202}
203
204/* definitions with forward iteration --------------------------------------- */
205
206/**
207 * Get a code point from a string at a code point boundary offset,
208 * and advance the offset to the next code point boundary.
209 * (Post-incrementing forward iteration.)
210 * "Unsafe" macro, assumes well-formed UTF-16.
211 *
212 * The offset may point to the lead surrogate unit
213 * for a supplementary code point, in which case the macro will read
214 * the following trail surrogate as well.
215 * If the offset points to a trail surrogate, then that itself
216 * will be returned as the code point.
217 * The result is undefined if the offset points to a single, unpaired lead surrogate.
218 *
219 * @param s const UChar * string
220 * @param i string offset
221 * @param c output UChar32 variable
222 * @see U16_NEXT
223 * @stable ICU 2.4
224 */
225#define U16_NEXT_UNSAFE(s, i, c) { \
226    (c)=(s)[(i)++]; \
227    if(U16_IS_LEAD(c)) { \
228        (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
229    } \
230}
231
232/**
233 * Get a code point from a string at a code point boundary offset,
234 * and advance the offset to the next code point boundary.
235 * (Post-incrementing forward iteration.)
236 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
237 *
238 * The offset may point to the lead surrogate unit
239 * for a supplementary code point, in which case the macro will read
240 * the following trail surrogate as well.
241 * If the offset points to a trail surrogate or
242 * to a single, unpaired lead surrogate, then that itself
243 * will be returned as the code point.
244 *
245 * @param s const UChar * string
246 * @param i string offset, i<length
247 * @param length string length
248 * @param c output UChar32 variable
249 * @see U16_NEXT_UNSAFE
250 * @stable ICU 2.4
251 */
252#define U16_NEXT(s, i, length, c) { \
253    (c)=(s)[(i)++]; \
254    if(U16_IS_LEAD(c)) { \
255        uint16_t __c2; \
256        if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
257            ++(i); \
258            (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
259        } \
260    } \
261}
262
263/**
264 * Append a code point to a string, overwriting 1 or 2 code units.
265 * The offset points to the current end of the string contents
266 * and is advanced (post-increment).
267 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
268 * Otherwise, the result is undefined.
269 *
270 * @param s const UChar * string buffer
271 * @param i string offset
272 * @param c code point to append
273 * @see U16_APPEND
274 * @stable ICU 2.4
275 */
276#define U16_APPEND_UNSAFE(s, i, c) { \
277    if((uint32_t)(c)<=0xffff) { \
278        (s)[(i)++]=(uint16_t)(c); \
279    } else { \
280        (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
281        (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
282    } \
283}
284
285/**
286 * Append a code point to a string, overwriting 1 or 2 code units.
287 * The offset points to the current end of the string contents
288 * and is advanced (post-increment).
289 * "Safe" macro, checks for a valid code point.
290 * If a surrogate pair is written, checks for sufficient space in the string.
291 * If the code point is not valid or a trail surrogate does not fit,
292 * then isError is set to TRUE.
293 *
294 * @param s const UChar * string buffer
295 * @param i string offset, i<length
296 * @param capacity size of the string buffer
297 * @param c code point to append
298 * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
299 * @see U16_APPEND_UNSAFE
300 * @stable ICU 2.4
301 */
302#define U16_APPEND(s, i, capacity, c, isError) { \
303    if((uint32_t)(c)<=0xffff) { \
304        (s)[(i)++]=(uint16_t)(c); \
305    } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
306        (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
307        (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
308    } else /* c>0x10ffff or not enough space */ { \
309        (isError)=TRUE; \
310    } \
311}
312
313/**
314 * Advance the string offset from one code point boundary to the next.
315 * (Post-incrementing iteration.)
316 * "Unsafe" macro, assumes well-formed UTF-16.
317 *
318 * @param s const UChar * string
319 * @param i string offset
320 * @see U16_FWD_1
321 * @stable ICU 2.4
322 */
323#define U16_FWD_1_UNSAFE(s, i) { \
324    if(U16_IS_LEAD((s)[(i)++])) { \
325        ++(i); \
326    } \
327}
328
329/**
330 * Advance the string offset from one code point boundary to the next.
331 * (Post-incrementing iteration.)
332 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
333 *
334 * @param s const UChar * string
335 * @param i string offset, i<length
336 * @param length string length
337 * @see U16_FWD_1_UNSAFE
338 * @stable ICU 2.4
339 */
340#define U16_FWD_1(s, i, length) { \
341    if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
342        ++(i); \
343    } \
344}
345
346/**
347 * Advance the string offset from one code point boundary to the n-th next one,
348 * i.e., move forward by n code points.
349 * (Post-incrementing iteration.)
350 * "Unsafe" macro, assumes well-formed UTF-16.
351 *
352 * @param s const UChar * string
353 * @param i string offset
354 * @param n number of code points to skip
355 * @see U16_FWD_N
356 * @stable ICU 2.4
357 */
358#define U16_FWD_N_UNSAFE(s, i, n) { \
359    int32_t __N=(n); \
360    while(__N>0) { \
361        U16_FWD_1_UNSAFE(s, i); \
362        --__N; \
363    } \
364}
365
366/**
367 * Advance the string offset from one code point boundary to the n-th next one,
368 * i.e., move forward by n code points.
369 * (Post-incrementing iteration.)
370 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
371 *
372 * @param s const UChar * string
373 * @param i string offset, i<length
374 * @param length string length
375 * @param n number of code points to skip
376 * @see U16_FWD_N_UNSAFE
377 * @stable ICU 2.4
378 */
379#define U16_FWD_N(s, i, length, n) { \
380    int32_t __N=(n); \
381    while(__N>0 && (i)<(length)) { \
382        U16_FWD_1(s, i, length); \
383        --__N; \
384    } \
385}
386
387/**
388 * Adjust a random-access offset to a code point boundary
389 * at the start of a code point.
390 * If the offset points to the trail surrogate of a surrogate pair,
391 * then the offset is decremented.
392 * Otherwise, it is not modified.
393 * "Unsafe" macro, assumes well-formed UTF-16.
394 *
395 * @param s const UChar * string
396 * @param i string offset
397 * @see U16_SET_CP_START
398 * @stable ICU 2.4
399 */
400#define U16_SET_CP_START_UNSAFE(s, i) { \
401    if(U16_IS_TRAIL((s)[i])) { \
402        --(i); \
403    } \
404}
405
406/**
407 * Adjust a random-access offset to a code point boundary
408 * at the start of a code point.
409 * If the offset points to the trail surrogate of a surrogate pair,
410 * then the offset is decremented.
411 * Otherwise, it is not modified.
412 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
413 *
414 * @param s const UChar * string
415 * @param start starting string offset (usually 0)
416 * @param i string offset, start<=i
417 * @see U16_SET_CP_START_UNSAFE
418 * @stable ICU 2.4
419 */
420#define U16_SET_CP_START(s, start, i) { \
421    if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
422        --(i); \
423    } \
424}
425
426/* definitions with backward iteration -------------------------------------- */
427
428/**
429 * Move the string offset from one code point boundary to the previous one
430 * and get the code point between them.
431 * (Pre-decrementing backward iteration.)
432 * "Unsafe" macro, assumes well-formed UTF-16.
433 *
434 * The input offset may be the same as the string length.
435 * If the offset is behind a trail surrogate unit
436 * for a supplementary code point, then the macro will read
437 * the preceding lead surrogate as well.
438 * If the offset is behind a lead surrogate, then that itself
439 * will be returned as the code point.
440 * The result is undefined if the offset is behind a single, unpaired trail surrogate.
441 *
442 * @param s const UChar * string
443 * @param i string offset
444 * @param c output UChar32 variable
445 * @see U16_PREV
446 * @stable ICU 2.4
447 */
448#define U16_PREV_UNSAFE(s, i, c) { \
449    (c)=(s)[--(i)]; \
450    if(U16_IS_TRAIL(c)) { \
451        (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
452    } \
453}
454
455/**
456 * Move the string offset from one code point boundary to the previous one
457 * and get the code point between them.
458 * (Pre-decrementing backward iteration.)
459 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
460 *
461 * The input offset may be the same as the string length.
462 * If the offset is behind a trail surrogate unit
463 * for a supplementary code point, then the macro will read
464 * the preceding lead surrogate as well.
465 * If the offset is behind a lead surrogate or behind a single, unpaired
466 * trail surrogate, then that itself
467 * will be returned as the code point.
468 *
469 * @param s const UChar * string
470 * @param start starting string offset (usually 0)
471 * @param i string offset, start<=i
472 * @param c output UChar32 variable
473 * @see U16_PREV_UNSAFE
474 * @stable ICU 2.4
475 */
476#define U16_PREV(s, start, i, c) { \
477    (c)=(s)[--(i)]; \
478    if(U16_IS_TRAIL(c)) { \
479        uint16_t __c2; \
480        if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
481            --(i); \
482            (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
483        } \
484    } \
485}
486
487/**
488 * Move the string offset from one code point boundary to the previous one.
489 * (Pre-decrementing backward iteration.)
490 * The input offset may be the same as the string length.
491 * "Unsafe" macro, assumes well-formed UTF-16.
492 *
493 * @param s const UChar * string
494 * @param i string offset
495 * @see U16_BACK_1
496 * @stable ICU 2.4
497 */
498#define U16_BACK_1_UNSAFE(s, i) { \
499    if(U16_IS_TRAIL((s)[--(i)])) { \
500        --(i); \
501    } \
502}
503
504/**
505 * Move the string offset from one code point boundary to the previous one.
506 * (Pre-decrementing backward iteration.)
507 * The input offset may be the same as the string length.
508 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
509 *
510 * @param s const UChar * string
511 * @param start starting string offset (usually 0)
512 * @param i string offset, start<=i
513 * @see U16_BACK_1_UNSAFE
514 * @stable ICU 2.4
515 */
516#define U16_BACK_1(s, start, i) { \
517    if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
518        --(i); \
519    } \
520}
521
522/**
523 * Move the string offset from one code point boundary to the n-th one before it,
524 * i.e., move backward by n code points.
525 * (Pre-decrementing backward iteration.)
526 * The input offset may be the same as the string length.
527 * "Unsafe" macro, assumes well-formed UTF-16.
528 *
529 * @param s const UChar * string
530 * @param i string offset
531 * @param n number of code points to skip
532 * @see U16_BACK_N
533 * @stable ICU 2.4
534 */
535#define U16_BACK_N_UNSAFE(s, i, n) { \
536    int32_t __N=(n); \
537    while(__N>0) { \
538        U16_BACK_1_UNSAFE(s, i); \
539        --__N; \
540    } \
541}
542
543/**
544 * Move the string offset from one code point boundary to the n-th one before it,
545 * i.e., move backward by n code points.
546 * (Pre-decrementing backward iteration.)
547 * The input offset may be the same as the string length.
548 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
549 *
550 * @param s const UChar * string
551 * @param start start of string
552 * @param i string offset, i<length
553 * @param n number of code points to skip
554 * @see U16_BACK_N_UNSAFE
555 * @stable ICU 2.4
556 */
557#define U16_BACK_N(s, start, i, n) { \
558    int32_t __N=(n); \
559    while(__N>0 && (i)>(start)) { \
560        U16_BACK_1(s, start, i); \
561        --__N; \
562    } \
563}
564
565/**
566 * Adjust a random-access offset to a code point boundary after a code point.
567 * If the offset is behind the lead surrogate of a surrogate pair,
568 * then the offset is incremented.
569 * Otherwise, it is not modified.
570 * The input offset may be the same as the string length.
571 * "Unsafe" macro, assumes well-formed UTF-16.
572 *
573 * @param s const UChar * string
574 * @param i string offset
575 * @see U16_SET_CP_LIMIT
576 * @stable ICU 2.4
577 */
578#define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
579    if(U16_IS_LEAD((s)[(i)-1])) { \
580        ++(i); \
581    } \
582}
583
584/**
585 * Adjust a random-access offset to a code point boundary after a code point.
586 * If the offset is behind the lead surrogate of a surrogate pair,
587 * then the offset is incremented.
588 * Otherwise, it is not modified.
589 * The input offset may be the same as the string length.
590 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
591 *
592 * @param s const UChar * string
593 * @param start starting string offset (usually 0)
594 * @param i string offset, start<=i<=length
595 * @param length string length
596 * @see U16_SET_CP_LIMIT_UNSAFE
597 * @stable ICU 2.4
598 */
599#define U16_SET_CP_LIMIT(s, start, i, length) { \
600    if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
601        ++(i); \
602    } \
603}
604
605#endif
606