1e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *
3e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *                     The LLVM Compiler Infrastructure
4e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *
5e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * This file is distributed under the University of Illinois Open Source
6e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * License. See LICENSE.TXT for details.
7e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *
8e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *===------------------------------------------------------------------------=*/
9e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/*
10e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Copyright 2001-2004 Unicode, Inc.
11e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *
12e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Disclaimer
13e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *
14e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * This source code is provided as is by Unicode, Inc. No claims are
15e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * made as to fitness for any particular purpose. No warranties of any
16e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * kind are expressed or implied. The recipient agrees to determine
17e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * applicability of information provided. If this file has been
18e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * purchased on magnetic or optical media from Unicode, Inc., the
19e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * sole remedy for any claim will be exchange of defective media
20e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * within 90 days of receipt.
21e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *
22e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Limitations on Rights to Redistribute This Code
23e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *
24e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Unicode, Inc. hereby grants the right to freely use the information
25e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * supplied in this file in the creation of products supporting the
26e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Unicode Standard, and to make copies of this file in any form
27e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * for internal or external distribution as long as this notice
28e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * remains attached.
29e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff */
30e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
31e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* ---------------------------------------------------------------------
32e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
33e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
34e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    Author: Mark E. Davis, 1994.
35e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    Rev History: Rick McGowan, fixes & updates May 2001.
36e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    Sept 2001: fixed const & error conditions per
371eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        mods suggested by S. Parent & A. Lillich.
38e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    June 2002: Tim Dodd added detection and handling of incomplete
391eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        source sequences, enhanced error detection, added casts
401eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        to eliminate compiler warnings.
41e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    July 2003: slight mods to back out aggressive FFFE detection.
42e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    Jan 2004: updated switches in from-UTF8 conversions.
43e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
44e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
45e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    See the header file "ConvertUTF.h" for complete documentation.
46e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
47e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff------------------------------------------------------------------------ */
48e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
49e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
50e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#include "clang/Basic/ConvertUTF.h"
51e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#ifdef CVTUTF_DEBUG
52e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#include <stdio.h>
53e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#endif
54e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
55e9b7d8ace8674585818990cff585daae7745bd88Steve Naroffstatic const int halfShift  = 10; /* used for shifting by 10 bits */
56e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
57e9b7d8ace8674585818990cff585daae7745bd88Steve Naroffstatic const UTF32 halfBase = 0x0010000UL;
58e9b7d8ace8674585818990cff585daae7745bd88Steve Naroffstatic const UTF32 halfMask = 0x3FFUL;
59e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
60e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#define UNI_SUR_HIGH_START  (UTF32)0xD800
61e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
62e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#define UNI_SUR_LOW_START   (UTF32)0xDC00
63e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#define UNI_SUR_LOW_END     (UTF32)0xDFFF
641eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump#define false      0
651eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump#define true        1
66e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
67e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* --------------------------------------------------------------------- */
68e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
69e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/*
70e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Index into the table below with the first byte of a UTF-8 sequence to
71e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * get the number of trailing bytes that are supposed to follow it.
72e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
73e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * left as-is for anyone who may want to do such conversion, which was
74e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * allowed in earlier algorithms.
75e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff */
76e9b7d8ace8674585818990cff585daae7745bd88Steve Naroffstatic const char trailingBytesForUTF8[256] = {
77e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
78e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
79e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
80e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
82e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
83e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
84e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
85e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff};
86e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
87e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/*
88e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Magic values subtracted from a buffer value during UTF8 conversion.
89e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * This table contains as many values as there might be trailing bytes
90e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * in a UTF-8 sequence.
91e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff */
92e9b7d8ace8674585818990cff585daae7745bd88Steve Naroffstatic const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
931eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
94e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
95e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/*
96e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
97e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * into the first byte, depending on how many bytes follow.  There are
98e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * as many entries in this table as there are UTF-8 sequence types.
99e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
100e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * for *legal* UTF-8 will be 4 or fewer bytes total.
101e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff */
102e9b7d8ace8674585818990cff585daae7745bd88Steve Naroffstatic const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
103e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
104e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* --------------------------------------------------------------------- */
105e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
106e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* The interface converts a whole buffer to avoid function-call overhead.
107e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Constants have been gathered. Loops & conditionals have been removed as
108e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * much as possible for efficiency, in favor of drop-through switches.
109e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * (See "Note A" at the bottom of the file for equivalent code.)
110e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * If your compiler supports it, the "isLegalUTF8" call can be turned
111e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * into an inline function.
112e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff */
113e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
114e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
115e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* --------------------------------------------------------------------- */
116e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
117e9b7d8ace8674585818990cff585daae7745bd88Steve NaroffConversionResult ConvertUTF32toUTF16 (
1181eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF32** sourceStart, const UTF32* sourceEnd,
1191eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
120e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    ConversionResult result = conversionOK;
121e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    const UTF32* source = *sourceStart;
122e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    UTF16* target = *targetStart;
123e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    while (source < sourceEnd) {
1241eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF32 ch;
1251eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (target >= targetEnd) {
1261eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            result = targetExhausted; break;
1271eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
1281eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        ch = *source++;
1291eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
1301eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
1311eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
1321eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                if (flags == strictConversion) {
1331eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    --source; /* return to the illegal value itself */
1341eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    result = sourceIllegal;
1351eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    break;
1361eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                } else {
1371eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    *target++ = UNI_REPLACEMENT_CHAR;
1381eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                }
1391eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            } else {
1401eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                *target++ = (UTF16)ch; /* normal case */
1411eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
1421eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (ch > UNI_MAX_LEGAL_UTF32) {
1431eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (flags == strictConversion) {
1441eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = sourceIllegal;
1451eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            } else {
1461eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                *target++ = UNI_REPLACEMENT_CHAR;
1471eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
1481eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else {
1491eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* target is a character in range 0xFFFF - 0x10FFFF. */
1501eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (target + 1 >= targetEnd) {
1511eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                --source; /* Back up source pointer! */
1521eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = targetExhausted; break;
1531eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
1541eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            ch -= halfBase;
1551eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
1561eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
1571eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
158e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    }
159e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *sourceStart = source;
160e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *targetStart = target;
161e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    return result;
162e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff}
163e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
164e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* --------------------------------------------------------------------- */
165e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
166e9b7d8ace8674585818990cff585daae7745bd88Steve NaroffConversionResult ConvertUTF16toUTF32 (
1671eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF16** sourceStart, const UTF16* sourceEnd,
1681eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
169e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    ConversionResult result = conversionOK;
170e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    const UTF16* source = *sourceStart;
171e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    UTF32* target = *targetStart;
172e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    UTF32 ch, ch2;
173e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    while (source < sourceEnd) {
1741eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
1751eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        ch = *source++;
1761eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        /* If we have a surrogate pair, convert to UTF32 first. */
1771eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1781eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* If the 16 bits following the high surrogate are in the source buffer... */
1791eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (source < sourceEnd) {
1801eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                ch2 = *source;
1811eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                /* If it's a low surrogate, convert to UTF32. */
1821eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
1831eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1841eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
1851eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    ++source;
1861eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
1871eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    --source; /* return to the illegal value itself */
1881eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    result = sourceIllegal;
1891eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    break;
1901eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                }
1911eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            } else { /* We don't have the 16 bits following the high surrogate. */
1921eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                --source; /* return to the high surrogate */
1931eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = sourceExhausted;
1941eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                break;
1951eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
1961eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (flags == strictConversion) {
1971eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* UTF-16 surrogate values are illegal in UTF-32 */
1981eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
1991eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                --source; /* return to the illegal value itself */
2001eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = sourceIllegal;
2011eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                break;
2021eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
2031eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
2041eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (target >= targetEnd) {
2051eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            source = oldSource; /* Back up source pointer! */
2061eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            result = targetExhausted; break;
2071eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
2081eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        *target++ = ch;
209e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    }
210e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *sourceStart = source;
211e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *targetStart = target;
212e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#ifdef CVTUTF_DEBUG
213e9b7d8ace8674585818990cff585daae7745bd88Steve Naroffif (result == sourceIllegal) {
214e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
215e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    fflush(stderr);
216e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff}
217e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff#endif
218e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    return result;
219e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff}
220e9b7d8ace8674585818990cff585daae7745bd88Steve NaroffConversionResult ConvertUTF16toUTF8 (
2211eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF16** sourceStart, const UTF16* sourceEnd,
2221eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
223e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    ConversionResult result = conversionOK;
224e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    const UTF16* source = *sourceStart;
225e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    UTF8* target = *targetStart;
226e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    while (source < sourceEnd) {
2271eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF32 ch;
2281eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        unsigned short bytesToWrite = 0;
2291eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF32 byteMask = 0xBF;
2301eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF32 byteMark = 0x80;
2311eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
2321eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        ch = *source++;
2331eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        /* If we have a surrogate pair, convert to UTF32 first. */
2341eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
2351eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* If the 16 bits following the high surrogate are in the source buffer... */
2361eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (source < sourceEnd) {
2371eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                UTF32 ch2 = *source;
2381eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                /* If it's a low surrogate, convert to UTF32. */
2391eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
2401eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
2411eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
2421eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    ++source;
2431eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
2441eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    --source; /* return to the illegal value itself */
2451eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    result = sourceIllegal;
2461eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    break;
2471eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                }
2481eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            } else { /* We don't have the 16 bits following the high surrogate. */
2491eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                --source; /* return to the high surrogate */
2501eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = sourceExhausted;
2511eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                break;
2521eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
2531eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (flags == strictConversion) {
2541eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* UTF-16 surrogate values are illegal in UTF-32 */
2551eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
2561eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                --source; /* return to the illegal value itself */
2571eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = sourceIllegal;
2581eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                break;
2591eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
2601eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
2611eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        /* Figure out how many bytes the result will require */
2621eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
2631eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
2641eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
2651eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
2661eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else {                            bytesToWrite = 3;
2671eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                                            ch = UNI_REPLACEMENT_CHAR;
2681eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
2691eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump
2701eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        target += bytesToWrite;
2711eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (target > targetEnd) {
2721eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            source = oldSource; /* Back up source pointer! */
2731eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            target -= bytesToWrite; result = targetExhausted; break;
2741eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
2751eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        switch (bytesToWrite) { /* note: everything falls through. */
2761eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
2771eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
2781eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
2791eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
2801eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
2811eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        target += bytesToWrite;
282e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    }
283e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *sourceStart = source;
284e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *targetStart = target;
285e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    return result;
286e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff}
287e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
288e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* --------------------------------------------------------------------- */
289e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
290e9b7d8ace8674585818990cff585daae7745bd88Steve NaroffConversionResult ConvertUTF32toUTF8 (
2911eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF32** sourceStart, const UTF32* sourceEnd,
2921eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
293e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    ConversionResult result = conversionOK;
294e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    const UTF32* source = *sourceStart;
295e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    UTF8* target = *targetStart;
296e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    while (source < sourceEnd) {
2971eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF32 ch;
2981eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        unsigned short bytesToWrite = 0;
2991eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF32 byteMask = 0xBF;
3001eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF32 byteMark = 0x80;
3011eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        ch = *source++;
3021eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (flags == strictConversion ) {
3031eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* UTF-16 surrogate values are illegal in UTF-32 */
3041eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
3051eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                --source; /* return to the illegal value itself */
3061eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = sourceIllegal;
3071eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                break;
3081eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
3091eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
3101eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        /*
3111eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump         * Figure out how many bytes the result will require. Turn any
3121eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump         * illegally large UTF32 things (> Plane 17) into replacement chars.
3131eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump         */
3141eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
3151eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
3161eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
3171eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
3181eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else {                            bytesToWrite = 3;
3191eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                                            ch = UNI_REPLACEMENT_CHAR;
3201eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                                            result = sourceIllegal;
3211eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
3221eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump
3231eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        target += bytesToWrite;
3241eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (target > targetEnd) {
3251eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            --source; /* Back up source pointer! */
3261eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            target -= bytesToWrite; result = targetExhausted; break;
3271eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
3281eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        switch (bytesToWrite) { /* note: everything falls through. */
3291eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3301eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3311eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3321eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
3331eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
3341eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        target += bytesToWrite;
335e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    }
336e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *sourceStart = source;
337e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *targetStart = target;
338e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    return result;
339e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff}
340e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
341e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* --------------------------------------------------------------------- */
342e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
343e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/*
344e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Utility routine to tell whether a sequence of bytes is legal UTF-8.
345e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * This must be called with the length pre-determined by the first byte.
346e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * If not calling this from ConvertUTF8to*, then the length can be set by:
347e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff *  length = trailingBytesForUTF8[*source]+1;
348e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * and the sequence is illegal right away if there aren't that many bytes
349e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * available.
350e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * If presented with a length > 4, this returns false.  The Unicode
351e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * definition of UTF-8 goes up to 4-byte sequences.
352e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff */
353e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
354e9b7d8ace8674585818990cff585daae7745bd88Steve Naroffstatic Boolean isLegalUTF8(const UTF8 *source, int length) {
355e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    UTF8 a;
356e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    const UTF8 *srcptr = source+length;
357e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    switch (length) {
358e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    default: return false;
3591eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        /* Everything else falls through when "true"... */
360e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
361e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
362e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
363e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
3641eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        switch (*source) {
3651eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* no fall-through in this inner switch */
3661eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 0xE0: if (a < 0xA0) return false; break;
3671eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 0xED: if (a > 0x9F) return false; break;
3681eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 0xF0: if (a < 0x90) return false; break;
3691eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 0xF4: if (a > 0x8F) return false; break;
3701eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            default:   if (a < 0x80) return false;
3711eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
372e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
373e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
374e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    }
375e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    if (*source > 0xF4) return false;
376e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    return true;
377e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff}
378e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
379e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* --------------------------------------------------------------------- */
380e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
381e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/*
382e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * Exported function to return whether a UTF-8 sequence is legal or not.
383e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff * This is not used here; it's just exported.
384e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff */
385e9b7d8ace8674585818990cff585daae7745bd88Steve NaroffBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
386e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    int length = trailingBytesForUTF8[*source]+1;
38749d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith    if (length > sourceEnd - source) {
3881eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        return false;
389e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    }
390e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    return isLegalUTF8(source, length);
391e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff}
392e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
393e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* --------------------------------------------------------------------- */
394e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
39549d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith/*
396e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith * Exported function to return the total number of bytes in a codepoint
397e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith * represented in UTF-8, given the value of the first byte.
398e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith */
399e5f0588840b20897631cc8110344fd2745ef4caaRichard Smithunsigned getNumBytesForUTF8(UTF8 first) {
400e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith  return trailingBytesForUTF8[first] + 1;
401e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith}
402e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith
403e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith/* --------------------------------------------------------------------- */
404e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith
405e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith/*
40649d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith * Exported function to return whether a UTF-8 string is legal or not.
40749d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith * This is not used here; it's just exported.
40849d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith */
409e5f0588840b20897631cc8110344fd2745ef4caaRichard SmithBoolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
410e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith    while (*source != sourceEnd) {
411e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith        int length = trailingBytesForUTF8[**source] + 1;
412e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
41349d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith            return false;
414e5f0588840b20897631cc8110344fd2745ef4caaRichard Smith        *source += length;
41549d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith    }
41649d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith    return true;
41749d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith}
41849d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith
41949d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith/* --------------------------------------------------------------------- */
42049d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith
421e9b7d8ace8674585818990cff585daae7745bd88Steve NaroffConversionResult ConvertUTF8toUTF16 (
4221eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        const UTF8** sourceStart, const UTF8* sourceEnd,
4231eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
424e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    ConversionResult result = conversionOK;
425e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    const UTF8* source = *sourceStart;
426e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    UTF16* target = *targetStart;
427e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    while (source < sourceEnd) {
4281eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        UTF32 ch = 0;
4291eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
43049d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith        if (extraBytesToRead >= sourceEnd - source) {
4311eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            result = sourceExhausted; break;
4321eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
4331eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        /* Do this check whether lenient or strict */
4341eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (!isLegalUTF8(source, extraBytesToRead+1)) {
4351eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            result = sourceIllegal;
4361eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            break;
4371eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
4381eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        /*
4391eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump         * The cases all fall through. See "Note A" below.
4401eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump         */
4411eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        switch (extraBytesToRead) {
4421eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
4431eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
4441eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 3: ch += *source++; ch <<= 6;
4451eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 2: ch += *source++; ch <<= 6;
4461eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 1: ch += *source++; ch <<= 6;
4471eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            case 0: ch += *source++;
4481eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
4491eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        ch -= offsetsFromUTF8[extraBytesToRead];
4501eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump
4511eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (target >= targetEnd) {
4521eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            source -= (extraBytesToRead+1); /* Back up source pointer! */
4531eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            result = targetExhausted; break;
4541eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
4551eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
4561eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* UTF-16 surrogate values are illegal in UTF-32 */
4571eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
4581eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                if (flags == strictConversion) {
4591eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
4601eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    result = sourceIllegal;
4611eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    break;
4621eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                } else {
4631eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                    *target++ = UNI_REPLACEMENT_CHAR;
4641eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                }
4651eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            } else {
4661eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                *target++ = (UTF16)ch; /* normal case */
4671eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
4681eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else if (ch > UNI_MAX_UTF16) {
4691eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (flags == strictConversion) {
4701eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = sourceIllegal;
4711eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                source -= (extraBytesToRead+1); /* return to the start */
4721eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                break; /* Bail out; shouldn't continue */
4731eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            } else {
4741eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                *target++ = UNI_REPLACEMENT_CHAR;
4751eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
4761eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        } else {
4771eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            /* target is a character in range 0xFFFF - 0x10FFFF. */
4781eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            if (target + 1 >= targetEnd) {
4791eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                source -= (extraBytesToRead+1); /* Back up source pointer! */
4801eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                result = targetExhausted; break;
4811eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            }
4821eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            ch -= halfBase;
4831eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
4841eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
4851eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
486e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    }
487e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *sourceStart = source;
488e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    *targetStart = target;
489e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    return result;
490e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff}
491e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
492436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman/* --------------------------------------------------------------------- */
493436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman
494436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli FriedmanConversionResult ConvertUTF8toUTF32 (
495436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        const UTF8** sourceStart, const UTF8* sourceEnd,
496436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
497436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman    ConversionResult result = conversionOK;
498436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman    const UTF8* source = *sourceStart;
499436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman    UTF32* target = *targetStart;
500436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman    while (source < sourceEnd) {
501436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        UTF32 ch = 0;
502436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
50349d517467c3dcd2c67e8a6c740ba5160e37625f7Richard Smith        if (extraBytesToRead >= sourceEnd - source) {
504436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            result = sourceExhausted; break;
505436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        }
506436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        /* Do this check whether lenient or strict */
507436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        if (!isLegalUTF8(source, extraBytesToRead+1)) {
508436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            result = sourceIllegal;
509436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            break;
510436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        }
511436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        /*
512436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman         * The cases all fall through. See "Note A" below.
513436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman         */
514436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        switch (extraBytesToRead) {
515436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            case 5: ch += *source++; ch <<= 6;
516436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            case 4: ch += *source++; ch <<= 6;
517436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            case 3: ch += *source++; ch <<= 6;
518436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            case 2: ch += *source++; ch <<= 6;
519436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            case 1: ch += *source++; ch <<= 6;
520436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            case 0: ch += *source++;
521436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        }
522436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        ch -= offsetsFromUTF8[extraBytesToRead];
523436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman
524436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        if (target >= targetEnd) {
525436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            source -= (extraBytesToRead+1); /* Back up the source pointer! */
526436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            result = targetExhausted; break;
527436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        }
528436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        if (ch <= UNI_MAX_LEGAL_UTF32) {
529436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            /*
530436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman             * UTF-16 surrogate values are illegal in UTF-32, and anything
531436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman             * over Plane 17 (> 0x10FFFF) is illegal.
532436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman             */
533436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
534436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman                if (flags == strictConversion) {
535436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
536436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman                    result = sourceIllegal;
537436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman                    break;
538436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman                } else {
539436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman                    *target++ = UNI_REPLACEMENT_CHAR;
540436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman                }
541436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            } else {
542436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman                *target++ = ch;
543436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            }
544436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
545436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            result = sourceIllegal;
546436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman            *target++ = UNI_REPLACEMENT_CHAR;
547436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman        }
548436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman    }
549436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman    *sourceStart = source;
550436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman    *targetStart = target;
551436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman    return result;
552436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman}
553436ecd959954db0e11c8daf64b3d6b6b6d0eba55Eli Friedman
554e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff/* ---------------------------------------------------------------------
555e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
556e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    Note A.
557e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    The fall-through switches in UTF-8 reading code save a
558e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    temp variable, some decrements & conditionals.  The switches
559e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    are equivalent to the following loop:
5601eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        {
5611eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            int tmpBytesToRead = extraBytesToRead+1;
5621eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            do {
5631eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                ch += *source++;
5641eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                --tmpBytesToRead;
5651eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump                if (tmpBytesToRead) ch <<= 6;
5661eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump            } while (tmpBytesToRead > 0);
5671eb4433ac451dc16f4133a88af2d002ac26c58efMike Stump        }
568e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    In UTF-8 writing code, the switches on "bytesToWrite" are
569e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff    similarly unrolled loops.
570e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff
571e9b7d8ace8674585818990cff585daae7745bd88Steve Naroff   --------------------------------------------------------------------- */
572