1f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *
3f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *                     The LLVM Compiler Infrastructure
4f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *
5f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This file is distributed under the University of Illinois Open Source
6f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * License. See LICENSE.TXT for details.
7f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *
8f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *===------------------------------------------------------------------------=*/
9f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/*
10f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Copyright 2001-2004 Unicode, Inc.
11f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *
12f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Disclaimer
13f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *
14f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This source code is provided as is by Unicode, Inc. No claims are
15f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * made as to fitness for any particular purpose. No warranties of any
16f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * kind are expressed or implied. The recipient agrees to determine
17f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * applicability of information provided. If this file has been
18f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * purchased on magnetic or optical media from Unicode, Inc., the
19f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * sole remedy for any claim will be exchange of defective media
20f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * within 90 days of receipt.
21f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *
22f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Limitations on Rights to Redistribute This Code
23f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *
24f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Unicode, Inc. hereby grants the right to freely use the information
25f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * supplied in this file in the creation of products supporting the
26f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Unicode Standard, and to make copies of this file in any form
27f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * for internal or external distribution as long as this notice
28f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * remains attached.
29f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */
30f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
31f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* ---------------------------------------------------------------------
32f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
33f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
34f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    Author: Mark E. Davis, 1994.
35f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    Rev History: Rick McGowan, fixes & updates May 2001.
36f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    Sept 2001: fixed const & error conditions per
37f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        mods suggested by S. Parent & A. Lillich.
38f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    June 2002: Tim Dodd added detection and handling of incomplete
39f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        source sequences, enhanced error detection, added casts
40f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        to eliminate compiler warnings.
41f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    July 2003: slight mods to back out aggressive FFFE detection.
42f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    Jan 2004: updated switches in from-UTF8 conversions.
43f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
44f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
45f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    See the header file "ConvertUTF.h" for complete documentation.
46f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
47f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)------------------------------------------------------------------------ */
48f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
49f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
50f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "llvm/Support/ConvertUTF.h"
51f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#ifdef CVTUTF_DEBUG
52f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include <stdio.h>
53f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#endif
54f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include <assert.h>
55f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
56f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const int halfShift  = 10; /* used for shifting by 10 bits */
57f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
58f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const UTF32 halfBase = 0x0010000UL;
59f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const UTF32 halfMask = 0x3FFUL;
60f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
61f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define UNI_SUR_HIGH_START  (UTF32)0xD800
62f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
63f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define UNI_SUR_LOW_START   (UTF32)0xDC00
64f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define UNI_SUR_LOW_END     (UTF32)0xDFFF
65f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define false      0
66f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define true        1
67f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
68f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */
69f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
70f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/*
71f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Index into the table below with the first byte of a UTF-8 sequence to
72f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * get the number of trailing bytes that are supposed to follow it.
73f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
74f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * left as-is for anyone who may want to do such conversion, which was
75f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * allowed in earlier algorithms.
76f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */
77f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const char trailingBytesForUTF8[256] = {
78f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
79f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
80f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
82f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
83f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
84f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
85f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
86f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)};
87f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
88f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/*
89f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Magic values subtracted from a buffer value during UTF8 conversion.
90f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This table contains as many values as there might be trailing bytes
91f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * in a UTF-8 sequence.
92f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */
93f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
94f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
95f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
96f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/*
97f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
98f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * into the first byte, depending on how many bytes follow.  There are
99f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * as many entries in this table as there are UTF-8 sequence types.
100f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
101f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * for *legal* UTF-8 will be 4 or fewer bytes total.
102f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */
103f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
104f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
105f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */
106f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
107f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* The interface converts a whole buffer to avoid function-call overhead.
108f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Constants have been gathered. Loops & conditionals have been removed as
109f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * much as possible for efficiency, in favor of drop-through switches.
110f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * (See "Note A" at the bottom of the file for equivalent code.)
111f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * If your compiler supports it, the "isLegalUTF8" call can be turned
112f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * into an inline function.
113f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */
114f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
115f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
116f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */
117f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
118f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)ConversionResult ConvertUTF32toUTF16 (
119f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF32** sourceStart, const UTF32* sourceEnd,
120f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
121f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    ConversionResult result = conversionOK;
122f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const UTF32* source = *sourceStart;
123f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    UTF16* target = *targetStart;
124f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    while (source < sourceEnd) {
125f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        UTF32 ch;
126f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (target >= targetEnd) {
127f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            result = targetExhausted; break;
128f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
129f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        ch = *source++;
130f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
131f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
132f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
133f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                if (flags == strictConversion) {
134f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    --source; /* return to the illegal value itself */
135f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    result = sourceIllegal;
136f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    break;
137f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                } else {
138f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    *target++ = UNI_REPLACEMENT_CHAR;
139f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                }
140f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            } else {
141f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                *target++ = (UTF16)ch; /* normal case */
142f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            }
143f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (ch > UNI_MAX_LEGAL_UTF32) {
144f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            if (flags == strictConversion) {
145f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                result = sourceIllegal;
146f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            } else {
147f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                *target++ = UNI_REPLACEMENT_CHAR;
148f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            }
149f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else {
150f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            /* target is a character in range 0xFFFF - 0x10FFFF. */
151f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            if (target + 1 >= targetEnd) {
152f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                --source; /* Back up source pointer! */
153f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                result = targetExhausted; break;
154f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            }
155f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            ch -= halfBase;
156f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
157f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
158f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
159f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
160f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    *sourceStart = source;
161f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    *targetStart = target;
162f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return result;
163f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}
164f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
165f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */
166f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
167f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)ConversionResult ConvertUTF16toUTF32 (
168f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF16** sourceStart, const UTF16* sourceEnd,
169f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
170f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    ConversionResult result = conversionOK;
171f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const UTF16* source = *sourceStart;
172f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    UTF32* target = *targetStart;
173f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    UTF32 ch, ch2;
174f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    while (source < sourceEnd) {
175f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
176f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        ch = *source++;
177f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        /* If we have a surrogate pair, convert to UTF32 first. */
178f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
179f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            /* If the 16 bits following the high surrogate are in the source buffer... */
180f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            if (source < sourceEnd) {
181f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                ch2 = *source;
182f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                /* If it's a low surrogate, convert to UTF32. */
183f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
184f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
185f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
186f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    ++source;
187f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
188f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    --source; /* return to the illegal value itself */
189f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    result = sourceIllegal;
190f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    break;
191f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                }
192f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            } else { /* We don't have the 16 bits following the high surrogate. */
193f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                --source; /* return to the high surrogate */
194f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                result = sourceExhausted;
195f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                break;
196f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            }
197f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (flags == strictConversion) {
198f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            /* UTF-16 surrogate values are illegal in UTF-32 */
199f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
200f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                --source; /* return to the illegal value itself */
201f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                result = sourceIllegal;
202f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                break;
203f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            }
204f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
205f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (target >= targetEnd) {
206f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            source = oldSource; /* Back up source pointer! */
207f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            result = targetExhausted; break;
208f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
209f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        *target++ = ch;
210f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
211f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    *sourceStart = source;
212f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    *targetStart = target;
213f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#ifdef CVTUTF_DEBUG
214f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)if (result == sourceIllegal) {
215f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
216f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    fflush(stderr);
217f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}
218f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#endif
219f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return result;
220f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}
221f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)ConversionResult ConvertUTF16toUTF8 (
222f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF16** sourceStart, const UTF16* sourceEnd,
223f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
224f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    ConversionResult result = conversionOK;
225f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const UTF16* source = *sourceStart;
226f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    UTF8* target = *targetStart;
227f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    while (source < sourceEnd) {
228f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        UTF32 ch;
229f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        unsigned short bytesToWrite = 0;
230f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF32 byteMask = 0xBF;
231f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF32 byteMark = 0x80;
232f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
233f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        ch = *source++;
234f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        /* If we have a surrogate pair, convert to UTF32 first. */
235f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
236f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            /* If the 16 bits following the high surrogate are in the source buffer... */
237f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            if (source < sourceEnd) {
238f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                UTF32 ch2 = *source;
239f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                /* If it's a low surrogate, convert to UTF32. */
240f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
241f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
242f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
243f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    ++source;
244f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
245f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    --source; /* return to the illegal value itself */
246f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    result = sourceIllegal;
247f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                    break;
248f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                }
249f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            } else { /* We don't have the 16 bits following the high surrogate. */
250f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                --source; /* return to the high surrogate */
251f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                result = sourceExhausted;
252f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                break;
253f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            }
254f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (flags == strictConversion) {
255f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            /* UTF-16 surrogate values are illegal in UTF-32 */
256f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
257f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                --source; /* return to the illegal value itself */
258f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                result = sourceIllegal;
259f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                break;
260f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            }
261f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
262f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        /* Figure out how many bytes the result will require */
263f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
264f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
265f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
266f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
267f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else {                            bytesToWrite = 3;
268f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                                            ch = UNI_REPLACEMENT_CHAR;
269f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
270f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
271f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        target += bytesToWrite;
272f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (target > targetEnd) {
273f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            source = oldSource; /* Back up source pointer! */
274f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            target -= bytesToWrite; result = targetExhausted; break;
275f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
276f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        switch (bytesToWrite) { /* note: everything falls through. */
277f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
278f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
279f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
280f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
281f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
282f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        target += bytesToWrite;
283f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
284f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    *sourceStart = source;
285f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    *targetStart = target;
286f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return result;
287f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}
288f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
289f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */
290f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
291f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)ConversionResult ConvertUTF32toUTF8 (
292f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF32** sourceStart, const UTF32* sourceEnd,
293f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
294f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    ConversionResult result = conversionOK;
295f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const UTF32* source = *sourceStart;
296f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    UTF8* target = *targetStart;
297f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    while (source < sourceEnd) {
298f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        UTF32 ch;
299f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        unsigned short bytesToWrite = 0;
300f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF32 byteMask = 0xBF;
301f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        const UTF32 byteMark = 0x80;
302f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        ch = *source++;
303f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (flags == strictConversion ) {
304f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            /* UTF-16 surrogate values are illegal in UTF-32 */
305f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
306f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                --source; /* return to the illegal value itself */
307f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                result = sourceIllegal;
308f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                break;
309f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            }
310f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
311f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        /*
312f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)         * Figure out how many bytes the result will require. Turn any
313f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)         * illegally large UTF32 things (> Plane 17) into replacement chars.
314f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)         */
315f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
316f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
317f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
318f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
319f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        } else {                            bytesToWrite = 3;
320f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                                            ch = UNI_REPLACEMENT_CHAR;
321f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                                            result = sourceIllegal;
322f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
323f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
324f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        target += bytesToWrite;
325f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        if (target > targetEnd) {
326f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            --source; /* Back up source pointer! */
327f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            target -= bytesToWrite; result = targetExhausted; break;
328f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
329f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        switch (bytesToWrite) { /* note: everything falls through. */
330f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
331f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
332f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
333f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
334f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
335f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        target += bytesToWrite;
336f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
337f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    *sourceStart = source;
338f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    *targetStart = target;
339f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return result;
340f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}
341f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
342f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */
343f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
344f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/*
345f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Utility routine to tell whether a sequence of bytes is legal UTF-8.
346f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This must be called with the length pre-determined by the first byte.
347f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * If not calling this from ConvertUTF8to*, then the length can be set by:
348f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *  length = trailingBytesForUTF8[*source]+1;
349f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * and the sequence is illegal right away if there aren't that many bytes
350f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * available.
351f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * If presented with a length > 4, this returns false.  The Unicode
352f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * definition of UTF-8 goes up to 4-byte sequences.
353f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */
354f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
355f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static Boolean isLegalUTF8(const UTF8 *source, int length) {
356f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    UTF8 a;
357f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    const UTF8 *srcptr = source+length;
358f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    switch (length) {
359f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    default: return false;
360f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        /* Everything else falls through when "true"... */
361f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
362f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
363f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
364f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
365f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        switch (*source) {
366f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            /* no fall-through in this inner switch */
367f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 0xE0: if (a < 0xA0) return false; break;
368f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 0xED: if (a > 0x9F) return false; break;
369f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 0xF0: if (a < 0x90) return false; break;
370f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            case 0xF4: if (a > 0x8F) return false; break;
371f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            default:   if (a < 0x80) return false;
372f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        }
373f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
374f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
375f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
376f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    if (*source > 0xF4) return false;
377f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return true;
378f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}
379f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
380f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */
381f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
382f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/*
383f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Exported function to return whether a UTF-8 sequence is legal or not.
384f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This is not used here; it's just exported.
385f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */
386f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
387f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    int length = trailingBytesForUTF8[*source]+1;
388f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    if (length > sourceEnd - source) {
389f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        return false;
390f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    }
391f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return isLegalUTF8(source, length);
392f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}
393f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
394f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */
395f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
396f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static unsigned
397f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
398f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                                          const UTF8 *sourceEnd) {
399f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  UTF8 b1, b2, b3;
400f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
401f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  assert(!isLegalUTF8Sequence(source, sourceEnd));
402f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)
403f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  /*
404f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)   * Unicode 6.3.0, D93b:
405f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)   *
406f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)   *   Maximal subpart of an ill-formed subsequence: The longest code unit
407f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)   *   subsequence starting at an unconvertible offset that is either:
408   *   a. the initial subsequence of a well-formed code unit sequence, or
409   *   b. a subsequence of length one.
410   */
411
412  if (source == sourceEnd)
413    return 0;
414
415  /*
416   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
417   * Byte Sequences.
418   */
419
420  b1 = *source;
421  ++source;
422  if (b1 >= 0xC2 && b1 <= 0xDF) {
423    /*
424     * First byte is valid, but we know that this code unit sequence is
425     * invalid, so the maximal subpart has to end after the first byte.
426     */
427    return 1;
428  }
429
430  if (source == sourceEnd)
431    return 1;
432
433  b2 = *source;
434  ++source;
435
436  if (b1 == 0xE0) {
437    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
438  }
439  if (b1 >= 0xE1 && b1 <= 0xEC) {
440    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
441  }
442  if (b1 == 0xED) {
443    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
444  }
445  if (b1 >= 0xEE && b1 <= 0xEF) {
446    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
447  }
448  if (b1 == 0xF0) {
449    if (b2 >= 0x90 && b2 <= 0xBF) {
450      if (source == sourceEnd)
451        return 2;
452
453      b3 = *source;
454      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
455    }
456    return 1;
457  }
458  if (b1 >= 0xF1 && b1 <= 0xF3) {
459    if (b2 >= 0x80 && b2 <= 0xBF) {
460      if (source == sourceEnd)
461        return 2;
462
463      b3 = *source;
464      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
465    }
466    return 1;
467  }
468  if (b1 == 0xF4) {
469    if (b2 >= 0x80 && b2 <= 0x8F) {
470      if (source == sourceEnd)
471        return 2;
472
473      b3 = *source;
474      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
475    }
476    return 1;
477  }
478
479  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
480  /*
481   * There are no valid sequences that start with these bytes.  Maximal subpart
482   * is defined to have length 1 in these cases.
483   */
484  return 1;
485}
486
487/* --------------------------------------------------------------------- */
488
489/*
490 * Exported function to return the total number of bytes in a codepoint
491 * represented in UTF-8, given the value of the first byte.
492 */
493unsigned getNumBytesForUTF8(UTF8 first) {
494  return trailingBytesForUTF8[first] + 1;
495}
496
497/* --------------------------------------------------------------------- */
498
499/*
500 * Exported function to return whether a UTF-8 string is legal or not.
501 * This is not used here; it's just exported.
502 */
503Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
504    while (*source != sourceEnd) {
505        int length = trailingBytesForUTF8[**source] + 1;
506        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
507            return false;
508        *source += length;
509    }
510    return true;
511}
512
513/* --------------------------------------------------------------------- */
514
515ConversionResult ConvertUTF8toUTF16 (
516        const UTF8** sourceStart, const UTF8* sourceEnd,
517        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
518    ConversionResult result = conversionOK;
519    const UTF8* source = *sourceStart;
520    UTF16* target = *targetStart;
521    while (source < sourceEnd) {
522        UTF32 ch = 0;
523        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
524        if (extraBytesToRead >= sourceEnd - source) {
525            result = sourceExhausted; break;
526        }
527        /* Do this check whether lenient or strict */
528        if (!isLegalUTF8(source, extraBytesToRead+1)) {
529            result = sourceIllegal;
530            break;
531        }
532        /*
533         * The cases all fall through. See "Note A" below.
534         */
535        switch (extraBytesToRead) {
536            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
537            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
538            case 3: ch += *source++; ch <<= 6;
539            case 2: ch += *source++; ch <<= 6;
540            case 1: ch += *source++; ch <<= 6;
541            case 0: ch += *source++;
542        }
543        ch -= offsetsFromUTF8[extraBytesToRead];
544
545        if (target >= targetEnd) {
546            source -= (extraBytesToRead+1); /* Back up source pointer! */
547            result = targetExhausted; break;
548        }
549        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
550            /* UTF-16 surrogate values are illegal in UTF-32 */
551            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
552                if (flags == strictConversion) {
553                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
554                    result = sourceIllegal;
555                    break;
556                } else {
557                    *target++ = UNI_REPLACEMENT_CHAR;
558                }
559            } else {
560                *target++ = (UTF16)ch; /* normal case */
561            }
562        } else if (ch > UNI_MAX_UTF16) {
563            if (flags == strictConversion) {
564                result = sourceIllegal;
565                source -= (extraBytesToRead+1); /* return to the start */
566                break; /* Bail out; shouldn't continue */
567            } else {
568                *target++ = UNI_REPLACEMENT_CHAR;
569            }
570        } else {
571            /* target is a character in range 0xFFFF - 0x10FFFF. */
572            if (target + 1 >= targetEnd) {
573                source -= (extraBytesToRead+1); /* Back up source pointer! */
574                result = targetExhausted; break;
575            }
576            ch -= halfBase;
577            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
578            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
579        }
580    }
581    *sourceStart = source;
582    *targetStart = target;
583    return result;
584}
585
586/* --------------------------------------------------------------------- */
587
588static ConversionResult ConvertUTF8toUTF32Impl(
589        const UTF8** sourceStart, const UTF8* sourceEnd,
590        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
591        Boolean InputIsPartial) {
592    ConversionResult result = conversionOK;
593    const UTF8* source = *sourceStart;
594    UTF32* target = *targetStart;
595    while (source < sourceEnd) {
596        UTF32 ch = 0;
597        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
598        if (extraBytesToRead >= sourceEnd - source) {
599            if (flags == strictConversion || InputIsPartial) {
600                result = sourceExhausted;
601                break;
602            } else {
603                result = sourceIllegal;
604
605                /*
606                 * Replace the maximal subpart of ill-formed sequence with
607                 * replacement character.
608                 */
609                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
610                                                                    sourceEnd);
611                *target++ = UNI_REPLACEMENT_CHAR;
612                continue;
613            }
614        }
615        if (target >= targetEnd) {
616            result = targetExhausted; break;
617        }
618
619        /* Do this check whether lenient or strict */
620        if (!isLegalUTF8(source, extraBytesToRead+1)) {
621            result = sourceIllegal;
622            if (flags == strictConversion) {
623                /* Abort conversion. */
624                break;
625            } else {
626                /*
627                 * Replace the maximal subpart of ill-formed sequence with
628                 * replacement character.
629                 */
630                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
631                                                                    sourceEnd);
632                *target++ = UNI_REPLACEMENT_CHAR;
633                continue;
634            }
635        }
636        /*
637         * The cases all fall through. See "Note A" below.
638         */
639        switch (extraBytesToRead) {
640            case 5: ch += *source++; ch <<= 6;
641            case 4: ch += *source++; ch <<= 6;
642            case 3: ch += *source++; ch <<= 6;
643            case 2: ch += *source++; ch <<= 6;
644            case 1: ch += *source++; ch <<= 6;
645            case 0: ch += *source++;
646        }
647        ch -= offsetsFromUTF8[extraBytesToRead];
648
649        if (ch <= UNI_MAX_LEGAL_UTF32) {
650            /*
651             * UTF-16 surrogate values are illegal in UTF-32, and anything
652             * over Plane 17 (> 0x10FFFF) is illegal.
653             */
654            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
655                if (flags == strictConversion) {
656                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
657                    result = sourceIllegal;
658                    break;
659                } else {
660                    *target++ = UNI_REPLACEMENT_CHAR;
661                }
662            } else {
663                *target++ = ch;
664            }
665        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
666            result = sourceIllegal;
667            *target++ = UNI_REPLACEMENT_CHAR;
668        }
669    }
670    *sourceStart = source;
671    *targetStart = target;
672    return result;
673}
674
675ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
676                                           const UTF8 *sourceEnd,
677                                           UTF32 **targetStart,
678                                           UTF32 *targetEnd,
679                                           ConversionFlags flags) {
680  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
681                                flags, /*InputIsPartial=*/true);
682}
683
684ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
685                                    const UTF8 *sourceEnd, UTF32 **targetStart,
686                                    UTF32 *targetEnd, ConversionFlags flags) {
687  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
688                                flags, /*InputIsPartial=*/false);
689}
690
691/* ---------------------------------------------------------------------
692
693    Note A.
694    The fall-through switches in UTF-8 reading code save a
695    temp variable, some decrements & conditionals.  The switches
696    are equivalent to the following loop:
697        {
698            int tmpBytesToRead = extraBytesToRead+1;
699            do {
700                ch += *source++;
701                --tmpBytesToRead;
702                if (tmpBytesToRead) ch <<= 6;
703            } while (tmpBytesToRead > 0);
704        }
705    In UTF-8 writing code, the switches on "bytesToWrite" are
706    similarly unrolled loops.
707
708   --------------------------------------------------------------------- */
709