1dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/*
2dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Copyright 2001-2004 Unicode, Inc.
3dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels *
4dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Disclaimer
5dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels *
6dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * This source code is provided as is by Unicode, Inc. No claims are
7dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * made as to fitness for any particular purpose. No warranties of any
8dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * kind are expressed or implied. The recipient agrees to determine
9dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * applicability of information provided. If this file has been
10dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * purchased on magnetic or optical media from Unicode, Inc., the
11dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * sole remedy for any claim will be exchange of defective media
12dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * within 90 days of receipt.
13dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels *
14dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Limitations on Rights to Redistribute This Code
15dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels *
16dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Unicode, Inc. hereby grants the right to freely use the information
17dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * supplied in this file in the creation of products supporting the
18dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Unicode Standard, and to make copies of this file in any form
19dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * for internal or external distribution as long as this notice
20dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * remains attached.
21dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels */
22dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
23dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* ---------------------------------------------------------------------
24dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
25dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    Author: Mark E. Davis, 1994.
27dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    Rev History: Rick McGowan, fixes & updates May 2001.
28dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    Sept 2001: fixed const & error conditions per
29dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	mods suggested by S. Parent & A. Lillich.
30dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    June 2002: Tim Dodd added detection and handling of incomplete
31dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	source sequences, enhanced error detection, added casts
32dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	to eliminate compiler warnings.
33dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    July 2003: slight mods to back out aggressive FFFE detection.
34dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    Jan 2004: updated switches in from-UTF8 conversions.
35dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
37dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    See the header file "ConvertUTF.h" for complete documentation.
38dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
39dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels------------------------------------------------------------------------ */
40dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
41dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
42dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#include "ConvertUTF.h"
43dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#ifdef CVTUTF_DEBUG
44dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#include <stdio.h>
45dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#endif
46dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
47dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckelsstatic const int halfShift  = 10; /* used for shifting by 10 bits */
48dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
49dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckelsstatic const UTF32 halfBase = 0x0010000UL;
50dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckelsstatic const UTF32 halfMask = 0x3FFUL;
51dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
52dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#define UNI_SUR_HIGH_START  (UTF32)0xD800
53dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
54dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#define UNI_SUR_LOW_START   (UTF32)0xDC00
55dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#define UNI_SUR_LOW_END     (UTF32)0xDFFF
56dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#define false	   0
57dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#define true	    1
58dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
59dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
60dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
61dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas EckelsConversionResult ConvertUTF32toUTF16 (
62dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF32** sourceStart, const UTF32* sourceEnd,
63dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
64dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    ConversionResult result = conversionOK;
65dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    const UTF32* source = *sourceStart;
66dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    UTF16* target = *targetStart;
67dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    while (source < sourceEnd) {
68dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF32 ch;
69dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (target >= targetEnd) {
70dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = targetExhausted; break;
71dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
72dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	ch = *source++;
73dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
74dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
75dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
76dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		if (flags == strictConversion) {
77dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    --source; /* return to the illegal value itself */
78dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    result = sourceIllegal;
79dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    break;
80dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		} else {
81dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    *target++ = UNI_REPLACEMENT_CHAR;
82dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		}
83dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    } else {
84dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		*target++ = (UTF16)ch; /* normal case */
85dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
86dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (ch > UNI_MAX_LEGAL_UTF32) {
87dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (flags == strictConversion) {
88dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = sourceIllegal;
89dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    } else {
90dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		*target++ = UNI_REPLACEMENT_CHAR;
91dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
92dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else {
93dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* target is a character in range 0xFFFF - 0x10FFFF. */
94dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (target + 1 >= targetEnd) {
95dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		--source; /* Back up source pointer! */
96dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = targetExhausted; break;
97dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
98dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    ch -= halfBase;
99dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
100dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
101dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
102dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    }
103dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *sourceStart = source;
104dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *targetStart = target;
105dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    return result;
106dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
107dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
108dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
109dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
110dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas EckelsConversionResult ConvertUTF16toUTF32 (
111dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF16** sourceStart, const UTF16* sourceEnd,
112dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
113dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    ConversionResult result = conversionOK;
114dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    const UTF16* source = *sourceStart;
115dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    UTF32* target = *targetStart;
116dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    UTF32 ch, ch2;
117dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    while (source < sourceEnd) {
118dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
119dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	ch = *source++;
120dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/* If we have a surrogate pair, convert to UTF32 first. */
121dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
122dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* If the 16 bits following the high surrogate are in the source buffer... */
123dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (source < sourceEnd) {
124dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		ch2 = *source;
125dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		/* If it's a low surrogate, convert to UTF32. */
126dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
127dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
128dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
129dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    ++source;
130dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
131dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    --source; /* return to the illegal value itself */
132dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    result = sourceIllegal;
133dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    break;
134dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		}
135dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    } else { /* We don't have the 16 bits following the high surrogate. */
136dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		--source; /* return to the high surrogate */
137dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = sourceExhausted;
138dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		break;
139dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
140dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (flags == strictConversion) {
141dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* UTF-16 surrogate values are illegal in UTF-32 */
142dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
143dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		--source; /* return to the illegal value itself */
144dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = sourceIllegal;
145dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		break;
146dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
147dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
148dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (target >= targetEnd) {
149dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    source = oldSource; /* Back up source pointer! */
150dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = targetExhausted; break;
151dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
152dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	*target++ = ch;
153dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    }
154dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *sourceStart = source;
155dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *targetStart = target;
156dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#ifdef CVTUTF_DEBUG
157dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckelsif (result == sourceIllegal) {
158dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
159dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    fflush(stderr);
160dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
161dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels#endif
162dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    return result;
163dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
164dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
165dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
166dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
167dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/*
168dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Index into the table below with the first byte of a UTF-8 sequence to
169dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * get the number of trailing bytes that are supposed to follow it.
170dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
171dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * left as-is for anyone who may want to do such conversion, which was
172dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * allowed in earlier algorithms.
173dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels */
174dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckelsstatic const char trailingBytesForUTF8[256] = {
175dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
182dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
183dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels};
184dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
185dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/*
186dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Magic values subtracted from a buffer value during UTF8 conversion.
187dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * This table contains as many values as there might be trailing bytes
188dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * in a UTF-8 sequence.
189dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels */
190dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckelsstatic const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
191dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
192dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
193dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/*
194dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
195dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * into the first byte, depending on how many bytes follow.  There are
196dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * as many entries in this table as there are UTF-8 sequence types.
197dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
198dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * for *legal* UTF-8 will be 4 or fewer bytes total.
199dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels */
200dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckelsstatic const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
201dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
202dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
203dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
204dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* The interface converts a whole buffer to avoid function-call overhead.
205dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Constants have been gathered. Loops & conditionals have been removed as
206dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * much as possible for efficiency, in favor of drop-through switches.
207dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * (See "Note A" at the bottom of the file for equivalent code.)
208dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * If your compiler supports it, the "isLegalUTF8" call can be turned
209dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * into an inline function.
210dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels */
211dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
212dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
213dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
214dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas EckelsConversionResult ConvertUTF16toUTF8 (
215dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF16** sourceStart, const UTF16* sourceEnd,
216dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
217dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    ConversionResult result = conversionOK;
218dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    const UTF16* source = *sourceStart;
219dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    UTF8* target = *targetStart;
220dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    while (source < sourceEnd) {
221dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF32 ch;
222dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	unsigned short bytesToWrite = 0;
223dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF32 byteMask = 0xBF;
224dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF32 byteMark = 0x80;
225dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
226dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	ch = *source++;
227dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/* If we have a surrogate pair, convert to UTF32 first. */
228dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
229dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* If the 16 bits following the high surrogate are in the source buffer... */
230dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (source < sourceEnd) {
231dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		UTF32 ch2 = *source;
232dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		/* If it's a low surrogate, convert to UTF32. */
233dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
234dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
235dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
236dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    ++source;
237dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
238dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    --source; /* return to the illegal value itself */
239dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    result = sourceIllegal;
240dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    break;
241dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		}
242dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    } else { /* We don't have the 16 bits following the high surrogate. */
243dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		--source; /* return to the high surrogate */
244dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = sourceExhausted;
245dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		break;
246dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
247dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (flags == strictConversion) {
248dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* UTF-16 surrogate values are illegal in UTF-32 */
249dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
250dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		--source; /* return to the illegal value itself */
251dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = sourceIllegal;
252dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		break;
253dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
254dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
255dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
256dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	// TPN: substitute all control characters except for NULL, TAB, LF or CR
257dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (ch && (ch != (UTF32)0x09)  && (ch != (UTF32)0x0a)  && (ch != (UTF32)0x0d)  && (ch < (UTF32)0x20) )  {
258dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		ch = (UTF32)0x3f;
259dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
260dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	// TPN: filter out byte order marks and invalid character 0xFFFF
261dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if((ch == (UTF32)0xFEFF) || (ch == (UTF32)0xFFFE)|| (ch == (UTF32)0xFFFF)) {
262dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		continue;
263dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
264dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
265dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/* Figure out how many bytes the result will require */
266dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (ch < (UTF32)0x80) {	    bytesToWrite = 1;
267dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
268dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
269dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
270dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else {			    bytesToWrite = 3;
271dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels					    ch = UNI_REPLACEMENT_CHAR;
272dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
273dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
274dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	target += bytesToWrite;
275dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (target > targetEnd) {
276dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    source = oldSource; /* Back up source pointer! */
277dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    target -= bytesToWrite; result = targetExhausted; break;
278dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
279dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	switch (bytesToWrite) { /* note: everything falls through. */
280dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
281dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
282dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
283dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
284dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
285dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	target += bytesToWrite;
286dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    }
287dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *sourceStart = source;
288dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *targetStart = target;
289dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    return result;
290dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
291dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
292dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
293dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
294dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/*
295dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Utility routine to tell whether a sequence of bytes is legal UTF-8.
296dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * This must be called with the length pre-determined by the first byte.
297dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * If not calling this from ConvertUTF8to*, then the length can be set by:
298dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels *  length = trailingBytesForUTF8[*source]+1;
299dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * and the sequence is illegal right away if there aren't that many bytes
300dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * available.
301dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * If presented with a length > 4, this returns false.  The Unicode
302dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * definition of UTF-8 goes up to 4-byte sequences.
303dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels */
304dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
305dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckelsinline Boolean isLegalUTF8(const UTF8 *source, int length) {
306dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    UTF8 a;
307dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    const UTF8 *srcptr = source+length;
308dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    switch (length) {
309dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    default: return false;
310dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/* Everything else falls through when "true"... */
311dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
312dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
313dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
314dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
315dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	switch (*source) {
316dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* no fall-through in this inner switch */
317dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 0xE0: if (a < 0xA0) return false; break;
318dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 0xED: if (a > 0x9F) return false; break;
319dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 0xF0: if (a < 0x90) return false; break;
320dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 0xF4: if (a > 0x8F) return false; break;
321dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    default:   if (a < 0x80) return false;
322dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
323dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
324dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
325dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    }
326dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    if (*source > 0xF4) return false;
327dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    return true;
328dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
329dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
330dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
331dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
332dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/*
333dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * Exported function to return whether a UTF-8 sequence is legal or not.
334dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels * This is not used here; it's just exported.
335dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels */
336dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas EckelsBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
337dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    int length = trailingBytesForUTF8[*source]+1;
338dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    if (source+length > sourceEnd) {
339dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	return false;
340dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    }
341dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    return isLegalUTF8(source, length);
342dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
343dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
344dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
345dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
346dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas EckelsConversionResult ConvertUTF8toUTF16 (
347dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF8** sourceStart, const UTF8* sourceEnd,
348dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
349dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    ConversionResult result = conversionOK;
350dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    const UTF8* source = *sourceStart;
351dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    UTF16* target = *targetStart;
352dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    while (source < sourceEnd) {
353dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF32 ch = 0;
354dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
355dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (source + extraBytesToRead >= sourceEnd) {
356dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = sourceExhausted; break;
357dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
358dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/* Do this check whether lenient or strict */
359dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (! isLegalUTF8(source, extraBytesToRead+1)) {
360dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = sourceIllegal;
361dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    break;
362dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
363dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/*
364dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	 * The cases all fall through. See "Note A" below.
365dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	 */
366dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	switch (extraBytesToRead) {
367dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
368dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
369dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 3: ch += *source++; ch <<= 6;
370dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 2: ch += *source++; ch <<= 6;
371dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 1: ch += *source++; ch <<= 6;
372dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 0: ch += *source++;
373dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
374dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	ch -= offsetsFromUTF8[extraBytesToRead];
375dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
376dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (target >= targetEnd) {
377dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    source -= (extraBytesToRead+1); /* Back up source pointer! */
378dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = targetExhausted; break;
379dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
380dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
381dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* UTF-16 surrogate values are illegal in UTF-32 */
382dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
383dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		if (flags == strictConversion) {
384dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
385dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    result = sourceIllegal;
386dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    break;
387dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		} else {
388dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    *target++ = UNI_REPLACEMENT_CHAR;
389dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		}
390dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    } else {
391dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		*target++ = (UTF16)ch; /* normal case */
392dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
393dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (ch > UNI_MAX_UTF16) {
394dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (flags == strictConversion) {
395dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = sourceIllegal;
396dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		source -= (extraBytesToRead+1); /* return to the start */
397dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		break; /* Bail out; shouldn't continue */
398dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    } else {
399dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		*target++ = UNI_REPLACEMENT_CHAR;
400dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
401dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else {
402dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* target is a character in range 0xFFFF - 0x10FFFF. */
403dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (target + 1 >= targetEnd) {
404dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		source -= (extraBytesToRead+1); /* Back up source pointer! */
405dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = targetExhausted; break;
406dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
407dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    ch -= halfBase;
408dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
409dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
410dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
411dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    }
412dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *sourceStart = source;
413dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *targetStart = target;
414dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    return result;
415dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
416dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
417dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
418dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
419dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas EckelsConversionResult ConvertUTF32toUTF8 (
420dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF32** sourceStart, const UTF32* sourceEnd,
421dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
422dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    ConversionResult result = conversionOK;
423dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    const UTF32* source = *sourceStart;
424dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    UTF8* target = *targetStart;
425dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    while (source < sourceEnd) {
426dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF32 ch;
427dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	unsigned short bytesToWrite = 0;
428dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF32 byteMask = 0xBF;
429dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF32 byteMark = 0x80;
430dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	ch = *source++;
431dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (flags == strictConversion ) {
432dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /* UTF-16 surrogate values are illegal in UTF-32 */
433dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
434dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		--source; /* return to the illegal value itself */
435dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		result = sourceIllegal;
436dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		break;
437dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
438dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
439dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/*
440dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	 * Figure out how many bytes the result will require. Turn any
441dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	 * illegally large UTF32 things (> Plane 17) into replacement chars.
442dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	 */
443dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
444dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
445dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
446dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
447dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else {			    bytesToWrite = 3;
448dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels					    ch = UNI_REPLACEMENT_CHAR;
449dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels					    result = sourceIllegal;
450dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
451dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
452dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	target += bytesToWrite;
453dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (target > targetEnd) {
454dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    --source; /* Back up source pointer! */
455dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    target -= bytesToWrite; result = targetExhausted; break;
456dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
457dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	switch (bytesToWrite) { /* note: everything falls through. */
458dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
459dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
460dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
461dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
462dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
463dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	target += bytesToWrite;
464dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    }
465dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *sourceStart = source;
466dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *targetStart = target;
467dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    return result;
468dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
469dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
470dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* --------------------------------------------------------------------- */
471dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
472dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas EckelsConversionResult ConvertUTF8toUTF32 (
473dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	const UTF8** sourceStart, const UTF8* sourceEnd,
474dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
475dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    ConversionResult result = conversionOK;
476dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    const UTF8* source = *sourceStart;
477dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    UTF32* target = *targetStart;
478dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    while (source < sourceEnd) {
479dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	UTF32 ch = 0;
480dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
481dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (source + extraBytesToRead >= sourceEnd) {
482dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = sourceExhausted; break;
483dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
484dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/* Do this check whether lenient or strict */
485dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (! isLegalUTF8(source, extraBytesToRead+1)) {
486dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = sourceIllegal;
487dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    break;
488dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
489dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	/*
490dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	 * The cases all fall through. See "Note A" below.
491dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	 */
492dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	switch (extraBytesToRead) {
493dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 5: ch += *source++; ch <<= 6;
494dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 4: ch += *source++; ch <<= 6;
495dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 3: ch += *source++; ch <<= 6;
496dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 2: ch += *source++; ch <<= 6;
497dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 1: ch += *source++; ch <<= 6;
498dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    case 0: ch += *source++;
499dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
500dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	ch -= offsetsFromUTF8[extraBytesToRead];
501dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
502dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (target >= targetEnd) {
503dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
504dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = targetExhausted; break;
505dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
506dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	if (ch <= UNI_MAX_LEGAL_UTF32) {
507dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    /*
508dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	     * UTF-16 surrogate values are illegal in UTF-32, and anything
509dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	     * over Plane 17 (> 0x10FFFF) is illegal.
510dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	     */
511dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
512dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		if (flags == strictConversion) {
513dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
514dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    result = sourceIllegal;
515dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    break;
516dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		} else {
517dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		    *target++ = UNI_REPLACEMENT_CHAR;
518dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		}
519dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    } else {
520dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		*target++ = ch;
521dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    }
522dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
523dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    result = sourceIllegal;
524dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    *target++ = UNI_REPLACEMENT_CHAR;
525dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
526dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    }
527dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *sourceStart = source;
528dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    *targetStart = target;
529dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    return result;
530dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels}
531dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
532dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels/* ---------------------------------------------------------------------
533dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
534dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    Note A.
535dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    The fall-through switches in UTF-8 reading code save a
536dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    temp variable, some decrements & conditionals.  The switches
537dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    are equivalent to the following loop:
538dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	{
539dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    int tmpBytesToRead = extraBytesToRead+1;
540dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    do {
541dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		ch += *source++;
542dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		--tmpBytesToRead;
543dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels		if (tmpBytesToRead) ch <<= 6;
544dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	    } while (tmpBytesToRead > 0);
545dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels	}
546dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    In UTF-8 writing code, the switches on "bytesToWrite" are
547dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels    similarly unrolled loops.
548dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels
549dc4699f0a7c0f7f76ce37935c58ffd1f638a0525Lucas Eckels   --------------------------------------------------------------------- */
550