1/*
2 * Copyright 2001-2004 Unicode, Inc.
3 *
4 * Disclaimer
5 *
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
13 *
14 * Limitations on Rights to Redistribute This Code
15 *
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
20 * remains attached.
21 */
22
23/* ---------------------------------------------------------------------
24
25    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26    Author: Mark E. Davis, 1994.
27    Rev History: Rick McGowan, fixes & updates May 2001.
28    Sept 2001: fixed const & error conditions per
29	mods suggested by S. Parent & A. Lillich.
30    June 2002: Tim Dodd added detection and handling of incomplete
31	source sequences, enhanced error detection, added casts
32	to eliminate compiler warnings.
33    July 2003: slight mods to back out aggressive FFFE detection.
34    Jan 2004: updated switches in from-UTF8 conversions.
35    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37    See the header file "ConvertUTF.h" for complete documentation.
38
39------------------------------------------------------------------------ */
40
41
42#include "antlr3convertutf.h"
43
44#ifdef CVTUTF_DEBUG
45#include <stdio.h>
46#endif
47
48
49
50/* --------------------------------------------------------------------- */
51
52ConversionResult ConvertUTF32toUTF16 (
53	const UTF32** sourceStart, const UTF32* sourceEnd,
54	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
55    ConversionResult result = conversionOK;
56    const UTF32* source = *sourceStart;
57    UTF16* target = *targetStart;
58    while (source < sourceEnd) {
59	UTF32 ch;
60	if (target >= targetEnd) {
61	    result = targetExhausted; break;
62	}
63	ch = *source++;
64	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
65	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
66	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
67		if (flags == strictConversion) {
68		    --source; /* return to the illegal value itself */
69		    result = sourceIllegal;
70		    break;
71		} else {
72		    *target++ = UNI_REPLACEMENT_CHAR;
73		}
74	    } else {
75		*target++ = (UTF16)ch; /* normal case */
76	    }
77	} else if (ch > UNI_MAX_LEGAL_UTF32) {
78	    if (flags == strictConversion) {
79		result = sourceIllegal;
80	    } else {
81		*target++ = UNI_REPLACEMENT_CHAR;
82	    }
83	} else {
84	    /* target is a character in range 0xFFFF - 0x10FFFF. */
85	    if (target + 1 >= targetEnd) {
86		--source; /* Back up source pointer! */
87		result = targetExhausted; break;
88	    }
89	    ch -= halfBase;
90	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
91	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
92	}
93    }
94    *sourceStart = source;
95    *targetStart = target;
96    return result;
97}
98
99/* --------------------------------------------------------------------- */
100
101ConversionResult ConvertUTF16toUTF32 (
102	const UTF16** sourceStart, const UTF16* sourceEnd,
103	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
104    ConversionResult result = conversionOK;
105    const UTF16* source = *sourceStart;
106    UTF32* target = *targetStart;
107    UTF32 ch, ch2;
108    while (source < sourceEnd) {
109	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
110	ch = *source++;
111	/* If we have a surrogate pair, convert to UTF32 first. */
112	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
113	    /* If the 16 bits following the high surrogate are in the source buffer... */
114	    if (source < sourceEnd) {
115		ch2 = *source;
116		/* If it's a low surrogate, convert to UTF32. */
117		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
118		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
119			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
120		    ++source;
121		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
122		    --source; /* return to the illegal value itself */
123		    result = sourceIllegal;
124		    break;
125		}
126	    } else { /* We don't have the 16 bits following the high surrogate. */
127		--source; /* return to the high surrogate */
128		result = sourceExhausted;
129		break;
130	    }
131	} else if (flags == strictConversion) {
132	    /* UTF-16 surrogate values are illegal in UTF-32 */
133	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
134		--source; /* return to the illegal value itself */
135		result = sourceIllegal;
136		break;
137	    }
138	}
139	if (target >= targetEnd) {
140	    source = oldSource; /* Back up source pointer! */
141	    result = targetExhausted; break;
142	}
143	*target++ = ch;
144    }
145    *sourceStart = source;
146    *targetStart = target;
147#ifdef CVTUTF_DEBUG
148if (result == sourceIllegal) {
149    ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
150    fflush(stderr);
151}
152#endif
153    return result;
154}
155
156/* --------------------------------------------------------------------- */
157
158/*
159 * Index into the table below with the first byte of a UTF-8 sequence to
160 * get the number of trailing bytes that are supposed to follow it.
161 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
162 * left as-is for anyone who may want to do such conversion, which was
163 * allowed in earlier algorithms.
164 */
165static const char trailingBytesForUTF8[256] = {
166    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
167    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
168    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
169    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
170    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
173    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
174};
175
176/*
177 * Magic values subtracted from a buffer value during UTF8 conversion.
178 * This table contains as many values as there might be trailing bytes
179 * in a UTF-8 sequence.
180 */
181static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
182		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
183
184/*
185 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
186 * into the first byte, depending on how many bytes follow.  There are
187 * as many entries in this table as there are UTF-8 sequence types.
188 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
189 * for *legal* UTF-8 will be 4 or fewer bytes total.
190 */
191static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
192
193/* --------------------------------------------------------------------- */
194
195/* The interface converts a whole buffer to avoid function-call overhead.
196 * Constants have been gathered. Loops & conditionals have been removed as
197 * much as possible for efficiency, in favor of drop-through switches.
198 * (See "Note A" at the bottom of the file for equivalent code.)
199 * If your compiler supports it, the "isLegalUTF8" call can be turned
200 * into an inline function.
201 */
202
203/* --------------------------------------------------------------------- */
204
205ConversionResult ConvertUTF16toUTF8 (
206	const UTF16** sourceStart, const UTF16* sourceEnd,
207	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
208    ConversionResult result = conversionOK;
209    const UTF16* source = *sourceStart;
210    UTF8* target = *targetStart;
211    while (source < sourceEnd) {
212	UTF32 ch;
213	unsigned short bytesToWrite = 0;
214	const UTF32 byteMask = 0xBF;
215	const UTF32 byteMark = 0x80;
216	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
217	ch = *source++;
218	/* If we have a surrogate pair, convert to UTF32 first. */
219	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
220	    /* If the 16 bits following the high surrogate are in the source buffer... */
221	    if (source < sourceEnd) {
222		UTF32 ch2 = *source;
223		/* If it's a low surrogate, convert to UTF32. */
224		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
225		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
226			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
227		    ++source;
228		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
229		    --source; /* return to the illegal value itself */
230		    result = sourceIllegal;
231		    break;
232		}
233	    } else { /* We don't have the 16 bits following the high surrogate. */
234		--source; /* return to the high surrogate */
235		result = sourceExhausted;
236		break;
237	    }
238        } else if (flags == strictConversion) {
239	    /* UTF-16 surrogate values are illegal in UTF-32 */
240	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
241		--source; /* return to the illegal value itself */
242		result = sourceIllegal;
243		break;
244	    }
245	}
246	/* Figure out how many bytes the result will require */
247	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
248	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
249	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
250	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
251	} else {			    bytesToWrite = 3;
252					    ch = UNI_REPLACEMENT_CHAR;
253	}
254
255	target += bytesToWrite;
256	if (target > targetEnd) {
257	    source = oldSource; /* Back up source pointer! */
258	    target -= bytesToWrite; result = targetExhausted; break;
259	}
260	switch (bytesToWrite) { /* note: everything falls through. */
261	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
262	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
263	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
264	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
265	}
266	target += bytesToWrite;
267    }
268    *sourceStart = source;
269    *targetStart = target;
270    return result;
271}
272
273/* --------------------------------------------------------------------- */
274
275/*
276 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
277 * This must be called with the length pre-determined by the first byte.
278 * If not calling this from ConvertUTF8to*, then the length can be set by:
279 *  length = trailingBytesForUTF8[*source]+1;
280 * and the sequence is illegal right away if there aren't that many bytes
281 * available.
282 * If presented with a length > 4, this returns false.  The Unicode
283 * definition of UTF-8 goes up to 4-byte sequences.
284 */
285
286static ANTLR3_BOOLEAN
287isLegalUTF8(const UTF8 *source, int length) {
288    UTF8 a;
289    const UTF8 *srcptr = source+length;
290    switch (length) {
291    default: return false;
292	/* Everything else falls through when "true"... */
293    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
294    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
295    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
296
297	switch (*source) {
298	    /* no fall-through in this inner switch */
299	    case 0xE0: if (a < 0xA0) return false; break;
300	    case 0xED: if (a > 0x9F) return false; break;
301	    case 0xF0: if (a < 0x90) return false; break;
302	    case 0xF4: if (a > 0x8F) return false; break;
303	    default:   if (a < 0x80) return false;
304	}
305
306    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
307    }
308    if (*source > 0xF4) return false;
309    return true;
310}
311
312/* --------------------------------------------------------------------- */
313
314/*
315 * Exported function to return whether a UTF-8 sequence is legal or not.
316 * This is not used here; it's just exported.
317 */
318ANTLR3_BOOLEAN
319isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
320    int length = trailingBytesForUTF8[*source]+1;
321    if (source+length > sourceEnd) {
322	return false;
323    }
324    return isLegalUTF8(source, length);
325}
326
327/* --------------------------------------------------------------------- */
328
329ConversionResult ConvertUTF8toUTF16 (
330	const UTF8** sourceStart, const UTF8* sourceEnd,
331	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
332    ConversionResult result = conversionOK;
333    const UTF8* source = *sourceStart;
334    UTF16* target = *targetStart;
335    while (source < sourceEnd) {
336	UTF32 ch = 0;
337	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
338	if (source + extraBytesToRead >= sourceEnd) {
339	    result = sourceExhausted; break;
340	}
341	/* Do this check whether lenient or strict */
342	if (! isLegalUTF8(source, extraBytesToRead+1)) {
343	    result = sourceIllegal;
344	    break;
345	}
346	/*
347	 * The cases all fall through. See "Note A" below.
348	 */
349	switch (extraBytesToRead) {
350	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
351	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
352	    case 3: ch += *source++; ch <<= 6;
353	    case 2: ch += *source++; ch <<= 6;
354	    case 1: ch += *source++; ch <<= 6;
355	    case 0: ch += *source++;
356	}
357	ch -= offsetsFromUTF8[extraBytesToRead];
358
359	if (target >= targetEnd) {
360	    source -= (extraBytesToRead+1); /* Back up source pointer! */
361	    result = targetExhausted; break;
362	}
363	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
364	    /* UTF-16 surrogate values are illegal in UTF-32 */
365	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
366		if (flags == strictConversion) {
367		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
368		    result = sourceIllegal;
369		    break;
370		} else {
371		    *target++ = UNI_REPLACEMENT_CHAR;
372		}
373	    } else {
374		*target++ = (UTF16)ch; /* normal case */
375	    }
376	} else if (ch > UNI_MAX_UTF16) {
377	    if (flags == strictConversion) {
378		result = sourceIllegal;
379		source -= (extraBytesToRead+1); /* return to the start */
380		break; /* Bail out; shouldn't continue */
381	    } else {
382		*target++ = UNI_REPLACEMENT_CHAR;
383	    }
384	} else {
385	    /* target is a character in range 0xFFFF - 0x10FFFF. */
386	    if (target + 1 >= targetEnd) {
387		source -= (extraBytesToRead+1); /* Back up source pointer! */
388		result = targetExhausted; break;
389	    }
390	    ch -= halfBase;
391	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
392	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
393	}
394    }
395    *sourceStart = source;
396    *targetStart = target;
397    return result;
398}
399
400/* --------------------------------------------------------------------- */
401
402ConversionResult ConvertUTF32toUTF8 (
403	const UTF32** sourceStart, const UTF32* sourceEnd,
404	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
405    ConversionResult result = conversionOK;
406    const UTF32* source = *sourceStart;
407    UTF8* target = *targetStart;
408    while (source < sourceEnd) {
409	UTF32 ch;
410	unsigned short bytesToWrite = 0;
411	const UTF32 byteMask = 0xBF;
412	const UTF32 byteMark = 0x80;
413	ch = *source++;
414	if (flags == strictConversion ) {
415	    /* UTF-16 surrogate values are illegal in UTF-32 */
416	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
417		--source; /* return to the illegal value itself */
418		result = sourceIllegal;
419		break;
420	    }
421	}
422	/*
423	 * Figure out how many bytes the result will require. Turn any
424	 * illegally large UTF32 things (> Plane 17) into replacement chars.
425	 */
426	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
427	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
428	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
429	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
430	} else {			    bytesToWrite = 3;
431					    ch = UNI_REPLACEMENT_CHAR;
432					    result = sourceIllegal;
433	}
434
435	target += bytesToWrite;
436	if (target > targetEnd) {
437	    --source; /* Back up source pointer! */
438	    target -= bytesToWrite; result = targetExhausted; break;
439	}
440	switch (bytesToWrite) { /* note: everything falls through. */
441	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
442	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
443	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
444	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
445	}
446	target += bytesToWrite;
447    }
448    *sourceStart = source;
449    *targetStart = target;
450    return result;
451}
452
453/* --------------------------------------------------------------------- */
454
455ConversionResult ConvertUTF8toUTF32 (
456	const UTF8** sourceStart, const UTF8* sourceEnd,
457	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
458    ConversionResult result = conversionOK;
459    const UTF8* source = *sourceStart;
460    UTF32* target = *targetStart;
461    while (source < sourceEnd) {
462	UTF32 ch = 0;
463	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
464	if (source + extraBytesToRead >= sourceEnd) {
465	    result = sourceExhausted; break;
466	}
467	/* Do this check whether lenient or strict */
468	if (! isLegalUTF8(source, extraBytesToRead+1)) {
469	    result = sourceIllegal;
470	    break;
471	}
472	/*
473	 * The cases all fall through. See "Note A" below.
474	 */
475	switch (extraBytesToRead) {
476	    case 5: ch += *source++; ch <<= 6;
477	    case 4: ch += *source++; ch <<= 6;
478	    case 3: ch += *source++; ch <<= 6;
479	    case 2: ch += *source++; ch <<= 6;
480	    case 1: ch += *source++; ch <<= 6;
481	    case 0: ch += *source++;
482	}
483	ch -= offsetsFromUTF8[extraBytesToRead];
484
485	if (target >= targetEnd) {
486	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
487	    result = targetExhausted; break;
488	}
489	if (ch <= UNI_MAX_LEGAL_UTF32) {
490	    /*
491	     * UTF-16 surrogate values are illegal in UTF-32, and anything
492	     * over Plane 17 (> 0x10FFFF) is illegal.
493	     */
494	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
495		if (flags == strictConversion) {
496		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
497		    result = sourceIllegal;
498		    break;
499		} else {
500		    *target++ = UNI_REPLACEMENT_CHAR;
501		}
502	    } else {
503		*target++ = ch;
504	    }
505	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
506	    result = sourceIllegal;
507	    *target++ = UNI_REPLACEMENT_CHAR;
508	}
509    }
510    *sourceStart = source;
511    *targetStart = target;
512    return result;
513}
514
515/* ---------------------------------------------------------------------
516
517    Note A.
518    The fall-through switches in UTF-8 reading code save a
519    temp variable, some decrements & conditionals.  The switches
520    are equivalent to the following loop:
521	{
522	    int tmpBytesToRead = extraBytesToRead+1;
523	    do {
524		ch += *source++;
525		--tmpBytesToRead;
526		if (tmpBytesToRead) ch <<= 6;
527	    } while (tmpBytesToRead > 0);
528	}
529    In UTF-8 writing code, the switches on "bytesToWrite" are
530    similarly unrolled loops.
531
532   --------------------------------------------------------------------- */
533