1324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* 2324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Copyright 2001-2004 Unicode, Inc. 3324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * 4324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Disclaimer 5324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * 6324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * This source code is provided as is by Unicode, Inc. No claims are 7324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * made as to fitness for any particular purpose. No warranties of any 8324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * kind are expressed or implied. The recipient agrees to determine 9324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * applicability of information provided. If this file has been 10324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * purchased on magnetic or optical media from Unicode, Inc., the 11324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * sole remedy for any claim will be exchange of defective media 12324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * within 90 days of receipt. 13324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * 14324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Limitations on Rights to Redistribute This Code 15324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * 16324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Unicode, Inc. hereby grants the right to freely use the information 17324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * supplied in this file in the creation of products supporting the 18324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Unicode Standard, and to make copies of this file in any form 19324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * for internal or external distribution as long as this notice 20324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * remains attached. 21324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 22324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 23324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- 24324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 25324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver Conversions between UTF32, UTF-16, and UTF-8. Source code file. 26324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver Author: Mark E. Davis, 1994. 27324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver Rev History: Rick McGowan, fixes & updates May 2001. 28324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver Sept 2001: fixed const & error conditions per 29324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver mods suggested by S. Parent & A. Lillich. 30324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver June 2002: Tim Dodd added detection and handling of incomplete 31324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source sequences, enhanced error detection, added casts 32324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver to eliminate compiler warnings. 33324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver July 2003: slight mods to back out aggressive FFFE detection. 34324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver Jan 2004: updated switches in from-UTF8 conversions. 35324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 36324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 37324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver See the header file "ConvertUTF.h" for complete documentation. 38324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 39324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver------------------------------------------------------------------------ */ 40324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 41324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 42324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver#include "antlr3convertutf.h" 43324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 44324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver#ifdef CVTUTF_DEBUG 45324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver#include <stdio.h> 46324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver#endif 47324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 48324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 49324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 50324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 51324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 52324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverConversionResult ConvertUTF32toUTF16 ( 53324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF32** sourceStart, const UTF32* sourceEnd, 54324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 55324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ConversionResult result = conversionOK; 56324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF32* source = *sourceStart; 57324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF16* target = *targetStart; 58324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver while (source < sourceEnd) { 59324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32 ch; 60324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (target >= targetEnd) { 61324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = targetExhausted; break; 62324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 63324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch = *source++; 64324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 65324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 66324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 67324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (flags == strictConversion) { 68324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* return to the illegal value itself */ 69324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 70324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 71324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 72324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = UNI_REPLACEMENT_CHAR; 73324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 74324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 75324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = (UTF16)ch; /* normal case */ 76324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 77324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (ch > UNI_MAX_LEGAL_UTF32) { 78324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (flags == strictConversion) { 79324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 80324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 81324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = UNI_REPLACEMENT_CHAR; 82324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 83324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 84324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* target is a character in range 0xFFFF - 0x10FFFF. */ 85324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (target + 1 >= targetEnd) { 86324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* Back up source pointer! */ 87324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = targetExhausted; break; 88324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 89324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch -= halfBase; 90324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 91324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 92324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 93324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 94324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *sourceStart = source; 95324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *targetStart = target; 96324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return result; 97324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 98324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 99324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 100324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 101324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverConversionResult ConvertUTF16toUTF32 ( 102324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF16** sourceStart, const UTF16* sourceEnd, 103324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 104324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ConversionResult result = conversionOK; 105324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF16* source = *sourceStart; 106324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32* target = *targetStart; 107324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32 ch, ch2; 108324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver while (source < sourceEnd) { 109324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 110324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch = *source++; 111324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* If we have a surrogate pair, convert to UTF32 first. */ 112324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 113324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* If the 16 bits following the high surrogate are in the source buffer... */ 114324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (source < sourceEnd) { 115324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch2 = *source; 116324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* If it's a low surrogate, convert to UTF32. */ 117324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 118324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 119324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver + (ch2 - UNI_SUR_LOW_START) + halfBase; 120324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ++source; 121324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 122324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* return to the illegal value itself */ 123324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 124324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 125324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 126324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { /* We don't have the 16 bits following the high surrogate. */ 127324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* return to the high surrogate */ 128324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceExhausted; 129324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 130324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 131324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (flags == strictConversion) { 132324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* UTF-16 surrogate values are illegal in UTF-32 */ 133324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 134324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* return to the illegal value itself */ 135324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 136324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 137324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 138324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 139324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (target >= targetEnd) { 140324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source = oldSource; /* Back up source pointer! */ 141324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = targetExhausted; break; 142324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 143324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = ch; 144324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 145324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *sourceStart = source; 146324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *targetStart = target; 147324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver#ifdef CVTUTF_DEBUG 148324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverif (result == sourceIllegal) { 149324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 150324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver fflush(stderr); 151324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 152324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver#endif 153324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return result; 154324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 155324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 156324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 157324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 158324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* 159324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Index into the table below with the first byte of a UTF-8 sequence to 160324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * get the number of trailing bytes that are supposed to follow it. 161324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 162324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * left as-is for anyone who may want to do such conversion, which was 163324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * allowed in earlier algorithms. 164324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 165324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverstatic const char trailingBytesForUTF8[256] = { 166324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 167324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 168324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 169324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 170324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 171324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 172324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 173324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 174324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver}; 175324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 176324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* 177324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Magic values subtracted from a buffer value during UTF8 conversion. 178324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * This table contains as many values as there might be trailing bytes 179324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * in a UTF-8 sequence. 180324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 181324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverstatic const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 182324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 183324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 184324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* 185324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 186324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * into the first byte, depending on how many bytes follow. There are 187324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * as many entries in this table as there are UTF-8 sequence types. 188324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 189324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * for *legal* UTF-8 will be 4 or fewer bytes total. 190324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 191324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverstatic const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 192324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 193324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 194324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 195324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* The interface converts a whole buffer to avoid function-call overhead. 196324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Constants have been gathered. Loops & conditionals have been removed as 197324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * much as possible for efficiency, in favor of drop-through switches. 198324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * (See "Note A" at the bottom of the file for equivalent code.) 199324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * If your compiler supports it, the "isLegalUTF8" call can be turned 200324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * into an inline function. 201324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 202324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 203324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 204324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 205324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverConversionResult ConvertUTF16toUTF8 ( 206324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF16** sourceStart, const UTF16* sourceEnd, 207324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 208324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ConversionResult result = conversionOK; 209324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF16* source = *sourceStart; 210324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF8* target = *targetStart; 211324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver while (source < sourceEnd) { 212324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32 ch; 213324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver unsigned short bytesToWrite = 0; 214324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF32 byteMask = 0xBF; 215324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF32 byteMark = 0x80; 216324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 217324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch = *source++; 218324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* If we have a surrogate pair, convert to UTF32 first. */ 219324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 220324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* If the 16 bits following the high surrogate are in the source buffer... */ 221324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (source < sourceEnd) { 222324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32 ch2 = *source; 223324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* If it's a low surrogate, convert to UTF32. */ 224324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 225324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 226324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver + (ch2 - UNI_SUR_LOW_START) + halfBase; 227324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ++source; 228324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 229324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* return to the illegal value itself */ 230324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 231324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 232324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 233324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { /* We don't have the 16 bits following the high surrogate. */ 234324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* return to the high surrogate */ 235324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceExhausted; 236324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 237324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 238324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (flags == strictConversion) { 239324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* UTF-16 surrogate values are illegal in UTF-32 */ 240324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 241324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* return to the illegal value itself */ 242324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 243324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 244324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 245324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 246324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* Figure out how many bytes the result will require */ 247324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch < (UTF32)0x80) { bytesToWrite = 1; 248324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 249324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 250324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 251324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { bytesToWrite = 3; 252324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch = UNI_REPLACEMENT_CHAR; 253324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 254324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 255324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver target += bytesToWrite; 256324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (target > targetEnd) { 257324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source = oldSource; /* Back up source pointer! */ 258324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver target -= bytesToWrite; result = targetExhausted; break; 259324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 260324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver switch (bytesToWrite) { /* note: everything falls through. */ 261324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 262324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 263324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 264324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 265324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 266324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver target += bytesToWrite; 267324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 268324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *sourceStart = source; 269324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *targetStart = target; 270324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return result; 271324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 272324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 273324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 274324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 275324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* 276324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Utility routine to tell whether a sequence of bytes is legal UTF-8. 277324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * This must be called with the length pre-determined by the first byte. 278324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * If not calling this from ConvertUTF8to*, then the length can be set by: 279324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * length = trailingBytesForUTF8[*source]+1; 280324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * and the sequence is illegal right away if there aren't that many bytes 281324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * available. 282324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * If presented with a length > 4, this returns false. The Unicode 283324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * definition of UTF-8 goes up to 4-byte sequences. 284324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 285324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 286324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruverstatic ANTLR3_BOOLEAN 287324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverisLegalUTF8(const UTF8 *source, int length) { 288324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF8 a; 289324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF8 *srcptr = source+length; 290324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver switch (length) { 291324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver default: return false; 292324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* Everything else falls through when "true"... */ 293324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 294324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 295324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 2: if ((a = (*--srcptr)) > 0xBF) return false; 296324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 297324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver switch (*source) { 298324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* no fall-through in this inner switch */ 299324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 0xE0: if (a < 0xA0) return false; break; 300324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 0xED: if (a > 0x9F) return false; break; 301324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 0xF0: if (a < 0x90) return false; break; 302324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 0xF4: if (a > 0x8F) return false; break; 303324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver default: if (a < 0x80) return false; 304324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 305324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 306324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 1: if (*source >= 0x80 && *source < 0xC2) return false; 307324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 308324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (*source > 0xF4) return false; 309324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return true; 310324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 311324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 312324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 313324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 314324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* 315324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Exported function to return whether a UTF-8 sequence is legal or not. 316324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * This is not used here; it's just exported. 317324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 318324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverANTLR3_BOOLEAN 319324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverisLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 320324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver int length = trailingBytesForUTF8[*source]+1; 321324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (source+length > sourceEnd) { 322324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return false; 323324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 324324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return isLegalUTF8(source, length); 325324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 326324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 327324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 328324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 329324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverConversionResult ConvertUTF8toUTF16 ( 330324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF8** sourceStart, const UTF8* sourceEnd, 331324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 332324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ConversionResult result = conversionOK; 333324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF8* source = *sourceStart; 334324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF16* target = *targetStart; 335324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver while (source < sourceEnd) { 336324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32 ch = 0; 337324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 338324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (source + extraBytesToRead >= sourceEnd) { 339324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceExhausted; break; 340324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 341324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* Do this check whether lenient or strict */ 342324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (! isLegalUTF8(source, extraBytesToRead+1)) { 343324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 344324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 345324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 346324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* 347324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * The cases all fall through. See "Note A" below. 348324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 349324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver switch (extraBytesToRead) { 350324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 351324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 352324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 3: ch += *source++; ch <<= 6; 353324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 2: ch += *source++; ch <<= 6; 354324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 1: ch += *source++; ch <<= 6; 355324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 0: ch += *source++; 356324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 357324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch -= offsetsFromUTF8[extraBytesToRead]; 358324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 359324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (target >= targetEnd) { 360324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source -= (extraBytesToRead+1); /* Back up source pointer! */ 361324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = targetExhausted; break; 362324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 363324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 364324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* UTF-16 surrogate values are illegal in UTF-32 */ 365324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 366324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (flags == strictConversion) { 367324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source -= (extraBytesToRead+1); /* return to the illegal value itself */ 368324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 369324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 370324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 371324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = UNI_REPLACEMENT_CHAR; 372324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 373324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 374324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = (UTF16)ch; /* normal case */ 375324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 376324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (ch > UNI_MAX_UTF16) { 377324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (flags == strictConversion) { 378324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 379324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source -= (extraBytesToRead+1); /* return to the start */ 380324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; /* Bail out; shouldn't continue */ 381324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 382324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = UNI_REPLACEMENT_CHAR; 383324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 384324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 385324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* target is a character in range 0xFFFF - 0x10FFFF. */ 386324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (target + 1 >= targetEnd) { 387324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source -= (extraBytesToRead+1); /* Back up source pointer! */ 388324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = targetExhausted; break; 389324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 390324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch -= halfBase; 391324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 392324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 393324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 394324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 395324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *sourceStart = source; 396324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *targetStart = target; 397324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return result; 398324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 399324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 400324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 401324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 402324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverConversionResult ConvertUTF32toUTF8 ( 403324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF32** sourceStart, const UTF32* sourceEnd, 404324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 405324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ConversionResult result = conversionOK; 406324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF32* source = *sourceStart; 407324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF8* target = *targetStart; 408324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver while (source < sourceEnd) { 409324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32 ch; 410324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver unsigned short bytesToWrite = 0; 411324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF32 byteMask = 0xBF; 412324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF32 byteMark = 0x80; 413324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch = *source++; 414324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (flags == strictConversion ) { 415324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* UTF-16 surrogate values are illegal in UTF-32 */ 416324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 417324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* return to the illegal value itself */ 418324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 419324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 420324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 421324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 422324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* 423324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * Figure out how many bytes the result will require. Turn any 424324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * illegally large UTF32 things (> Plane 17) into replacement chars. 425324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 426324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch < (UTF32)0x80) { bytesToWrite = 1; 427324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 428324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 429324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 430324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { bytesToWrite = 3; 431324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch = UNI_REPLACEMENT_CHAR; 432324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 433324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 434324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 435324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver target += bytesToWrite; 436324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (target > targetEnd) { 437324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --source; /* Back up source pointer! */ 438324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver target -= bytesToWrite; result = targetExhausted; break; 439324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 440324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver switch (bytesToWrite) { /* note: everything falls through. */ 441324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 442324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 443324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 444324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 445324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 446324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver target += bytesToWrite; 447324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 448324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *sourceStart = source; 449324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *targetStart = target; 450324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return result; 451324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 452324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 453324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- */ 454324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 455324c4644fee44b9898524c09511bd33c3f12e2dfBen GruverConversionResult ConvertUTF8toUTF32 ( 456324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF8** sourceStart, const UTF8* sourceEnd, 457324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 458324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ConversionResult result = conversionOK; 459324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver const UTF8* source = *sourceStart; 460324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32* target = *targetStart; 461324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver while (source < sourceEnd) { 462324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver UTF32 ch = 0; 463324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 464324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (source + extraBytesToRead >= sourceEnd) { 465324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceExhausted; break; 466324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 467324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* Do this check whether lenient or strict */ 468324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (! isLegalUTF8(source, extraBytesToRead+1)) { 469324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 470324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 471324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 472324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* 473324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * The cases all fall through. See "Note A" below. 474324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 475324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver switch (extraBytesToRead) { 476324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 5: ch += *source++; ch <<= 6; 477324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 4: ch += *source++; ch <<= 6; 478324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 3: ch += *source++; ch <<= 6; 479324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 2: ch += *source++; ch <<= 6; 480324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 1: ch += *source++; ch <<= 6; 481324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver case 0: ch += *source++; 482324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 483324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch -= offsetsFromUTF8[extraBytesToRead]; 484324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 485324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (target >= targetEnd) { 486324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source -= (extraBytesToRead+1); /* Back up the source pointer! */ 487324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = targetExhausted; break; 488324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 489324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch <= UNI_MAX_LEGAL_UTF32) { 490324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver /* 491324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * UTF-16 surrogate values are illegal in UTF-32, and anything 492324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver * over Plane 17 (> 0x10FFFF) is illegal. 493324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver */ 494324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 495324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (flags == strictConversion) { 496324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver source -= (extraBytesToRead+1); /* return to the illegal value itself */ 497324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 498324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver break; 499324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 500324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = UNI_REPLACEMENT_CHAR; 501324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 502324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { 503324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = ch; 504324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 505324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 506324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver result = sourceIllegal; 507324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *target++ = UNI_REPLACEMENT_CHAR; 508324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 509324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 510324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *sourceStart = source; 511324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver *targetStart = target; 512324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver return result; 513324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver} 514324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 515324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver/* --------------------------------------------------------------------- 516324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 517324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver Note A. 518324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver The fall-through switches in UTF-8 reading code save a 519324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver temp variable, some decrements & conditionals. The switches 520324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver are equivalent to the following loop: 521324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver { 522324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver int tmpBytesToRead = extraBytesToRead+1; 523324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver do { 524324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver ch += *source++; 525324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --tmpBytesToRead; 526324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver if (tmpBytesToRead) ch <<= 6; 527324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } while (tmpBytesToRead > 0); 528324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver } 529324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver In UTF-8 writing code, the switches on "bytesToWrite" are 530324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver similarly unrolled loops. 531324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver 532324c4644fee44b9898524c09511bd33c3f12e2dfBen Gruver --------------------------------------------------------------------- */ 533