1/* 2 * Copyright 2001-2004 Unicode, Inc. 3 * 4 * Disclaimer 5 * 6 * This source code is provided as is by Unicode, Inc. No claims are 7 * made as to fitness for any particular purpose. No warranties of any 8 * kind are expressed or implied. The recipient agrees to determine 9 * applicability of information provided. If this file has been 10 * purchased on magnetic or optical media from Unicode, Inc., the 11 * sole remedy for any claim will be exchange of defective media 12 * within 90 days of receipt. 13 * 14 * Limitations on Rights to Redistribute This Code 15 * 16 * Unicode, Inc. hereby grants the right to freely use the information 17 * supplied in this file in the creation of products supporting the 18 * Unicode Standard, and to make copies of this file in any form 19 * for internal or external distribution as long as this notice 20 * remains attached. 21 */ 22 23/* --------------------------------------------------------------------- 24 25 Conversions between UTF32, UTF-16, and UTF-8. Source code file. 26 Author: Mark E. Davis, 1994. 27 Rev History: Rick McGowan, fixes & updates May 2001. 28 Sept 2001: fixed const & error conditions per 29 mods suggested by S. Parent & A. Lillich. 30 June 2002: Tim Dodd added detection and handling of incomplete 31 source sequences, enhanced error detection, added casts 32 to eliminate compiler warnings. 33 July 2003: slight mods to back out aggressive FFFE detection. 34 Jan 2004: updated switches in from-UTF8 conversions. 35 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 36 37 See the header file "ConvertUTF.h" for complete documentation. 38 39------------------------------------------------------------------------ */ 40 41 42#include "antlr3convertutf.h" 43 44#ifdef CVTUTF_DEBUG 45#include <stdio.h> 46#endif 47 48 49 50/* --------------------------------------------------------------------- */ 51 52ConversionResult ConvertUTF32toUTF16 ( 53 const UTF32** sourceStart, const UTF32* sourceEnd, 54 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 55 ConversionResult result = conversionOK; 56 const UTF32* source = *sourceStart; 57 UTF16* target = *targetStart; 58 while (source < sourceEnd) { 59 UTF32 ch; 60 if (target >= targetEnd) { 61 result = targetExhausted; break; 62 } 63 ch = *source++; 64 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 65 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 66 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 67 if (flags == strictConversion) { 68 --source; /* return to the illegal value itself */ 69 result = sourceIllegal; 70 break; 71 } else { 72 *target++ = UNI_REPLACEMENT_CHAR; 73 } 74 } else { 75 *target++ = (UTF16)ch; /* normal case */ 76 } 77 } else if (ch > UNI_MAX_LEGAL_UTF32) { 78 if (flags == strictConversion) { 79 result = sourceIllegal; 80 } else { 81 *target++ = UNI_REPLACEMENT_CHAR; 82 } 83 } else { 84 /* target is a character in range 0xFFFF - 0x10FFFF. */ 85 if (target + 1 >= targetEnd) { 86 --source; /* Back up source pointer! */ 87 result = targetExhausted; break; 88 } 89 ch -= halfBase; 90 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 91 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 92 } 93 } 94 *sourceStart = source; 95 *targetStart = target; 96 return result; 97} 98 99/* --------------------------------------------------------------------- */ 100 101ConversionResult ConvertUTF16toUTF32 ( 102 const UTF16** sourceStart, const UTF16* sourceEnd, 103 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 104 ConversionResult result = conversionOK; 105 const UTF16* source = *sourceStart; 106 UTF32* target = *targetStart; 107 UTF32 ch, ch2; 108 while (source < sourceEnd) { 109 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 110 ch = *source++; 111 /* If we have a surrogate pair, convert to UTF32 first. */ 112 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 113 /* If the 16 bits following the high surrogate are in the source buffer... */ 114 if (source < sourceEnd) { 115 ch2 = *source; 116 /* If it's a low surrogate, convert to UTF32. */ 117 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 118 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 119 + (ch2 - UNI_SUR_LOW_START) + halfBase; 120 ++source; 121 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 122 --source; /* return to the illegal value itself */ 123 result = sourceIllegal; 124 break; 125 } 126 } else { /* We don't have the 16 bits following the high surrogate. */ 127 --source; /* return to the high surrogate */ 128 result = sourceExhausted; 129 break; 130 } 131 } else if (flags == strictConversion) { 132 /* UTF-16 surrogate values are illegal in UTF-32 */ 133 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 134 --source; /* return to the illegal value itself */ 135 result = sourceIllegal; 136 break; 137 } 138 } 139 if (target >= targetEnd) { 140 source = oldSource; /* Back up source pointer! */ 141 result = targetExhausted; break; 142 } 143 *target++ = ch; 144 } 145 *sourceStart = source; 146 *targetStart = target; 147#ifdef CVTUTF_DEBUG 148if (result == sourceIllegal) { 149 ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 150 fflush(stderr); 151} 152#endif 153 return result; 154} 155 156/* --------------------------------------------------------------------- */ 157 158/* 159 * Index into the table below with the first byte of a UTF-8 sequence to 160 * get the number of trailing bytes that are supposed to follow it. 161 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 162 * left as-is for anyone who may want to do such conversion, which was 163 * allowed in earlier algorithms. 164 */ 165static const char trailingBytesForUTF8[256] = { 166 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 167 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 168 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 169 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 170 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 171 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 172 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 173 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 174}; 175 176/* 177 * Magic values subtracted from a buffer value during UTF8 conversion. 178 * This table contains as many values as there might be trailing bytes 179 * in a UTF-8 sequence. 180 */ 181static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 182 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 183 184/* 185 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 186 * into the first byte, depending on how many bytes follow. There are 187 * as many entries in this table as there are UTF-8 sequence types. 188 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 189 * for *legal* UTF-8 will be 4 or fewer bytes total. 190 */ 191static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 192 193/* --------------------------------------------------------------------- */ 194 195/* The interface converts a whole buffer to avoid function-call overhead. 196 * Constants have been gathered. Loops & conditionals have been removed as 197 * much as possible for efficiency, in favor of drop-through switches. 198 * (See "Note A" at the bottom of the file for equivalent code.) 199 * If your compiler supports it, the "isLegalUTF8" call can be turned 200 * into an inline function. 201 */ 202 203/* --------------------------------------------------------------------- */ 204 205ConversionResult ConvertUTF16toUTF8 ( 206 const UTF16** sourceStart, const UTF16* sourceEnd, 207 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 208 ConversionResult result = conversionOK; 209 const UTF16* source = *sourceStart; 210 UTF8* target = *targetStart; 211 while (source < sourceEnd) { 212 UTF32 ch; 213 unsigned short bytesToWrite = 0; 214 const UTF32 byteMask = 0xBF; 215 const UTF32 byteMark = 0x80; 216 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 217 ch = *source++; 218 /* If we have a surrogate pair, convert to UTF32 first. */ 219 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 220 /* If the 16 bits following the high surrogate are in the source buffer... */ 221 if (source < sourceEnd) { 222 UTF32 ch2 = *source; 223 /* If it's a low surrogate, convert to UTF32. */ 224 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 225 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 226 + (ch2 - UNI_SUR_LOW_START) + halfBase; 227 ++source; 228 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 229 --source; /* return to the illegal value itself */ 230 result = sourceIllegal; 231 break; 232 } 233 } else { /* We don't have the 16 bits following the high surrogate. */ 234 --source; /* return to the high surrogate */ 235 result = sourceExhausted; 236 break; 237 } 238 } else if (flags == strictConversion) { 239 /* UTF-16 surrogate values are illegal in UTF-32 */ 240 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 241 --source; /* return to the illegal value itself */ 242 result = sourceIllegal; 243 break; 244 } 245 } 246 /* Figure out how many bytes the result will require */ 247 if (ch < (UTF32)0x80) { bytesToWrite = 1; 248 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 249 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 250 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 251 } else { bytesToWrite = 3; 252 ch = UNI_REPLACEMENT_CHAR; 253 } 254 255 target += bytesToWrite; 256 if (target > targetEnd) { 257 source = oldSource; /* Back up source pointer! */ 258 target -= bytesToWrite; result = targetExhausted; break; 259 } 260 switch (bytesToWrite) { /* note: everything falls through. */ 261 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 262 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 263 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 264 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 265 } 266 target += bytesToWrite; 267 } 268 *sourceStart = source; 269 *targetStart = target; 270 return result; 271} 272 273/* --------------------------------------------------------------------- */ 274 275/* 276 * Utility routine to tell whether a sequence of bytes is legal UTF-8. 277 * This must be called with the length pre-determined by the first byte. 278 * If not calling this from ConvertUTF8to*, then the length can be set by: 279 * length = trailingBytesForUTF8[*source]+1; 280 * and the sequence is illegal right away if there aren't that many bytes 281 * available. 282 * If presented with a length > 4, this returns false. The Unicode 283 * definition of UTF-8 goes up to 4-byte sequences. 284 */ 285 286static ANTLR3_BOOLEAN 287isLegalUTF8(const UTF8 *source, int length) { 288 UTF8 a; 289 const UTF8 *srcptr = source+length; 290 switch (length) { 291 default: return false; 292 /* Everything else falls through when "true"... */ 293 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 294 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 295 case 2: if ((a = (*--srcptr)) > 0xBF) return false; 296 297 switch (*source) { 298 /* no fall-through in this inner switch */ 299 case 0xE0: if (a < 0xA0) return false; break; 300 case 0xED: if (a > 0x9F) return false; break; 301 case 0xF0: if (a < 0x90) return false; break; 302 case 0xF4: if (a > 0x8F) return false; break; 303 default: if (a < 0x80) return false; 304 } 305 306 case 1: if (*source >= 0x80 && *source < 0xC2) return false; 307 } 308 if (*source > 0xF4) return false; 309 return true; 310} 311 312/* --------------------------------------------------------------------- */ 313 314/* 315 * Exported function to return whether a UTF-8 sequence is legal or not. 316 * This is not used here; it's just exported. 317 */ 318ANTLR3_BOOLEAN 319isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 320 int length = trailingBytesForUTF8[*source]+1; 321 if (source+length > sourceEnd) { 322 return false; 323 } 324 return isLegalUTF8(source, length); 325} 326 327/* --------------------------------------------------------------------- */ 328 329ConversionResult ConvertUTF8toUTF16 ( 330 const UTF8** sourceStart, const UTF8* sourceEnd, 331 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 332 ConversionResult result = conversionOK; 333 const UTF8* source = *sourceStart; 334 UTF16* target = *targetStart; 335 while (source < sourceEnd) { 336 UTF32 ch = 0; 337 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 338 if (source + extraBytesToRead >= sourceEnd) { 339 result = sourceExhausted; break; 340 } 341 /* Do this check whether lenient or strict */ 342 if (! isLegalUTF8(source, extraBytesToRead+1)) { 343 result = sourceIllegal; 344 break; 345 } 346 /* 347 * The cases all fall through. See "Note A" below. 348 */ 349 switch (extraBytesToRead) { 350 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 351 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 352 case 3: ch += *source++; ch <<= 6; 353 case 2: ch += *source++; ch <<= 6; 354 case 1: ch += *source++; ch <<= 6; 355 case 0: ch += *source++; 356 } 357 ch -= offsetsFromUTF8[extraBytesToRead]; 358 359 if (target >= targetEnd) { 360 source -= (extraBytesToRead+1); /* Back up source pointer! */ 361 result = targetExhausted; break; 362 } 363 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 364 /* UTF-16 surrogate values are illegal in UTF-32 */ 365 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 366 if (flags == strictConversion) { 367 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 368 result = sourceIllegal; 369 break; 370 } else { 371 *target++ = UNI_REPLACEMENT_CHAR; 372 } 373 } else { 374 *target++ = (UTF16)ch; /* normal case */ 375 } 376 } else if (ch > UNI_MAX_UTF16) { 377 if (flags == strictConversion) { 378 result = sourceIllegal; 379 source -= (extraBytesToRead+1); /* return to the start */ 380 break; /* Bail out; shouldn't continue */ 381 } else { 382 *target++ = UNI_REPLACEMENT_CHAR; 383 } 384 } else { 385 /* target is a character in range 0xFFFF - 0x10FFFF. */ 386 if (target + 1 >= targetEnd) { 387 source -= (extraBytesToRead+1); /* Back up source pointer! */ 388 result = targetExhausted; break; 389 } 390 ch -= halfBase; 391 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 392 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 393 } 394 } 395 *sourceStart = source; 396 *targetStart = target; 397 return result; 398} 399 400/* --------------------------------------------------------------------- */ 401 402ConversionResult ConvertUTF32toUTF8 ( 403 const UTF32** sourceStart, const UTF32* sourceEnd, 404 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 405 ConversionResult result = conversionOK; 406 const UTF32* source = *sourceStart; 407 UTF8* target = *targetStart; 408 while (source < sourceEnd) { 409 UTF32 ch; 410 unsigned short bytesToWrite = 0; 411 const UTF32 byteMask = 0xBF; 412 const UTF32 byteMark = 0x80; 413 ch = *source++; 414 if (flags == strictConversion ) { 415 /* UTF-16 surrogate values are illegal in UTF-32 */ 416 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 417 --source; /* return to the illegal value itself */ 418 result = sourceIllegal; 419 break; 420 } 421 } 422 /* 423 * Figure out how many bytes the result will require. Turn any 424 * illegally large UTF32 things (> Plane 17) into replacement chars. 425 */ 426 if (ch < (UTF32)0x80) { bytesToWrite = 1; 427 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 428 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 429 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 430 } else { bytesToWrite = 3; 431 ch = UNI_REPLACEMENT_CHAR; 432 result = sourceIllegal; 433 } 434 435 target += bytesToWrite; 436 if (target > targetEnd) { 437 --source; /* Back up source pointer! */ 438 target -= bytesToWrite; result = targetExhausted; break; 439 } 440 switch (bytesToWrite) { /* note: everything falls through. */ 441 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 442 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 443 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 444 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 445 } 446 target += bytesToWrite; 447 } 448 *sourceStart = source; 449 *targetStart = target; 450 return result; 451} 452 453/* --------------------------------------------------------------------- */ 454 455ConversionResult ConvertUTF8toUTF32 ( 456 const UTF8** sourceStart, const UTF8* sourceEnd, 457 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 458 ConversionResult result = conversionOK; 459 const UTF8* source = *sourceStart; 460 UTF32* target = *targetStart; 461 while (source < sourceEnd) { 462 UTF32 ch = 0; 463 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 464 if (source + extraBytesToRead >= sourceEnd) { 465 result = sourceExhausted; break; 466 } 467 /* Do this check whether lenient or strict */ 468 if (! isLegalUTF8(source, extraBytesToRead+1)) { 469 result = sourceIllegal; 470 break; 471 } 472 /* 473 * The cases all fall through. See "Note A" below. 474 */ 475 switch (extraBytesToRead) { 476 case 5: ch += *source++; ch <<= 6; 477 case 4: ch += *source++; ch <<= 6; 478 case 3: ch += *source++; ch <<= 6; 479 case 2: ch += *source++; ch <<= 6; 480 case 1: ch += *source++; ch <<= 6; 481 case 0: ch += *source++; 482 } 483 ch -= offsetsFromUTF8[extraBytesToRead]; 484 485 if (target >= targetEnd) { 486 source -= (extraBytesToRead+1); /* Back up the source pointer! */ 487 result = targetExhausted; break; 488 } 489 if (ch <= UNI_MAX_LEGAL_UTF32) { 490 /* 491 * UTF-16 surrogate values are illegal in UTF-32, and anything 492 * over Plane 17 (> 0x10FFFF) is illegal. 493 */ 494 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 495 if (flags == strictConversion) { 496 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 497 result = sourceIllegal; 498 break; 499 } else { 500 *target++ = UNI_REPLACEMENT_CHAR; 501 } 502 } else { 503 *target++ = ch; 504 } 505 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 506 result = sourceIllegal; 507 *target++ = UNI_REPLACEMENT_CHAR; 508 } 509 } 510 *sourceStart = source; 511 *targetStart = target; 512 return result; 513} 514 515/* --------------------------------------------------------------------- 516 517 Note A. 518 The fall-through switches in UTF-8 reading code save a 519 temp variable, some decrements & conditionals. The switches 520 are equivalent to the following loop: 521 { 522 int tmpBytesToRead = extraBytesToRead+1; 523 do { 524 ch += *source++; 525 --tmpBytesToRead; 526 if (tmpBytesToRead) ch <<= 6; 527 } while (tmpBytesToRead > 0); 528 } 529 In UTF-8 writing code, the switches on "bytesToWrite" are 530 similarly unrolled loops. 531 532 --------------------------------------------------------------------- */ 533