1/* 2********************************************************************** 3* Copyright (C) 2002-2009, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* file name: ucnv_u32.c 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* created on: 2002jul01 12* created by: Markus W. Scherer 13* 14* UTF-32 converter implementation. Used to be in ucnv_utf.c. 15*/ 16 17#include "unicode/utypes.h" 18 19#if !UCONFIG_NO_CONVERSION 20 21#include "unicode/ucnv.h" 22#include "ucnv_bld.h" 23#include "ucnv_cnv.h" 24#include "cmemory.h" 25 26#define MAXIMUM_UCS2 0x0000FFFF 27#define MAXIMUM_UTF 0x0010FFFF 28#define HALF_SHIFT 10 29#define HALF_BASE 0x0010000 30#define HALF_MASK 0x3FF 31#define SURROGATE_HIGH_START 0xD800 32#define SURROGATE_LOW_START 0xDC00 33 34/* -SURROGATE_LOW_START + HALF_BASE */ 35#define SURROGATE_LOW_BASE 9216 36 37enum { 38 UCNV_NEED_TO_WRITE_BOM=1 39}; 40 41/* UTF-32BE ----------------------------------------------------------------- */ 42 43static void 44T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args, 45 UErrorCode * err) 46{ 47 const unsigned char *mySource = (unsigned char *) args->source; 48 UChar *myTarget = args->target; 49 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 50 const UChar *targetLimit = args->targetLimit; 51 unsigned char *toUBytes = args->converter->toUBytes; 52 uint32_t ch, i; 53 54 /* Restore state of current sequence */ 55 if (args->converter->toUnicodeStatus && myTarget < targetLimit) { 56 i = args->converter->toULength; /* restore # of bytes consumed */ 57 args->converter->toULength = 0; 58 59 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ 60 args->converter->toUnicodeStatus = 0; 61 goto morebytes; 62 } 63 64 while (mySource < sourceLimit && myTarget < targetLimit) { 65 i = 0; 66 ch = 0; 67morebytes: 68 while (i < sizeof(uint32_t)) { 69 if (mySource < sourceLimit) { 70 ch = (ch << 8) | (uint8_t)(*mySource); 71 toUBytes[i++] = (char) *(mySource++); 72 } 73 else { 74 /* stores a partially calculated target*/ 75 /* + 1 to make 0 a valid character */ 76 args->converter->toUnicodeStatus = ch + 1; 77 args->converter->toULength = (int8_t) i; 78 goto donefornow; 79 } 80 } 81 82 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 83 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 84 if (ch <= MAXIMUM_UCS2) 85 { 86 /* fits in 16 bits */ 87 *(myTarget++) = (UChar) ch; 88 } 89 else { 90 /* write out the surrogates */ 91 *(myTarget++) = U16_LEAD(ch); 92 ch = U16_TRAIL(ch); 93 if (myTarget < targetLimit) { 94 *(myTarget++) = (UChar)ch; 95 } 96 else { 97 /* Put in overflow buffer (not handled here) */ 98 args->converter->UCharErrorBuffer[0] = (UChar) ch; 99 args->converter->UCharErrorBufferLength = 1; 100 *err = U_BUFFER_OVERFLOW_ERROR; 101 break; 102 } 103 } 104 } 105 else { 106 args->converter->toULength = (int8_t)i; 107 *err = U_ILLEGAL_CHAR_FOUND; 108 break; 109 } 110 } 111 112donefornow: 113 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 114 /* End of target buffer */ 115 *err = U_BUFFER_OVERFLOW_ERROR; 116 } 117 118 args->target = myTarget; 119 args->source = (const char *) mySource; 120} 121 122static void 123T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, 124 UErrorCode * err) 125{ 126 const unsigned char *mySource = (unsigned char *) args->source; 127 UChar *myTarget = args->target; 128 int32_t *myOffsets = args->offsets; 129 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 130 const UChar *targetLimit = args->targetLimit; 131 unsigned char *toUBytes = args->converter->toUBytes; 132 uint32_t ch, i; 133 int32_t offsetNum = 0; 134 135 /* Restore state of current sequence */ 136 if (args->converter->toUnicodeStatus && myTarget < targetLimit) { 137 i = args->converter->toULength; /* restore # of bytes consumed */ 138 args->converter->toULength = 0; 139 140 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ 141 args->converter->toUnicodeStatus = 0; 142 goto morebytes; 143 } 144 145 while (mySource < sourceLimit && myTarget < targetLimit) { 146 i = 0; 147 ch = 0; 148morebytes: 149 while (i < sizeof(uint32_t)) { 150 if (mySource < sourceLimit) { 151 ch = (ch << 8) | (uint8_t)(*mySource); 152 toUBytes[i++] = (char) *(mySource++); 153 } 154 else { 155 /* stores a partially calculated target*/ 156 /* + 1 to make 0 a valid character */ 157 args->converter->toUnicodeStatus = ch + 1; 158 args->converter->toULength = (int8_t) i; 159 goto donefornow; 160 } 161 } 162 163 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 164 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 165 if (ch <= MAXIMUM_UCS2) { 166 /* fits in 16 bits */ 167 *(myTarget++) = (UChar) ch; 168 *(myOffsets++) = offsetNum; 169 } 170 else { 171 /* write out the surrogates */ 172 *(myTarget++) = U16_LEAD(ch); 173 *myOffsets++ = offsetNum; 174 ch = U16_TRAIL(ch); 175 if (myTarget < targetLimit) 176 { 177 *(myTarget++) = (UChar)ch; 178 *(myOffsets++) = offsetNum; 179 } 180 else { 181 /* Put in overflow buffer (not handled here) */ 182 args->converter->UCharErrorBuffer[0] = (UChar) ch; 183 args->converter->UCharErrorBufferLength = 1; 184 *err = U_BUFFER_OVERFLOW_ERROR; 185 break; 186 } 187 } 188 } 189 else { 190 args->converter->toULength = (int8_t)i; 191 *err = U_ILLEGAL_CHAR_FOUND; 192 break; 193 } 194 offsetNum += i; 195 } 196 197donefornow: 198 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 199 { 200 /* End of target buffer */ 201 *err = U_BUFFER_OVERFLOW_ERROR; 202 } 203 204 args->target = myTarget; 205 args->source = (const char *) mySource; 206 args->offsets = myOffsets; 207} 208 209static void 210T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, 211 UErrorCode * err) 212{ 213 const UChar *mySource = args->source; 214 unsigned char *myTarget; 215 const UChar *sourceLimit = args->sourceLimit; 216 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 217 UChar32 ch, ch2; 218 unsigned int indexToWrite; 219 unsigned char temp[sizeof(uint32_t)]; 220 221 if(mySource >= sourceLimit) { 222 /* no input, nothing to do */ 223 return; 224 } 225 226 /* write the BOM if necessary */ 227 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 228 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; 229 ucnv_fromUWriteBytes(args->converter, 230 bom, 4, 231 &args->target, args->targetLimit, 232 &args->offsets, -1, 233 err); 234 args->converter->fromUnicodeStatus=0; 235 } 236 237 myTarget = (unsigned char *) args->target; 238 temp[0] = 0; 239 240 if (args->converter->fromUChar32) { 241 ch = args->converter->fromUChar32; 242 args->converter->fromUChar32 = 0; 243 goto lowsurogate; 244 } 245 246 while (mySource < sourceLimit && myTarget < targetLimit) { 247 ch = *(mySource++); 248 249 if (UTF_IS_SURROGATE(ch)) { 250 if (U_IS_LEAD(ch)) { 251lowsurogate: 252 if (mySource < sourceLimit) { 253 ch2 = *mySource; 254 if (U_IS_TRAIL(ch2)) { 255 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 256 mySource++; 257 } 258 else { 259 /* this is an unmatched trail code unit (2nd surrogate) */ 260 /* callback(illegal) */ 261 args->converter->fromUChar32 = ch; 262 *err = U_ILLEGAL_CHAR_FOUND; 263 break; 264 } 265 } 266 else { 267 /* ran out of source */ 268 args->converter->fromUChar32 = ch; 269 if (args->flush) { 270 /* this is an unmatched trail code unit (2nd surrogate) */ 271 /* callback(illegal) */ 272 *err = U_ILLEGAL_CHAR_FOUND; 273 } 274 break; 275 } 276 } 277 else { 278 /* this is an unmatched trail code unit (2nd surrogate) */ 279 /* callback(illegal) */ 280 args->converter->fromUChar32 = ch; 281 *err = U_ILLEGAL_CHAR_FOUND; 282 break; 283 } 284 } 285 286 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 287 temp[1] = (uint8_t) (ch >> 16 & 0x1F); 288 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 289 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 290 291 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { 292 if (myTarget < targetLimit) { 293 *(myTarget++) = temp[indexToWrite]; 294 } 295 else { 296 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 297 *err = U_BUFFER_OVERFLOW_ERROR; 298 } 299 } 300 } 301 302 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 303 *err = U_BUFFER_OVERFLOW_ERROR; 304 } 305 306 args->target = (char *) myTarget; 307 args->source = mySource; 308} 309 310static void 311T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, 312 UErrorCode * err) 313{ 314 const UChar *mySource = args->source; 315 unsigned char *myTarget; 316 int32_t *myOffsets; 317 const UChar *sourceLimit = args->sourceLimit; 318 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 319 UChar32 ch, ch2; 320 int32_t offsetNum = 0; 321 unsigned int indexToWrite; 322 unsigned char temp[sizeof(uint32_t)]; 323 324 if(mySource >= sourceLimit) { 325 /* no input, nothing to do */ 326 return; 327 } 328 329 /* write the BOM if necessary */ 330 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 331 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; 332 ucnv_fromUWriteBytes(args->converter, 333 bom, 4, 334 &args->target, args->targetLimit, 335 &args->offsets, -1, 336 err); 337 args->converter->fromUnicodeStatus=0; 338 } 339 340 myTarget = (unsigned char *) args->target; 341 myOffsets = args->offsets; 342 temp[0] = 0; 343 344 if (args->converter->fromUChar32) { 345 ch = args->converter->fromUChar32; 346 args->converter->fromUChar32 = 0; 347 goto lowsurogate; 348 } 349 350 while (mySource < sourceLimit && myTarget < targetLimit) { 351 ch = *(mySource++); 352 353 if (UTF_IS_SURROGATE(ch)) { 354 if (U_IS_LEAD(ch)) { 355lowsurogate: 356 if (mySource < sourceLimit) { 357 ch2 = *mySource; 358 if (U_IS_TRAIL(ch2)) { 359 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 360 mySource++; 361 } 362 else { 363 /* this is an unmatched trail code unit (2nd surrogate) */ 364 /* callback(illegal) */ 365 args->converter->fromUChar32 = ch; 366 *err = U_ILLEGAL_CHAR_FOUND; 367 break; 368 } 369 } 370 else { 371 /* ran out of source */ 372 args->converter->fromUChar32 = ch; 373 if (args->flush) { 374 /* this is an unmatched trail code unit (2nd surrogate) */ 375 /* callback(illegal) */ 376 *err = U_ILLEGAL_CHAR_FOUND; 377 } 378 break; 379 } 380 } 381 else { 382 /* this is an unmatched trail code unit (2nd surrogate) */ 383 /* callback(illegal) */ 384 args->converter->fromUChar32 = ch; 385 *err = U_ILLEGAL_CHAR_FOUND; 386 break; 387 } 388 } 389 390 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 391 temp[1] = (uint8_t) (ch >> 16 & 0x1F); 392 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 393 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 394 395 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { 396 if (myTarget < targetLimit) { 397 *(myTarget++) = temp[indexToWrite]; 398 *(myOffsets++) = offsetNum; 399 } 400 else { 401 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 402 *err = U_BUFFER_OVERFLOW_ERROR; 403 } 404 } 405 offsetNum = offsetNum + 1 + (temp[1] != 0); 406 } 407 408 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 409 *err = U_BUFFER_OVERFLOW_ERROR; 410 } 411 412 args->target = (char *) myTarget; 413 args->source = mySource; 414 args->offsets = myOffsets; 415} 416 417static UChar32 418T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, 419 UErrorCode* err) 420{ 421 const uint8_t *mySource; 422 UChar32 myUChar; 423 int32_t length; 424 425 mySource = (const uint8_t *)args->source; 426 if (mySource >= (const uint8_t *)args->sourceLimit) 427 { 428 /* no input */ 429 *err = U_INDEX_OUTOFBOUNDS_ERROR; 430 return 0xffff; 431 } 432 433 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); 434 if (length < 4) 435 { 436 /* got a partial character */ 437 uprv_memcpy(args->converter->toUBytes, mySource, length); 438 args->converter->toULength = (int8_t)length; 439 args->source = (const char *)(mySource + length); 440 *err = U_TRUNCATED_CHAR_FOUND; 441 return 0xffff; 442 } 443 444 /* Don't even try to do a direct cast because the value may be on an odd address. */ 445 myUChar = ((UChar32)mySource[0] << 24) 446 | ((UChar32)mySource[1] << 16) 447 | ((UChar32)mySource[2] << 8) 448 | ((UChar32)mySource[3]); 449 450 args->source = (const char *)(mySource + 4); 451 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { 452 return myUChar; 453 } 454 455 uprv_memcpy(args->converter->toUBytes, mySource, 4); 456 args->converter->toULength = 4; 457 458 *err = U_ILLEGAL_CHAR_FOUND; 459 return 0xffff; 460} 461 462static const UConverterImpl _UTF32BEImpl = { 463 UCNV_UTF32_BigEndian, 464 465 NULL, 466 NULL, 467 468 NULL, 469 NULL, 470 NULL, 471 472 T_UConverter_toUnicode_UTF32_BE, 473 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC, 474 T_UConverter_fromUnicode_UTF32_BE, 475 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, 476 T_UConverter_getNextUChar_UTF32_BE, 477 478 NULL, 479 NULL, 480 NULL, 481 NULL, 482 ucnv_getNonSurrogateUnicodeSet 483}; 484 485/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ 486static const UConverterStaticData _UTF32BEStaticData = { 487 sizeof(UConverterStaticData), 488 "UTF-32BE", 489 1232, 490 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, 491 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE, 492 0, 493 0, 494 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 495}; 496 497const UConverterSharedData _UTF32BEData = { 498 sizeof(UConverterSharedData), ~((uint32_t) 0), 499 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl, 500 0 501}; 502 503/* UTF-32LE ---------------------------------------------------------- */ 504 505static void 506T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args, 507 UErrorCode * err) 508{ 509 const unsigned char *mySource = (unsigned char *) args->source; 510 UChar *myTarget = args->target; 511 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 512 const UChar *targetLimit = args->targetLimit; 513 unsigned char *toUBytes = args->converter->toUBytes; 514 uint32_t ch, i; 515 516 /* Restore state of current sequence */ 517 if (args->converter->toUnicodeStatus && myTarget < targetLimit) 518 { 519 i = args->converter->toULength; /* restore # of bytes consumed */ 520 args->converter->toULength = 0; 521 522 /* Stores the previously calculated ch from a previous call*/ 523 ch = args->converter->toUnicodeStatus - 1; 524 args->converter->toUnicodeStatus = 0; 525 goto morebytes; 526 } 527 528 while (mySource < sourceLimit && myTarget < targetLimit) 529 { 530 i = 0; 531 ch = 0; 532morebytes: 533 while (i < sizeof(uint32_t)) 534 { 535 if (mySource < sourceLimit) 536 { 537 ch |= ((uint8_t)(*mySource)) << (i * 8); 538 toUBytes[i++] = (char) *(mySource++); 539 } 540 else 541 { 542 /* stores a partially calculated target*/ 543 /* + 1 to make 0 a valid character */ 544 args->converter->toUnicodeStatus = ch + 1; 545 args->converter->toULength = (int8_t) i; 546 goto donefornow; 547 } 548 } 549 550 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 551 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 552 if (ch <= MAXIMUM_UCS2) { 553 /* fits in 16 bits */ 554 *(myTarget++) = (UChar) ch; 555 } 556 else { 557 /* write out the surrogates */ 558 *(myTarget++) = U16_LEAD(ch); 559 ch = U16_TRAIL(ch); 560 if (myTarget < targetLimit) { 561 *(myTarget++) = (UChar)ch; 562 } 563 else { 564 /* Put in overflow buffer (not handled here) */ 565 args->converter->UCharErrorBuffer[0] = (UChar) ch; 566 args->converter->UCharErrorBufferLength = 1; 567 *err = U_BUFFER_OVERFLOW_ERROR; 568 break; 569 } 570 } 571 } 572 else { 573 args->converter->toULength = (int8_t)i; 574 *err = U_ILLEGAL_CHAR_FOUND; 575 break; 576 } 577 } 578 579donefornow: 580 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 581 { 582 /* End of target buffer */ 583 *err = U_BUFFER_OVERFLOW_ERROR; 584 } 585 586 args->target = myTarget; 587 args->source = (const char *) mySource; 588} 589 590static void 591T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, 592 UErrorCode * err) 593{ 594 const unsigned char *mySource = (unsigned char *) args->source; 595 UChar *myTarget = args->target; 596 int32_t *myOffsets = args->offsets; 597 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 598 const UChar *targetLimit = args->targetLimit; 599 unsigned char *toUBytes = args->converter->toUBytes; 600 uint32_t ch, i; 601 int32_t offsetNum = 0; 602 603 /* Restore state of current sequence */ 604 if (args->converter->toUnicodeStatus && myTarget < targetLimit) 605 { 606 i = args->converter->toULength; /* restore # of bytes consumed */ 607 args->converter->toULength = 0; 608 609 /* Stores the previously calculated ch from a previous call*/ 610 ch = args->converter->toUnicodeStatus - 1; 611 args->converter->toUnicodeStatus = 0; 612 goto morebytes; 613 } 614 615 while (mySource < sourceLimit && myTarget < targetLimit) 616 { 617 i = 0; 618 ch = 0; 619morebytes: 620 while (i < sizeof(uint32_t)) 621 { 622 if (mySource < sourceLimit) 623 { 624 ch |= ((uint8_t)(*mySource)) << (i * 8); 625 toUBytes[i++] = (char) *(mySource++); 626 } 627 else 628 { 629 /* stores a partially calculated target*/ 630 /* + 1 to make 0 a valid character */ 631 args->converter->toUnicodeStatus = ch + 1; 632 args->converter->toULength = (int8_t) i; 633 goto donefornow; 634 } 635 } 636 637 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) 638 { 639 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 640 if (ch <= MAXIMUM_UCS2) 641 { 642 /* fits in 16 bits */ 643 *(myTarget++) = (UChar) ch; 644 *(myOffsets++) = offsetNum; 645 } 646 else { 647 /* write out the surrogates */ 648 *(myTarget++) = U16_LEAD(ch); 649 *(myOffsets++) = offsetNum; 650 ch = U16_TRAIL(ch); 651 if (myTarget < targetLimit) 652 { 653 *(myTarget++) = (UChar)ch; 654 *(myOffsets++) = offsetNum; 655 } 656 else 657 { 658 /* Put in overflow buffer (not handled here) */ 659 args->converter->UCharErrorBuffer[0] = (UChar) ch; 660 args->converter->UCharErrorBufferLength = 1; 661 *err = U_BUFFER_OVERFLOW_ERROR; 662 break; 663 } 664 } 665 } 666 else 667 { 668 args->converter->toULength = (int8_t)i; 669 *err = U_ILLEGAL_CHAR_FOUND; 670 break; 671 } 672 offsetNum += i; 673 } 674 675donefornow: 676 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 677 { 678 /* End of target buffer */ 679 *err = U_BUFFER_OVERFLOW_ERROR; 680 } 681 682 args->target = myTarget; 683 args->source = (const char *) mySource; 684 args->offsets = myOffsets; 685} 686 687static void 688T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, 689 UErrorCode * err) 690{ 691 const UChar *mySource = args->source; 692 unsigned char *myTarget; 693 const UChar *sourceLimit = args->sourceLimit; 694 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 695 UChar32 ch, ch2; 696 unsigned int indexToWrite; 697 unsigned char temp[sizeof(uint32_t)]; 698 699 if(mySource >= sourceLimit) { 700 /* no input, nothing to do */ 701 return; 702 } 703 704 /* write the BOM if necessary */ 705 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 706 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; 707 ucnv_fromUWriteBytes(args->converter, 708 bom, 4, 709 &args->target, args->targetLimit, 710 &args->offsets, -1, 711 err); 712 args->converter->fromUnicodeStatus=0; 713 } 714 715 myTarget = (unsigned char *) args->target; 716 temp[3] = 0; 717 718 if (args->converter->fromUChar32) 719 { 720 ch = args->converter->fromUChar32; 721 args->converter->fromUChar32 = 0; 722 goto lowsurogate; 723 } 724 725 while (mySource < sourceLimit && myTarget < targetLimit) 726 { 727 ch = *(mySource++); 728 729 if (UTF_IS_SURROGATE(ch)) { 730 if (U_IS_LEAD(ch)) 731 { 732lowsurogate: 733 if (mySource < sourceLimit) 734 { 735 ch2 = *mySource; 736 if (U_IS_TRAIL(ch2)) { 737 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 738 mySource++; 739 } 740 else { 741 /* this is an unmatched trail code unit (2nd surrogate) */ 742 /* callback(illegal) */ 743 args->converter->fromUChar32 = ch; 744 *err = U_ILLEGAL_CHAR_FOUND; 745 break; 746 } 747 } 748 else { 749 /* ran out of source */ 750 args->converter->fromUChar32 = ch; 751 if (args->flush) { 752 /* this is an unmatched trail code unit (2nd surrogate) */ 753 /* callback(illegal) */ 754 *err = U_ILLEGAL_CHAR_FOUND; 755 } 756 break; 757 } 758 } 759 else { 760 /* this is an unmatched trail code unit (2nd surrogate) */ 761 /* callback(illegal) */ 762 args->converter->fromUChar32 = ch; 763 *err = U_ILLEGAL_CHAR_FOUND; 764 break; 765 } 766 } 767 768 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 769 temp[2] = (uint8_t) (ch >> 16 & 0x1F); 770 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 771 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 772 773 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) 774 { 775 if (myTarget < targetLimit) 776 { 777 *(myTarget++) = temp[indexToWrite]; 778 } 779 else 780 { 781 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 782 *err = U_BUFFER_OVERFLOW_ERROR; 783 } 784 } 785 } 786 787 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 788 { 789 *err = U_BUFFER_OVERFLOW_ERROR; 790 } 791 792 args->target = (char *) myTarget; 793 args->source = mySource; 794} 795 796static void 797T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, 798 UErrorCode * err) 799{ 800 const UChar *mySource = args->source; 801 unsigned char *myTarget; 802 int32_t *myOffsets; 803 const UChar *sourceLimit = args->sourceLimit; 804 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 805 UChar32 ch, ch2; 806 unsigned int indexToWrite; 807 unsigned char temp[sizeof(uint32_t)]; 808 int32_t offsetNum = 0; 809 810 if(mySource >= sourceLimit) { 811 /* no input, nothing to do */ 812 return; 813 } 814 815 /* write the BOM if necessary */ 816 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 817 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; 818 ucnv_fromUWriteBytes(args->converter, 819 bom, 4, 820 &args->target, args->targetLimit, 821 &args->offsets, -1, 822 err); 823 args->converter->fromUnicodeStatus=0; 824 } 825 826 myTarget = (unsigned char *) args->target; 827 myOffsets = args->offsets; 828 temp[3] = 0; 829 830 if (args->converter->fromUChar32) 831 { 832 ch = args->converter->fromUChar32; 833 args->converter->fromUChar32 = 0; 834 goto lowsurogate; 835 } 836 837 while (mySource < sourceLimit && myTarget < targetLimit) 838 { 839 ch = *(mySource++); 840 841 if (UTF_IS_SURROGATE(ch)) { 842 if (U_IS_LEAD(ch)) 843 { 844lowsurogate: 845 if (mySource < sourceLimit) 846 { 847 ch2 = *mySource; 848 if (U_IS_TRAIL(ch2)) 849 { 850 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 851 mySource++; 852 } 853 else { 854 /* this is an unmatched trail code unit (2nd surrogate) */ 855 /* callback(illegal) */ 856 args->converter->fromUChar32 = ch; 857 *err = U_ILLEGAL_CHAR_FOUND; 858 break; 859 } 860 } 861 else { 862 /* ran out of source */ 863 args->converter->fromUChar32 = ch; 864 if (args->flush) { 865 /* this is an unmatched trail code unit (2nd surrogate) */ 866 /* callback(illegal) */ 867 *err = U_ILLEGAL_CHAR_FOUND; 868 } 869 break; 870 } 871 } 872 else { 873 /* this is an unmatched trail code unit (2nd surrogate) */ 874 /* callback(illegal) */ 875 args->converter->fromUChar32 = ch; 876 *err = U_ILLEGAL_CHAR_FOUND; 877 break; 878 } 879 } 880 881 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 882 temp[2] = (uint8_t) (ch >> 16 & 0x1F); 883 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 884 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 885 886 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) 887 { 888 if (myTarget < targetLimit) 889 { 890 *(myTarget++) = temp[indexToWrite]; 891 *(myOffsets++) = offsetNum; 892 } 893 else 894 { 895 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 896 *err = U_BUFFER_OVERFLOW_ERROR; 897 } 898 } 899 offsetNum = offsetNum + 1 + (temp[2] != 0); 900 } 901 902 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 903 { 904 *err = U_BUFFER_OVERFLOW_ERROR; 905 } 906 907 args->target = (char *) myTarget; 908 args->source = mySource; 909 args->offsets = myOffsets; 910} 911 912static UChar32 913T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, 914 UErrorCode* err) 915{ 916 const uint8_t *mySource; 917 UChar32 myUChar; 918 int32_t length; 919 920 mySource = (const uint8_t *)args->source; 921 if (mySource >= (const uint8_t *)args->sourceLimit) 922 { 923 /* no input */ 924 *err = U_INDEX_OUTOFBOUNDS_ERROR; 925 return 0xffff; 926 } 927 928 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); 929 if (length < 4) 930 { 931 /* got a partial character */ 932 uprv_memcpy(args->converter->toUBytes, mySource, length); 933 args->converter->toULength = (int8_t)length; 934 args->source = (const char *)(mySource + length); 935 *err = U_TRUNCATED_CHAR_FOUND; 936 return 0xffff; 937 } 938 939 /* Don't even try to do a direct cast because the value may be on an odd address. */ 940 myUChar = ((UChar32)mySource[3] << 24) 941 | ((UChar32)mySource[2] << 16) 942 | ((UChar32)mySource[1] << 8) 943 | ((UChar32)mySource[0]); 944 945 args->source = (const char *)(mySource + 4); 946 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { 947 return myUChar; 948 } 949 950 uprv_memcpy(args->converter->toUBytes, mySource, 4); 951 args->converter->toULength = 4; 952 953 *err = U_ILLEGAL_CHAR_FOUND; 954 return 0xffff; 955} 956 957static const UConverterImpl _UTF32LEImpl = { 958 UCNV_UTF32_LittleEndian, 959 960 NULL, 961 NULL, 962 963 NULL, 964 NULL, 965 NULL, 966 967 T_UConverter_toUnicode_UTF32_LE, 968 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC, 969 T_UConverter_fromUnicode_UTF32_LE, 970 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, 971 T_UConverter_getNextUChar_UTF32_LE, 972 973 NULL, 974 NULL, 975 NULL, 976 NULL, 977 ucnv_getNonSurrogateUnicodeSet 978}; 979 980/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ 981static const UConverterStaticData _UTF32LEStaticData = { 982 sizeof(UConverterStaticData), 983 "UTF-32LE", 984 1234, 985 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4, 986 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE, 987 0, 988 0, 989 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 990}; 991 992 993const UConverterSharedData _UTF32LEData = { 994 sizeof(UConverterSharedData), ~((uint32_t) 0), 995 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl, 996 0 997}; 998 999/* UTF-32 (Detect BOM) ------------------------------------------------------ */ 1000 1001/* 1002 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE 1003 * accordingly. 1004 * 1005 * State values: 1006 * 0 initial state 1007 * 1 saw 00 1008 * 2 saw 00 00 1009 * 3 saw 00 00 FE 1010 * 4 - 1011 * 5 saw FF 1012 * 6 saw FF FE 1013 * 7 saw FF FE 00 1014 * 8 UTF-32BE mode 1015 * 9 UTF-32LE mode 1016 * 1017 * During detection: state&3==number of matching bytes so far. 1018 * 1019 * On output, emit U+FEFF as the first code point. 1020 */ 1021 1022static void 1023_UTF32Reset(UConverter *cnv, UConverterResetChoice choice) { 1024 if(choice<=UCNV_RESET_TO_UNICODE) { 1025 /* reset toUnicode: state=0 */ 1026 cnv->mode=0; 1027 } 1028 if(choice!=UCNV_RESET_TO_UNICODE) { 1029 /* reset fromUnicode: prepare to output the UTF-32PE BOM */ 1030 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1031 } 1032} 1033 1034static void 1035_UTF32Open(UConverter *cnv, 1036 UConverterLoadArgs *pArgs, 1037 UErrorCode *pErrorCode) { 1038 _UTF32Reset(cnv, UCNV_RESET_BOTH); 1039} 1040 1041static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 }; 1042 1043static void 1044_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1045 UErrorCode *pErrorCode) { 1046 UConverter *cnv=pArgs->converter; 1047 const char *source=pArgs->source; 1048 const char *sourceLimit=pArgs->sourceLimit; 1049 int32_t *offsets=pArgs->offsets; 1050 1051 int32_t state, offsetDelta; 1052 char b; 1053 1054 state=cnv->mode; 1055 1056 /* 1057 * If we detect a BOM in this buffer, then we must add the BOM size to the 1058 * offsets because the actual converter function will not see and count the BOM. 1059 * offsetDelta will have the number of the BOM bytes that are in the current buffer. 1060 */ 1061 offsetDelta=0; 1062 1063 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { 1064 switch(state) { 1065 case 0: 1066 b=*source; 1067 if(b==0) { 1068 state=1; /* could be 00 00 FE FF */ 1069 } else if(b==(char)0xff) { 1070 state=5; /* could be FF FE 00 00 */ 1071 } else { 1072 state=8; /* default to UTF-32BE */ 1073 continue; 1074 } 1075 ++source; 1076 break; 1077 case 1: 1078 case 2: 1079 case 3: 1080 case 5: 1081 case 6: 1082 case 7: 1083 if(*source==utf32BOM[state]) { 1084 ++state; 1085 ++source; 1086 if(state==4) { 1087 state=8; /* detect UTF-32BE */ 1088 offsetDelta=(int32_t)(source-pArgs->source); 1089 } else if(state==8) { 1090 state=9; /* detect UTF-32LE */ 1091 offsetDelta=(int32_t)(source-pArgs->source); 1092 } 1093 } else { 1094 /* switch to UTF-32BE and pass the previous bytes */ 1095 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */ 1096 1097 /* reset the source */ 1098 source=pArgs->source; 1099 1100 if(count==(state&3)) { 1101 /* simple: all in the same buffer, just reset source */ 1102 } else { 1103 UBool oldFlush=pArgs->flush; 1104 1105 /* some of the bytes are from a previous buffer, replay those first */ 1106 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ 1107 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */ 1108 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ 1109 1110 /* no offsets: bytes from previous buffer, and not enough for output */ 1111 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1112 1113 /* restore real pointers; pArgs->source will be set in case 8/9 */ 1114 pArgs->sourceLimit=sourceLimit; 1115 pArgs->flush=oldFlush; 1116 } 1117 state=8; 1118 continue; 1119 } 1120 break; 1121 case 8: 1122 /* call UTF-32BE */ 1123 pArgs->source=source; 1124 if(offsets==NULL) { 1125 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1126 } else { 1127 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode); 1128 } 1129 source=pArgs->source; 1130 break; 1131 case 9: 1132 /* call UTF-32LE */ 1133 pArgs->source=source; 1134 if(offsets==NULL) { 1135 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); 1136 } else { 1137 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode); 1138 } 1139 source=pArgs->source; 1140 break; 1141 default: 1142 break; /* does not occur */ 1143 } 1144 } 1145 1146 /* add BOM size to offsets - see comment at offsetDelta declaration */ 1147 if(offsets!=NULL && offsetDelta!=0) { 1148 int32_t *offsetsLimit=pArgs->offsets; 1149 while(offsets<offsetsLimit) { 1150 *offsets++ += offsetDelta; 1151 } 1152 } 1153 1154 pArgs->source=source; 1155 1156 if(source==sourceLimit && pArgs->flush) { 1157 /* handle truncated input */ 1158 switch(state) { 1159 case 0: 1160 break; /* no input at all, nothing to do */ 1161 case 8: 1162 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1163 break; 1164 case 9: 1165 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); 1166 break; 1167 default: 1168 /* handle 0<state<8: call UTF-32BE with too-short input */ 1169 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ 1170 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ 1171 1172 /* no offsets: not enough for output */ 1173 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1174 pArgs->source=source; 1175 pArgs->sourceLimit=sourceLimit; 1176 state=8; 1177 break; 1178 } 1179 } 1180 1181 cnv->mode=state; 1182} 1183 1184static UChar32 1185_UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs, 1186 UErrorCode *pErrorCode) { 1187 switch(pArgs->converter->mode) { 1188 case 8: 1189 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode); 1190 case 9: 1191 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode); 1192 default: 1193 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1194 } 1195} 1196 1197static const UConverterImpl _UTF32Impl = { 1198 UCNV_UTF32, 1199 1200 NULL, 1201 NULL, 1202 1203 _UTF32Open, 1204 NULL, 1205 _UTF32Reset, 1206 1207 _UTF32ToUnicodeWithOffsets, 1208 _UTF32ToUnicodeWithOffsets, 1209#if U_IS_BIG_ENDIAN 1210 T_UConverter_fromUnicode_UTF32_BE, 1211 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, 1212#else 1213 T_UConverter_fromUnicode_UTF32_LE, 1214 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, 1215#endif 1216 _UTF32GetNextUChar, 1217 1218 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1219 NULL, 1220 NULL, 1221 NULL, 1222 ucnv_getNonSurrogateUnicodeSet 1223}; 1224 1225/* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */ 1226static const UConverterStaticData _UTF32StaticData = { 1227 sizeof(UConverterStaticData), 1228 "UTF-32", 1229 1236, 1230 UCNV_IBM, UCNV_UTF32, 4, 4, 1231#if U_IS_BIG_ENDIAN 1232 { 0, 0, 0xff, 0xfd }, 4, 1233#else 1234 { 0xfd, 0xff, 0, 0 }, 4, 1235#endif 1236 FALSE, FALSE, 1237 0, 1238 0, 1239 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1240}; 1241 1242const UConverterSharedData _UTF32Data = { 1243 sizeof(UConverterSharedData), ~((uint32_t) 0), 1244 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl, 1245 0 1246}; 1247 1248#endif 1249