1/************************************************************************** 2* 3* Copyright (C) 2000-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5* 6*************************************************************************** 7* file name: convsamp.c 8* encoding: ASCII (7-bit) 9* 10* created on: 2000may30 11* created by: Steven R. Loomis 12* 13* Sample code for the ICU conversion routines. 14* 15* Note: Nothing special is needed to build this sample. Link with 16* the icu UC and icu I18N libraries. 17* 18* I use 'assert' for error checking, you probably will want 19* something more flexible. '***BEGIN SAMPLE***' and 20* '***END SAMPLE***' mark pieces suitable for stand alone 21* code snippets. 22* 23* 24* Each test can define it's own BUFFERSIZE 25* 26*/ 27 28#define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ 29 30#include <stdio.h> 31#include <ctype.h> /* for isspace, etc. */ 32#include <assert.h> 33#include <string.h> 34#include <stdlib.h> /* malloc */ 35 36#include "unicode/utypes.h" /* Basic ICU data types */ 37#include "unicode/ucnv.h" /* C Converter API */ 38#include "unicode/ustring.h" /* some more string fcns*/ 39#include "unicode/uchar.h" /* char names */ 40#include "unicode/uloc.h" 41#include "unicode/unistr.h" 42 43#include "flagcb.h" 44 45/* Some utility functions */ 46 47static const UChar kNone[] = { 0x0000 }; 48 49#define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} 50 51/* Print a UChar if possible, in seven characters. */ 52void prettyPrintUChar(UChar c) 53{ 54 if( (c <= 0x007F) && 55 (isgraph(c)) ) { 56 printf(" '%c' ", (char)(0x00FF&c)); 57 } else if ( c > 0x007F ) { 58 char buf[1000]; 59 UErrorCode status = U_ZERO_ERROR; 60 int32_t o; 61 62 o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 1000, &status); 63 if(U_SUCCESS(status) && (o>0) ) { 64 buf[6] = 0; 65 printf("%7s", buf); 66 } else { 67 o = u_charName(c, U_UNICODE_10_CHAR_NAME, buf, 1000, &status); 68 if(U_SUCCESS(status) && (o>0)) { 69 buf[5] = 0; 70 printf("~%6s", buf); 71 } 72 else { 73 printf(" ??????"); 74 } 75 } 76 } else { 77 switch((char)(c & 0x007F)) { 78 case ' ': 79 printf(" ' ' "); 80 break; 81 case '\t': 82 printf(" \\t "); 83 break; 84 case '\n': 85 printf(" \\n "); 86 break; 87 default: 88 printf(" _ "); 89 break; 90 } 91 } 92} 93 94 95void printUChars(const char *name = "?", 96 const UChar *uch = kNone, 97 int32_t len = -1 ) 98{ 99 int32_t i; 100 101 if( (len == -1) && (uch) ) { 102 len = u_strlen(uch); 103 } 104 105 printf("%5s: ", name); 106 for( i = 0; i <len; i++) { 107 printf("%-6d ", i); 108 } 109 printf("\n"); 110 111 printf("%5s: ", "uni"); 112 for( i = 0; i <len; i++) { 113 printf("\\u%04X ", (int)uch[i]); 114 } 115 printf("\n"); 116 117 printf("%5s:", "ch"); 118 for( i = 0; i <len; i++) { 119 prettyPrintUChar(uch[i]); 120 } 121 printf("\n"); 122} 123 124void printBytes(const char *name = "?", 125 const char *uch = "", 126 int32_t len = -1 ) 127{ 128 int32_t i; 129 130 if( (len == -1) && (uch) ) { 131 len = strlen(uch); 132 } 133 134 printf("%5s: ", name); 135 for( i = 0; i <len; i++) { 136 printf("%-4d ", i); 137 } 138 printf("\n"); 139 140 printf("%5s: ", "uni"); 141 for( i = 0; i <len; i++) { 142 printf("\\x%02X ", 0x00FF & (int)uch[i]); 143 } 144 printf("\n"); 145 146 printf("%5s:", "ch"); 147 for( i = 0; i <len; i++) { 148 if(isgraph(0x00FF & (int)uch[i])) { 149 printf(" '%c' ", (char)uch[i]); 150 } else { 151 printf(" "); 152 } 153 } 154 printf("\n"); 155} 156 157void printUChar(UChar32 ch32) 158{ 159 if(ch32 > 0xFFFF) { 160 printf("ch: U+%06X\n", ch32); 161 } 162 else { 163 UChar ch = (UChar)ch32; 164 printUChars("C", &ch, 1); 165 } 166} 167 168/******************************************************************* 169 Very simple C sample to convert the word 'Moscow' in Russian in Unicode, 170 followed by an exclamation mark (!) into the KOI8-R Russian code page. 171 172 This example first creates a UChar String out of the Unicode chars. 173 174 targetSize must be set to the amount of space available in the target 175 buffer. After fromUChars is called, 176 len will contain the number of bytes in target[] which were 177 used in the resulting codepage. In this case, there is a 1:1 mapping 178 between the input and output characters. The exclamation mark has the 179 same value in both KOI8-R and Unicode. 180 181 src: 0 1 2 3 4 5 6 182 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 183 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' 184 185 targ: 0 1 2 3 4 5 6 186 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 187 ch: '!' 188 189 190Converting FROM unicode 191 to koi8-r. 192 You must call ucnv_close to clean up the memory used by the 193 converter. 194 195 'len' returns the number of OUTPUT bytes resulting from the 196 conversion. 197 */ 198 199UErrorCode convsample_02() 200{ 201 printf("\n\n==============================================\n" 202 "Sample 02: C: simple Unicode -> koi8-r conversion\n"); 203 204 205 // **************************** START SAMPLE ******************* 206 // "cat<cat>OK" 207 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, 208 0x0430, 0x0021, 0x0000 }; 209 char target[100]; 210 UErrorCode status = U_ZERO_ERROR; 211 UConverter *conv; 212 int32_t len; 213 214 // set up the converter 215 conv = ucnv_open("koi8-r", &status); 216 assert(U_SUCCESS(status)); 217 218 // convert to koi8-r 219 len = ucnv_fromUChars(conv, target, 100, source, -1, &status); 220 assert(U_SUCCESS(status)); 221 222 // close the converter 223 ucnv_close(conv); 224 225 // ***************************** END SAMPLE ******************** 226 227 // Print it out 228 printUChars("src", source); 229 printf("\n"); 230 printBytes("targ", target, len); 231 232 return U_ZERO_ERROR; 233} 234 235 236UErrorCode convsample_03() 237{ 238 printf("\n\n==============================================\n" 239 "Sample 03: C: print out all converters\n"); 240 241 int32_t count; 242 int32_t i; 243 244 // **************************** START SAMPLE ******************* 245 count = ucnv_countAvailable(); 246 printf("Available converters: %d\n", count); 247 248 for(i=0;i<count;i++) 249 { 250 printf("%s ", ucnv_getAvailableName(i)); 251 } 252 253 // ***************************** END SAMPLE ******************** 254 255 printf("\n"); 256 257 return U_ZERO_ERROR; 258} 259 260 261 262#define BUFFERSIZE 17 /* make it interesting :) */ 263 264/* 265 Converting from a codepage to Unicode in bulk.. 266 What is the best way to determine the buffer size? 267 268 The 'buffersize' is in bytes of input. 269 For a given converter, divinding this by the minimum char size 270 give you the maximum number of Unicode characters that could be 271 expected for a given number of input bytes. 272 see: ucnv_getMinCharSize() 273 274 For example, a single byte codepage like 'Latin-3' has a 275 minimum char size of 1. (It takes at least 1 byte to represent 276 each Unicode char.) So the unicode buffer has the same number of 277 UChars as the input buffer has bytes. 278 279 In a strictly double byte codepage such as cp1362 (Windows 280 Korean), the minimum char size is 2. So, only half as many Unicode 281 chars as bytes are needed. 282 283 This work to calculate the buffer size is an optimization. Any 284 size of input and output buffer can be used, as long as the 285 program handles the following cases: If the input buffer is empty, 286 the source pointer will be equal to sourceLimit. If the output 287 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. 288 */ 289 290UErrorCode convsample_05() 291{ 292 printf("\n\n==============================================\n" 293 "Sample 05: C: count the number of letters in a UTF-8 document\n"); 294 295 FILE *f; 296 int32_t count; 297 char inBuf[BUFFERSIZE]; 298 const char *source; 299 const char *sourceLimit; 300 UChar *uBuf; 301 UChar *target; 302 UChar *targetLimit; 303 UChar *p; 304 int32_t uBufSize = 0; 305 UConverter *conv; 306 UErrorCode status = U_ZERO_ERROR; 307 uint32_t letters=0, total=0; 308 309 f = fopen("data01.txt", "r"); 310 if(!f) 311 { 312 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); 313 return U_FILE_ACCESS_ERROR; 314 } 315 316 // **************************** START SAMPLE ******************* 317 conv = ucnv_open("utf-8", &status); 318 assert(U_SUCCESS(status)); 319 320 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 321 printf("input bytes %d / min chars %d = %d UChars\n", 322 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 323 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 324 assert(uBuf!=NULL); 325 326 // grab another buffer's worth 327 while((!feof(f)) && 328 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 329 { 330 // Convert bytes to unicode 331 source = inBuf; 332 sourceLimit = inBuf + count; 333 334 do 335 { 336 target = uBuf; 337 targetLimit = uBuf + uBufSize; 338 339 ucnv_toUnicode(conv, &target, targetLimit, 340 &source, sourceLimit, NULL, 341 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 342 /* is true (when no more data will come) */ 343 &status); 344 345 if(status == U_BUFFER_OVERFLOW_ERROR) 346 { 347 // simply ran out of space - we'll reset the target ptr the next 348 // time through the loop. 349 status = U_ZERO_ERROR; 350 } 351 else 352 { 353 // Check other errors here. 354 assert(U_SUCCESS(status)); 355 // Break out of the loop (by force) 356 } 357 358 // Process the Unicode 359 // Todo: handle UTF-16/surrogates 360 361 for(p = uBuf; p<target; p++) 362 { 363 if(u_isalpha(*p)) 364 letters++; 365 total++; 366 } 367 } while (source < sourceLimit); // while simply out of space 368 } 369 370 printf("%d letters out of %d total UChars.\n", letters, total); 371 372 // ***************************** END SAMPLE ******************** 373 ucnv_close(conv); 374 375 printf("\n"); 376 377 fclose(f); 378 379 return U_ZERO_ERROR; 380} 381#undef BUFFERSIZE 382 383#define BUFFERSIZE 1024 384typedef struct 385{ 386 UChar32 codepoint; 387 uint32_t frequency; 388} CharFreqInfo; 389 390UErrorCode convsample_06() 391{ 392 printf("\n\n==============================================\n" 393 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); 394 395 FILE *f; 396 int32_t count; 397 char inBuf[BUFFERSIZE]; 398 const char *source; 399 const char *sourceLimit; 400 int32_t uBufSize = 0; 401 UConverter *conv; 402 UErrorCode status = U_ZERO_ERROR; 403 uint32_t letters=0, total=0; 404 405 CharFreqInfo *info; 406 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ 407 UChar32 p; 408 409 uint32_t ie = 0; 410 uint32_t gh = 0; 411 UChar32 l = 0; 412 413 f = fopen("data06.txt", "r"); 414 if(!f) 415 { 416 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); 417 return U_FILE_ACCESS_ERROR; 418 } 419 420 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); 421 if(!info) 422 { 423 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); 424 } 425 426 /* reset frequencies */ 427 for(p=0;p<charCount;p++) 428 { 429 info[p].codepoint = p; 430 info[p].frequency = 0; 431 } 432 433 // **************************** START SAMPLE ******************* 434 conv = ucnv_open("utf-8", &status); 435 assert(U_SUCCESS(status)); 436 437 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 438 printf("input bytes %d / min chars %d = %d UChars\n", 439 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 440 441 // grab another buffer's worth 442 while((!feof(f)) && 443 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 444 { 445 // Convert bytes to unicode 446 source = inBuf; 447 sourceLimit = inBuf + count; 448 449 while(source < sourceLimit) 450 { 451 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 452 if(U_FAILURE(status)) 453 { 454 fprintf(stderr, "%s @ %d\n", u_errorName(status), total); 455 status = U_ZERO_ERROR; 456 continue; 457 } 458 U_ASSERT(status); 459 total++; 460 461 if(u_isalpha(p)) 462 letters++; 463 464 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) 465 ie++; 466 467 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) 468 gh++; 469 470 if(p>charCount) 471 { 472 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); 473 free(info); 474 fclose(f); 475 ucnv_close(conv); 476 return U_UNSUPPORTED_ERROR; 477 } 478 info[p].frequency++; 479 l = p; 480 } 481 } 482 483 fclose(f); 484 ucnv_close(conv); 485 486 printf("%d letters out of %d total UChars.\n", letters, total); 487 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); 488 489 // now, we could sort it.. 490 491 // qsort(info, charCount, sizeof(info[0]), charfreq_compare); 492 493 for(p=0;p<charCount;p++) 494 { 495 if(info[p].frequency) 496 { 497 printf("% 5d U+%06X ", info[p].frequency, p); 498 if(p <= 0xFFFF) 499 { 500 prettyPrintUChar((UChar)p); 501 } 502 printf("\n"); 503 } 504 } 505 free(info); 506 // ***************************** END SAMPLE ******************** 507 508 printf("\n"); 509 510 return U_ZERO_ERROR; 511} 512#undef BUFFERSIZE 513 514 515/****************************************************** 516 You must call ucnv_close to clean up the memory used by the 517 converter. 518 519 'len' returns the number of OUTPUT bytes resulting from the 520 conversion. 521 */ 522 523UErrorCode convsample_12() 524{ 525 printf("\n\n==============================================\n" 526 "Sample 12: C: simple sjis -> unicode conversion\n"); 527 528 529 // **************************** START SAMPLE ******************* 530 531 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; 532 UChar target[100]; 533 UErrorCode status = U_ZERO_ERROR; 534 UConverter *conv; 535 int32_t len; 536 537 // set up the converter 538 conv = ucnv_open("shift_jis", &status); 539 assert(U_SUCCESS(status)); 540 541 // convert to Unicode 542 // Note: we can use strlen, we know it's an 8 bit null terminated codepage 543 target[6] = 0xFDCA; 544 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); 545 U_ASSERT(status); 546 // close the converter 547 ucnv_close(conv); 548 549 // ***************************** END SAMPLE ******************** 550 551 // Print it out 552 printBytes("src", source, strlen(source) ); 553 printf("\n"); 554 printUChars("targ", target, len); 555 556 return U_ZERO_ERROR; 557} 558 559/****************************************************************** 560 C: Convert from codepage to Unicode one at a time. 561*/ 562 563UErrorCode convsample_13() 564{ 565 printf("\n\n==============================================\n" 566 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); 567 568 569 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; 570 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; 571 const char *source, *sourceLimit; 572 UChar32 target; 573 UErrorCode status = U_ZERO_ERROR; 574 UConverter *conv = NULL; 575 int32_t srcCount=0; 576 int32_t dstCount=0; 577 578 srcCount = sizeof(sourceChars); 579 580 conv = ucnv_open("Big5", &status); 581 U_ASSERT(status); 582 583 source = sourceChars; 584 sourceLimit = sourceChars + sizeof(sourceChars); 585 586 // **************************** START SAMPLE ******************* 587 588 589 printBytes("src",source,sourceLimit-source); 590 591 while(source < sourceLimit) 592 { 593 puts(""); 594 target = ucnv_getNextUChar (conv, 595 &source, 596 sourceLimit, 597 &status); 598 599 // printBytes("src",source,sourceLimit-source); 600 U_ASSERT(status); 601 printUChar(target); 602 dstCount++; 603 } 604 605 606 // ************************** END SAMPLE ************************* 607 608 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); 609 ucnv_close(conv); 610 611 return U_ZERO_ERROR; 612} 613 614 615 616 617UBool convsample_20_didSubstitute(const char *source) 618{ 619 UChar uchars[100]; 620 char bytes[100]; 621 UConverter *conv = NULL; 622 UErrorCode status = U_ZERO_ERROR; 623 uint32_t len, len2; 624 UBool flagVal; 625 626 FromUFLAGContext * context = NULL; 627 628 printf("\n\n==============================================\n" 629 "Sample 20: C: Test for substitution using callbacks\n"); 630 631 /* print out the original source */ 632 printBytes("src", source); 633 printf("\n"); 634 635 /* First, convert from UTF8 to unicode */ 636 conv = ucnv_open("utf-8", &status); 637 U_ASSERT(status); 638 639 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 640 U_ASSERT(status); 641 642 printUChars("uch", uchars, len); 643 printf("\n"); 644 645 /* Now, close the converter */ 646 ucnv_close(conv); 647 648 /* Now, convert to windows-1252 */ 649 conv = ucnv_open("windows-1252", &status); 650 U_ASSERT(status); 651 652 /* Converter starts out with the SUBSTITUTE callback set. */ 653 654 /* initialize our callback */ 655 context = flagCB_fromU_openContext(); 656 657 /* Set our special callback */ 658 ucnv_setFromUCallBack(conv, 659 flagCB_fromU, 660 context, 661 &(context->subCallback), 662 &(context->subContext), 663 &status); 664 665 U_ASSERT(status); 666 667 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); 668 U_ASSERT(status); 669 670 flagVal = context->flag; /* it's about to go away when we close the cnv */ 671 672 ucnv_close(conv); 673 674 /* print out the original source */ 675 printBytes("bytes", bytes, len2); 676 677 return flagVal; /* true if callback was called */ 678} 679 680UErrorCode convsample_20() 681{ 682 const char *sample1 = "abc\xdf\xbf"; 683 const char *sample2 = "abc_def"; 684 685 686 if(convsample_20_didSubstitute(sample1)) 687 { 688 printf("DID substitute.\n******\n"); 689 } 690 else 691 { 692 printf("Did NOT substitute.\n*****\n"); 693 } 694 695 if(convsample_20_didSubstitute(sample2)) 696 { 697 printf("DID substitute.\n******\n"); 698 } 699 else 700 { 701 printf("Did NOT substitute.\n*****\n"); 702 } 703 704 return U_ZERO_ERROR; 705} 706 707// 21 - C, callback, with clone and debug 708 709 710 711UBool convsample_21_didSubstitute(const char *source) 712{ 713 UChar uchars[100]; 714 char bytes[100]; 715 UConverter *conv = NULL, *cloneCnv = NULL; 716 UErrorCode status = U_ZERO_ERROR; 717 uint32_t len, len2; 718 int32_t cloneLen; 719 UBool flagVal = FALSE; 720 UConverterFromUCallback junkCB; 721 722 FromUFLAGContext *flagCtx = NULL, 723 *cloneFlagCtx = NULL; 724 725 debugCBContext *debugCtx1 = NULL, 726 *debugCtx2 = NULL, 727 *cloneDebugCtx = NULL; 728 729 printf("\n\n==============================================\n" 730 "Sample 21: C: Test for substitution w/ callbacks & clones \n"); 731 732 /* print out the original source */ 733 printBytes("src", source); 734 printf("\n"); 735 736 /* First, convert from UTF8 to unicode */ 737 conv = ucnv_open("utf-8", &status); 738 U_ASSERT(status); 739 740 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 741 U_ASSERT(status); 742 743 printUChars("uch", uchars, len); 744 printf("\n"); 745 746 /* Now, close the converter */ 747 ucnv_close(conv); 748 749 /* Now, convert to windows-1252 */ 750 conv = ucnv_open("windows-1252", &status); 751 U_ASSERT(status); 752 753 /* Converter starts out with the SUBSTITUTE callback set. */ 754 755 /* initialize our callback */ 756 /* from the 'bottom' innermost, out 757 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ 758 759#if DEBUG_TMI 760 printf("flagCB_fromU = %p\n", &flagCB_fromU); 761 printf("debugCB_fromU = %p\n", &debugCB_fromU); 762#endif 763 764 debugCtx1 = debugCB_openContext(); 765 flagCtx = flagCB_fromU_openContext(); 766 debugCtx2 = debugCB_openContext(); 767 768 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ 769 debugCtx1->subContext = flagCtx; 770 771 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ 772 flagCtx->subContext = debugCtx2; 773 774 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; 775 debugCtx2->subContext = NULL; 776 777 /* Set our special callback */ 778 779 ucnv_setFromUCallBack(conv, 780 debugCB_fromU, 781 debugCtx1, 782 &(debugCtx2->subCallback), 783 &(debugCtx2->subContext), 784 &status); 785 786 U_ASSERT(status); 787 788#if DEBUG_TMI 789 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", 790 conv, debugCtx1, debugCtx1->subCallback, 791 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); 792#endif 793 794 cloneLen = 1; /* but passing in null so it will clone */ 795 cloneCnv = ucnv_safeClone(conv, NULL, &cloneLen, &status); 796 797 U_ASSERT(status); 798 799#if DEBUG_TMI 800 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); 801#endif 802 803 ucnv_close(conv); 804 805#if DEBUG_TMI 806 printf("%p closed.\n", conv); 807#endif 808 809 U_ASSERT(status); 810 /* Now, we have to extract the context */ 811 cloneDebugCtx = NULL; 812 cloneFlagCtx = NULL; 813 814 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); 815 if(cloneDebugCtx != NULL) { 816 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; 817 } 818 819 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", 820 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); 821 822 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); 823 U_ASSERT(status); 824 825 if(cloneFlagCtx != NULL) { 826 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ 827 } else { 828 printf("** Warning, couldn't get the subcallback \n"); 829 } 830 831 ucnv_close(cloneCnv); 832 833 /* print out the original source */ 834 printBytes("bytes", bytes, len2); 835 836 return flagVal; /* true if callback was called */ 837} 838 839UErrorCode convsample_21() 840{ 841 const char *sample1 = "abc\xdf\xbf"; 842 const char *sample2 = "abc_def"; 843 844 if(convsample_21_didSubstitute(sample1)) 845 { 846 printf("DID substitute.\n******\n"); 847 } 848 else 849 { 850 printf("Did NOT substitute.\n*****\n"); 851 } 852 853 if(convsample_21_didSubstitute(sample2)) 854 { 855 printf("DID substitute.\n******\n"); 856 } 857 else 858 { 859 printf("Did NOT substitute.\n*****\n"); 860 } 861 862 return U_ZERO_ERROR; 863} 864 865 866// 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] 867 868#define BUFFERSIZE 17 /* make it interesting :) */ 869 870UErrorCode convsample_40() 871{ 872 printf("\n\n==============================================\n" 873 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); 874 875 FILE *f; 876 FILE *out; 877 int32_t count; 878 char inBuf[BUFFERSIZE]; 879 const char *source; 880 const char *sourceLimit; 881 UChar *uBuf; 882 UChar *target; 883 UChar *targetLimit; 884 int32_t uBufSize = 0; 885 UConverter *conv = NULL; 886 UErrorCode status = U_ZERO_ERROR; 887 uint32_t inbytes=0, total=0; 888 889 f = fopen("data02.bin", "rb"); 890 if(!f) 891 { 892 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); 893 return U_FILE_ACCESS_ERROR; 894 } 895 896 out = fopen("data40.utf16", "wb"); 897 if(!out) 898 { 899 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); 900 fclose(f); 901 return U_FILE_ACCESS_ERROR; 902 } 903 904 // **************************** START SAMPLE ******************* 905 conv = ucnv_openCCSID(37, UCNV_IBM, &status); 906 assert(U_SUCCESS(status)); 907 908 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 909 printf("input bytes %d / min chars %d = %d UChars\n", 910 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 911 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 912 assert(uBuf!=NULL); 913 914 // grab another buffer's worth 915 while((!feof(f)) && 916 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 917 { 918 inbytes += count; 919 920 // Convert bytes to unicode 921 source = inBuf; 922 sourceLimit = inBuf + count; 923 924 do 925 { 926 target = uBuf; 927 targetLimit = uBuf + uBufSize; 928 929 ucnv_toUnicode( conv, &target, targetLimit, 930 &source, sourceLimit, NULL, 931 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 932 /* is true (when no more data will come) */ 933 &status); 934 935 if(status == U_BUFFER_OVERFLOW_ERROR) 936 { 937 // simply ran out of space - we'll reset the target ptr the next 938 // time through the loop. 939 status = U_ZERO_ERROR; 940 } 941 else 942 { 943 // Check other errors here. 944 assert(U_SUCCESS(status)); 945 // Break out of the loop (by force) 946 } 947 948 // Process the Unicode 949 // Todo: handle UTF-16/surrogates 950 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == 951 (size_t)(target-uBuf)); 952 total += (target-uBuf); 953 } while (source < sourceLimit); // while simply out of space 954 } 955 956 printf("%d bytes in, %d UChars out.\n", inbytes, total); 957 958 // ***************************** END SAMPLE ******************** 959 ucnv_close(conv); 960 961 fclose(f); 962 fclose(out); 963 printf("\n"); 964 965 return U_ZERO_ERROR; 966} 967#undef BUFFERSIZE 968 969 970 971// 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] 972 973#define BUFFERSIZE 24 /* make it interesting :) */ 974 975UErrorCode convsample_46() 976{ 977 printf("\n\n==============================================\n" 978 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); 979 980 FILE *f; 981 FILE *out; 982 int32_t count; 983 UChar inBuf[BUFFERSIZE]; 984 const UChar *source; 985 const UChar *sourceLimit; 986 char *buf; 987 char *target; 988 char *targetLimit; 989 990 int32_t bufSize = 0; 991 UConverter *conv = NULL; 992 UErrorCode status = U_ZERO_ERROR; 993 uint32_t inchars=0, total=0; 994 995 f = fopen("data40.utf16", "rb"); 996 if(!f) 997 { 998 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); 999 return U_FILE_ACCESS_ERROR; 1000 } 1001 1002 out = fopen("data46.out", "wb"); 1003 if(!out) 1004 { 1005 fprintf(stderr, "Couldn't create file 'data46.out'.\n"); 1006 fclose(f); 1007 return U_FILE_ACCESS_ERROR; 1008 } 1009 1010 // **************************** START SAMPLE ******************* 1011 conv = ucnv_open( "iso-8859-2", &status); 1012 assert(U_SUCCESS(status)); 1013 1014 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); 1015 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", 1016 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); 1017 buf = (char*)malloc(bufSize * sizeof(char)); 1018 assert(buf!=NULL); 1019 1020 // grab another buffer's worth 1021 while((!feof(f)) && 1022 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) 1023 { 1024 inchars += count; 1025 1026 // Convert bytes to unicode 1027 source = inBuf; 1028 sourceLimit = inBuf + count; 1029 1030 do 1031 { 1032 target = buf; 1033 targetLimit = buf + bufSize; 1034 1035 ucnv_fromUnicode( conv, &target, targetLimit, 1036 &source, sourceLimit, NULL, 1037 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 1038 /* is true (when no more data will come) */ 1039 &status); 1040 1041 if(status == U_BUFFER_OVERFLOW_ERROR) 1042 { 1043 // simply ran out of space - we'll reset the target ptr the next 1044 // time through the loop. 1045 status = U_ZERO_ERROR; 1046 } 1047 else 1048 { 1049 // Check other errors here. 1050 assert(U_SUCCESS(status)); 1051 // Break out of the loop (by force) 1052 } 1053 1054 // Process the Unicode 1055 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == 1056 (size_t)(target-buf)); 1057 total += (target-buf); 1058 } while (source < sourceLimit); // while simply out of space 1059 } 1060 1061 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); 1062 1063 // ***************************** END SAMPLE ******************** 1064 ucnv_close(conv); 1065 1066 fclose(f); 1067 fclose(out); 1068 printf("\n"); 1069 1070 return U_ZERO_ERROR; 1071} 1072#undef BUFFERSIZE 1073 1074#define BUFFERSIZE 219 1075 1076 1077/* main */ 1078 1079int main() 1080{ 1081 1082 printf("Default Converter=%s\n", ucnv_getDefaultName() ); 1083 1084 convsample_02(); // C , u->koi8r, conv 1085 convsample_03(); // C, iterate 1086 1087 convsample_05(); // C, utf8->u, getNextUChar 1088 convsample_06(); // C freq counter thingy 1089 1090 convsample_12(); // C, sjis->u, conv 1091 convsample_13(); // C, big5->u, getNextU 1092 1093 convsample_20(); // C, callback 1094 convsample_21(); // C, callback debug 1095 1096 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] 1097 1098 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] 1099 1100 printf("End of converter samples.\n"); 1101 1102 fflush(stdout); 1103 fflush(stderr); 1104 1105 return 0; 1106} 1107