1/************************************************************************** 2* 3* Copyright (C) 2000-2013, International Business Machines 4* Corporation and others. All Rights Reserved. 5* 6*************************************************************************** 7* file name: convsamp.c 8* encoding: ASCII (7-bit) 9* 10* created on: 2000may30 11* created by: Steven R. Loomis 12* 13* Sample code for the ICU conversion routines. 14* 15* Note: Nothing special is needed to build this sample. Link with 16* the icu UC and icu I18N libraries. 17* 18* I use 'assert' for error checking, you probably will want 19* something more flexible. '***BEGIN SAMPLE***' and 20* '***END SAMPLE***' mark pieces suitable for stand alone 21* code snippets. 22* 23* 24* Each test can define it's own BUFFERSIZE 25* 26*/ 27 28#define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ 29 30#include <stdio.h> 31#include <ctype.h> /* for isspace, etc. */ 32#include <assert.h> 33#include <string.h> 34#include <stdlib.h> /* malloc */ 35 36#include "unicode/utypes.h" /* Basic ICU data types */ 37#include "unicode/ucnv.h" /* C Converter API */ 38#include "unicode/ustring.h" /* some more string fcns*/ 39#include "unicode/uchar.h" /* char names */ 40#include "unicode/uloc.h" 41#include "unicode/unistr.h" 42 43#include "flagcb.h" 44 45/* Some utility functions */ 46 47static const UChar kNone[] = { 0x0000 }; 48 49#define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} 50 51/* Print a UChar if possible, in seven characters. */ 52void prettyPrintUChar(UChar c) 53{ 54 if( (c <= 0x007F) && 55 (isgraph(c)) ) { 56 printf(" '%c' ", (char)(0x00FF&c)); 57 } else if ( c > 0x007F ) { 58 char buf[1000]; 59 UErrorCode status = U_ZERO_ERROR; 60 int32_t o; 61 62 o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status); 63 if(U_SUCCESS(status) && (o>0) ) { 64 buf[6] = 0; 65 printf("%7s", buf); 66 } else { 67 printf(" ??????"); 68 } 69 } else { 70 switch((char)(c & 0x007F)) { 71 case ' ': 72 printf(" ' ' "); 73 break; 74 case '\t': 75 printf(" \\t "); 76 break; 77 case '\n': 78 printf(" \\n "); 79 break; 80 default: 81 printf(" _ "); 82 break; 83 } 84 } 85} 86 87 88void printUChars(const char *name = "?", 89 const UChar *uch = kNone, 90 int32_t len = -1 ) 91{ 92 int32_t i; 93 94 if( (len == -1) && (uch) ) { 95 len = u_strlen(uch); 96 } 97 98 printf("%5s: ", name); 99 for( i = 0; i <len; i++) { 100 printf("%-6d ", i); 101 } 102 printf("\n"); 103 104 printf("%5s: ", "uni"); 105 for( i = 0; i <len; i++) { 106 printf("\\u%04X ", (int)uch[i]); 107 } 108 printf("\n"); 109 110 printf("%5s:", "ch"); 111 for( i = 0; i <len; i++) { 112 prettyPrintUChar(uch[i]); 113 } 114 printf("\n"); 115} 116 117void printBytes(const char *name = "?", 118 const char *uch = "", 119 int32_t len = -1 ) 120{ 121 int32_t i; 122 123 if( (len == -1) && (uch) ) { 124 len = strlen(uch); 125 } 126 127 printf("%5s: ", name); 128 for( i = 0; i <len; i++) { 129 printf("%-4d ", i); 130 } 131 printf("\n"); 132 133 printf("%5s: ", "uni"); 134 for( i = 0; i <len; i++) { 135 printf("\\x%02X ", 0x00FF & (int)uch[i]); 136 } 137 printf("\n"); 138 139 printf("%5s:", "ch"); 140 for( i = 0; i <len; i++) { 141 if(isgraph(0x00FF & (int)uch[i])) { 142 printf(" '%c' ", (char)uch[i]); 143 } else { 144 printf(" "); 145 } 146 } 147 printf("\n"); 148} 149 150void printUChar(UChar32 ch32) 151{ 152 if(ch32 > 0xFFFF) { 153 printf("ch: U+%06X\n", ch32); 154 } 155 else { 156 UChar ch = (UChar)ch32; 157 printUChars("C", &ch, 1); 158 } 159} 160 161/******************************************************************* 162 Very simple C sample to convert the word 'Moscow' in Russian in Unicode, 163 followed by an exclamation mark (!) into the KOI8-R Russian code page. 164 165 This example first creates a UChar String out of the Unicode chars. 166 167 targetSize must be set to the amount of space available in the target 168 buffer. After fromUChars is called, 169 len will contain the number of bytes in target[] which were 170 used in the resulting codepage. In this case, there is a 1:1 mapping 171 between the input and output characters. The exclamation mark has the 172 same value in both KOI8-R and Unicode. 173 174 src: 0 1 2 3 4 5 6 175 uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 176 ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' 177 178 targ: 0 1 2 3 4 5 6 179 uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 180 ch: '!' 181 182 183Converting FROM unicode 184 to koi8-r. 185 You must call ucnv_close to clean up the memory used by the 186 converter. 187 188 'len' returns the number of OUTPUT bytes resulting from the 189 conversion. 190 */ 191 192UErrorCode convsample_02() 193{ 194 printf("\n\n==============================================\n" 195 "Sample 02: C: simple Unicode -> koi8-r conversion\n"); 196 197 198 // **************************** START SAMPLE ******************* 199 // "cat<cat>OK" 200 UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, 201 0x0430, 0x0021, 0x0000 }; 202 char target[100]; 203 UErrorCode status = U_ZERO_ERROR; 204 UConverter *conv; 205 int32_t len; 206 207 // set up the converter 208 //! [ucnv_open] 209 conv = ucnv_open("koi8-r", &status); 210 //! [ucnv_open] 211 assert(U_SUCCESS(status)); 212 213 // convert to koi8-r 214 len = ucnv_fromUChars(conv, target, 100, source, -1, &status); 215 assert(U_SUCCESS(status)); 216 217 // close the converter 218 ucnv_close(conv); 219 220 // ***************************** END SAMPLE ******************** 221 222 // Print it out 223 printUChars("src", source); 224 printf("\n"); 225 printBytes("targ", target, len); 226 227 return U_ZERO_ERROR; 228} 229 230 231UErrorCode convsample_03() 232{ 233 printf("\n\n==============================================\n" 234 "Sample 03: C: print out all converters\n"); 235 236 int32_t count; 237 int32_t i; 238 239 // **************************** START SAMPLE ******************* 240 count = ucnv_countAvailable(); 241 printf("Available converters: %d\n", count); 242 243 for(i=0;i<count;i++) 244 { 245 printf("%s ", ucnv_getAvailableName(i)); 246 } 247 248 // ***************************** END SAMPLE ******************** 249 250 printf("\n"); 251 252 return U_ZERO_ERROR; 253} 254 255 256 257#define BUFFERSIZE 17 /* make it interesting :) */ 258 259/* 260 Converting from a codepage to Unicode in bulk.. 261 What is the best way to determine the buffer size? 262 263 The 'buffersize' is in bytes of input. 264 For a given converter, divinding this by the minimum char size 265 give you the maximum number of Unicode characters that could be 266 expected for a given number of input bytes. 267 see: ucnv_getMinCharSize() 268 269 For example, a single byte codepage like 'Latin-3' has a 270 minimum char size of 1. (It takes at least 1 byte to represent 271 each Unicode char.) So the unicode buffer has the same number of 272 UChars as the input buffer has bytes. 273 274 In a strictly double byte codepage such as cp1362 (Windows 275 Korean), the minimum char size is 2. So, only half as many Unicode 276 chars as bytes are needed. 277 278 This work to calculate the buffer size is an optimization. Any 279 size of input and output buffer can be used, as long as the 280 program handles the following cases: If the input buffer is empty, 281 the source pointer will be equal to sourceLimit. If the output 282 buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. 283 */ 284 285UErrorCode convsample_05() 286{ 287 printf("\n\n==============================================\n" 288 "Sample 05: C: count the number of letters in a UTF-8 document\n"); 289 290 FILE *f; 291 int32_t count; 292 char inBuf[BUFFERSIZE]; 293 const char *source; 294 const char *sourceLimit; 295 UChar *uBuf; 296 UChar *target; 297 UChar *targetLimit; 298 UChar *p; 299 int32_t uBufSize = 0; 300 UConverter *conv; 301 UErrorCode status = U_ZERO_ERROR; 302 uint32_t letters=0, total=0; 303 304 f = fopen("data01.txt", "r"); 305 if(!f) 306 { 307 fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); 308 return U_FILE_ACCESS_ERROR; 309 } 310 311 // **************************** START SAMPLE ******************* 312 conv = ucnv_open("utf-8", &status); 313 assert(U_SUCCESS(status)); 314 315 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 316 printf("input bytes %d / min chars %d = %d UChars\n", 317 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 318 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 319 assert(uBuf!=NULL); 320 321 // grab another buffer's worth 322 while((!feof(f)) && 323 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 324 { 325 // Convert bytes to unicode 326 source = inBuf; 327 sourceLimit = inBuf + count; 328 329 do 330 { 331 target = uBuf; 332 targetLimit = uBuf + uBufSize; 333 334 ucnv_toUnicode(conv, &target, targetLimit, 335 &source, sourceLimit, NULL, 336 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 337 /* is true (when no more data will come) */ 338 &status); 339 340 if(status == U_BUFFER_OVERFLOW_ERROR) 341 { 342 // simply ran out of space - we'll reset the target ptr the next 343 // time through the loop. 344 status = U_ZERO_ERROR; 345 } 346 else 347 { 348 // Check other errors here. 349 assert(U_SUCCESS(status)); 350 // Break out of the loop (by force) 351 } 352 353 // Process the Unicode 354 // Todo: handle UTF-16/surrogates 355 356 for(p = uBuf; p<target; p++) 357 { 358 if(u_isalpha(*p)) 359 letters++; 360 total++; 361 } 362 } while (source < sourceLimit); // while simply out of space 363 } 364 365 printf("%d letters out of %d total UChars.\n", letters, total); 366 367 // ***************************** END SAMPLE ******************** 368 ucnv_close(conv); 369 370 printf("\n"); 371 372 fclose(f); 373 374 return U_ZERO_ERROR; 375} 376#undef BUFFERSIZE 377 378#define BUFFERSIZE 1024 379typedef struct 380{ 381 UChar32 codepoint; 382 uint32_t frequency; 383} CharFreqInfo; 384 385UErrorCode convsample_06() 386{ 387 printf("\n\n==============================================\n" 388 "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); 389 390 FILE *f; 391 int32_t count; 392 char inBuf[BUFFERSIZE]; 393 const char *source; 394 const char *sourceLimit; 395 int32_t uBufSize = 0; 396 UConverter *conv; 397 UErrorCode status = U_ZERO_ERROR; 398 uint32_t letters=0, total=0; 399 400 CharFreqInfo *info; 401 UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ 402 UChar32 p; 403 404 uint32_t ie = 0; 405 uint32_t gh = 0; 406 UChar32 l = 0; 407 408 f = fopen("data06.txt", "r"); 409 if(!f) 410 { 411 fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); 412 return U_FILE_ACCESS_ERROR; 413 } 414 415 info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); 416 if(!info) 417 { 418 fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); 419 } 420 421 /* reset frequencies */ 422 for(p=0;p<charCount;p++) 423 { 424 info[p].codepoint = p; 425 info[p].frequency = 0; 426 } 427 428 // **************************** START SAMPLE ******************* 429 conv = ucnv_open("utf-8", &status); 430 assert(U_SUCCESS(status)); 431 432 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 433 printf("input bytes %d / min chars %d = %d UChars\n", 434 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 435 436 // grab another buffer's worth 437 while((!feof(f)) && 438 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 439 { 440 // Convert bytes to unicode 441 source = inBuf; 442 sourceLimit = inBuf + count; 443 444 while(source < sourceLimit) 445 { 446 p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); 447 if(U_FAILURE(status)) 448 { 449 fprintf(stderr, "%s @ %d\n", u_errorName(status), total); 450 status = U_ZERO_ERROR; 451 continue; 452 } 453 U_ASSERT(status); 454 total++; 455 456 if(u_isalpha(p)) 457 letters++; 458 459 if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) 460 ie++; 461 462 if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) 463 gh++; 464 465 if(p>charCount) 466 { 467 fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); 468 free(info); 469 fclose(f); 470 ucnv_close(conv); 471 return U_UNSUPPORTED_ERROR; 472 } 473 info[p].frequency++; 474 l = p; 475 } 476 } 477 478 fclose(f); 479 ucnv_close(conv); 480 481 printf("%d letters out of %d total UChars.\n", letters, total); 482 printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); 483 484 // now, we could sort it.. 485 486 // qsort(info, charCount, sizeof(info[0]), charfreq_compare); 487 488 for(p=0;p<charCount;p++) 489 { 490 if(info[p].frequency) 491 { 492 printf("% 5d U+%06X ", info[p].frequency, p); 493 if(p <= 0xFFFF) 494 { 495 prettyPrintUChar((UChar)p); 496 } 497 printf("\n"); 498 } 499 } 500 free(info); 501 // ***************************** END SAMPLE ******************** 502 503 printf("\n"); 504 505 return U_ZERO_ERROR; 506} 507#undef BUFFERSIZE 508 509 510/****************************************************** 511 You must call ucnv_close to clean up the memory used by the 512 converter. 513 514 'len' returns the number of OUTPUT bytes resulting from the 515 conversion. 516 */ 517 518UErrorCode convsample_12() 519{ 520 printf("\n\n==============================================\n" 521 "Sample 12: C: simple sjis -> unicode conversion\n"); 522 523 524 // **************************** START SAMPLE ******************* 525 526 char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; 527 UChar target[100]; 528 UErrorCode status = U_ZERO_ERROR; 529 UConverter *conv; 530 int32_t len; 531 532 // set up the converter 533 conv = ucnv_open("shift_jis", &status); 534 assert(U_SUCCESS(status)); 535 536 // convert to Unicode 537 // Note: we can use strlen, we know it's an 8 bit null terminated codepage 538 target[6] = 0xFDCA; 539 len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); 540 U_ASSERT(status); 541 // close the converter 542 ucnv_close(conv); 543 544 // ***************************** END SAMPLE ******************** 545 546 // Print it out 547 printBytes("src", source, strlen(source) ); 548 printf("\n"); 549 printUChars("targ", target, len); 550 551 return U_ZERO_ERROR; 552} 553 554/****************************************************************** 555 C: Convert from codepage to Unicode one at a time. 556*/ 557 558UErrorCode convsample_13() 559{ 560 printf("\n\n==============================================\n" 561 "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); 562 563 564 const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; 565 // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; 566 const char *source, *sourceLimit; 567 UChar32 target; 568 UErrorCode status = U_ZERO_ERROR; 569 UConverter *conv = NULL; 570 int32_t srcCount=0; 571 int32_t dstCount=0; 572 573 srcCount = sizeof(sourceChars); 574 575 conv = ucnv_open("Big5", &status); 576 U_ASSERT(status); 577 578 source = sourceChars; 579 sourceLimit = sourceChars + sizeof(sourceChars); 580 581 // **************************** START SAMPLE ******************* 582 583 584 printBytes("src",source,sourceLimit-source); 585 586 while(source < sourceLimit) 587 { 588 puts(""); 589 target = ucnv_getNextUChar (conv, 590 &source, 591 sourceLimit, 592 &status); 593 594 // printBytes("src",source,sourceLimit-source); 595 U_ASSERT(status); 596 printUChar(target); 597 dstCount++; 598 } 599 600 601 // ************************** END SAMPLE ************************* 602 603 printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); 604 ucnv_close(conv); 605 606 return U_ZERO_ERROR; 607} 608 609 610 611 612UBool convsample_20_didSubstitute(const char *source) 613{ 614 UChar uchars[100]; 615 char bytes[100]; 616 UConverter *conv = NULL; 617 UErrorCode status = U_ZERO_ERROR; 618 uint32_t len, len2; 619 UBool flagVal; 620 621 FromUFLAGContext * context = NULL; 622 623 printf("\n\n==============================================\n" 624 "Sample 20: C: Test for substitution using callbacks\n"); 625 626 /* print out the original source */ 627 printBytes("src", source); 628 printf("\n"); 629 630 /* First, convert from UTF8 to unicode */ 631 conv = ucnv_open("utf-8", &status); 632 U_ASSERT(status); 633 634 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 635 U_ASSERT(status); 636 637 printUChars("uch", uchars, len); 638 printf("\n"); 639 640 /* Now, close the converter */ 641 ucnv_close(conv); 642 643 /* Now, convert to windows-1252 */ 644 conv = ucnv_open("windows-1252", &status); 645 U_ASSERT(status); 646 647 /* Converter starts out with the SUBSTITUTE callback set. */ 648 649 /* initialize our callback */ 650 context = flagCB_fromU_openContext(); 651 652 /* Set our special callback */ 653 ucnv_setFromUCallBack(conv, 654 flagCB_fromU, 655 context, 656 &(context->subCallback), 657 &(context->subContext), 658 &status); 659 660 U_ASSERT(status); 661 662 len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); 663 U_ASSERT(status); 664 665 flagVal = context->flag; /* it's about to go away when we close the cnv */ 666 667 ucnv_close(conv); 668 669 /* print out the original source */ 670 printBytes("bytes", bytes, len2); 671 672 return flagVal; /* true if callback was called */ 673} 674 675UErrorCode convsample_20() 676{ 677 const char *sample1 = "abc\xdf\xbf"; 678 const char *sample2 = "abc_def"; 679 680 681 if(convsample_20_didSubstitute(sample1)) 682 { 683 printf("DID substitute.\n******\n"); 684 } 685 else 686 { 687 printf("Did NOT substitute.\n*****\n"); 688 } 689 690 if(convsample_20_didSubstitute(sample2)) 691 { 692 printf("DID substitute.\n******\n"); 693 } 694 else 695 { 696 printf("Did NOT substitute.\n*****\n"); 697 } 698 699 return U_ZERO_ERROR; 700} 701 702// 21 - C, callback, with clone and debug 703 704 705 706UBool convsample_21_didSubstitute(const char *source) 707{ 708 UChar uchars[100]; 709 char bytes[100]; 710 UConverter *conv = NULL, *cloneCnv = NULL; 711 UErrorCode status = U_ZERO_ERROR; 712 uint32_t len, len2; 713 int32_t cloneLen; 714 UBool flagVal = FALSE; 715 UConverterFromUCallback junkCB; 716 717 FromUFLAGContext *flagCtx = NULL, 718 *cloneFlagCtx = NULL; 719 720 debugCBContext *debugCtx1 = NULL, 721 *debugCtx2 = NULL, 722 *cloneDebugCtx = NULL; 723 724 printf("\n\n==============================================\n" 725 "Sample 21: C: Test for substitution w/ callbacks & clones \n"); 726 727 /* print out the original source */ 728 printBytes("src", source); 729 printf("\n"); 730 731 /* First, convert from UTF8 to unicode */ 732 conv = ucnv_open("utf-8", &status); 733 U_ASSERT(status); 734 735 len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); 736 U_ASSERT(status); 737 738 printUChars("uch", uchars, len); 739 printf("\n"); 740 741 /* Now, close the converter */ 742 ucnv_close(conv); 743 744 /* Now, convert to windows-1252 */ 745 conv = ucnv_open("windows-1252", &status); 746 U_ASSERT(status); 747 748 /* Converter starts out with the SUBSTITUTE callback set. */ 749 750 /* initialize our callback */ 751 /* from the 'bottom' innermost, out 752 * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ 753 754#if DEBUG_TMI 755 printf("flagCB_fromU = %p\n", &flagCB_fromU); 756 printf("debugCB_fromU = %p\n", &debugCB_fromU); 757#endif 758 759 debugCtx1 = debugCB_openContext(); 760 flagCtx = flagCB_fromU_openContext(); 761 debugCtx2 = debugCB_openContext(); 762 763 debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ 764 debugCtx1->subContext = flagCtx; 765 766 flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ 767 flagCtx->subContext = debugCtx2; 768 769 debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; 770 debugCtx2->subContext = NULL; 771 772 /* Set our special callback */ 773 774 ucnv_setFromUCallBack(conv, 775 debugCB_fromU, 776 debugCtx1, 777 &(debugCtx2->subCallback), 778 &(debugCtx2->subContext), 779 &status); 780 781 U_ASSERT(status); 782 783#if DEBUG_TMI 784 printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", 785 conv, debugCtx1, debugCtx1->subCallback, 786 debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); 787#endif 788 789 cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status); 790 791 U_ASSERT(status); 792 793#if DEBUG_TMI 794 printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); 795#endif 796 797 ucnv_close(conv); 798 799#if DEBUG_TMI 800 printf("%p closed.\n", conv); 801#endif 802 803 U_ASSERT(status); 804 /* Now, we have to extract the context */ 805 cloneDebugCtx = NULL; 806 cloneFlagCtx = NULL; 807 808 ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); 809 if(cloneDebugCtx != NULL) { 810 cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; 811 } 812 813 printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", 814 cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); 815 816 len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); 817 U_ASSERT(status); 818 819 if(cloneFlagCtx != NULL) { 820 flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ 821 } else { 822 printf("** Warning, couldn't get the subcallback \n"); 823 } 824 825 ucnv_close(cloneCnv); 826 827 /* print out the original source */ 828 printBytes("bytes", bytes, len2); 829 830 return flagVal; /* true if callback was called */ 831} 832 833UErrorCode convsample_21() 834{ 835 const char *sample1 = "abc\xdf\xbf"; 836 const char *sample2 = "abc_def"; 837 838 if(convsample_21_didSubstitute(sample1)) 839 { 840 printf("DID substitute.\n******\n"); 841 } 842 else 843 { 844 printf("Did NOT substitute.\n*****\n"); 845 } 846 847 if(convsample_21_didSubstitute(sample2)) 848 { 849 printf("DID substitute.\n******\n"); 850 } 851 else 852 { 853 printf("Did NOT substitute.\n*****\n"); 854 } 855 856 return U_ZERO_ERROR; 857} 858 859 860// 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] 861 862#define BUFFERSIZE 17 /* make it interesting :) */ 863 864UErrorCode convsample_40() 865{ 866 printf("\n\n==============================================\n" 867 "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); 868 869 FILE *f; 870 FILE *out; 871 int32_t count; 872 char inBuf[BUFFERSIZE]; 873 const char *source; 874 const char *sourceLimit; 875 UChar *uBuf; 876 UChar *target; 877 UChar *targetLimit; 878 int32_t uBufSize = 0; 879 UConverter *conv = NULL; 880 UErrorCode status = U_ZERO_ERROR; 881 uint32_t inbytes=0, total=0; 882 883 f = fopen("data02.bin", "rb"); 884 if(!f) 885 { 886 fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); 887 return U_FILE_ACCESS_ERROR; 888 } 889 890 out = fopen("data40.utf16", "wb"); 891 if(!out) 892 { 893 fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); 894 fclose(f); 895 return U_FILE_ACCESS_ERROR; 896 } 897 898 // **************************** START SAMPLE ******************* 899 conv = ucnv_openCCSID(37, UCNV_IBM, &status); 900 assert(U_SUCCESS(status)); 901 902 uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); 903 printf("input bytes %d / min chars %d = %d UChars\n", 904 BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); 905 uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); 906 assert(uBuf!=NULL); 907 908 // grab another buffer's worth 909 while((!feof(f)) && 910 ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) 911 { 912 inbytes += count; 913 914 // Convert bytes to unicode 915 source = inBuf; 916 sourceLimit = inBuf + count; 917 918 do 919 { 920 target = uBuf; 921 targetLimit = uBuf + uBufSize; 922 923 ucnv_toUnicode( conv, &target, targetLimit, 924 &source, sourceLimit, NULL, 925 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 926 /* is true (when no more data will come) */ 927 &status); 928 929 if(status == U_BUFFER_OVERFLOW_ERROR) 930 { 931 // simply ran out of space - we'll reset the target ptr the next 932 // time through the loop. 933 status = U_ZERO_ERROR; 934 } 935 else 936 { 937 // Check other errors here. 938 assert(U_SUCCESS(status)); 939 // Break out of the loop (by force) 940 } 941 942 // Process the Unicode 943 // Todo: handle UTF-16/surrogates 944 assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == 945 (size_t)(target-uBuf)); 946 total += (target-uBuf); 947 } while (source < sourceLimit); // while simply out of space 948 } 949 950 printf("%d bytes in, %d UChars out.\n", inbytes, total); 951 952 // ***************************** END SAMPLE ******************** 953 ucnv_close(conv); 954 955 fclose(f); 956 fclose(out); 957 printf("\n"); 958 959 return U_ZERO_ERROR; 960} 961#undef BUFFERSIZE 962 963 964 965// 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] 966 967#define BUFFERSIZE 24 /* make it interesting :) */ 968 969UErrorCode convsample_46() 970{ 971 printf("\n\n==============================================\n" 972 "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); 973 974 FILE *f; 975 FILE *out; 976 int32_t count; 977 UChar inBuf[BUFFERSIZE]; 978 const UChar *source; 979 const UChar *sourceLimit; 980 char *buf; 981 char *target; 982 char *targetLimit; 983 984 int32_t bufSize = 0; 985 UConverter *conv = NULL; 986 UErrorCode status = U_ZERO_ERROR; 987 uint32_t inchars=0, total=0; 988 989 f = fopen("data40.utf16", "rb"); 990 if(!f) 991 { 992 fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); 993 return U_FILE_ACCESS_ERROR; 994 } 995 996 out = fopen("data46.out", "wb"); 997 if(!out) 998 { 999 fprintf(stderr, "Couldn't create file 'data46.out'.\n"); 1000 fclose(f); 1001 return U_FILE_ACCESS_ERROR; 1002 } 1003 1004 // **************************** START SAMPLE ******************* 1005 conv = ucnv_open( "iso-8859-2", &status); 1006 assert(U_SUCCESS(status)); 1007 1008 bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); 1009 printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", 1010 BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); 1011 buf = (char*)malloc(bufSize * sizeof(char)); 1012 assert(buf!=NULL); 1013 1014 // grab another buffer's worth 1015 while((!feof(f)) && 1016 ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) 1017 { 1018 inchars += count; 1019 1020 // Convert bytes to unicode 1021 source = inBuf; 1022 sourceLimit = inBuf + count; 1023 1024 do 1025 { 1026 target = buf; 1027 targetLimit = buf + bufSize; 1028 1029 ucnv_fromUnicode( conv, &target, targetLimit, 1030 &source, sourceLimit, NULL, 1031 feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ 1032 /* is true (when no more data will come) */ 1033 &status); 1034 1035 if(status == U_BUFFER_OVERFLOW_ERROR) 1036 { 1037 // simply ran out of space - we'll reset the target ptr the next 1038 // time through the loop. 1039 status = U_ZERO_ERROR; 1040 } 1041 else 1042 { 1043 // Check other errors here. 1044 assert(U_SUCCESS(status)); 1045 // Break out of the loop (by force) 1046 } 1047 1048 // Process the Unicode 1049 assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == 1050 (size_t)(target-buf)); 1051 total += (target-buf); 1052 } while (source < sourceLimit); // while simply out of space 1053 } 1054 1055 printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); 1056 1057 // ***************************** END SAMPLE ******************** 1058 ucnv_close(conv); 1059 1060 fclose(f); 1061 fclose(out); 1062 printf("\n"); 1063 1064 return U_ZERO_ERROR; 1065} 1066#undef BUFFERSIZE 1067 1068#define BUFFERSIZE 219 1069 1070void convsample_50() { 1071 printf("\n\n==============================================\n" 1072 "Sample 50: C: ucnv_detectUnicodeSignature\n"); 1073 1074 //! [ucnv_detectUnicodeSignature] 1075 UErrorCode err = U_ZERO_ERROR; 1076 UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */ 1077 char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' }; 1078 int32_t signatureLength = 0; 1079 const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err); 1080 UConverter *conv = NULL; 1081 UChar output[100]; 1082 UChar *target = output, *out; 1083 const char *source = input; 1084 if(encoding!=NULL && U_SUCCESS(err)){ 1085 // should signature be discarded ? 1086 conv = ucnv_open(encoding, &err); 1087 // do the conversion 1088 ucnv_toUnicode(conv, 1089 &target, output + sizeof(output)/U_SIZEOF_UCHAR, 1090 &source, input + sizeof(input), 1091 NULL, TRUE, &err); 1092 out = output; 1093 if (discardSignature){ 1094 ++out; // ignore initial U+FEFF 1095 } 1096 while(out != target) { 1097 printf("%04x ", *out++); 1098 } 1099 puts(""); 1100 } 1101 //! [ucnv_detectUnicodeSignature] 1102 puts(""); 1103} 1104 1105 1106 1107/* main */ 1108 1109int main() 1110{ 1111 1112 printf("Default Converter=%s\n", ucnv_getDefaultName() ); 1113 1114 convsample_02(); // C , u->koi8r, conv 1115 convsample_03(); // C, iterate 1116 1117 convsample_05(); // C, utf8->u, getNextUChar 1118 convsample_06(); // C freq counter thingy 1119 1120 convsample_12(); // C, sjis->u, conv 1121 convsample_13(); // C, big5->u, getNextU 1122 1123 convsample_20(); // C, callback 1124 convsample_21(); // C, callback debug 1125 1126 convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] 1127 1128 convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] 1129 1130 convsample_50(); // C, detect unicode signature 1131 1132 printf("End of converter samples.\n"); 1133 1134 fflush(stdout); 1135 fflush(stderr); 1136 1137 return 0; 1138} 1139