1/**************************************************************************
2*
3*   Copyright (C) 2000-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*
6***************************************************************************
7*   file name:  convsamp.c
8*   encoding:   ASCII (7-bit)
9*
10*   created on: 2000may30
11*   created by: Steven R. Loomis
12*
13*   Sample code for the ICU conversion routines.
14*
15* Note: Nothing special is needed to build this sample. Link with
16*       the icu UC and icu I18N libraries.
17*
18*       I use 'assert' for error checking, you probably will want
19*       something more flexible.  '***BEGIN SAMPLE***' and
20*       '***END SAMPLE***' mark pieces suitable for stand alone
21*       code snippets.
22*
23*
24*  Each test can define it's own BUFFERSIZE
25*
26*/
27
28#define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
29
30#include <stdio.h>
31#include <ctype.h>            /* for isspace, etc.    */
32#include <assert.h>
33#include <string.h>
34#include <stdlib.h>  /* malloc */
35
36#include "unicode/utypes.h"   /* Basic ICU data types */
37#include "unicode/ucnv.h"     /* C   Converter API    */
38#include "unicode/ustring.h"  /* some more string fcns*/
39#include "unicode/uchar.h"    /* char names           */
40#include "unicode/uloc.h"
41#include "unicode/unistr.h"
42
43#include "flagcb.h"
44
45/* Some utility functions */
46
47static const UChar kNone[] = { 0x0000 };
48
49#define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
50
51/* Print a UChar if possible, in seven characters. */
52void prettyPrintUChar(UChar c)
53{
54  if(  (c <= 0x007F) &&
55       (isgraph(c))  ) {
56    printf(" '%c'   ", (char)(0x00FF&c));
57  } else if ( c > 0x007F ) {
58    char buf[1000];
59    UErrorCode status = U_ZERO_ERROR;
60    int32_t o;
61
62    o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
63    if(U_SUCCESS(status) && (o>0) ) {
64      buf[6] = 0;
65      printf("%7s", buf);
66    } else {
67      printf(" ??????");
68    }
69  } else {
70    switch((char)(c & 0x007F)) {
71    case ' ':
72      printf(" ' '   ");
73      break;
74    case '\t':
75      printf(" \\t    ");
76      break;
77    case '\n':
78      printf(" \\n    ");
79      break;
80    default:
81      printf("  _    ");
82      break;
83    }
84  }
85}
86
87
88void printUChars(const char  *name = "?",
89                 const UChar *uch  = kNone,
90                 int32_t     len   = -1 )
91{
92  int32_t i;
93
94  if( (len == -1) && (uch) ) {
95    len = u_strlen(uch);
96  }
97
98  printf("%5s: ", name);
99  for( i = 0; i <len; i++) {
100    printf("%-6d ", i);
101  }
102  printf("\n");
103
104  printf("%5s: ", "uni");
105  for( i = 0; i <len; i++) {
106    printf("\\u%04X ", (int)uch[i]);
107  }
108  printf("\n");
109
110  printf("%5s:", "ch");
111  for( i = 0; i <len; i++) {
112    prettyPrintUChar(uch[i]);
113  }
114  printf("\n");
115}
116
117void printBytes(const char  *name = "?",
118                 const char *uch  = "",
119                 int32_t     len   = -1 )
120{
121  int32_t i;
122
123  if( (len == -1) && (uch) ) {
124    len = strlen(uch);
125  }
126
127  printf("%5s: ", name);
128  for( i = 0; i <len; i++) {
129    printf("%-4d ", i);
130  }
131  printf("\n");
132
133  printf("%5s: ", "uni");
134  for( i = 0; i <len; i++) {
135    printf("\\x%02X ", 0x00FF & (int)uch[i]);
136  }
137  printf("\n");
138
139  printf("%5s:", "ch");
140  for( i = 0; i <len; i++) {
141    if(isgraph(0x00FF & (int)uch[i])) {
142      printf(" '%c' ", (char)uch[i]);
143    } else {
144      printf("     ");
145    }
146  }
147  printf("\n");
148}
149
150void printUChar(UChar32 ch32)
151{
152    if(ch32 > 0xFFFF) {
153      printf("ch: U+%06X\n", ch32);
154    }
155    else {
156      UChar ch = (UChar)ch32;
157      printUChars("C", &ch, 1);
158    }
159}
160
161/*******************************************************************
162  Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
163  followed by an exclamation mark (!) into the KOI8-R Russian code page.
164
165  This example first creates a UChar String out of the Unicode chars.
166
167  targetSize must be set to the amount of space available in the target
168  buffer. After fromUChars is called,
169  len will contain the number of bytes in target[] which were
170  used in the resulting codepage.  In this case, there is a 1:1 mapping
171  between the input and output characters. The exclamation mark has the
172  same value in both KOI8-R and Unicode.
173
174  src: 0      1      2      3      4      5      6
175  uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
176   ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
177
178 targ:  0    1    2    3    4    5    6
179  uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
180   ch:                                '!'
181
182
183Converting FROM unicode
184  to koi8-r.
185  You must call ucnv_close to clean up the memory used by the
186  converter.
187
188  'len' returns the number of OUTPUT bytes resulting from the
189  conversion.
190 */
191
192UErrorCode convsample_02()
193{
194  printf("\n\n==============================================\n"
195         "Sample 02: C: simple Unicode -> koi8-r conversion\n");
196
197
198  // **************************** START SAMPLE *******************
199  // "cat<cat>OK"
200  UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
201                     0x0430, 0x0021, 0x0000 };
202  char target[100];
203  UErrorCode status = U_ZERO_ERROR;
204  UConverter *conv;
205  int32_t     len;
206
207  // set up the converter
208  //! [ucnv_open]
209  conv = ucnv_open("koi8-r", &status);
210  //! [ucnv_open]
211  assert(U_SUCCESS(status));
212
213  // convert to koi8-r
214  len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
215  assert(U_SUCCESS(status));
216
217  // close the converter
218  ucnv_close(conv);
219
220  // ***************************** END SAMPLE ********************
221
222  // Print it out
223  printUChars("src", source);
224  printf("\n");
225  printBytes("targ", target, len);
226
227  return U_ZERO_ERROR;
228}
229
230
231UErrorCode convsample_03()
232{
233  printf("\n\n==============================================\n"
234         "Sample 03: C: print out all converters\n");
235
236  int32_t count;
237  int32_t i;
238
239  // **************************** START SAMPLE *******************
240  count = ucnv_countAvailable();
241  printf("Available converters: %d\n", count);
242
243  for(i=0;i<count;i++)
244  {
245    printf("%s ", ucnv_getAvailableName(i));
246  }
247
248  // ***************************** END SAMPLE ********************
249
250  printf("\n");
251
252  return U_ZERO_ERROR;
253}
254
255
256
257#define BUFFERSIZE 17 /* make it interesting :) */
258
259/*
260  Converting from a codepage to Unicode in bulk..
261  What is the best way to determine the buffer size?
262
263     The 'buffersize' is in bytes of input.
264    For a given converter, divinding this by the minimum char size
265    give you the maximum number of Unicode characters that could be
266    expected for a given number of input bytes.
267     see: ucnv_getMinCharSize()
268
269     For example, a single byte codepage like 'Latin-3' has a
270    minimum char size of 1. (It takes at least 1 byte to represent
271    each Unicode char.) So the unicode buffer has the same number of
272    UChars as the input buffer has bytes.
273
274     In a strictly double byte codepage such as cp1362 (Windows
275    Korean), the minimum char size is 2. So, only half as many Unicode
276    chars as bytes are needed.
277
278     This work to calculate the buffer size is an optimization. Any
279    size of input and output buffer can be used, as long as the
280    program handles the following cases: If the input buffer is empty,
281    the source pointer will be equal to sourceLimit.  If the output
282    buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
283 */
284
285UErrorCode convsample_05()
286{
287  printf("\n\n==============================================\n"
288         "Sample 05: C: count the number of letters in a UTF-8 document\n");
289
290  FILE *f;
291  int32_t count;
292  char inBuf[BUFFERSIZE];
293  const char *source;
294  const char *sourceLimit;
295  UChar *uBuf;
296  UChar *target;
297  UChar *targetLimit;
298  UChar *p;
299  int32_t uBufSize = 0;
300  UConverter *conv;
301  UErrorCode status = U_ZERO_ERROR;
302  uint32_t letters=0, total=0;
303
304  f = fopen("data01.txt", "r");
305  if(!f)
306  {
307    fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
308    return U_FILE_ACCESS_ERROR;
309  }
310
311  // **************************** START SAMPLE *******************
312  conv = ucnv_open("utf-8", &status);
313  assert(U_SUCCESS(status));
314
315  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
316  printf("input bytes %d / min chars %d = %d UChars\n",
317         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
318  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
319  assert(uBuf!=NULL);
320
321  // grab another buffer's worth
322  while((!feof(f)) &&
323        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
324  {
325    // Convert bytes to unicode
326    source = inBuf;
327    sourceLimit = inBuf + count;
328
329    do
330    {
331        target = uBuf;
332        targetLimit = uBuf + uBufSize;
333
334        ucnv_toUnicode(conv, &target, targetLimit,
335                       &source, sourceLimit, NULL,
336                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
337                                   /* is true (when no more data will come) */
338                       &status);
339
340        if(status == U_BUFFER_OVERFLOW_ERROR)
341        {
342          // simply ran out of space - we'll reset the target ptr the next
343          // time through the loop.
344          status = U_ZERO_ERROR;
345        }
346        else
347        {
348          //  Check other errors here.
349          assert(U_SUCCESS(status));
350          // Break out of the loop (by force)
351        }
352
353        // Process the Unicode
354        // Todo: handle UTF-16/surrogates
355
356        for(p = uBuf; p<target; p++)
357        {
358          if(u_isalpha(*p))
359            letters++;
360          total++;
361        }
362    } while (source < sourceLimit); // while simply out of space
363  }
364
365  printf("%d letters out of %d total UChars.\n", letters, total);
366
367  // ***************************** END SAMPLE ********************
368  ucnv_close(conv);
369
370  printf("\n");
371
372  fclose(f);
373
374  return U_ZERO_ERROR;
375}
376#undef BUFFERSIZE
377
378#define BUFFERSIZE 1024
379typedef struct
380{
381  UChar32  codepoint;
382  uint32_t frequency;
383} CharFreqInfo;
384
385UErrorCode convsample_06()
386{
387  printf("\n\n==============================================\n"
388         "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
389
390  FILE *f;
391  int32_t count;
392  char inBuf[BUFFERSIZE];
393  const char *source;
394  const char *sourceLimit;
395  int32_t uBufSize = 0;
396  UConverter *conv;
397  UErrorCode status = U_ZERO_ERROR;
398  uint32_t letters=0, total=0;
399
400  CharFreqInfo   *info;
401  UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
402  UChar32   p;
403
404  uint32_t ie = 0;
405  uint32_t gh = 0;
406  UChar32 l = 0;
407
408  f = fopen("data06.txt", "r");
409  if(!f)
410  {
411    fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
412    return U_FILE_ACCESS_ERROR;
413  }
414
415  info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
416  if(!info)
417  {
418    fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
419  }
420
421  /* reset frequencies */
422  for(p=0;p<charCount;p++)
423  {
424    info[p].codepoint = p;
425    info[p].frequency = 0;
426  }
427
428  // **************************** START SAMPLE *******************
429  conv = ucnv_open("utf-8", &status);
430  assert(U_SUCCESS(status));
431
432  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
433  printf("input bytes %d / min chars %d = %d UChars\n",
434         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
435
436  // grab another buffer's worth
437  while((!feof(f)) &&
438        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
439  {
440    // Convert bytes to unicode
441    source = inBuf;
442    sourceLimit = inBuf + count;
443
444    while(source < sourceLimit)
445    {
446      p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
447      if(U_FAILURE(status))
448      {
449        fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
450        status = U_ZERO_ERROR;
451        continue;
452      }
453      U_ASSERT(status);
454      total++;
455
456      if(u_isalpha(p))
457        letters++;
458
459      if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
460        ie++;
461
462      if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
463        gh++;
464
465      if(p>charCount)
466      {
467        fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
468        free(info);
469        fclose(f);
470        ucnv_close(conv);
471        return U_UNSUPPORTED_ERROR;
472      }
473      info[p].frequency++;
474      l = p;
475    }
476  }
477
478  fclose(f);
479  ucnv_close(conv);
480
481  printf("%d letters out of %d total UChars.\n", letters, total);
482  printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
483
484  // now, we could sort it..
485
486  //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
487
488  for(p=0;p<charCount;p++)
489  {
490    if(info[p].frequency)
491    {
492      printf("% 5d U+%06X ", info[p].frequency, p);
493      if(p <= 0xFFFF)
494      {
495        prettyPrintUChar((UChar)p);
496      }
497      printf("\n");
498    }
499  }
500  free(info);
501  // ***************************** END SAMPLE ********************
502
503  printf("\n");
504
505  return U_ZERO_ERROR;
506}
507#undef BUFFERSIZE
508
509
510/******************************************************
511  You must call ucnv_close to clean up the memory used by the
512  converter.
513
514  'len' returns the number of OUTPUT bytes resulting from the
515  conversion.
516 */
517
518UErrorCode convsample_12()
519{
520  printf("\n\n==============================================\n"
521         "Sample 12: C: simple sjis -> unicode conversion\n");
522
523
524  // **************************** START SAMPLE *******************
525
526  char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
527  UChar target[100];
528  UErrorCode status = U_ZERO_ERROR;
529  UConverter *conv;
530  int32_t     len;
531
532  // set up the converter
533  conv = ucnv_open("shift_jis", &status);
534  assert(U_SUCCESS(status));
535
536  // convert to Unicode
537  // Note: we can use strlen, we know it's an 8 bit null terminated codepage
538  target[6] = 0xFDCA;
539  len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
540  U_ASSERT(status);
541  // close the converter
542  ucnv_close(conv);
543
544  // ***************************** END SAMPLE ********************
545
546  // Print it out
547  printBytes("src", source, strlen(source) );
548  printf("\n");
549  printUChars("targ", target, len);
550
551  return U_ZERO_ERROR;
552}
553
554/******************************************************************
555   C: Convert from codepage to Unicode one at a time.
556*/
557
558UErrorCode convsample_13()
559{
560  printf("\n\n==============================================\n"
561         "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
562
563
564  const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
565  //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
566  const char *source, *sourceLimit;
567  UChar32 target;
568  UErrorCode status = U_ZERO_ERROR;
569  UConverter *conv = NULL;
570  int32_t srcCount=0;
571  int32_t dstCount=0;
572
573  srcCount = sizeof(sourceChars);
574
575  conv = ucnv_open("Big5", &status);
576  U_ASSERT(status);
577
578  source = sourceChars;
579  sourceLimit = sourceChars + sizeof(sourceChars);
580
581  // **************************** START SAMPLE *******************
582
583
584  printBytes("src",source,sourceLimit-source);
585
586  while(source < sourceLimit)
587  {
588    puts("");
589    target = ucnv_getNextUChar (conv,
590                                &source,
591                                sourceLimit,
592                                &status);
593
594    //    printBytes("src",source,sourceLimit-source);
595    U_ASSERT(status);
596    printUChar(target);
597    dstCount++;
598  }
599
600
601  // ************************** END SAMPLE *************************
602
603  printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
604  ucnv_close(conv);
605
606  return U_ZERO_ERROR;
607}
608
609
610
611
612UBool convsample_20_didSubstitute(const char *source)
613{
614  UChar uchars[100];
615  char bytes[100];
616  UConverter *conv = NULL;
617  UErrorCode status = U_ZERO_ERROR;
618  uint32_t len, len2;
619  UBool  flagVal;
620
621  FromUFLAGContext * context = NULL;
622
623  printf("\n\n==============================================\n"
624         "Sample 20: C: Test for substitution using callbacks\n");
625
626  /* print out the original source */
627  printBytes("src", source);
628  printf("\n");
629
630  /* First, convert from UTF8 to unicode */
631  conv = ucnv_open("utf-8", &status);
632  U_ASSERT(status);
633
634  len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
635  U_ASSERT(status);
636
637  printUChars("uch", uchars, len);
638  printf("\n");
639
640  /* Now, close the converter */
641  ucnv_close(conv);
642
643  /* Now, convert to windows-1252 */
644  conv = ucnv_open("windows-1252", &status);
645  U_ASSERT(status);
646
647  /* Converter starts out with the SUBSTITUTE callback set. */
648
649  /* initialize our callback */
650  context = flagCB_fromU_openContext();
651
652  /* Set our special callback */
653  ucnv_setFromUCallBack(conv,
654                        flagCB_fromU,
655                        context,
656                        &(context->subCallback),
657                        &(context->subContext),
658                        &status);
659
660  U_ASSERT(status);
661
662  len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
663  U_ASSERT(status);
664
665  flagVal = context->flag;  /* it's about to go away when we close the cnv */
666
667  ucnv_close(conv);
668
669  /* print out the original source */
670  printBytes("bytes", bytes, len2);
671
672  return flagVal; /* true if callback was called */
673}
674
675UErrorCode convsample_20()
676{
677  const char *sample1 = "abc\xdf\xbf";
678  const char *sample2 = "abc_def";
679
680
681  if(convsample_20_didSubstitute(sample1))
682  {
683    printf("DID substitute.\n******\n");
684  }
685  else
686  {
687    printf("Did NOT substitute.\n*****\n");
688  }
689
690  if(convsample_20_didSubstitute(sample2))
691  {
692    printf("DID substitute.\n******\n");
693  }
694  else
695  {
696    printf("Did NOT substitute.\n*****\n");
697  }
698
699  return U_ZERO_ERROR;
700}
701
702// 21  - C, callback, with clone and debug
703
704
705
706UBool convsample_21_didSubstitute(const char *source)
707{
708  UChar uchars[100];
709  char bytes[100];
710  UConverter *conv = NULL, *cloneCnv = NULL;
711  UErrorCode status = U_ZERO_ERROR;
712  uint32_t len, len2;
713  int32_t  cloneLen;
714  UBool  flagVal = FALSE;
715  UConverterFromUCallback junkCB;
716
717  FromUFLAGContext *flagCtx = NULL,
718                   *cloneFlagCtx = NULL;
719
720  debugCBContext   *debugCtx1 = NULL,
721                   *debugCtx2 = NULL,
722                   *cloneDebugCtx = NULL;
723
724  printf("\n\n==============================================\n"
725         "Sample 21: C: Test for substitution w/ callbacks & clones \n");
726
727  /* print out the original source */
728  printBytes("src", source);
729  printf("\n");
730
731  /* First, convert from UTF8 to unicode */
732  conv = ucnv_open("utf-8", &status);
733  U_ASSERT(status);
734
735  len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
736  U_ASSERT(status);
737
738  printUChars("uch", uchars, len);
739  printf("\n");
740
741  /* Now, close the converter */
742  ucnv_close(conv);
743
744  /* Now, convert to windows-1252 */
745  conv = ucnv_open("windows-1252", &status);
746  U_ASSERT(status);
747
748  /* Converter starts out with the SUBSTITUTE callback set. */
749
750  /* initialize our callback */
751  /* from the 'bottom' innermost, out
752   *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
753
754#if DEBUG_TMI
755  printf("flagCB_fromU = %p\n", &flagCB_fromU);
756  printf("debugCB_fromU = %p\n", &debugCB_fromU);
757#endif
758
759  debugCtx1 = debugCB_openContext();
760   flagCtx  = flagCB_fromU_openContext();
761  debugCtx2 = debugCB_openContext();
762
763  debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
764  debugCtx1->subContext  =  flagCtx;
765
766  flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
767  flagCtx->subContext    =  debugCtx2;
768
769  debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
770  debugCtx2->subContext  = NULL;
771
772  /* Set our special callback */
773
774  ucnv_setFromUCallBack(conv,
775                        debugCB_fromU,
776                        debugCtx1,
777                        &(debugCtx2->subCallback),
778                        &(debugCtx2->subContext),
779                        &status);
780
781  U_ASSERT(status);
782
783#if DEBUG_TMI
784  printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
785         conv, debugCtx1, debugCtx1->subCallback,
786         debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
787#endif
788
789  cloneLen = 1; /* but passing in null so it will clone */
790  cloneCnv = ucnv_safeClone(conv,  NULL,  &cloneLen, &status);
791
792  U_ASSERT(status);
793
794#if DEBUG_TMI
795  printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
796#endif
797
798  ucnv_close(conv);
799
800#if DEBUG_TMI
801  printf("%p closed.\n", conv);
802#endif
803
804  U_ASSERT(status);
805  /* Now, we have to extract the context */
806  cloneDebugCtx = NULL;
807  cloneFlagCtx  = NULL;
808
809  ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
810  if(cloneDebugCtx != NULL) {
811      cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
812  }
813
814  printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
815         cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
816
817  len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
818  U_ASSERT(status);
819
820  if(cloneFlagCtx != NULL) {
821      flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
822  } else {
823      printf("** Warning, couldn't get the subcallback \n");
824  }
825
826  ucnv_close(cloneCnv);
827
828  /* print out the original source */
829  printBytes("bytes", bytes, len2);
830
831  return flagVal; /* true if callback was called */
832}
833
834UErrorCode convsample_21()
835{
836  const char *sample1 = "abc\xdf\xbf";
837  const char *sample2 = "abc_def";
838
839  if(convsample_21_didSubstitute(sample1))
840  {
841    printf("DID substitute.\n******\n");
842  }
843  else
844  {
845    printf("Did NOT substitute.\n*****\n");
846  }
847
848  if(convsample_21_didSubstitute(sample2))
849  {
850    printf("DID substitute.\n******\n");
851  }
852  else
853  {
854    printf("Did NOT substitute.\n*****\n");
855  }
856
857  return U_ZERO_ERROR;
858}
859
860
861//  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
862
863#define BUFFERSIZE 17 /* make it interesting :) */
864
865UErrorCode convsample_40()
866{
867  printf("\n\n==============================================\n"
868    "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
869
870  FILE *f;
871  FILE *out;
872  int32_t count;
873  char inBuf[BUFFERSIZE];
874  const char *source;
875  const char *sourceLimit;
876  UChar *uBuf;
877  UChar *target;
878  UChar *targetLimit;
879  int32_t uBufSize = 0;
880  UConverter *conv = NULL;
881  UErrorCode status = U_ZERO_ERROR;
882  uint32_t inbytes=0, total=0;
883
884  f = fopen("data02.bin", "rb");
885  if(!f)
886  {
887    fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
888    return U_FILE_ACCESS_ERROR;
889  }
890
891  out = fopen("data40.utf16", "wb");
892  if(!out)
893  {
894    fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
895    fclose(f);
896    return U_FILE_ACCESS_ERROR;
897  }
898
899  // **************************** START SAMPLE *******************
900  conv = ucnv_openCCSID(37, UCNV_IBM, &status);
901  assert(U_SUCCESS(status));
902
903  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
904  printf("input bytes %d / min chars %d = %d UChars\n",
905         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
906  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
907  assert(uBuf!=NULL);
908
909  // grab another buffer's worth
910  while((!feof(f)) &&
911        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
912  {
913    inbytes += count;
914
915    // Convert bytes to unicode
916    source = inBuf;
917    sourceLimit = inBuf + count;
918
919    do
920    {
921        target = uBuf;
922        targetLimit = uBuf + uBufSize;
923
924        ucnv_toUnicode( conv, &target, targetLimit,
925                       &source, sourceLimit, NULL,
926                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
927                                   /* is true (when no more data will come) */
928                         &status);
929
930        if(status == U_BUFFER_OVERFLOW_ERROR)
931        {
932          // simply ran out of space - we'll reset the target ptr the next
933          // time through the loop.
934          status = U_ZERO_ERROR;
935        }
936        else
937        {
938          //  Check other errors here.
939          assert(U_SUCCESS(status));
940          // Break out of the loop (by force)
941        }
942
943        // Process the Unicode
944        // Todo: handle UTF-16/surrogates
945        assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
946               (size_t)(target-uBuf));
947        total += (target-uBuf);
948    } while (source < sourceLimit); // while simply out of space
949  }
950
951  printf("%d bytes in,  %d UChars out.\n", inbytes, total);
952
953  // ***************************** END SAMPLE ********************
954  ucnv_close(conv);
955
956  fclose(f);
957  fclose(out);
958  printf("\n");
959
960  return U_ZERO_ERROR;
961}
962#undef BUFFERSIZE
963
964
965
966//  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
967
968#define BUFFERSIZE 24 /* make it interesting :) */
969
970UErrorCode convsample_46()
971{
972  printf("\n\n==============================================\n"
973    "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
974
975  FILE *f;
976  FILE *out;
977  int32_t count;
978  UChar inBuf[BUFFERSIZE];
979  const UChar *source;
980  const UChar *sourceLimit;
981  char *buf;
982  char *target;
983  char *targetLimit;
984
985  int32_t bufSize = 0;
986  UConverter *conv = NULL;
987  UErrorCode status = U_ZERO_ERROR;
988  uint32_t inchars=0, total=0;
989
990  f = fopen("data40.utf16", "rb");
991  if(!f)
992  {
993    fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
994    return U_FILE_ACCESS_ERROR;
995  }
996
997  out = fopen("data46.out", "wb");
998  if(!out)
999  {
1000    fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1001    fclose(f);
1002    return U_FILE_ACCESS_ERROR;
1003  }
1004
1005  // **************************** START SAMPLE *******************
1006  conv = ucnv_open( "iso-8859-2", &status);
1007  assert(U_SUCCESS(status));
1008
1009  bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1010  printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1011         BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1012  buf = (char*)malloc(bufSize * sizeof(char));
1013  assert(buf!=NULL);
1014
1015  // grab another buffer's worth
1016  while((!feof(f)) &&
1017        ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1018  {
1019    inchars += count;
1020
1021    // Convert bytes to unicode
1022    source = inBuf;
1023    sourceLimit = inBuf + count;
1024
1025    do
1026    {
1027        target = buf;
1028        targetLimit = buf + bufSize;
1029
1030        ucnv_fromUnicode( conv, &target, targetLimit,
1031                       &source, sourceLimit, NULL,
1032                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1033                                   /* is true (when no more data will come) */
1034                         &status);
1035
1036        if(status == U_BUFFER_OVERFLOW_ERROR)
1037        {
1038          // simply ran out of space - we'll reset the target ptr the next
1039          // time through the loop.
1040          status = U_ZERO_ERROR;
1041        }
1042        else
1043        {
1044          //  Check other errors here.
1045          assert(U_SUCCESS(status));
1046          // Break out of the loop (by force)
1047        }
1048
1049        // Process the Unicode
1050        assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1051               (size_t)(target-buf));
1052        total += (target-buf);
1053    } while (source < sourceLimit); // while simply out of space
1054  }
1055
1056  printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1057
1058  // ***************************** END SAMPLE ********************
1059  ucnv_close(conv);
1060
1061  fclose(f);
1062  fclose(out);
1063  printf("\n");
1064
1065  return U_ZERO_ERROR;
1066}
1067#undef BUFFERSIZE
1068
1069#define BUFFERSIZE 219
1070
1071void convsample_50() {
1072  printf("\n\n==============================================\n"
1073         "Sample 50: C: ucnv_detectUnicodeSignature\n");
1074
1075  //! [ucnv_detectUnicodeSignature]
1076  UErrorCode err = U_ZERO_ERROR;
1077  UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1078  char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1079  int32_t signatureLength = 0;
1080  const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1081  UConverter *conv = NULL;
1082  UChar output[100];
1083  UChar *target = output, *out;
1084  const char *source = input;
1085  if(encoding!=NULL && U_SUCCESS(err)){
1086    // should signature be discarded ?
1087    conv = ucnv_open(encoding, &err);
1088    // do the conversion
1089    ucnv_toUnicode(conv,
1090                   &target, output + sizeof(output)/U_SIZEOF_UCHAR,
1091                   &source, input + sizeof(input),
1092                   NULL, TRUE, &err);
1093    out = output;
1094    if (discardSignature){
1095      ++out; // ignore initial U+FEFF
1096    }
1097    while(out != target) {
1098      printf("%04x ", *out++);
1099    }
1100    puts("");
1101  }
1102  //! [ucnv_detectUnicodeSignature]
1103  puts("");
1104}
1105
1106
1107
1108/* main */
1109
1110int main()
1111{
1112
1113  printf("Default Converter=%s\n", ucnv_getDefaultName() );
1114
1115  convsample_02();  // C  , u->koi8r, conv
1116  convsample_03();  // C,   iterate
1117
1118  convsample_05();  // C,  utf8->u, getNextUChar
1119  convsample_06(); // C freq counter thingy
1120
1121  convsample_12();  // C,  sjis->u, conv
1122  convsample_13();  // C,  big5->u, getNextU
1123
1124  convsample_20();  // C, callback
1125  convsample_21();  // C, callback debug
1126
1127  convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1128
1129  convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1130
1131  convsample_50();  // C, detect unicode signature
1132
1133  printf("End of converter samples.\n");
1134
1135  fflush(stdout);
1136  fflush(stderr);
1137
1138  return 0;
1139}
1140