1/**************************************************************************
2*
3*   Copyright (C) 2000-2013, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*
6***************************************************************************
7*   file name:  convsamp.c
8*   encoding:   ASCII (7-bit)
9*
10*   created on: 2000may30
11*   created by: Steven R. Loomis
12*
13*   Sample code for the ICU conversion routines.
14*
15* Note: Nothing special is needed to build this sample. Link with
16*       the icu UC and icu I18N libraries.
17*
18*       I use 'assert' for error checking, you probably will want
19*       something more flexible.  '***BEGIN SAMPLE***' and
20*       '***END SAMPLE***' mark pieces suitable for stand alone
21*       code snippets.
22*
23*
24*  Each test can define it's own BUFFERSIZE
25*
26*/
27
28#define DEBUG_TMI 0  /* define to 1 to enable Too Much Information */
29
30#include <stdio.h>
31#include <ctype.h>            /* for isspace, etc.    */
32#include <assert.h>
33#include <string.h>
34#include <stdlib.h>  /* malloc */
35
36#include "unicode/utypes.h"   /* Basic ICU data types */
37#include "unicode/ucnv.h"     /* C   Converter API    */
38#include "unicode/ustring.h"  /* some more string fcns*/
39#include "unicode/uchar.h"    /* char names           */
40#include "unicode/uloc.h"
41#include "unicode/unistr.h"
42
43#include "flagcb.h"
44
45/* Some utility functions */
46
47static const UChar kNone[] = { 0x0000 };
48
49#define U_ASSERT(x)  { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }}
50
51/* Print a UChar if possible, in seven characters. */
52void prettyPrintUChar(UChar c)
53{
54  if(  (c <= 0x007F) &&
55       (isgraph(c))  ) {
56    printf(" '%c'   ", (char)(0x00FF&c));
57  } else if ( c > 0x007F ) {
58    char buf[1000];
59    UErrorCode status = U_ZERO_ERROR;
60    int32_t o;
61
62    o = u_charName(c, U_EXTENDED_CHAR_NAME, buf, 1000, &status);
63    if(U_SUCCESS(status) && (o>0) ) {
64      buf[6] = 0;
65      printf("%7s", buf);
66    } else {
67      printf(" ??????");
68    }
69  } else {
70    switch((char)(c & 0x007F)) {
71    case ' ':
72      printf(" ' '   ");
73      break;
74    case '\t':
75      printf(" \\t    ");
76      break;
77    case '\n':
78      printf(" \\n    ");
79      break;
80    default:
81      printf("  _    ");
82      break;
83    }
84  }
85}
86
87
88void printUChars(const char  *name = "?",
89                 const UChar *uch  = kNone,
90                 int32_t     len   = -1 )
91{
92  int32_t i;
93
94  if( (len == -1) && (uch) ) {
95    len = u_strlen(uch);
96  }
97
98  printf("%5s: ", name);
99  for( i = 0; i <len; i++) {
100    printf("%-6d ", i);
101  }
102  printf("\n");
103
104  printf("%5s: ", "uni");
105  for( i = 0; i <len; i++) {
106    printf("\\u%04X ", (int)uch[i]);
107  }
108  printf("\n");
109
110  printf("%5s:", "ch");
111  for( i = 0; i <len; i++) {
112    prettyPrintUChar(uch[i]);
113  }
114  printf("\n");
115}
116
117void printBytes(const char  *name = "?",
118                 const char *uch  = "",
119                 int32_t     len   = -1 )
120{
121  int32_t i;
122
123  if( (len == -1) && (uch) ) {
124    len = strlen(uch);
125  }
126
127  printf("%5s: ", name);
128  for( i = 0; i <len; i++) {
129    printf("%-4d ", i);
130  }
131  printf("\n");
132
133  printf("%5s: ", "uni");
134  for( i = 0; i <len; i++) {
135    printf("\\x%02X ", 0x00FF & (int)uch[i]);
136  }
137  printf("\n");
138
139  printf("%5s:", "ch");
140  for( i = 0; i <len; i++) {
141    if(isgraph(0x00FF & (int)uch[i])) {
142      printf(" '%c' ", (char)uch[i]);
143    } else {
144      printf("     ");
145    }
146  }
147  printf("\n");
148}
149
150void printUChar(UChar32 ch32)
151{
152    if(ch32 > 0xFFFF) {
153      printf("ch: U+%06X\n", ch32);
154    }
155    else {
156      UChar ch = (UChar)ch32;
157      printUChars("C", &ch, 1);
158    }
159}
160
161/*******************************************************************
162  Very simple C sample to convert the word 'Moscow' in Russian in Unicode,
163  followed by an exclamation mark (!) into the KOI8-R Russian code page.
164
165  This example first creates a UChar String out of the Unicode chars.
166
167  targetSize must be set to the amount of space available in the target
168  buffer. After fromUChars is called,
169  len will contain the number of bytes in target[] which were
170  used in the resulting codepage.  In this case, there is a 1:1 mapping
171  between the input and output characters. The exclamation mark has the
172  same value in both KOI8-R and Unicode.
173
174  src: 0      1      2      3      4      5      6
175  uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021
176   ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL   '!'
177
178 targ:  0    1    2    3    4    5    6
179  uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21
180   ch:                                '!'
181
182
183Converting FROM unicode
184  to koi8-r.
185  You must call ucnv_close to clean up the memory used by the
186  converter.
187
188  'len' returns the number of OUTPUT bytes resulting from the
189  conversion.
190 */
191
192UErrorCode convsample_02()
193{
194  printf("\n\n==============================================\n"
195         "Sample 02: C: simple Unicode -> koi8-r conversion\n");
196
197
198  // **************************** START SAMPLE *******************
199  // "cat<cat>OK"
200  UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432,
201                     0x0430, 0x0021, 0x0000 };
202  char target[100];
203  UErrorCode status = U_ZERO_ERROR;
204  UConverter *conv;
205  int32_t     len;
206
207  // set up the converter
208  //! [ucnv_open]
209  conv = ucnv_open("koi8-r", &status);
210  //! [ucnv_open]
211  assert(U_SUCCESS(status));
212
213  // convert to koi8-r
214  len = ucnv_fromUChars(conv, target, 100, source, -1, &status);
215  assert(U_SUCCESS(status));
216
217  // close the converter
218  ucnv_close(conv);
219
220  // ***************************** END SAMPLE ********************
221
222  // Print it out
223  printUChars("src", source);
224  printf("\n");
225  printBytes("targ", target, len);
226
227  return U_ZERO_ERROR;
228}
229
230
231UErrorCode convsample_03()
232{
233  printf("\n\n==============================================\n"
234         "Sample 03: C: print out all converters\n");
235
236  int32_t count;
237  int32_t i;
238
239  // **************************** START SAMPLE *******************
240  count = ucnv_countAvailable();
241  printf("Available converters: %d\n", count);
242
243  for(i=0;i<count;i++)
244  {
245    printf("%s ", ucnv_getAvailableName(i));
246  }
247
248  // ***************************** END SAMPLE ********************
249
250  printf("\n");
251
252  return U_ZERO_ERROR;
253}
254
255
256
257#define BUFFERSIZE 17 /* make it interesting :) */
258
259/*
260  Converting from a codepage to Unicode in bulk..
261  What is the best way to determine the buffer size?
262
263     The 'buffersize' is in bytes of input.
264    For a given converter, divinding this by the minimum char size
265    give you the maximum number of Unicode characters that could be
266    expected for a given number of input bytes.
267     see: ucnv_getMinCharSize()
268
269     For example, a single byte codepage like 'Latin-3' has a
270    minimum char size of 1. (It takes at least 1 byte to represent
271    each Unicode char.) So the unicode buffer has the same number of
272    UChars as the input buffer has bytes.
273
274     In a strictly double byte codepage such as cp1362 (Windows
275    Korean), the minimum char size is 2. So, only half as many Unicode
276    chars as bytes are needed.
277
278     This work to calculate the buffer size is an optimization. Any
279    size of input and output buffer can be used, as long as the
280    program handles the following cases: If the input buffer is empty,
281    the source pointer will be equal to sourceLimit.  If the output
282    buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned.
283 */
284
285UErrorCode convsample_05()
286{
287  printf("\n\n==============================================\n"
288         "Sample 05: C: count the number of letters in a UTF-8 document\n");
289
290  FILE *f;
291  int32_t count;
292  char inBuf[BUFFERSIZE];
293  const char *source;
294  const char *sourceLimit;
295  UChar *uBuf;
296  UChar *target;
297  UChar *targetLimit;
298  UChar *p;
299  int32_t uBufSize = 0;
300  UConverter *conv;
301  UErrorCode status = U_ZERO_ERROR;
302  uint32_t letters=0, total=0;
303
304  f = fopen("data01.txt", "r");
305  if(!f)
306  {
307    fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
308    return U_FILE_ACCESS_ERROR;
309  }
310
311  // **************************** START SAMPLE *******************
312  conv = ucnv_open("utf-8", &status);
313  assert(U_SUCCESS(status));
314
315  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
316  printf("input bytes %d / min chars %d = %d UChars\n",
317         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
318  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
319  assert(uBuf!=NULL);
320
321  // grab another buffer's worth
322  while((!feof(f)) &&
323        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
324  {
325    // Convert bytes to unicode
326    source = inBuf;
327    sourceLimit = inBuf + count;
328
329    do
330    {
331        target = uBuf;
332        targetLimit = uBuf + uBufSize;
333
334        ucnv_toUnicode(conv, &target, targetLimit,
335                       &source, sourceLimit, NULL,
336                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
337                                   /* is true (when no more data will come) */
338                       &status);
339
340        if(status == U_BUFFER_OVERFLOW_ERROR)
341        {
342          // simply ran out of space - we'll reset the target ptr the next
343          // time through the loop.
344          status = U_ZERO_ERROR;
345        }
346        else
347        {
348          //  Check other errors here.
349          assert(U_SUCCESS(status));
350          // Break out of the loop (by force)
351        }
352
353        // Process the Unicode
354        // Todo: handle UTF-16/surrogates
355
356        for(p = uBuf; p<target; p++)
357        {
358          if(u_isalpha(*p))
359            letters++;
360          total++;
361        }
362    } while (source < sourceLimit); // while simply out of space
363  }
364
365  printf("%d letters out of %d total UChars.\n", letters, total);
366
367  // ***************************** END SAMPLE ********************
368  ucnv_close(conv);
369
370  printf("\n");
371
372  fclose(f);
373
374  return U_ZERO_ERROR;
375}
376#undef BUFFERSIZE
377
378#define BUFFERSIZE 1024
379typedef struct
380{
381  UChar32  codepoint;
382  uint32_t frequency;
383} CharFreqInfo;
384
385UErrorCode convsample_06()
386{
387  printf("\n\n==============================================\n"
388         "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");
389
390  FILE *f;
391  int32_t count;
392  char inBuf[BUFFERSIZE];
393  const char *source;
394  const char *sourceLimit;
395  int32_t uBufSize = 0;
396  UConverter *conv;
397  UErrorCode status = U_ZERO_ERROR;
398  uint32_t letters=0, total=0;
399
400  CharFreqInfo   *info;
401  UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
402  UChar32   p;
403
404  uint32_t ie = 0;
405  uint32_t gh = 0;
406  UChar32 l = 0;
407
408  f = fopen("data06.txt", "r");
409  if(!f)
410  {
411    fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
412    return U_FILE_ACCESS_ERROR;
413  }
414
415  info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
416  if(!info)
417  {
418    fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
419  }
420
421  /* reset frequencies */
422  for(p=0;p<charCount;p++)
423  {
424    info[p].codepoint = p;
425    info[p].frequency = 0;
426  }
427
428  // **************************** START SAMPLE *******************
429  conv = ucnv_open("utf-8", &status);
430  assert(U_SUCCESS(status));
431
432  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
433  printf("input bytes %d / min chars %d = %d UChars\n",
434         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
435
436  // grab another buffer's worth
437  while((!feof(f)) &&
438        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
439  {
440    // Convert bytes to unicode
441    source = inBuf;
442    sourceLimit = inBuf + count;
443
444    while(source < sourceLimit)
445    {
446      p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
447      if(U_FAILURE(status))
448      {
449        fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
450        status = U_ZERO_ERROR;
451        continue;
452      }
453      U_ASSERT(status);
454      total++;
455
456      if(u_isalpha(p))
457        letters++;
458
459      if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
460        ie++;
461
462      if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
463        gh++;
464
465      if(p>charCount)
466      {
467        fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
468        free(info);
469        fclose(f);
470        ucnv_close(conv);
471        return U_UNSUPPORTED_ERROR;
472      }
473      info[p].frequency++;
474      l = p;
475    }
476  }
477
478  fclose(f);
479  ucnv_close(conv);
480
481  printf("%d letters out of %d total UChars.\n", letters, total);
482  printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);
483
484  // now, we could sort it..
485
486  //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);
487
488  for(p=0;p<charCount;p++)
489  {
490    if(info[p].frequency)
491    {
492      printf("% 5d U+%06X ", info[p].frequency, p);
493      if(p <= 0xFFFF)
494      {
495        prettyPrintUChar((UChar)p);
496      }
497      printf("\n");
498    }
499  }
500  free(info);
501  // ***************************** END SAMPLE ********************
502
503  printf("\n");
504
505  return U_ZERO_ERROR;
506}
507#undef BUFFERSIZE
508
509
510/******************************************************
511  You must call ucnv_close to clean up the memory used by the
512  converter.
513
514  'len' returns the number of OUTPUT bytes resulting from the
515  conversion.
516 */
517
518UErrorCode convsample_12()
519{
520  printf("\n\n==============================================\n"
521         "Sample 12: C: simple sjis -> unicode conversion\n");
522
523
524  // **************************** START SAMPLE *******************
525
526  char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
527  UChar target[100];
528  UErrorCode status = U_ZERO_ERROR;
529  UConverter *conv;
530  int32_t     len;
531
532  // set up the converter
533  conv = ucnv_open("shift_jis", &status);
534  assert(U_SUCCESS(status));
535
536  // convert to Unicode
537  // Note: we can use strlen, we know it's an 8 bit null terminated codepage
538  target[6] = 0xFDCA;
539  len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
540  U_ASSERT(status);
541  // close the converter
542  ucnv_close(conv);
543
544  // ***************************** END SAMPLE ********************
545
546  // Print it out
547  printBytes("src", source, strlen(source) );
548  printf("\n");
549  printUChars("targ", target, len);
550
551  return U_ZERO_ERROR;
552}
553
554/******************************************************************
555   C: Convert from codepage to Unicode one at a time.
556*/
557
558UErrorCode convsample_13()
559{
560  printf("\n\n==============================================\n"
561         "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");
562
563
564  const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
565  //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
566  const char *source, *sourceLimit;
567  UChar32 target;
568  UErrorCode status = U_ZERO_ERROR;
569  UConverter *conv = NULL;
570  int32_t srcCount=0;
571  int32_t dstCount=0;
572
573  srcCount = sizeof(sourceChars);
574
575  conv = ucnv_open("Big5", &status);
576  U_ASSERT(status);
577
578  source = sourceChars;
579  sourceLimit = sourceChars + sizeof(sourceChars);
580
581  // **************************** START SAMPLE *******************
582
583
584  printBytes("src",source,sourceLimit-source);
585
586  while(source < sourceLimit)
587  {
588    puts("");
589    target = ucnv_getNextUChar (conv,
590                                &source,
591                                sourceLimit,
592                                &status);
593
594    //    printBytes("src",source,sourceLimit-source);
595    U_ASSERT(status);
596    printUChar(target);
597    dstCount++;
598  }
599
600
601  // ************************** END SAMPLE *************************
602
603  printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
604  ucnv_close(conv);
605
606  return U_ZERO_ERROR;
607}
608
609
610
611
612UBool convsample_20_didSubstitute(const char *source)
613{
614  UChar uchars[100];
615  char bytes[100];
616  UConverter *conv = NULL;
617  UErrorCode status = U_ZERO_ERROR;
618  uint32_t len, len2;
619  UBool  flagVal;
620
621  FromUFLAGContext * context = NULL;
622
623  printf("\n\n==============================================\n"
624         "Sample 20: C: Test for substitution using callbacks\n");
625
626  /* print out the original source */
627  printBytes("src", source);
628  printf("\n");
629
630  /* First, convert from UTF8 to unicode */
631  conv = ucnv_open("utf-8", &status);
632  U_ASSERT(status);
633
634  len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
635  U_ASSERT(status);
636
637  printUChars("uch", uchars, len);
638  printf("\n");
639
640  /* Now, close the converter */
641  ucnv_close(conv);
642
643  /* Now, convert to windows-1252 */
644  conv = ucnv_open("windows-1252", &status);
645  U_ASSERT(status);
646
647  /* Converter starts out with the SUBSTITUTE callback set. */
648
649  /* initialize our callback */
650  context = flagCB_fromU_openContext();
651
652  /* Set our special callback */
653  ucnv_setFromUCallBack(conv,
654                        flagCB_fromU,
655                        context,
656                        &(context->subCallback),
657                        &(context->subContext),
658                        &status);
659
660  U_ASSERT(status);
661
662  len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
663  U_ASSERT(status);
664
665  flagVal = context->flag;  /* it's about to go away when we close the cnv */
666
667  ucnv_close(conv);
668
669  /* print out the original source */
670  printBytes("bytes", bytes, len2);
671
672  return flagVal; /* true if callback was called */
673}
674
675UErrorCode convsample_20()
676{
677  const char *sample1 = "abc\xdf\xbf";
678  const char *sample2 = "abc_def";
679
680
681  if(convsample_20_didSubstitute(sample1))
682  {
683    printf("DID substitute.\n******\n");
684  }
685  else
686  {
687    printf("Did NOT substitute.\n*****\n");
688  }
689
690  if(convsample_20_didSubstitute(sample2))
691  {
692    printf("DID substitute.\n******\n");
693  }
694  else
695  {
696    printf("Did NOT substitute.\n*****\n");
697  }
698
699  return U_ZERO_ERROR;
700}
701
702// 21  - C, callback, with clone and debug
703
704
705
706UBool convsample_21_didSubstitute(const char *source)
707{
708  UChar uchars[100];
709  char bytes[100];
710  UConverter *conv = NULL, *cloneCnv = NULL;
711  UErrorCode status = U_ZERO_ERROR;
712  uint32_t len, len2;
713  int32_t  cloneLen;
714  UBool  flagVal = FALSE;
715  UConverterFromUCallback junkCB;
716
717  FromUFLAGContext *flagCtx = NULL,
718                   *cloneFlagCtx = NULL;
719
720  debugCBContext   *debugCtx1 = NULL,
721                   *debugCtx2 = NULL,
722                   *cloneDebugCtx = NULL;
723
724  printf("\n\n==============================================\n"
725         "Sample 21: C: Test for substitution w/ callbacks & clones \n");
726
727  /* print out the original source */
728  printBytes("src", source);
729  printf("\n");
730
731  /* First, convert from UTF8 to unicode */
732  conv = ucnv_open("utf-8", &status);
733  U_ASSERT(status);
734
735  len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
736  U_ASSERT(status);
737
738  printUChars("uch", uchars, len);
739  printf("\n");
740
741  /* Now, close the converter */
742  ucnv_close(conv);
743
744  /* Now, convert to windows-1252 */
745  conv = ucnv_open("windows-1252", &status);
746  U_ASSERT(status);
747
748  /* Converter starts out with the SUBSTITUTE callback set. */
749
750  /* initialize our callback */
751  /* from the 'bottom' innermost, out
752   *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */
753
754#if DEBUG_TMI
755  printf("flagCB_fromU = %p\n", &flagCB_fromU);
756  printf("debugCB_fromU = %p\n", &debugCB_fromU);
757#endif
758
759  debugCtx1 = debugCB_openContext();
760   flagCtx  = flagCB_fromU_openContext();
761  debugCtx2 = debugCB_openContext();
762
763  debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
764  debugCtx1->subContext  =  flagCtx;
765
766  flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
767  flagCtx->subContext    =  debugCtx2;
768
769  debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
770  debugCtx2->subContext  = NULL;
771
772  /* Set our special callback */
773
774  ucnv_setFromUCallBack(conv,
775                        debugCB_fromU,
776                        debugCtx1,
777                        &(debugCtx2->subCallback),
778                        &(debugCtx2->subContext),
779                        &status);
780
781  U_ASSERT(status);
782
783#if DEBUG_TMI
784  printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
785         conv, debugCtx1, debugCtx1->subCallback,
786         debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
787#endif
788
789  cloneCnv = ucnv_safeClone(conv, NULL, NULL, &status);
790
791  U_ASSERT(status);
792
793#if DEBUG_TMI
794  printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
795#endif
796
797  ucnv_close(conv);
798
799#if DEBUG_TMI
800  printf("%p closed.\n", conv);
801#endif
802
803  U_ASSERT(status);
804  /* Now, we have to extract the context */
805  cloneDebugCtx = NULL;
806  cloneFlagCtx  = NULL;
807
808  ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
809  if(cloneDebugCtx != NULL) {
810      cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
811  }
812
813  printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
814         cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );
815
816  len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
817  U_ASSERT(status);
818
819  if(cloneFlagCtx != NULL) {
820      flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
821  } else {
822      printf("** Warning, couldn't get the subcallback \n");
823  }
824
825  ucnv_close(cloneCnv);
826
827  /* print out the original source */
828  printBytes("bytes", bytes, len2);
829
830  return flagVal; /* true if callback was called */
831}
832
833UErrorCode convsample_21()
834{
835  const char *sample1 = "abc\xdf\xbf";
836  const char *sample2 = "abc_def";
837
838  if(convsample_21_didSubstitute(sample1))
839  {
840    printf("DID substitute.\n******\n");
841  }
842  else
843  {
844    printf("Did NOT substitute.\n*****\n");
845  }
846
847  if(convsample_21_didSubstitute(sample2))
848  {
849    printf("DID substitute.\n******\n");
850  }
851  else
852  {
853    printf("Did NOT substitute.\n*****\n");
854  }
855
856  return U_ZERO_ERROR;
857}
858
859
860//  40-  C, cp37 -> UTF16 [data02.bin -> data40.utf16]
861
862#define BUFFERSIZE 17 /* make it interesting :) */
863
864UErrorCode convsample_40()
865{
866  printf("\n\n==============================================\n"
867    "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");
868
869  FILE *f;
870  FILE *out;
871  int32_t count;
872  char inBuf[BUFFERSIZE];
873  const char *source;
874  const char *sourceLimit;
875  UChar *uBuf;
876  UChar *target;
877  UChar *targetLimit;
878  int32_t uBufSize = 0;
879  UConverter *conv = NULL;
880  UErrorCode status = U_ZERO_ERROR;
881  uint32_t inbytes=0, total=0;
882
883  f = fopen("data02.bin", "rb");
884  if(!f)
885  {
886    fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
887    return U_FILE_ACCESS_ERROR;
888  }
889
890  out = fopen("data40.utf16", "wb");
891  if(!out)
892  {
893    fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
894    fclose(f);
895    return U_FILE_ACCESS_ERROR;
896  }
897
898  // **************************** START SAMPLE *******************
899  conv = ucnv_openCCSID(37, UCNV_IBM, &status);
900  assert(U_SUCCESS(status));
901
902  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
903  printf("input bytes %d / min chars %d = %d UChars\n",
904         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
905  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
906  assert(uBuf!=NULL);
907
908  // grab another buffer's worth
909  while((!feof(f)) &&
910        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
911  {
912    inbytes += count;
913
914    // Convert bytes to unicode
915    source = inBuf;
916    sourceLimit = inBuf + count;
917
918    do
919    {
920        target = uBuf;
921        targetLimit = uBuf + uBufSize;
922
923        ucnv_toUnicode( conv, &target, targetLimit,
924                       &source, sourceLimit, NULL,
925                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
926                                   /* is true (when no more data will come) */
927                         &status);
928
929        if(status == U_BUFFER_OVERFLOW_ERROR)
930        {
931          // simply ran out of space - we'll reset the target ptr the next
932          // time through the loop.
933          status = U_ZERO_ERROR;
934        }
935        else
936        {
937          //  Check other errors here.
938          assert(U_SUCCESS(status));
939          // Break out of the loop (by force)
940        }
941
942        // Process the Unicode
943        // Todo: handle UTF-16/surrogates
944        assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
945               (size_t)(target-uBuf));
946        total += (target-uBuf);
947    } while (source < sourceLimit); // while simply out of space
948  }
949
950  printf("%d bytes in,  %d UChars out.\n", inbytes, total);
951
952  // ***************************** END SAMPLE ********************
953  ucnv_close(conv);
954
955  fclose(f);
956  fclose(out);
957  printf("\n");
958
959  return U_ZERO_ERROR;
960}
961#undef BUFFERSIZE
962
963
964
965//  46-  C, UTF16 -> latin2 [data40.utf16 -> data46.out]
966
967#define BUFFERSIZE 24 /* make it interesting :) */
968
969UErrorCode convsample_46()
970{
971  printf("\n\n==============================================\n"
972    "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n");
973
974  FILE *f;
975  FILE *out;
976  int32_t count;
977  UChar inBuf[BUFFERSIZE];
978  const UChar *source;
979  const UChar *sourceLimit;
980  char *buf;
981  char *target;
982  char *targetLimit;
983
984  int32_t bufSize = 0;
985  UConverter *conv = NULL;
986  UErrorCode status = U_ZERO_ERROR;
987  uint32_t inchars=0, total=0;
988
989  f = fopen("data40.utf16", "rb");
990  if(!f)
991  {
992    fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n");
993    return U_FILE_ACCESS_ERROR;
994  }
995
996  out = fopen("data46.out", "wb");
997  if(!out)
998  {
999    fprintf(stderr, "Couldn't create file 'data46.out'.\n");
1000    fclose(f);
1001    return U_FILE_ACCESS_ERROR;
1002  }
1003
1004  // **************************** START SAMPLE *******************
1005  conv = ucnv_open( "iso-8859-2", &status);
1006  assert(U_SUCCESS(status));
1007
1008  bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv));
1009  printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n",
1010         BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize);
1011  buf = (char*)malloc(bufSize * sizeof(char));
1012  assert(buf!=NULL);
1013
1014  // grab another buffer's worth
1015  while((!feof(f)) &&
1016        ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) )
1017  {
1018    inchars += count;
1019
1020    // Convert bytes to unicode
1021    source = inBuf;
1022    sourceLimit = inBuf + count;
1023
1024    do
1025    {
1026        target = buf;
1027        targetLimit = buf + bufSize;
1028
1029        ucnv_fromUnicode( conv, &target, targetLimit,
1030                       &source, sourceLimit, NULL,
1031                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
1032                                   /* is true (when no more data will come) */
1033                         &status);
1034
1035        if(status == U_BUFFER_OVERFLOW_ERROR)
1036        {
1037          // simply ran out of space - we'll reset the target ptr the next
1038          // time through the loop.
1039          status = U_ZERO_ERROR;
1040        }
1041        else
1042        {
1043          //  Check other errors here.
1044          assert(U_SUCCESS(status));
1045          // Break out of the loop (by force)
1046        }
1047
1048        // Process the Unicode
1049        assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) ==
1050               (size_t)(target-buf));
1051        total += (target-buf);
1052    } while (source < sourceLimit); // while simply out of space
1053  }
1054
1055  printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total);
1056
1057  // ***************************** END SAMPLE ********************
1058  ucnv_close(conv);
1059
1060  fclose(f);
1061  fclose(out);
1062  printf("\n");
1063
1064  return U_ZERO_ERROR;
1065}
1066#undef BUFFERSIZE
1067
1068#define BUFFERSIZE 219
1069
1070void convsample_50() {
1071  printf("\n\n==============================================\n"
1072         "Sample 50: C: ucnv_detectUnicodeSignature\n");
1073
1074  //! [ucnv_detectUnicodeSignature]
1075  UErrorCode err = U_ZERO_ERROR;
1076  UBool discardSignature = TRUE; /* set to TRUE to throw away the initial U+FEFF */
1077  char input[] = { '\xEF','\xBB', '\xBF','\x41','\x42','\x43' };
1078  int32_t signatureLength = 0;
1079  const char *encoding = ucnv_detectUnicodeSignature(input,sizeof(input),&signatureLength,&err);
1080  UConverter *conv = NULL;
1081  UChar output[100];
1082  UChar *target = output, *out;
1083  const char *source = input;
1084  if(encoding!=NULL && U_SUCCESS(err)){
1085    // should signature be discarded ?
1086    conv = ucnv_open(encoding, &err);
1087    // do the conversion
1088    ucnv_toUnicode(conv,
1089                   &target, output + sizeof(output)/U_SIZEOF_UCHAR,
1090                   &source, input + sizeof(input),
1091                   NULL, TRUE, &err);
1092    out = output;
1093    if (discardSignature){
1094      ++out; // ignore initial U+FEFF
1095    }
1096    while(out != target) {
1097      printf("%04x ", *out++);
1098    }
1099    puts("");
1100  }
1101  //! [ucnv_detectUnicodeSignature]
1102  puts("");
1103}
1104
1105
1106
1107/* main */
1108
1109int main()
1110{
1111
1112  printf("Default Converter=%s\n", ucnv_getDefaultName() );
1113
1114  convsample_02();  // C  , u->koi8r, conv
1115  convsample_03();  // C,   iterate
1116
1117  convsample_05();  // C,  utf8->u, getNextUChar
1118  convsample_06(); // C freq counter thingy
1119
1120  convsample_12();  // C,  sjis->u, conv
1121  convsample_13();  // C,  big5->u, getNextU
1122
1123  convsample_20();  // C, callback
1124  convsample_21();  // C, callback debug
1125
1126  convsample_40();  // C,   cp37 -> UTF16 [data02.bin -> data40.utf16]
1127
1128  convsample_46();  // C,  UTF16 -> latin3 [data41.utf16 -> data46.out]
1129
1130  convsample_50();  // C, detect unicode signature
1131
1132  printf("End of converter samples.\n");
1133
1134  fflush(stdout);
1135  fflush(stderr);
1136
1137  return 0;
1138}
1139