1/*****************************************************************************
2*
3*   Copyright (C) 1999-2008, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*
6******************************************************************************/
7
8/*
9 * uconv(1): an iconv(1)-like converter using ICU.
10 *
11 * Original code by Jonas Utterstr&#x00F6;m <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
13 *
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
16 *
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
19 */
20
21#include <unicode/utypes.h>
22#include <unicode/putil.h>
23#include <unicode/ucnv.h>
24#include <unicode/uenum.h>
25#include <unicode/unistr.h>
26#include <unicode/translit.h>
27#include <unicode/uset.h>
28#include <unicode/uclean.h>
29
30#include <stdio.h>
31#include <errno.h>
32#include <string.h>
33#include <stdlib.h>
34
35#include "cmemory.h"
36#include "cstring.h"
37#include "ustrfmt.h"
38
39#include "unicode/uwmsg.h"
40
41U_NAMESPACE_USE
42
43#if (defined(U_WINDOWS) || defined(U_CYGWIN)) && !defined(__STRICT_ANSI__)
44#include <io.h>
45#include <fcntl.h>
46#if defined(U_WINDOWS)
47#define USE_FILENO_BINARY_MODE 1
48/* Windows likes to rename Unix-like functions */
49#ifndef fileno
50#define fileno _fileno
51#endif
52#ifndef setmode
53#define setmode _setmode
54#endif
55#ifndef O_BINARY
56#define O_BINARY _O_BINARY
57#endif
58#endif
59#endif
60
61#ifdef UCONVMSG_LINK
62/* below from the README */
63#include "unicode/utypes.h"
64#include "unicode/udata.h"
65U_CFUNC char uconvmsg_dat[];
66#endif
67
68#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
69
70#define DEFAULT_BUFSZ   4096
71#define UCONVMSG "uconvmsg"
72
73static UResourceBundle *gBundle = 0;    /* Bundle containing messages. */
74
75/*
76 * Initialize the message bundle so that message strings can be fetched
77 * by u_wmsg().
78 *
79 */
80
81static void initMsg(const char *pname) {
82    static int ps = 0;
83
84    if (!ps) {
85        char dataPath[2048];        /* XXX Sloppy: should be PATH_MAX. */
86        UErrorCode err = U_ZERO_ERROR;
87
88        ps = 1;
89
90        /* Set up our static data - if any */
91#ifdef UCONVMSG_LINK
92        udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
93        if (U_FAILURE(err)) {
94          fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
95                  pname, u_errorName(err));
96          err = U_ZERO_ERROR; /* It may still fail */
97        }
98#endif
99
100        /* Get messages. */
101        gBundle = u_wmsg_setPath(UCONVMSG, &err);
102        if (U_FAILURE(err)) {
103            fprintf(stderr,
104                    "%s: warning: couldn't open bundle %s: %s\n",
105                    pname, UCONVMSG, u_errorName(err));
106#ifdef UCONVMSG_LINK
107            fprintf(stderr,
108                    "%s: setAppData was called, internal data %s failed to load\n",
109                        pname, UCONVMSG);
110#endif
111
112            err = U_ZERO_ERROR;
113            /* that was try #1, try again with a path */
114            uprv_strcpy(dataPath, u_getDataDirectory());
115            uprv_strcat(dataPath, U_FILE_SEP_STRING);
116            uprv_strcat(dataPath, UCONVMSG);
117
118            gBundle = u_wmsg_setPath(dataPath, &err);
119            if (U_FAILURE(err)) {
120                fprintf(stderr,
121                    "%s: warning: still couldn't open bundle %s: %s\n",
122                    pname, dataPath, u_errorName(err));
123                fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
124            }
125        }
126    }
127}
128
129/* Mapping of callback names to the callbacks passed to the converter
130   API. */
131
132static struct callback_ent {
133    const char *name;
134    UConverterFromUCallback fromu;
135    const void *fromuctxt;
136    UConverterToUCallback tou;
137    const void *touctxt;
138} transcode_callbacks[] = {
139    { "substitute",
140      UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
141      UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
142    { "skip",
143      UCNV_FROM_U_CALLBACK_SKIP, 0,
144      UCNV_TO_U_CALLBACK_SKIP, 0 },
145    { "stop",
146      UCNV_FROM_U_CALLBACK_STOP, 0,
147      UCNV_TO_U_CALLBACK_STOP, 0 },
148    { "escape",
149      UCNV_FROM_U_CALLBACK_ESCAPE, 0,
150      UCNV_TO_U_CALLBACK_ESCAPE, 0},
151    { "escape-icu",
152      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
153      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
154    { "escape-java",
155      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
156      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
157    { "escape-c",
158      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
159      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
160    { "escape-xml",
161      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
162      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
163    { "escape-xml-hex",
164      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
165      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
166    { "escape-xml-dec",
167      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
168      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
169    { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
170      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
171};
172
173/* Return a pointer to a callback record given its name. */
174
175static const struct callback_ent *findCallback(const char *name) {
176    int i, count =
177        sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
178
179    /* We'll do a linear search, there aren't many of them and bsearch()
180       may not be that portable. */
181
182    for (i = 0; i < count; ++i) {
183        if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
184            return &transcode_callbacks[i];
185        }
186    }
187
188    return 0;
189}
190
191/* Print converter information. If lookfor is set, only that converter will
192   be printed, otherwise all converters will be printed. If canon is non
193   zero, tags and aliases for each converter are printed too, in the format
194   expected for convrters.txt(5). */
195
196static int printConverters(const char *pname, const char *lookfor,
197    UBool canon)
198{
199    UErrorCode err = U_ZERO_ERROR;
200    int32_t num;
201    uint16_t num_stds;
202    const char **stds;
203
204    /* If there is a specified name, just handle that now. */
205
206    if (lookfor) {
207        if (!canon) {
208            printf("%s\n", lookfor);
209            return 0;
210        } else {
211        /*  Because we are printing a canonical name, we need the
212            true converter name. We've done that already except for
213            the default name (because we want to print the exact
214            name one would get when calling ucnv_getDefaultName()
215            in non-canon mode). But since we do not know at this
216            point if we have the default name or something else, we
217            need to normalize again to the canonical converter
218            name. */
219
220            const char *truename = ucnv_getAlias(lookfor, 0, &err);
221            if (U_SUCCESS(err)) {
222                lookfor = truename;
223            } else {
224                err = U_ZERO_ERROR;
225            }
226        }
227    }
228
229    /* Print converter names. We come here for one of two reasons: we
230       are printing all the names (lookfor was null), or we have a
231       single converter to print but in canon mode, hence we need to
232       get to it in order to print everything. */
233
234    num = ucnv_countAvailable();
235    if (num <= 0) {
236        initMsg(pname);
237        u_wmsg(stderr, "cantGetNames");
238        return -1;
239    }
240    if (lookfor) {
241        num = 1;                /* We know where we want to be. */
242    }
243
244    num_stds = ucnv_countStandards();
245    stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
246    if (!stds) {
247        u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
248        return -1;
249    } else {
250        uint16_t s;
251
252        if (canon) {
253            printf("{ ");
254        }
255        for (s = 0; s < num_stds; ++s) {
256            stds[s] = ucnv_getStandard(s, &err);
257            if (canon) {
258                printf("%s ", stds[s]);
259            }
260            if (U_FAILURE(err)) {
261                u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
262                goto error_cleanup;
263            }
264        }
265        if (canon) {
266            puts("}");
267        }
268    }
269
270    for (int32_t i = 0; i < num; i++) {
271        const char *name;
272        uint16_t num_aliases;
273
274        /* Set the name either to what we are looking for, or
275        to the current converter name. */
276
277        if (lookfor) {
278            name = lookfor;
279        } else {
280            name = ucnv_getAvailableName(i);
281        }
282
283        /* Get all the aliases associated to the name. */
284
285        err = U_ZERO_ERROR;
286        num_aliases = ucnv_countAliases(name, &err);
287        if (U_FAILURE(err)) {
288            printf("%s", name);
289
290            UnicodeString str(name, "");
291            putchar('\t');
292            u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
293                u_wmsg_errorName(err));
294            goto error_cleanup;
295        } else {
296            uint16_t a, s, t;
297
298            /* Write all the aliases and their tags. */
299
300            for (a = 0; a < num_aliases; ++a) {
301                const char *alias = ucnv_getAlias(name, a, &err);
302
303                if (U_FAILURE(err)) {
304                    UnicodeString str(name, "");
305                    putchar('\t');
306                    u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
307                        u_wmsg_errorName(err));
308                    goto error_cleanup;
309                }
310
311                /* Print the current alias so that it looks right. */
312                printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
313                                 alias,
314                                 (canon ? "" : " "));
315
316                /* Look (slowly, linear searching) for a tag. */
317
318                if (canon) {
319                    /* -1 to skip the last standard */
320                    for (s = t = 0; s < num_stds-1; ++s) {
321                        UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
322                        if (U_SUCCESS(err)) {
323                            /* List the standard tags */
324                            const char *standardName;
325                            UBool isFirst = TRUE;
326                            UErrorCode enumError = U_ZERO_ERROR;
327                            while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
328                                /* See if this alias is supported by this standard. */
329                                if (!strcmp(standardName, alias)) {
330                                    if (!t) {
331                                        printf(" {");
332                                        t = 1;
333                                    }
334                                    /* Print a * after the default standard name */
335                                    printf(" %s%s", stds[s], (isFirst ? "*" : ""));
336                                }
337                                isFirst = FALSE;
338                            }
339                        }
340                    }
341                    if (t) {
342                        printf(" }");
343                    }
344                }
345                /* Terminate this entry. */
346                if (canon) {
347                    puts("");
348                }
349
350                /* Move on. */
351            }
352            /* Terminate this entry. */
353            if (!canon) {
354                puts("");
355            }
356        }
357    }
358
359    /* Free temporary data. */
360
361    uprv_free(stds);
362
363    /* Success. */
364
365    return 0;
366error_cleanup:
367    uprv_free(stds);
368    return -1;
369}
370
371/* Print all available transliterators. If canon is non zero, print
372   one transliterator per line. */
373
374static int printTransliterators(UBool canon)
375{
376#if UCONFIG_NO_TRANSLITERATION
377    printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
378    return 1;
379#else
380    int32_t numtrans = utrans_countAvailableIDs(), i;
381    int buflen = 512;
382    char *buf = (char *) uprv_malloc(buflen);
383    char staticbuf[512];
384
385    char sepchar = canon ? '\n' : ' ';
386
387    if (!buf) {
388        buf = staticbuf;
389        buflen = sizeof(staticbuf);
390    }
391
392    for (i = 0; i < numtrans; ++i) {
393        int32_t len = utrans_getAvailableID(i, buf, buflen);
394        if (len >= buflen - 1) {
395            if (buf != staticbuf) {
396                buflen <<= 1;
397                if (buflen < len) {
398                    buflen = len + 64;
399                }
400                buf = (char *) uprv_realloc(buf, buflen);
401                if (!buf) {
402                    buf = staticbuf;
403                    buflen = sizeof(staticbuf);
404                }
405            }
406            utrans_getAvailableID(i, buf, buflen);
407            if (len >= buflen) {
408                uprv_strcpy(buf + buflen - 4, "..."); /* Truncate the name. */
409            }
410        }
411
412        printf("%s", buf);
413        if (i < numtrans - 1) {
414            putchar(sepchar);
415        }
416    }
417
418    /* Add a terminating newline if needed. */
419
420    if (sepchar != '\n') {
421        putchar('\n');
422    }
423
424    /* Free temporary data. */
425
426    if (buf != staticbuf) {
427        uprv_free(buf);
428    }
429
430    /* Success. */
431
432    return 0;
433#endif
434}
435
436enum {
437    uSP = 0x20,         // space
438    uCR = 0xd,          // carriage return
439    uLF = 0xa,          // line feed
440    uNL = 0x85,         // newline
441    uLS = 0x2028,       // line separator
442    uPS = 0x2029,       // paragraph separator
443    uSig = 0xfeff       // signature/BOM character
444};
445
446static inline int32_t
447getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
448    // find one of
449    // CR, LF, CRLF, NL, LS, PS
450    // for paragraph ends (see UAX #13/Unicode 4)
451    // and include it in the chunk
452    // all of these characters are on the BMP
453    // do not include FF or VT in case they are part of a paragraph
454    // (important for bidi contexts)
455    static const UChar paraEnds[] = {
456        0xd, 0xa, 0x85, 0x2028, 0x2029
457    };
458    enum {
459        iCR, iLF, iNL, iLS, iPS, iCount
460    };
461
462    // first, see if there is a CRLF split between prev and s
463    if (prev.endsWith(paraEnds + iCR, 1)) {
464        if (s.startsWith(paraEnds + iLF, 1)) {
465            return 1; // split CRLF, include the LF
466        } else if (!s.isEmpty()) {
467            return 0; // complete the last chunk
468        } else {
469            return -1; // wait for actual further contents to arrive
470        }
471    }
472
473    const UChar *u = s.getBuffer(), *limit = u + s.length();
474    UChar c;
475
476    while (u < limit) {
477        c = *u++;
478        if (
479            ((c < uSP) && (c == uCR || c == uLF)) ||
480            (c == uNL) ||
481            ((c & uLS) == uLS)
482        ) {
483            if (c == uCR) {
484                // check for CRLF
485                if (u == limit) {
486                    return -1; // LF may be in the next chunk
487                } else if (*u == uLF) {
488                    ++u; // include the LF in this chunk
489                }
490            }
491            return (int32_t)(u - s.getBuffer());
492        }
493    }
494
495    return -1; // continue collecting the chunk
496}
497
498enum {
499    CNV_NO_FEFF,    // cannot convert the U+FEFF Unicode signature character (BOM)
500    CNV_WITH_FEFF,  // can convert the U+FEFF signature character
501    CNV_ADDS_FEFF   // automatically adds/detects the U+FEFF signature character
502};
503
504static inline UChar
505nibbleToHex(uint8_t n) {
506    n &= 0xf;
507    return
508        n <= 9 ?
509            (UChar)(0x30 + n) :
510            (UChar)((0x61 - 10) + n);
511}
512
513// check the converter's Unicode signature properties;
514// the fromUnicode side of the converter must be in its initial state
515// and will be reset again if it was used
516static int32_t
517cnvSigType(UConverter *cnv) {
518    UErrorCode err;
519    int32_t result;
520
521    // test if the output charset can convert U+FEFF
522    USet *set = uset_open(1, 0);
523    err = U_ZERO_ERROR;
524    ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
525    if (U_SUCCESS(err) && uset_contains(set, uSig)) {
526        result = CNV_WITH_FEFF;
527    } else {
528        result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
529    }
530    uset_close(set);
531
532    if (result == CNV_WITH_FEFF) {
533        // test if the output charset emits a signature anyway
534        const UChar a[1] = { 0x61 }; // "a"
535        const UChar *in;
536
537        char buffer[20];
538        char *out;
539
540        in = a;
541        out = buffer;
542        err = U_ZERO_ERROR;
543        ucnv_fromUnicode(cnv,
544            &out, buffer + sizeof(buffer),
545            &in, a + 1,
546            NULL, TRUE, &err);
547        ucnv_resetFromUnicode(cnv);
548
549        if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
550            U_SUCCESS(err)
551        ) {
552            result = CNV_ADDS_FEFF;
553        }
554    }
555
556    return result;
557}
558
559class ConvertFile {
560public:
561    ConvertFile() :
562        buf(NULL), outbuf(NULL), fromoffsets(NULL),
563        bufsz(0), signature(0) {}
564
565    void
566    setBufferSize(size_t bufferSize) {
567        bufsz = bufferSize;
568
569        buf = new char[2 * bufsz];
570        outbuf = buf + bufsz;
571
572        // +1 for an added U+FEFF in the intermediate Unicode buffer
573        fromoffsets = new int32_t[bufsz + 1];
574    }
575
576    ~ConvertFile() {
577        delete [] buf;
578        delete [] fromoffsets;
579    }
580
581    UBool convertFile(const char *pname,
582                      const char *fromcpage,
583                      UConverterToUCallback toucallback,
584                      const void *touctxt,
585                      const char *tocpage,
586                      UConverterFromUCallback fromucallback,
587                      const void *fromuctxt,
588                      UBool fallback,
589                      const char *translit,
590                      const char *infilestr,
591                      FILE * outfile, int verbose);
592private:
593    friend int main(int argc, char **argv);
594
595    char *buf, *outbuf;
596    int32_t *fromoffsets;
597
598    size_t bufsz;
599    int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
600};
601
602// Convert a file from one encoding to another
603UBool
604ConvertFile::convertFile(const char *pname,
605                         const char *fromcpage,
606                         UConverterToUCallback toucallback,
607                         const void *touctxt,
608                         const char *tocpage,
609                         UConverterFromUCallback fromucallback,
610                         const void *fromuctxt,
611                         UBool fallback,
612                         const char *translit,
613                         const char *infilestr,
614                         FILE * outfile, int verbose)
615{
616    FILE *infile;
617    UBool ret = TRUE;
618    UConverter *convfrom = 0;
619    UConverter *convto = 0;
620    UErrorCode err = U_ZERO_ERROR;
621    UBool flush;
622    const char *cbufp, *prevbufp;
623    char *bufp;
624
625    uint32_t infoffset = 0, outfoffset = 0;   /* Where we are in the file, for error reporting. */
626
627    const UChar *unibuf, *unibufbp;
628    UChar *unibufp;
629
630    size_t rd, wr;
631
632#if !UCONFIG_NO_TRANSLITERATION
633    Transliterator *t = 0;      // Transliterator acting on Unicode data.
634    UnicodeString chunk;        // One chunk of the text being collected for transformation.
635#endif
636    UnicodeString u;            // String to do the transliteration.
637    int32_t ulen;
638
639    // use conversion offsets for error messages
640    // unless a transliterator is used -
641    // a text transformation will reorder characters in unpredictable ways
642    UBool useOffsets = TRUE;
643
644    // Open the correct input file or connect to stdin for reading input
645
646    if (infilestr != 0 && strcmp(infilestr, "-")) {
647        infile = fopen(infilestr, "rb");
648        if (infile == 0) {
649            UnicodeString str1(infilestr, "");
650            str1.append((UChar32) 0);
651            UnicodeString str2(strerror(errno), "");
652            str2.append((UChar32) 0);
653            initMsg(pname);
654            u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
655            return FALSE;
656        }
657    } else {
658        infilestr = "-";
659        infile = stdin;
660#ifdef USE_FILENO_BINARY_MODE
661        if (setmode(fileno(stdin), O_BINARY) == -1) {
662            initMsg(pname);
663            u_wmsg(stderr, "cantSetInBinMode");
664            return FALSE;
665        }
666#endif
667    }
668
669    if (verbose) {
670        fprintf(stderr, "%s:\n", infilestr);
671    }
672
673#if !UCONFIG_NO_TRANSLITERATION
674    // Create transliterator as needed.
675
676    if (translit != NULL && *translit) {
677        UParseError parse;
678        UnicodeString str(translit), pestr;
679
680        /* Create from rules or by ID as needed. */
681
682        parse.line = -1;
683
684        if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
685            t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
686        } else {
687            t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
688        }
689
690        if (U_FAILURE(err)) {
691            str.append((UChar32) 0);
692            initMsg(pname);
693
694            if (parse.line >= 0) {
695                UChar linebuf[20], offsetbuf[20];
696                uprv_itou(linebuf, 20, parse.line, 10, 0);
697                uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
698                u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
699                    u_wmsg_errorName(err), linebuf, offsetbuf);
700            } else {
701                u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
702                    u_wmsg_errorName(err));
703            }
704
705            if (t) {
706                delete t;
707                t = 0;
708            }
709            goto error_exit;
710        }
711
712        useOffsets = FALSE;
713    }
714#endif
715
716    // Create codepage converter. If the codepage or its aliases weren't
717    // available, it returns NULL and a failure code. We also set the
718    // callbacks, and return errors in the same way.
719
720    convfrom = ucnv_open(fromcpage, &err);
721    if (U_FAILURE(err)) {
722        UnicodeString str(fromcpage, "");
723        initMsg(pname);
724        u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
725            u_wmsg_errorName(err));
726        goto error_exit;
727    }
728    ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
729    if (U_FAILURE(err)) {
730        initMsg(pname);
731        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
732        goto error_exit;
733    }
734
735    convto = ucnv_open(tocpage, &err);
736    if (U_FAILURE(err)) {
737        UnicodeString str(tocpage, "");
738        initMsg(pname);
739        u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
740            u_wmsg_errorName(err));
741        goto error_exit;
742    }
743    ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
744    if (U_FAILURE(err)) {
745        initMsg(pname);
746        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
747        goto error_exit;
748    }
749    ucnv_setFallback(convto, fallback);
750
751    UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
752    int8_t sig;
753
754    // OK, we can convert now.
755    sig = signature;
756    rd = 0;
757
758    do {
759        willexit = FALSE;
760
761        // input file offset at the beginning of the next buffer
762        infoffset += rd;
763
764        rd = fread(buf, 1, bufsz, infile);
765        if (ferror(infile) != 0) {
766            UnicodeString str(strerror(errno));
767            initMsg(pname);
768            u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
769            goto error_exit;
770        }
771
772        // Convert the read buffer into the new encoding via Unicode.
773        // After the call 'unibufp' will be placed behind the last
774        // character that was converted in the 'unibuf'.
775        // Also the 'cbufp' is positioned behind the last converted
776        // character.
777        // At the last conversion in the file, flush should be set to
778        // true so that we get all characters converted.
779        //
780        // The converter must be flushed at the end of conversion so
781        // that characters on hold also will be written.
782
783        cbufp = buf;
784        flush = (UBool)(rd != bufsz);
785
786        // convert until the input is consumed
787        do {
788            // remember the start of the current byte-to-Unicode conversion
789            prevbufp = cbufp;
790
791            unibuf = unibufp = u.getBuffer((int32_t)bufsz);
792
793            // Use bufsz instead of u.getCapacity() for the targetLimit
794            // so that we don't overflow fromoffsets[].
795            ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
796                buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
797
798            ulen = (int32_t)(unibufp - unibuf);
799            u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
800
801            // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
802            // converting all of the input bytes.
803            // It works like this because ucnv_toUnicode() returns only under the
804            // following conditions:
805            // - an error occurred during conversion (an error code is set)
806            // - the target buffer is filled (the error code indicates an overflow)
807            // - the source is consumed
808            // That is, if the error code does not indicate a failure,
809            // not even an overflow, then the source must be consumed entirely.
810            fromSawEndOfBytes = (UBool)U_SUCCESS(err);
811
812            if (err == U_BUFFER_OVERFLOW_ERROR) {
813                err = U_ZERO_ERROR;
814            } else if (U_FAILURE(err)) {
815                char pos[32], errorBytes[32];
816                int8_t i, length, errorLength;
817
818                UErrorCode localError = U_ZERO_ERROR;
819                errorLength = (int8_t)sizeof(errorBytes);
820                ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
821                if (U_FAILURE(localError) || errorLength == 0) {
822                    errorLength = 1;
823                }
824
825                // print the input file offset of the start of the error bytes:
826                // input file offset of the current byte buffer +
827                // length of the just consumed bytes -
828                // length of the error bytes
829                length =
830                    (int8_t)sprintf(pos, "%d",
831                        (int)(infoffset + (cbufp - buf) - errorLength));
832
833                // output the bytes that caused the error
834                UnicodeString str;
835                for (i = 0; i < errorLength; ++i) {
836                    if (i > 0) {
837                        str.append((UChar)uSP);
838                    }
839                    str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
840                    str.append(nibbleToHex((uint8_t)errorBytes[i]));
841                }
842
843                initMsg(pname);
844                u_wmsg(stderr, "problemCvtToU",
845                        UnicodeString(pos, length, "").getTerminatedBuffer(),
846                        str.getTerminatedBuffer(),
847                        u_wmsg_errorName(err));
848
849                willexit = TRUE;
850                err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
851            }
852
853            // Replaced a check for whether the input was consumed by
854            // looping until it is; message key "premEndInput" now obsolete.
855
856            if (ulen == 0) {
857                continue;
858            }
859
860            // remove a U+FEFF Unicode signature character if requested
861            if (sig < 0) {
862                if (u.charAt(0) == uSig) {
863                    u.remove(0, 1);
864
865                    // account for the removed UChar and offset
866                    --ulen;
867
868                    if (useOffsets) {
869                        // remove an offset from fromoffsets[] as well
870                        // to keep the array parallel with the UChars
871                        memmove(fromoffsets, fromoffsets + 1, ulen * 4);
872                    }
873
874                }
875                sig = 0;
876            }
877
878#if !UCONFIG_NO_TRANSLITERATION
879            // Transliterate/transform if needed.
880
881            // For transformation, we use chunking code -
882            // collect Unicode input until, for example, an end-of-line,
883            // then transform and output-convert that and continue collecting.
884            // This makes the transformation result independent of the buffer size
885            // while avoiding the slower keyboard mode.
886            // The end-of-chunk characters are completely included in the
887            // transformed string in case they are to be transformed themselves.
888            if (t != NULL) {
889                UnicodeString out;
890                int32_t chunkLimit;
891
892                do {
893                    chunkLimit = getChunkLimit(chunk, u);
894                    if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
895                        // use all of the rest at the end of the text
896                        chunkLimit = u.length();
897                    }
898                    if (chunkLimit >= 0) {
899                        // complete the chunk and transform it
900                        chunk.append(u, 0, chunkLimit);
901                        u.remove(0, chunkLimit);
902                        t->transliterate(chunk);
903
904                        // append the transformation result to the result and empty the chunk
905                        out.append(chunk);
906                        chunk.remove();
907                    } else {
908                        // continue collecting the chunk
909                        chunk.append(u);
910                        break;
911                    }
912                } while (!u.isEmpty());
913
914                u = out;
915                ulen = u.length();
916            }
917#endif
918
919            // add a U+FEFF Unicode signature character if requested
920            // and possible/necessary
921            if (sig > 0) {
922                if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
923                    u.insert(0, (UChar)uSig);
924
925                    if (useOffsets) {
926                        // insert a pseudo-offset into fromoffsets[] as well
927                        // to keep the array parallel with the UChars
928                        memmove(fromoffsets + 1, fromoffsets, ulen * 4);
929                        fromoffsets[0] = -1;
930                    }
931
932                    // account for the additional UChar and offset
933                    ++ulen;
934                }
935                sig = 0;
936            }
937
938            // Convert the Unicode buffer into the destination codepage
939            // Again 'bufp' will be placed behind the last converted character
940            // And 'unibufp' will be placed behind the last converted unicode character
941            // At the last conversion flush should be set to true to ensure that
942            // all characters left get converted
943
944            unibuf = unibufbp = u.getBuffer();
945
946            do {
947                bufp = outbuf;
948
949                // Use fromSawEndOfBytes in addition to the flush flag -
950                // it indicates whether the intermediate Unicode string
951                // contains the very last UChars for the very last input bytes.
952                ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
953                                 &unibufbp,
954                                 unibuf + ulen,
955                                 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
956
957                // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
958                // converting all of the intermediate UChars.
959                // See comment for fromSawEndOfBytes.
960                toSawEndOfUnicode = (UBool)U_SUCCESS(err);
961
962                if (err == U_BUFFER_OVERFLOW_ERROR) {
963                    err = U_ZERO_ERROR;
964                } else if (U_FAILURE(err)) {
965                    UChar errorUChars[4];
966                    const char *errtag;
967                    char pos[32];
968                    UChar32 c;
969                    int8_t i, length, errorLength;
970
971                    UErrorCode localError = U_ZERO_ERROR;
972                    errorLength = (int8_t)LENGTHOF(errorUChars);
973                    ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
974                    if (U_FAILURE(localError) || errorLength == 0) {
975                        // need at least 1 so that we don't access beyond the length of fromoffsets[]
976                        errorLength = 1;
977                    }
978
979                    int32_t ferroffset;
980
981                    if (useOffsets) {
982                        // Unicode buffer offset of the start of the error UChars
983                        ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
984                        if (ferroffset < 0) {
985                            // approximation - the character started in the previous Unicode buffer
986                            ferroffset = 0;
987                        }
988
989                        // get the corresponding byte offset out of fromoffsets[]
990                        // go back if the offset is not known for some of the UChars
991                        int32_t fromoffset;
992                        do {
993                            fromoffset = fromoffsets[ferroffset];
994                        } while (fromoffset < 0 && --ferroffset >= 0);
995
996                        // total input file offset =
997                        // input file offset of the current byte buffer +
998                        // byte buffer offset of where the current Unicode buffer is converted from +
999                        // fromoffsets[Unicode offset]
1000                        ferroffset = infoffset + (prevbufp - buf) + fromoffset;
1001                        errtag = "problemCvtFromU";
1002                    } else {
1003                        // Do not use fromoffsets if (t != NULL) because the Unicode text may
1004                        // be different from what the offsets refer to.
1005
1006                        // output file offset
1007                        ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
1008                        errtag = "problemCvtFromUOut";
1009                    }
1010
1011                    length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
1012
1013                    // output the code points that caused the error
1014                    UnicodeString str;
1015                    for (i = 0; i < errorLength;) {
1016                        if (i > 0) {
1017                            str.append((UChar)uSP);
1018                        }
1019                        U16_NEXT(errorUChars, i, errorLength, c);
1020                        if (c >= 0x100000) {
1021                            str.append(nibbleToHex((uint8_t)(c >> 20)));
1022                        }
1023                        if (c >= 0x10000) {
1024                            str.append(nibbleToHex((uint8_t)(c >> 16)));
1025                        }
1026                        str.append(nibbleToHex((uint8_t)(c >> 12)));
1027                        str.append(nibbleToHex((uint8_t)(c >> 8)));
1028                        str.append(nibbleToHex((uint8_t)(c >> 4)));
1029                        str.append(nibbleToHex((uint8_t)c));
1030                    }
1031
1032                    initMsg(pname);
1033                    u_wmsg(stderr, errtag,
1034                            UnicodeString(pos, length, "").getTerminatedBuffer(),
1035                            str.getTerminatedBuffer(),
1036                           u_wmsg_errorName(err));
1037                    u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
1038
1039                    willexit = TRUE;
1040                    err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
1041                }
1042
1043                // Replaced a check for whether the intermediate Unicode characters were all consumed by
1044                // looping until they are; message key "premEnd" now obsolete.
1045
1046                // Finally, write the converted buffer to the output file
1047                size_t outlen = (size_t) (bufp - outbuf);
1048                outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
1049                if (wr != outlen) {
1050                    UnicodeString str(strerror(errno));
1051                    initMsg(pname);
1052                    u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
1053                    willexit = TRUE;
1054                }
1055
1056                if (willexit) {
1057                    goto error_exit;
1058                }
1059            } while (!toSawEndOfUnicode);
1060        } while (!fromSawEndOfBytes);
1061    } while (!flush);           // Stop when we have flushed the
1062                                // converters (this means that it's
1063                                // the end of output)
1064
1065    goto normal_exit;
1066
1067error_exit:
1068    ret = FALSE;
1069
1070normal_exit:
1071    // Cleanup.
1072
1073    ucnv_close(convfrom);
1074    ucnv_close(convto);
1075
1076#if !UCONFIG_NO_TRANSLITERATION
1077    delete t;
1078#endif
1079
1080    if (infile != stdin) {
1081        fclose(infile);
1082    }
1083
1084    return ret;
1085}
1086
1087static void usage(const char *pname, int ecode) {
1088    const UChar *msg;
1089    int32_t msgLen;
1090    UErrorCode err = U_ZERO_ERROR;
1091    FILE *fp = ecode ? stderr : stdout;
1092    int res;
1093
1094    initMsg(pname);
1095    msg =
1096        ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
1097                            &msgLen, &err);
1098    UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
1099    UnicodeString mname(msg, msgLen + 1);
1100
1101    res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
1102    if (!ecode) {
1103        if (!res) {
1104            fputc('\n', fp);
1105        }
1106        if (!u_wmsg(fp, "help")) {
1107            /* Now dump callbacks and finish. */
1108
1109            int i, count =
1110                sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
1111            for (i = 0; i < count; ++i) {
1112                fprintf(fp, " %s", transcode_callbacks[i].name);
1113            }
1114            fputc('\n', fp);
1115        }
1116    }
1117
1118    exit(ecode);
1119}
1120
1121extern int
1122main(int argc, char **argv)
1123{
1124    FILE *outfile;
1125    int ret = 0;
1126
1127    size_t bufsz = DEFAULT_BUFSZ;
1128
1129    const char *fromcpage = 0;
1130    const char *tocpage = 0;
1131    const char *translit = 0;
1132    const char *outfilestr = 0;
1133    UBool fallback = FALSE;
1134
1135    UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
1136    const void *fromuctxt = 0;
1137    UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
1138    const void *touctxt = 0;
1139
1140    char **iter, **remainArgv, **remainArgvLimit;
1141    char **end = argv + argc;
1142
1143    const char *pname;
1144
1145    UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
1146    const char *printName = 0;
1147
1148    UBool verbose = FALSE;
1149    UErrorCode status = U_ZERO_ERROR;
1150
1151    ConvertFile cf;
1152
1153    /* Initialize ICU */
1154    u_init(&status);
1155    if (U_FAILURE(status)) {
1156        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
1157            argv[0], u_errorName(status));
1158        exit(1);
1159    }
1160
1161    // Get and prettify pname.
1162    pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
1163#ifdef U_WINDOWS
1164    if (!pname) {
1165        pname = uprv_strrchr(*argv, '/');
1166    }
1167#endif
1168    if (!pname) {
1169        pname = *argv;
1170    } else {
1171        ++pname;
1172    }
1173
1174    // First, get the arguments from command-line
1175    // to know the codepages to convert between
1176
1177    remainArgv = remainArgvLimit = argv + 1;
1178    for (iter = argv + 1; iter != end; iter++) {
1179        // Check for from charset
1180        if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
1181            iter++;
1182            if (iter != end)
1183                fromcpage = *iter;
1184            else
1185                usage(pname, 1);
1186        } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
1187            iter++;
1188            if (iter != end)
1189                tocpage = *iter;
1190            else
1191                usage(pname, 1);
1192        } else if (strcmp("-x", *iter) == 0) {
1193            iter++;
1194            if (iter != end)
1195                translit = *iter;
1196            else
1197                usage(pname, 1);
1198        } else if (!strcmp("--fallback", *iter)) {
1199            fallback = TRUE;
1200        } else if (!strcmp("--no-fallback", *iter)) {
1201            fallback = FALSE;
1202        } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
1203            iter++;
1204            if (iter != end) {
1205                bufsz = atoi(*iter);
1206                if ((int) bufsz <= 0) {
1207                    initMsg(pname);
1208                    UnicodeString str(*iter);
1209                    initMsg(pname);
1210                    u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
1211                    return 3;
1212                }
1213            } else {
1214                usage(pname, 1);
1215            }
1216        } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
1217            if (printTranslits) {
1218                usage(pname, 1);
1219            }
1220            printConvs = TRUE;
1221        } else if (strcmp("--default-code", *iter) == 0) {
1222            if (printTranslits) {
1223                usage(pname, 1);
1224            }
1225            printName = ucnv_getDefaultName();
1226        } else if (strcmp("--list-code", *iter) == 0) {
1227            if (printTranslits) {
1228                usage(pname, 1);
1229            }
1230
1231            iter++;
1232            if (iter != end) {
1233                UErrorCode e = U_ZERO_ERROR;
1234                printName = ucnv_getAlias(*iter, 0, &e);
1235                if (U_FAILURE(e) || !printName) {
1236                    UnicodeString str(*iter);
1237                    initMsg(pname);
1238                    u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
1239                    return 2;
1240                }
1241            } else
1242                usage(pname, 1);
1243        } else if (strcmp("--canon", *iter) == 0) {
1244            printCanon = TRUE;
1245        } else if (strcmp("-L", *iter) == 0
1246            || !strcmp("--list-transliterators", *iter)) {
1247            if (printConvs) {
1248                usage(pname, 1);
1249            }
1250            printTranslits = TRUE;
1251        } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
1252            || !strcmp("--help", *iter)) {
1253            usage(pname, 0);
1254        } else if (!strcmp("-c", *iter)) {
1255            fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
1256        } else if (!strcmp("--to-callback", *iter)) {
1257            iter++;
1258            if (iter != end) {
1259                const struct callback_ent *cbe = findCallback(*iter);
1260                if (cbe) {
1261                    fromucallback = cbe->fromu;
1262                    fromuctxt = cbe->fromuctxt;
1263                } else {
1264                    UnicodeString str(*iter);
1265                    initMsg(pname);
1266                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1267                    return 4;
1268                }
1269            } else {
1270                usage(pname, 1);
1271            }
1272        } else if (!strcmp("--from-callback", *iter)) {
1273            iter++;
1274            if (iter != end) {
1275                const struct callback_ent *cbe = findCallback(*iter);
1276                if (cbe) {
1277                    toucallback = cbe->tou;
1278                    touctxt = cbe->touctxt;
1279                } else {
1280                    UnicodeString str(*iter);
1281                    initMsg(pname);
1282                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1283                    return 4;
1284                }
1285            } else {
1286                usage(pname, 1);
1287            }
1288        } else if (!strcmp("-i", *iter)) {
1289            toucallback = UCNV_TO_U_CALLBACK_SKIP;
1290        } else if (!strcmp("--callback", *iter)) {
1291            iter++;
1292            if (iter != end) {
1293                const struct callback_ent *cbe = findCallback(*iter);
1294                if (cbe) {
1295                    fromucallback = cbe->fromu;
1296                    fromuctxt = cbe->fromuctxt;
1297                    toucallback = cbe->tou;
1298                    touctxt = cbe->touctxt;
1299                } else {
1300                    UnicodeString str(*iter);
1301                    initMsg(pname);
1302                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1303                    return 4;
1304                }
1305            } else {
1306                usage(pname, 1);
1307            }
1308        } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
1309            verbose = FALSE;
1310        } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
1311            verbose = TRUE;
1312        } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
1313            printf("%s v2.1  ICU " U_ICU_VERSION "\n", pname);
1314            return 0;
1315        } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
1316            ++iter;
1317            if (iter != end && !outfilestr) {
1318                outfilestr = *iter;
1319            } else {
1320                usage(pname, 1);
1321            }
1322        } else if (0 == strcmp("--add-signature", *iter)) {
1323            cf.signature = 1;
1324        } else if (0 == strcmp("--remove-signature", *iter)) {
1325            cf.signature = -1;
1326        } else if (**iter == '-' && (*iter)[1]) {
1327            usage(pname, 1);
1328        } else {
1329            // move a non-option up in argv[]
1330            *remainArgvLimit++ = *iter;
1331        }
1332    }
1333
1334    if (printConvs || printName) {
1335        return printConverters(pname, printName, printCanon) ? 2 : 0;
1336    } else if (printTranslits) {
1337        return printTransliterators(printCanon) ? 3 : 0;
1338    }
1339
1340    if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
1341        fromcpage = ucnv_getDefaultName();
1342    }
1343    if (!tocpage || !uprv_strcmp(tocpage, "-")) {
1344        tocpage = ucnv_getDefaultName();
1345    }
1346
1347    // Open the correct output file or connect to stdout for reading input
1348    if (outfilestr != 0 && strcmp(outfilestr, "-")) {
1349        outfile = fopen(outfilestr, "wb");
1350        if (outfile == 0) {
1351            UnicodeString str1(outfilestr, "");
1352            UnicodeString str2(strerror(errno), "");
1353            initMsg(pname);
1354            u_wmsg(stderr, "cantCreateOutputF",
1355                str1.getBuffer(), str2.getBuffer());
1356            return 1;
1357        }
1358    } else {
1359        outfilestr = "-";
1360        outfile = stdout;
1361#ifdef USE_FILENO_BINARY_MODE
1362        if (setmode(fileno(outfile), O_BINARY) == -1) {
1363            u_wmsg(stderr, "cantSetOutBinMode");
1364            exit(-1);
1365        }
1366#endif
1367    }
1368
1369    /* Loop again on the arguments to find all the input files, and
1370    convert them. */
1371
1372    cf.setBufferSize(bufsz);
1373
1374    if(remainArgv < remainArgvLimit) {
1375        for (iter = remainArgv; iter != remainArgvLimit; iter++) {
1376            if (!cf.convertFile(
1377                    pname, fromcpage, toucallback, touctxt, tocpage,
1378                    fromucallback, fromuctxt, fallback, translit, *iter,
1379                    outfile, verbose)
1380            ) {
1381                goto error_exit;
1382            }
1383        }
1384    } else {
1385        if (!cf.convertFile(
1386                pname, fromcpage, toucallback, touctxt, tocpage,
1387                fromucallback, fromuctxt, fallback, translit, 0,
1388                outfile, verbose)
1389        ) {
1390            goto error_exit;
1391        }
1392    }
1393
1394    goto normal_exit;
1395error_exit:
1396    ret = 1;
1397normal_exit:
1398
1399    if (outfile != stdout) {
1400        fclose(outfile);
1401    }
1402
1403    return ret;
1404}
1405
1406
1407/*
1408 * Hey, Emacs, please set the following:
1409 *
1410 * Local Variables:
1411 * indent-tabs-mode: nil
1412 * End:
1413 *
1414 */
1415