1/*****************************************************************************
2*
3*   Copyright (C) 1999-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*
6******************************************************************************/
7
8/*
9 * uconv(1): an iconv(1)-like converter using ICU.
10 *
11 * Original code by Jonas Utterstr&#x00F6;m <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
13 *
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
16 *
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
19 */
20
21#include <unicode/utypes.h>
22#include <unicode/putil.h>
23#include <unicode/ucnv.h>
24#include <unicode/uenum.h>
25#include <unicode/unistr.h>
26#include <unicode/translit.h>
27#include <unicode/uset.h>
28#include <unicode/uclean.h>
29
30#include <stdio.h>
31#include <errno.h>
32#include <string.h>
33#include <stdlib.h>
34
35#include "cmemory.h"
36#include "cstring.h"
37#include "ustrfmt.h"
38
39#include "unicode/uwmsg.h"
40
41U_NAMESPACE_USE
42
43#if (defined(U_WINDOWS) || defined(U_CYGWIN) || defined(U_MINGW)) && !defined(__STRICT_ANSI__)
44#include <io.h>
45#include <fcntl.h>
46#if defined(U_WINDOWS)
47#define USE_FILENO_BINARY_MODE 1
48/* Windows likes to rename Unix-like functions */
49#ifndef fileno
50#define fileno _fileno
51#endif
52#ifndef setmode
53#define setmode _setmode
54#endif
55#ifndef O_BINARY
56#define O_BINARY _O_BINARY
57#endif
58#endif
59#endif
60
61#ifdef UCONVMSG_LINK
62/* below from the README */
63#include "unicode/utypes.h"
64#include "unicode/udata.h"
65U_CFUNC char uconvmsg_dat[];
66#endif
67
68#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
69
70#define DEFAULT_BUFSZ   4096
71#define UCONVMSG "uconvmsg"
72
73static UResourceBundle *gBundle = 0;    /* Bundle containing messages. */
74
75/*
76 * Initialize the message bundle so that message strings can be fetched
77 * by u_wmsg().
78 *
79 */
80
81static void initMsg(const char *pname) {
82    static int ps = 0;
83
84    if (!ps) {
85        char dataPath[2048];        /* XXX Sloppy: should be PATH_MAX. */
86        UErrorCode err = U_ZERO_ERROR;
87
88        ps = 1;
89
90        /* Set up our static data - if any */
91#ifdef UCONVMSG_LINK
92        udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
93        if (U_FAILURE(err)) {
94          fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
95                  pname, u_errorName(err));
96          err = U_ZERO_ERROR; /* It may still fail */
97        }
98#endif
99
100        /* Get messages. */
101        gBundle = u_wmsg_setPath(UCONVMSG, &err);
102        if (U_FAILURE(err)) {
103            fprintf(stderr,
104                    "%s: warning: couldn't open bundle %s: %s\n",
105                    pname, UCONVMSG, u_errorName(err));
106#ifdef UCONVMSG_LINK
107            fprintf(stderr,
108                    "%s: setAppData was called, internal data %s failed to load\n",
109                        pname, UCONVMSG);
110#endif
111
112            err = U_ZERO_ERROR;
113            /* that was try #1, try again with a path */
114            uprv_strcpy(dataPath, u_getDataDirectory());
115            uprv_strcat(dataPath, U_FILE_SEP_STRING);
116            uprv_strcat(dataPath, UCONVMSG);
117
118            gBundle = u_wmsg_setPath(dataPath, &err);
119            if (U_FAILURE(err)) {
120                fprintf(stderr,
121                    "%s: warning: still couldn't open bundle %s: %s\n",
122                    pname, dataPath, u_errorName(err));
123                fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
124            }
125        }
126    }
127}
128
129/* Mapping of callback names to the callbacks passed to the converter
130   API. */
131
132static struct callback_ent {
133    const char *name;
134    UConverterFromUCallback fromu;
135    const void *fromuctxt;
136    UConverterToUCallback tou;
137    const void *touctxt;
138} transcode_callbacks[] = {
139    { "substitute",
140      UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
141      UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
142    { "skip",
143      UCNV_FROM_U_CALLBACK_SKIP, 0,
144      UCNV_TO_U_CALLBACK_SKIP, 0 },
145    { "stop",
146      UCNV_FROM_U_CALLBACK_STOP, 0,
147      UCNV_TO_U_CALLBACK_STOP, 0 },
148    { "escape",
149      UCNV_FROM_U_CALLBACK_ESCAPE, 0,
150      UCNV_TO_U_CALLBACK_ESCAPE, 0},
151    { "escape-icu",
152      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
153      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
154    { "escape-java",
155      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
156      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
157    { "escape-c",
158      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
159      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
160    { "escape-xml",
161      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
162      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
163    { "escape-xml-hex",
164      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
165      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
166    { "escape-xml-dec",
167      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
168      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
169    { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
170      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
171};
172
173/* Return a pointer to a callback record given its name. */
174
175static const struct callback_ent *findCallback(const char *name) {
176    int i, count =
177        sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
178
179    /* We'll do a linear search, there aren't many of them and bsearch()
180       may not be that portable. */
181
182    for (i = 0; i < count; ++i) {
183        if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
184            return &transcode_callbacks[i];
185        }
186    }
187
188    return 0;
189}
190
191/* Print converter information. If lookfor is set, only that converter will
192   be printed, otherwise all converters will be printed. If canon is non
193   zero, tags and aliases for each converter are printed too, in the format
194   expected for convrters.txt(5). */
195
196static int printConverters(const char *pname, const char *lookfor,
197    UBool canon)
198{
199    UErrorCode err = U_ZERO_ERROR;
200    int32_t num;
201    uint16_t num_stds;
202    const char **stds;
203
204    /* If there is a specified name, just handle that now. */
205
206    if (lookfor) {
207        if (!canon) {
208            printf("%s\n", lookfor);
209            return 0;
210        } else {
211        /*  Because we are printing a canonical name, we need the
212            true converter name. We've done that already except for
213            the default name (because we want to print the exact
214            name one would get when calling ucnv_getDefaultName()
215            in non-canon mode). But since we do not know at this
216            point if we have the default name or something else, we
217            need to normalize again to the canonical converter
218            name. */
219
220            const char *truename = ucnv_getAlias(lookfor, 0, &err);
221            if (U_SUCCESS(err)) {
222                lookfor = truename;
223            } else {
224                err = U_ZERO_ERROR;
225            }
226        }
227    }
228
229    /* Print converter names. We come here for one of two reasons: we
230       are printing all the names (lookfor was null), or we have a
231       single converter to print but in canon mode, hence we need to
232       get to it in order to print everything. */
233
234    num = ucnv_countAvailable();
235    if (num <= 0) {
236        initMsg(pname);
237        u_wmsg(stderr, "cantGetNames");
238        return -1;
239    }
240    if (lookfor) {
241        num = 1;                /* We know where we want to be. */
242    }
243
244    num_stds = ucnv_countStandards();
245    stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
246    if (!stds) {
247        u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
248        return -1;
249    } else {
250        uint16_t s;
251
252        if (canon) {
253            printf("{ ");
254        }
255        for (s = 0; s < num_stds; ++s) {
256            stds[s] = ucnv_getStandard(s, &err);
257            if (canon) {
258                printf("%s ", stds[s]);
259            }
260            if (U_FAILURE(err)) {
261                u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
262                goto error_cleanup;
263            }
264        }
265        if (canon) {
266            puts("}");
267        }
268    }
269
270    for (int32_t i = 0; i < num; i++) {
271        const char *name;
272        uint16_t num_aliases;
273
274        /* Set the name either to what we are looking for, or
275        to the current converter name. */
276
277        if (lookfor) {
278            name = lookfor;
279        } else {
280            name = ucnv_getAvailableName(i);
281        }
282
283        /* Get all the aliases associated to the name. */
284
285        err = U_ZERO_ERROR;
286        num_aliases = ucnv_countAliases(name, &err);
287        if (U_FAILURE(err)) {
288            printf("%s", name);
289
290            UnicodeString str(name, "");
291            putchar('\t');
292            u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
293                u_wmsg_errorName(err));
294            goto error_cleanup;
295        } else {
296            uint16_t a, s, t;
297
298            /* Write all the aliases and their tags. */
299
300            for (a = 0; a < num_aliases; ++a) {
301                const char *alias = ucnv_getAlias(name, a, &err);
302
303                if (U_FAILURE(err)) {
304                    UnicodeString str(name, "");
305                    putchar('\t');
306                    u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
307                        u_wmsg_errorName(err));
308                    goto error_cleanup;
309                }
310
311                /* Print the current alias so that it looks right. */
312                printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
313                                 alias,
314                                 (canon ? "" : " "));
315
316                /* Look (slowly, linear searching) for a tag. */
317
318                if (canon) {
319                    /* -1 to skip the last standard */
320                    for (s = t = 0; s < num_stds-1; ++s) {
321                        UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
322                        if (U_SUCCESS(err)) {
323                            /* List the standard tags */
324                            const char *standardName;
325                            UBool isFirst = TRUE;
326                            UErrorCode enumError = U_ZERO_ERROR;
327                            while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
328                                /* See if this alias is supported by this standard. */
329                                if (!strcmp(standardName, alias)) {
330                                    if (!t) {
331                                        printf(" {");
332                                        t = 1;
333                                    }
334                                    /* Print a * after the default standard name */
335                                    printf(" %s%s", stds[s], (isFirst ? "*" : ""));
336                                }
337                                isFirst = FALSE;
338                            }
339                        }
340                    }
341                    if (t) {
342                        printf(" }");
343                    }
344                }
345                /* Terminate this entry. */
346                if (canon) {
347                    puts("");
348                }
349
350                /* Move on. */
351            }
352            /* Terminate this entry. */
353            if (!canon) {
354                puts("");
355            }
356        }
357    }
358
359    /* Free temporary data. */
360
361    uprv_free(stds);
362
363    /* Success. */
364
365    return 0;
366error_cleanup:
367    uprv_free(stds);
368    return -1;
369}
370
371/* Print all available transliterators. If canon is non zero, print
372   one transliterator per line. */
373
374static int printTransliterators(UBool canon)
375{
376#if UCONFIG_NO_TRANSLITERATION
377    printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
378    return 1;
379#else
380    UErrorCode status = U_ZERO_ERROR;
381    UEnumeration *ids = utrans_openIDs(&status);
382    int32_t i, numtrans = uenum_count(ids, &status);
383
384    char sepchar = canon ? '\n' : ' ';
385
386    for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) {
387    	int32_t len;
388    	const char *nextTrans = uenum_next(ids, &len, &status);
389
390        printf("%s", nextTrans);
391        if (i < numtrans - 1) {
392            putchar(sepchar);
393        }
394    }
395
396    uenum_close(ids);
397
398    /* Add a terminating newline if needed. */
399
400    if (sepchar != '\n') {
401        putchar('\n');
402    }
403
404    /* Success. */
405
406    return 0;
407#endif
408}
409
410enum {
411    uSP = 0x20,         // space
412    uCR = 0xd,          // carriage return
413    uLF = 0xa,          // line feed
414    uNL = 0x85,         // newline
415    uLS = 0x2028,       // line separator
416    uPS = 0x2029,       // paragraph separator
417    uSig = 0xfeff       // signature/BOM character
418};
419
420static inline int32_t
421getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
422    // find one of
423    // CR, LF, CRLF, NL, LS, PS
424    // for paragraph ends (see UAX #13/Unicode 4)
425    // and include it in the chunk
426    // all of these characters are on the BMP
427    // do not include FF or VT in case they are part of a paragraph
428    // (important for bidi contexts)
429    static const UChar paraEnds[] = {
430        0xd, 0xa, 0x85, 0x2028, 0x2029
431    };
432    enum {
433        iCR, iLF, iNL, iLS, iPS, iCount
434    };
435
436    // first, see if there is a CRLF split between prev and s
437    if (prev.endsWith(paraEnds + iCR, 1)) {
438        if (s.startsWith(paraEnds + iLF, 1)) {
439            return 1; // split CRLF, include the LF
440        } else if (!s.isEmpty()) {
441            return 0; // complete the last chunk
442        } else {
443            return -1; // wait for actual further contents to arrive
444        }
445    }
446
447    const UChar *u = s.getBuffer(), *limit = u + s.length();
448    UChar c;
449
450    while (u < limit) {
451        c = *u++;
452        if (
453            ((c < uSP) && (c == uCR || c == uLF)) ||
454            (c == uNL) ||
455            ((c & uLS) == uLS)
456        ) {
457            if (c == uCR) {
458                // check for CRLF
459                if (u == limit) {
460                    return -1; // LF may be in the next chunk
461                } else if (*u == uLF) {
462                    ++u; // include the LF in this chunk
463                }
464            }
465            return (int32_t)(u - s.getBuffer());
466        }
467    }
468
469    return -1; // continue collecting the chunk
470}
471
472enum {
473    CNV_NO_FEFF,    // cannot convert the U+FEFF Unicode signature character (BOM)
474    CNV_WITH_FEFF,  // can convert the U+FEFF signature character
475    CNV_ADDS_FEFF   // automatically adds/detects the U+FEFF signature character
476};
477
478static inline UChar
479nibbleToHex(uint8_t n) {
480    n &= 0xf;
481    return
482        n <= 9 ?
483            (UChar)(0x30 + n) :
484            (UChar)((0x61 - 10) + n);
485}
486
487// check the converter's Unicode signature properties;
488// the fromUnicode side of the converter must be in its initial state
489// and will be reset again if it was used
490static int32_t
491cnvSigType(UConverter *cnv) {
492    UErrorCode err;
493    int32_t result;
494
495    // test if the output charset can convert U+FEFF
496    USet *set = uset_open(1, 0);
497    err = U_ZERO_ERROR;
498    ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
499    if (U_SUCCESS(err) && uset_contains(set, uSig)) {
500        result = CNV_WITH_FEFF;
501    } else {
502        result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
503    }
504    uset_close(set);
505
506    if (result == CNV_WITH_FEFF) {
507        // test if the output charset emits a signature anyway
508        const UChar a[1] = { 0x61 }; // "a"
509        const UChar *in;
510
511        char buffer[20];
512        char *out;
513
514        in = a;
515        out = buffer;
516        err = U_ZERO_ERROR;
517        ucnv_fromUnicode(cnv,
518            &out, buffer + sizeof(buffer),
519            &in, a + 1,
520            NULL, TRUE, &err);
521        ucnv_resetFromUnicode(cnv);
522
523        if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
524            U_SUCCESS(err)
525        ) {
526            result = CNV_ADDS_FEFF;
527        }
528    }
529
530    return result;
531}
532
533class ConvertFile {
534public:
535    ConvertFile() :
536        buf(NULL), outbuf(NULL), fromoffsets(NULL),
537        bufsz(0), signature(0) {}
538
539    void
540    setBufferSize(size_t bufferSize) {
541        bufsz = bufferSize;
542
543        buf = new char[2 * bufsz];
544        outbuf = buf + bufsz;
545
546        // +1 for an added U+FEFF in the intermediate Unicode buffer
547        fromoffsets = new int32_t[bufsz + 1];
548    }
549
550    ~ConvertFile() {
551        delete [] buf;
552        delete [] fromoffsets;
553    }
554
555    UBool convertFile(const char *pname,
556                      const char *fromcpage,
557                      UConverterToUCallback toucallback,
558                      const void *touctxt,
559                      const char *tocpage,
560                      UConverterFromUCallback fromucallback,
561                      const void *fromuctxt,
562                      UBool fallback,
563                      const char *translit,
564                      const char *infilestr,
565                      FILE * outfile, int verbose);
566private:
567    friend int main(int argc, char **argv);
568
569    char *buf, *outbuf;
570    int32_t *fromoffsets;
571
572    size_t bufsz;
573    int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
574};
575
576// Convert a file from one encoding to another
577UBool
578ConvertFile::convertFile(const char *pname,
579                         const char *fromcpage,
580                         UConverterToUCallback toucallback,
581                         const void *touctxt,
582                         const char *tocpage,
583                         UConverterFromUCallback fromucallback,
584                         const void *fromuctxt,
585                         UBool fallback,
586                         const char *translit,
587                         const char *infilestr,
588                         FILE * outfile, int verbose)
589{
590    FILE *infile;
591    UBool ret = TRUE;
592    UConverter *convfrom = 0;
593    UConverter *convto = 0;
594    UErrorCode err = U_ZERO_ERROR;
595    UBool flush;
596    const char *cbufp, *prevbufp;
597    char *bufp;
598
599    uint32_t infoffset = 0, outfoffset = 0;   /* Where we are in the file, for error reporting. */
600
601    const UChar *unibuf, *unibufbp;
602    UChar *unibufp;
603
604    size_t rd, wr;
605
606#if !UCONFIG_NO_TRANSLITERATION
607    Transliterator *t = 0;      // Transliterator acting on Unicode data.
608    UnicodeString chunk;        // One chunk of the text being collected for transformation.
609#endif
610    UnicodeString u;            // String to do the transliteration.
611    int32_t ulen;
612
613    // use conversion offsets for error messages
614    // unless a transliterator is used -
615    // a text transformation will reorder characters in unpredictable ways
616    UBool useOffsets = TRUE;
617
618    // Open the correct input file or connect to stdin for reading input
619
620    if (infilestr != 0 && strcmp(infilestr, "-")) {
621        infile = fopen(infilestr, "rb");
622        if (infile == 0) {
623            UnicodeString str1(infilestr, "");
624            str1.append((UChar32) 0);
625            UnicodeString str2(strerror(errno), "");
626            str2.append((UChar32) 0);
627            initMsg(pname);
628            u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
629            return FALSE;
630        }
631    } else {
632        infilestr = "-";
633        infile = stdin;
634#ifdef USE_FILENO_BINARY_MODE
635        if (setmode(fileno(stdin), O_BINARY) == -1) {
636            initMsg(pname);
637            u_wmsg(stderr, "cantSetInBinMode");
638            return FALSE;
639        }
640#endif
641    }
642
643    if (verbose) {
644        fprintf(stderr, "%s:\n", infilestr);
645    }
646
647#if !UCONFIG_NO_TRANSLITERATION
648    // Create transliterator as needed.
649
650    if (translit != NULL && *translit) {
651        UParseError parse;
652        UnicodeString str(translit), pestr;
653
654        /* Create from rules or by ID as needed. */
655
656        parse.line = -1;
657
658        if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
659            t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
660        } else {
661            t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
662        }
663
664        if (U_FAILURE(err)) {
665            str.append((UChar32) 0);
666            initMsg(pname);
667
668            if (parse.line >= 0) {
669                UChar linebuf[20], offsetbuf[20];
670                uprv_itou(linebuf, 20, parse.line, 10, 0);
671                uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
672                u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
673                    u_wmsg_errorName(err), linebuf, offsetbuf);
674            } else {
675                u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
676                    u_wmsg_errorName(err));
677            }
678
679            if (t) {
680                delete t;
681                t = 0;
682            }
683            goto error_exit;
684        }
685
686        useOffsets = FALSE;
687    }
688#endif
689
690    // Create codepage converter. If the codepage or its aliases weren't
691    // available, it returns NULL and a failure code. We also set the
692    // callbacks, and return errors in the same way.
693
694    convfrom = ucnv_open(fromcpage, &err);
695    if (U_FAILURE(err)) {
696        UnicodeString str(fromcpage, "");
697        initMsg(pname);
698        u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
699            u_wmsg_errorName(err));
700        goto error_exit;
701    }
702    ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
703    if (U_FAILURE(err)) {
704        initMsg(pname);
705        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
706        goto error_exit;
707    }
708
709    convto = ucnv_open(tocpage, &err);
710    if (U_FAILURE(err)) {
711        UnicodeString str(tocpage, "");
712        initMsg(pname);
713        u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
714            u_wmsg_errorName(err));
715        goto error_exit;
716    }
717    ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
718    if (U_FAILURE(err)) {
719        initMsg(pname);
720        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
721        goto error_exit;
722    }
723    ucnv_setFallback(convto, fallback);
724
725    UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
726    int8_t sig;
727
728    // OK, we can convert now.
729    sig = signature;
730    rd = 0;
731
732    do {
733        willexit = FALSE;
734
735        // input file offset at the beginning of the next buffer
736        infoffset += rd;
737
738        rd = fread(buf, 1, bufsz, infile);
739        if (ferror(infile) != 0) {
740            UnicodeString str(strerror(errno));
741            initMsg(pname);
742            u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
743            goto error_exit;
744        }
745
746        // Convert the read buffer into the new encoding via Unicode.
747        // After the call 'unibufp' will be placed behind the last
748        // character that was converted in the 'unibuf'.
749        // Also the 'cbufp' is positioned behind the last converted
750        // character.
751        // At the last conversion in the file, flush should be set to
752        // true so that we get all characters converted.
753        //
754        // The converter must be flushed at the end of conversion so
755        // that characters on hold also will be written.
756
757        cbufp = buf;
758        flush = (UBool)(rd != bufsz);
759
760        // convert until the input is consumed
761        do {
762            // remember the start of the current byte-to-Unicode conversion
763            prevbufp = cbufp;
764
765            unibuf = unibufp = u.getBuffer((int32_t)bufsz);
766
767            // Use bufsz instead of u.getCapacity() for the targetLimit
768            // so that we don't overflow fromoffsets[].
769            ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
770                buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
771
772            ulen = (int32_t)(unibufp - unibuf);
773            u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
774
775            // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
776            // converting all of the input bytes.
777            // It works like this because ucnv_toUnicode() returns only under the
778            // following conditions:
779            // - an error occurred during conversion (an error code is set)
780            // - the target buffer is filled (the error code indicates an overflow)
781            // - the source is consumed
782            // That is, if the error code does not indicate a failure,
783            // not even an overflow, then the source must be consumed entirely.
784            fromSawEndOfBytes = (UBool)U_SUCCESS(err);
785
786            if (err == U_BUFFER_OVERFLOW_ERROR) {
787                err = U_ZERO_ERROR;
788            } else if (U_FAILURE(err)) {
789                char pos[32], errorBytes[32];
790                int8_t i, length, errorLength;
791
792                UErrorCode localError = U_ZERO_ERROR;
793                errorLength = (int8_t)sizeof(errorBytes);
794                ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
795                if (U_FAILURE(localError) || errorLength == 0) {
796                    errorLength = 1;
797                }
798
799                // print the input file offset of the start of the error bytes:
800                // input file offset of the current byte buffer +
801                // length of the just consumed bytes -
802                // length of the error bytes
803                length =
804                    (int8_t)sprintf(pos, "%d",
805                        (int)(infoffset + (cbufp - buf) - errorLength));
806
807                // output the bytes that caused the error
808                UnicodeString str;
809                for (i = 0; i < errorLength; ++i) {
810                    if (i > 0) {
811                        str.append((UChar)uSP);
812                    }
813                    str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
814                    str.append(nibbleToHex((uint8_t)errorBytes[i]));
815                }
816
817                initMsg(pname);
818                u_wmsg(stderr, "problemCvtToU",
819                        UnicodeString(pos, length, "").getTerminatedBuffer(),
820                        str.getTerminatedBuffer(),
821                        u_wmsg_errorName(err));
822
823                willexit = TRUE;
824                err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
825            }
826
827            // Replaced a check for whether the input was consumed by
828            // looping until it is; message key "premEndInput" now obsolete.
829
830            if (ulen == 0) {
831                continue;
832            }
833
834            // remove a U+FEFF Unicode signature character if requested
835            if (sig < 0) {
836                if (u.charAt(0) == uSig) {
837                    u.remove(0, 1);
838
839                    // account for the removed UChar and offset
840                    --ulen;
841
842                    if (useOffsets) {
843                        // remove an offset from fromoffsets[] as well
844                        // to keep the array parallel with the UChars
845                        memmove(fromoffsets, fromoffsets + 1, ulen * 4);
846                    }
847
848                }
849                sig = 0;
850            }
851
852#if !UCONFIG_NO_TRANSLITERATION
853            // Transliterate/transform if needed.
854
855            // For transformation, we use chunking code -
856            // collect Unicode input until, for example, an end-of-line,
857            // then transform and output-convert that and continue collecting.
858            // This makes the transformation result independent of the buffer size
859            // while avoiding the slower keyboard mode.
860            // The end-of-chunk characters are completely included in the
861            // transformed string in case they are to be transformed themselves.
862            if (t != NULL) {
863                UnicodeString out;
864                int32_t chunkLimit;
865
866                do {
867                    chunkLimit = getChunkLimit(chunk, u);
868                    if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
869                        // use all of the rest at the end of the text
870                        chunkLimit = u.length();
871                    }
872                    if (chunkLimit >= 0) {
873                        // complete the chunk and transform it
874                        chunk.append(u, 0, chunkLimit);
875                        u.remove(0, chunkLimit);
876                        t->transliterate(chunk);
877
878                        // append the transformation result to the result and empty the chunk
879                        out.append(chunk);
880                        chunk.remove();
881                    } else {
882                        // continue collecting the chunk
883                        chunk.append(u);
884                        break;
885                    }
886                } while (!u.isEmpty());
887
888                u = out;
889                ulen = u.length();
890            }
891#endif
892
893            // add a U+FEFF Unicode signature character if requested
894            // and possible/necessary
895            if (sig > 0) {
896                if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
897                    u.insert(0, (UChar)uSig);
898
899                    if (useOffsets) {
900                        // insert a pseudo-offset into fromoffsets[] as well
901                        // to keep the array parallel with the UChars
902                        memmove(fromoffsets + 1, fromoffsets, ulen * 4);
903                        fromoffsets[0] = -1;
904                    }
905
906                    // account for the additional UChar and offset
907                    ++ulen;
908                }
909                sig = 0;
910            }
911
912            // Convert the Unicode buffer into the destination codepage
913            // Again 'bufp' will be placed behind the last converted character
914            // And 'unibufp' will be placed behind the last converted unicode character
915            // At the last conversion flush should be set to true to ensure that
916            // all characters left get converted
917
918            unibuf = unibufbp = u.getBuffer();
919
920            do {
921                bufp = outbuf;
922
923                // Use fromSawEndOfBytes in addition to the flush flag -
924                // it indicates whether the intermediate Unicode string
925                // contains the very last UChars for the very last input bytes.
926                ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
927                                 &unibufbp,
928                                 unibuf + ulen,
929                                 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
930
931                // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
932                // converting all of the intermediate UChars.
933                // See comment for fromSawEndOfBytes.
934                toSawEndOfUnicode = (UBool)U_SUCCESS(err);
935
936                if (err == U_BUFFER_OVERFLOW_ERROR) {
937                    err = U_ZERO_ERROR;
938                } else if (U_FAILURE(err)) {
939                    UChar errorUChars[4];
940                    const char *errtag;
941                    char pos[32];
942                    UChar32 c;
943                    int8_t i, length, errorLength;
944
945                    UErrorCode localError = U_ZERO_ERROR;
946                    errorLength = (int8_t)LENGTHOF(errorUChars);
947                    ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
948                    if (U_FAILURE(localError) || errorLength == 0) {
949                        // need at least 1 so that we don't access beyond the length of fromoffsets[]
950                        errorLength = 1;
951                    }
952
953                    int32_t ferroffset;
954
955                    if (useOffsets) {
956                        // Unicode buffer offset of the start of the error UChars
957                        ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
958                        if (ferroffset < 0) {
959                            // approximation - the character started in the previous Unicode buffer
960                            ferroffset = 0;
961                        }
962
963                        // get the corresponding byte offset out of fromoffsets[]
964                        // go back if the offset is not known for some of the UChars
965                        int32_t fromoffset;
966                        do {
967                            fromoffset = fromoffsets[ferroffset];
968                        } while (fromoffset < 0 && --ferroffset >= 0);
969
970                        // total input file offset =
971                        // input file offset of the current byte buffer +
972                        // byte buffer offset of where the current Unicode buffer is converted from +
973                        // fromoffsets[Unicode offset]
974                        ferroffset = infoffset + (prevbufp - buf) + fromoffset;
975                        errtag = "problemCvtFromU";
976                    } else {
977                        // Do not use fromoffsets if (t != NULL) because the Unicode text may
978                        // be different from what the offsets refer to.
979
980                        // output file offset
981                        ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
982                        errtag = "problemCvtFromUOut";
983                    }
984
985                    length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
986
987                    // output the code points that caused the error
988                    UnicodeString str;
989                    for (i = 0; i < errorLength;) {
990                        if (i > 0) {
991                            str.append((UChar)uSP);
992                        }
993                        U16_NEXT(errorUChars, i, errorLength, c);
994                        if (c >= 0x100000) {
995                            str.append(nibbleToHex((uint8_t)(c >> 20)));
996                        }
997                        if (c >= 0x10000) {
998                            str.append(nibbleToHex((uint8_t)(c >> 16)));
999                        }
1000                        str.append(nibbleToHex((uint8_t)(c >> 12)));
1001                        str.append(nibbleToHex((uint8_t)(c >> 8)));
1002                        str.append(nibbleToHex((uint8_t)(c >> 4)));
1003                        str.append(nibbleToHex((uint8_t)c));
1004                    }
1005
1006                    initMsg(pname);
1007                    u_wmsg(stderr, errtag,
1008                            UnicodeString(pos, length, "").getTerminatedBuffer(),
1009                            str.getTerminatedBuffer(),
1010                           u_wmsg_errorName(err));
1011                    u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
1012
1013                    willexit = TRUE;
1014                    err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
1015                }
1016
1017                // Replaced a check for whether the intermediate Unicode characters were all consumed by
1018                // looping until they are; message key "premEnd" now obsolete.
1019
1020                // Finally, write the converted buffer to the output file
1021                size_t outlen = (size_t) (bufp - outbuf);
1022                outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
1023                if (wr != outlen) {
1024                    UnicodeString str(strerror(errno));
1025                    initMsg(pname);
1026                    u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
1027                    willexit = TRUE;
1028                }
1029
1030                if (willexit) {
1031                    goto error_exit;
1032                }
1033            } while (!toSawEndOfUnicode);
1034        } while (!fromSawEndOfBytes);
1035    } while (!flush);           // Stop when we have flushed the
1036                                // converters (this means that it's
1037                                // the end of output)
1038
1039    goto normal_exit;
1040
1041error_exit:
1042    ret = FALSE;
1043
1044normal_exit:
1045    // Cleanup.
1046
1047    ucnv_close(convfrom);
1048    ucnv_close(convto);
1049
1050#if !UCONFIG_NO_TRANSLITERATION
1051    delete t;
1052#endif
1053
1054    if (infile != stdin) {
1055        fclose(infile);
1056    }
1057
1058    return ret;
1059}
1060
1061static void usage(const char *pname, int ecode) {
1062    const UChar *msg;
1063    int32_t msgLen;
1064    UErrorCode err = U_ZERO_ERROR;
1065    FILE *fp = ecode ? stderr : stdout;
1066    int res;
1067
1068    initMsg(pname);
1069    msg =
1070        ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
1071                            &msgLen, &err);
1072    UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
1073    UnicodeString mname(msg, msgLen + 1);
1074
1075    res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
1076    if (!ecode) {
1077        if (!res) {
1078            fputc('\n', fp);
1079        }
1080        if (!u_wmsg(fp, "help")) {
1081            /* Now dump callbacks and finish. */
1082
1083            int i, count =
1084                sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
1085            for (i = 0; i < count; ++i) {
1086                fprintf(fp, " %s", transcode_callbacks[i].name);
1087            }
1088            fputc('\n', fp);
1089        }
1090    }
1091
1092    exit(ecode);
1093}
1094
1095extern int
1096main(int argc, char **argv)
1097{
1098    FILE *outfile;
1099    int ret = 0;
1100
1101    size_t bufsz = DEFAULT_BUFSZ;
1102
1103    const char *fromcpage = 0;
1104    const char *tocpage = 0;
1105    const char *translit = 0;
1106    const char *outfilestr = 0;
1107    UBool fallback = FALSE;
1108
1109    UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
1110    const void *fromuctxt = 0;
1111    UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
1112    const void *touctxt = 0;
1113
1114    char **iter, **remainArgv, **remainArgvLimit;
1115    char **end = argv + argc;
1116
1117    const char *pname;
1118
1119    UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
1120    const char *printName = 0;
1121
1122    UBool verbose = FALSE;
1123    UErrorCode status = U_ZERO_ERROR;
1124
1125    ConvertFile cf;
1126
1127    /* Initialize ICU */
1128    u_init(&status);
1129    if (U_FAILURE(status)) {
1130        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
1131            argv[0], u_errorName(status));
1132        exit(1);
1133    }
1134
1135    // Get and prettify pname.
1136    pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
1137#ifdef U_WINDOWS
1138    if (!pname) {
1139        pname = uprv_strrchr(*argv, '/');
1140    }
1141#endif
1142    if (!pname) {
1143        pname = *argv;
1144    } else {
1145        ++pname;
1146    }
1147
1148    // First, get the arguments from command-line
1149    // to know the codepages to convert between
1150
1151    remainArgv = remainArgvLimit = argv + 1;
1152    for (iter = argv + 1; iter != end; iter++) {
1153        // Check for from charset
1154        if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
1155            iter++;
1156            if (iter != end)
1157                fromcpage = *iter;
1158            else
1159                usage(pname, 1);
1160        } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
1161            iter++;
1162            if (iter != end)
1163                tocpage = *iter;
1164            else
1165                usage(pname, 1);
1166        } else if (strcmp("-x", *iter) == 0) {
1167            iter++;
1168            if (iter != end)
1169                translit = *iter;
1170            else
1171                usage(pname, 1);
1172        } else if (!strcmp("--fallback", *iter)) {
1173            fallback = TRUE;
1174        } else if (!strcmp("--no-fallback", *iter)) {
1175            fallback = FALSE;
1176        } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
1177            iter++;
1178            if (iter != end) {
1179                bufsz = atoi(*iter);
1180                if ((int) bufsz <= 0) {
1181                    initMsg(pname);
1182                    UnicodeString str(*iter);
1183                    initMsg(pname);
1184                    u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
1185                    return 3;
1186                }
1187            } else {
1188                usage(pname, 1);
1189            }
1190        } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
1191            if (printTranslits) {
1192                usage(pname, 1);
1193            }
1194            printConvs = TRUE;
1195        } else if (strcmp("--default-code", *iter) == 0) {
1196            if (printTranslits) {
1197                usage(pname, 1);
1198            }
1199            printName = ucnv_getDefaultName();
1200        } else if (strcmp("--list-code", *iter) == 0) {
1201            if (printTranslits) {
1202                usage(pname, 1);
1203            }
1204
1205            iter++;
1206            if (iter != end) {
1207                UErrorCode e = U_ZERO_ERROR;
1208                printName = ucnv_getAlias(*iter, 0, &e);
1209                if (U_FAILURE(e) || !printName) {
1210                    UnicodeString str(*iter);
1211                    initMsg(pname);
1212                    u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
1213                    return 2;
1214                }
1215            } else
1216                usage(pname, 1);
1217        } else if (strcmp("--canon", *iter) == 0) {
1218            printCanon = TRUE;
1219        } else if (strcmp("-L", *iter) == 0
1220            || !strcmp("--list-transliterators", *iter)) {
1221            if (printConvs) {
1222                usage(pname, 1);
1223            }
1224            printTranslits = TRUE;
1225        } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
1226            || !strcmp("--help", *iter)) {
1227            usage(pname, 0);
1228        } else if (!strcmp("-c", *iter)) {
1229            fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
1230        } else if (!strcmp("--to-callback", *iter)) {
1231            iter++;
1232            if (iter != end) {
1233                const struct callback_ent *cbe = findCallback(*iter);
1234                if (cbe) {
1235                    fromucallback = cbe->fromu;
1236                    fromuctxt = cbe->fromuctxt;
1237                } else {
1238                    UnicodeString str(*iter);
1239                    initMsg(pname);
1240                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1241                    return 4;
1242                }
1243            } else {
1244                usage(pname, 1);
1245            }
1246        } else if (!strcmp("--from-callback", *iter)) {
1247            iter++;
1248            if (iter != end) {
1249                const struct callback_ent *cbe = findCallback(*iter);
1250                if (cbe) {
1251                    toucallback = cbe->tou;
1252                    touctxt = cbe->touctxt;
1253                } else {
1254                    UnicodeString str(*iter);
1255                    initMsg(pname);
1256                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1257                    return 4;
1258                }
1259            } else {
1260                usage(pname, 1);
1261            }
1262        } else if (!strcmp("-i", *iter)) {
1263            toucallback = UCNV_TO_U_CALLBACK_SKIP;
1264        } else if (!strcmp("--callback", *iter)) {
1265            iter++;
1266            if (iter != end) {
1267                const struct callback_ent *cbe = findCallback(*iter);
1268                if (cbe) {
1269                    fromucallback = cbe->fromu;
1270                    fromuctxt = cbe->fromuctxt;
1271                    toucallback = cbe->tou;
1272                    touctxt = cbe->touctxt;
1273                } else {
1274                    UnicodeString str(*iter);
1275                    initMsg(pname);
1276                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1277                    return 4;
1278                }
1279            } else {
1280                usage(pname, 1);
1281            }
1282        } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
1283            verbose = FALSE;
1284        } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
1285            verbose = TRUE;
1286        } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
1287            printf("%s v2.1  ICU " U_ICU_VERSION "\n", pname);
1288            return 0;
1289        } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
1290            ++iter;
1291            if (iter != end && !outfilestr) {
1292                outfilestr = *iter;
1293            } else {
1294                usage(pname, 1);
1295            }
1296        } else if (0 == strcmp("--add-signature", *iter)) {
1297            cf.signature = 1;
1298        } else if (0 == strcmp("--remove-signature", *iter)) {
1299            cf.signature = -1;
1300        } else if (**iter == '-' && (*iter)[1]) {
1301            usage(pname, 1);
1302        } else {
1303            // move a non-option up in argv[]
1304            *remainArgvLimit++ = *iter;
1305        }
1306    }
1307
1308    if (printConvs || printName) {
1309        return printConverters(pname, printName, printCanon) ? 2 : 0;
1310    } else if (printTranslits) {
1311        return printTransliterators(printCanon) ? 3 : 0;
1312    }
1313
1314    if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
1315        fromcpage = ucnv_getDefaultName();
1316    }
1317    if (!tocpage || !uprv_strcmp(tocpage, "-")) {
1318        tocpage = ucnv_getDefaultName();
1319    }
1320
1321    // Open the correct output file or connect to stdout for reading input
1322    if (outfilestr != 0 && strcmp(outfilestr, "-")) {
1323        outfile = fopen(outfilestr, "wb");
1324        if (outfile == 0) {
1325            UnicodeString str1(outfilestr, "");
1326            UnicodeString str2(strerror(errno), "");
1327            initMsg(pname);
1328            u_wmsg(stderr, "cantCreateOutputF",
1329                str1.getBuffer(), str2.getBuffer());
1330            return 1;
1331        }
1332    } else {
1333        outfilestr = "-";
1334        outfile = stdout;
1335#ifdef USE_FILENO_BINARY_MODE
1336        if (setmode(fileno(outfile), O_BINARY) == -1) {
1337            u_wmsg(stderr, "cantSetOutBinMode");
1338            exit(-1);
1339        }
1340#endif
1341    }
1342
1343    /* Loop again on the arguments to find all the input files, and
1344    convert them. */
1345
1346    cf.setBufferSize(bufsz);
1347
1348    if(remainArgv < remainArgvLimit) {
1349        for (iter = remainArgv; iter != remainArgvLimit; iter++) {
1350            if (!cf.convertFile(
1351                    pname, fromcpage, toucallback, touctxt, tocpage,
1352                    fromucallback, fromuctxt, fallback, translit, *iter,
1353                    outfile, verbose)
1354            ) {
1355                goto error_exit;
1356            }
1357        }
1358    } else {
1359        if (!cf.convertFile(
1360                pname, fromcpage, toucallback, touctxt, tocpage,
1361                fromucallback, fromuctxt, fallback, translit, 0,
1362                outfile, verbose)
1363        ) {
1364            goto error_exit;
1365        }
1366    }
1367
1368    goto normal_exit;
1369error_exit:
1370#if !UCONFIG_NO_LEGACY_CONVERSION
1371    ret = 1;
1372#else
1373    fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
1374#endif
1375normal_exit:
1376
1377    if (outfile != stdout) {
1378        fclose(outfile);
1379    }
1380
1381    return ret;
1382}
1383
1384
1385/*
1386 * Hey, Emacs, please set the following:
1387 *
1388 * Local Variables:
1389 * indent-tabs-mode: nil
1390 * End:
1391 *
1392 */
1393