1/*****************************************************************************
2*
3*   Copyright (C) 1999-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*
6******************************************************************************/
7
8/*
9 * uconv(1): an iconv(1)-like converter using ICU.
10 *
11 * Original code by Jonas Utterstr&#x00F6;m <jonas.utterstrom@vittran.norrnod.se>
12 * contributed in 1999.
13 *
14 * Conversion to the C conversion API and many improvements by
15 * Yves Arrouye <yves@realnames.com>, current maintainer.
16 *
17 * Markus Scherer maintainer from 2003.
18 * See source code repository history for changes.
19 */
20
21#include <unicode/utypes.h>
22#include <unicode/putil.h>
23#include <unicode/ucnv.h>
24#include <unicode/uenum.h>
25#include <unicode/unistr.h>
26#include <unicode/translit.h>
27#include <unicode/uset.h>
28#include <unicode/uclean.h>
29#include <unicode/utf16.h>
30
31#include <stdio.h>
32#include <errno.h>
33#include <string.h>
34#include <stdlib.h>
35
36#include "cmemory.h"
37#include "cstring.h"
38#include "ustrfmt.h"
39
40#include "unicode/uwmsg.h"
41
42U_NAMESPACE_USE
43
44#if U_PLATFORM_USES_ONLY_WIN32_API && !defined(__STRICT_ANSI__)
45#include <io.h>
46#include <fcntl.h>
47#if U_PLATFORM_USES_ONLY_WIN32_API
48#define USE_FILENO_BINARY_MODE 1
49/* Windows likes to rename Unix-like functions */
50#ifndef fileno
51#define fileno _fileno
52#endif
53#ifndef setmode
54#define setmode _setmode
55#endif
56#ifndef O_BINARY
57#define O_BINARY _O_BINARY
58#endif
59#endif
60#endif
61
62#ifdef UCONVMSG_LINK
63/* below from the README */
64#include "unicode/utypes.h"
65#include "unicode/udata.h"
66U_CFUNC char uconvmsg_dat[];
67#endif
68
69#define DEFAULT_BUFSZ   4096
70#define UCONVMSG "uconvmsg"
71
72static UResourceBundle *gBundle = 0;    /* Bundle containing messages. */
73
74/*
75 * Initialize the message bundle so that message strings can be fetched
76 * by u_wmsg().
77 *
78 */
79
80static void initMsg(const char *pname) {
81    static int ps = 0;
82
83    if (!ps) {
84        char dataPath[2048];        /* XXX Sloppy: should be PATH_MAX. */
85        UErrorCode err = U_ZERO_ERROR;
86
87        ps = 1;
88
89        /* Set up our static data - if any */
90#if defined(UCONVMSG_LINK) && U_PLATFORM != U_PF_OS390 /* On z/OS, this is failing. */
91        udata_setAppData(UCONVMSG, (const void*) uconvmsg_dat, &err);
92        if (U_FAILURE(err)) {
93          fprintf(stderr, "%s: warning, problem installing our static resource bundle data uconvmsg: %s - trying anyways.\n",
94                  pname, u_errorName(err));
95          err = U_ZERO_ERROR; /* It may still fail */
96        }
97#endif
98
99        /* Get messages. */
100        gBundle = u_wmsg_setPath(UCONVMSG, &err);
101        if (U_FAILURE(err)) {
102            fprintf(stderr,
103                    "%s: warning: couldn't open bundle %s: %s\n",
104                    pname, UCONVMSG, u_errorName(err));
105#ifdef UCONVMSG_LINK
106            fprintf(stderr,
107                    "%s: setAppData was called, internal data %s failed to load\n",
108                        pname, UCONVMSG);
109#endif
110
111            err = U_ZERO_ERROR;
112            /* that was try #1, try again with a path */
113            uprv_strcpy(dataPath, u_getDataDirectory());
114            uprv_strcat(dataPath, U_FILE_SEP_STRING);
115            uprv_strcat(dataPath, UCONVMSG);
116
117            gBundle = u_wmsg_setPath(dataPath, &err);
118            if (U_FAILURE(err)) {
119                fprintf(stderr,
120                    "%s: warning: still couldn't open bundle %s: %s\n",
121                    pname, dataPath, u_errorName(err));
122                fprintf(stderr, "%s: warning: messages will not be displayed\n", pname);
123            }
124        }
125    }
126}
127
128/* Mapping of callback names to the callbacks passed to the converter
129   API. */
130
131static struct callback_ent {
132    const char *name;
133    UConverterFromUCallback fromu;
134    const void *fromuctxt;
135    UConverterToUCallback tou;
136    const void *touctxt;
137} transcode_callbacks[] = {
138    { "substitute",
139      UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0,
140      UCNV_TO_U_CALLBACK_SUBSTITUTE, 0 },
141    { "skip",
142      UCNV_FROM_U_CALLBACK_SKIP, 0,
143      UCNV_TO_U_CALLBACK_SKIP, 0 },
144    { "stop",
145      UCNV_FROM_U_CALLBACK_STOP, 0,
146      UCNV_TO_U_CALLBACK_STOP, 0 },
147    { "escape",
148      UCNV_FROM_U_CALLBACK_ESCAPE, 0,
149      UCNV_TO_U_CALLBACK_ESCAPE, 0},
150    { "escape-icu",
151      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU,
152      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_ICU },
153    { "escape-java",
154      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA,
155      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_JAVA },
156    { "escape-c",
157      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C,
158      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C },
159    { "escape-xml",
160      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
161      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
162    { "escape-xml-hex",
163      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX,
164      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_HEX },
165    { "escape-xml-dec",
166      UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC,
167      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC },
168    { "escape-unicode", UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE,
169      UCNV_TO_U_CALLBACK_ESCAPE, UCNV_ESCAPE_UNICODE }
170};
171
172/* Return a pointer to a callback record given its name. */
173
174static const struct callback_ent *findCallback(const char *name) {
175    int i, count =
176        sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
177
178    /* We'll do a linear search, there aren't many of them and bsearch()
179       may not be that portable. */
180
181    for (i = 0; i < count; ++i) {
182        if (!uprv_stricmp(name, transcode_callbacks[i].name)) {
183            return &transcode_callbacks[i];
184        }
185    }
186
187    return 0;
188}
189
190/* Print converter information. If lookfor is set, only that converter will
191   be printed, otherwise all converters will be printed. If canon is non
192   zero, tags and aliases for each converter are printed too, in the format
193   expected for convrters.txt(5). */
194
195static int printConverters(const char *pname, const char *lookfor,
196    UBool canon)
197{
198    UErrorCode err = U_ZERO_ERROR;
199    int32_t num;
200    uint16_t num_stds;
201    const char **stds;
202
203    /* If there is a specified name, just handle that now. */
204
205    if (lookfor) {
206        if (!canon) {
207            printf("%s\n", lookfor);
208            return 0;
209        } else {
210        /*  Because we are printing a canonical name, we need the
211            true converter name. We've done that already except for
212            the default name (because we want to print the exact
213            name one would get when calling ucnv_getDefaultName()
214            in non-canon mode). But since we do not know at this
215            point if we have the default name or something else, we
216            need to normalize again to the canonical converter
217            name. */
218
219            const char *truename = ucnv_getAlias(lookfor, 0, &err);
220            if (U_SUCCESS(err)) {
221                lookfor = truename;
222            } else {
223                err = U_ZERO_ERROR;
224            }
225        }
226    }
227
228    /* Print converter names. We come here for one of two reasons: we
229       are printing all the names (lookfor was null), or we have a
230       single converter to print but in canon mode, hence we need to
231       get to it in order to print everything. */
232
233    num = ucnv_countAvailable();
234    if (num <= 0) {
235        initMsg(pname);
236        u_wmsg(stderr, "cantGetNames");
237        return -1;
238    }
239    if (lookfor) {
240        num = 1;                /* We know where we want to be. */
241    }
242
243    num_stds = ucnv_countStandards();
244    stds = (const char **) uprv_malloc(num_stds * sizeof(*stds));
245    if (!stds) {
246        u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(U_MEMORY_ALLOCATION_ERROR));
247        return -1;
248    } else {
249        uint16_t s;
250
251        if (canon) {
252            printf("{ ");
253        }
254        for (s = 0; s < num_stds; ++s) {
255            stds[s] = ucnv_getStandard(s, &err);
256            if (canon) {
257                printf("%s ", stds[s]);
258            }
259            if (U_FAILURE(err)) {
260                u_wmsg(stderr, "cantGetTag", u_wmsg_errorName(err));
261                goto error_cleanup;
262            }
263        }
264        if (canon) {
265            puts("}");
266        }
267    }
268
269    for (int32_t i = 0; i < num; i++) {
270        const char *name;
271        uint16_t num_aliases;
272
273        /* Set the name either to what we are looking for, or
274        to the current converter name. */
275
276        if (lookfor) {
277            name = lookfor;
278        } else {
279            name = ucnv_getAvailableName(i);
280        }
281
282        /* Get all the aliases associated to the name. */
283
284        err = U_ZERO_ERROR;
285        num_aliases = ucnv_countAliases(name, &err);
286        if (U_FAILURE(err)) {
287            printf("%s", name);
288
289            UnicodeString str(name, "");
290            putchar('\t');
291            u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
292                u_wmsg_errorName(err));
293            goto error_cleanup;
294        } else {
295            uint16_t a, s, t;
296
297            /* Write all the aliases and their tags. */
298
299            for (a = 0; a < num_aliases; ++a) {
300                const char *alias = ucnv_getAlias(name, a, &err);
301
302                if (U_FAILURE(err)) {
303                    UnicodeString str(name, "");
304                    putchar('\t');
305                    u_wmsg(stderr, "cantGetAliases", str.getTerminatedBuffer(),
306                        u_wmsg_errorName(err));
307                    goto error_cleanup;
308                }
309
310                /* Print the current alias so that it looks right. */
311                printf("%s%s%s", (canon ? (a == 0? "" : "\t" ) : "") ,
312                                 alias,
313                                 (canon ? "" : " "));
314
315                /* Look (slowly, linear searching) for a tag. */
316
317                if (canon) {
318                    /* -1 to skip the last standard */
319                    for (s = t = 0; s < num_stds-1; ++s) {
320                        UEnumeration *nameEnum = ucnv_openStandardNames(name, stds[s], &err);
321                        if (U_SUCCESS(err)) {
322                            /* List the standard tags */
323                            const char *standardName;
324                            UBool isFirst = TRUE;
325                            UErrorCode enumError = U_ZERO_ERROR;
326                            while ((standardName = uenum_next(nameEnum, NULL, &enumError))) {
327                                /* See if this alias is supported by this standard. */
328                                if (!strcmp(standardName, alias)) {
329                                    if (!t) {
330                                        printf(" {");
331                                        t = 1;
332                                    }
333                                    /* Print a * after the default standard name */
334                                    printf(" %s%s", stds[s], (isFirst ? "*" : ""));
335                                }
336                                isFirst = FALSE;
337                            }
338                        }
339                    }
340                    if (t) {
341                        printf(" }");
342                    }
343                }
344                /* Terminate this entry. */
345                if (canon) {
346                    puts("");
347                }
348
349                /* Move on. */
350            }
351            /* Terminate this entry. */
352            if (!canon) {
353                puts("");
354            }
355        }
356    }
357
358    /* Free temporary data. */
359
360    uprv_free(stds);
361
362    /* Success. */
363
364    return 0;
365error_cleanup:
366    uprv_free(stds);
367    return -1;
368}
369
370/* Print all available transliterators. If canon is non zero, print
371   one transliterator per line. */
372
373static int printTransliterators(UBool canon)
374{
375#if UCONFIG_NO_TRANSLITERATION
376    printf("no transliterators available because of UCONFIG_NO_TRANSLITERATION, see uconfig.h\n");
377    return 1;
378#else
379    UErrorCode status = U_ZERO_ERROR;
380    UEnumeration *ids = utrans_openIDs(&status);
381    int32_t i, numtrans = uenum_count(ids, &status);
382
383    char sepchar = canon ? '\n' : ' ';
384
385    for (i = 0; U_SUCCESS(status)&& (i < numtrans); ++i) {
386    	int32_t len;
387    	const char *nextTrans = uenum_next(ids, &len, &status);
388
389        printf("%s", nextTrans);
390        if (i < numtrans - 1) {
391            putchar(sepchar);
392        }
393    }
394
395    uenum_close(ids);
396
397    /* Add a terminating newline if needed. */
398
399    if (sepchar != '\n') {
400        putchar('\n');
401    }
402
403    /* Success. */
404
405    return 0;
406#endif
407}
408
409enum {
410    uSP = 0x20,         // space
411    uCR = 0xd,          // carriage return
412    uLF = 0xa,          // line feed
413    uNL = 0x85,         // newline
414    uLS = 0x2028,       // line separator
415    uPS = 0x2029,       // paragraph separator
416    uSig = 0xfeff       // signature/BOM character
417};
418
419static inline int32_t
420getChunkLimit(const UnicodeString &prev, const UnicodeString &s) {
421    // find one of
422    // CR, LF, CRLF, NL, LS, PS
423    // for paragraph ends (see UAX #13/Unicode 4)
424    // and include it in the chunk
425    // all of these characters are on the BMP
426    // do not include FF or VT in case they are part of a paragraph
427    // (important for bidi contexts)
428    static const UChar paraEnds[] = {
429        0xd, 0xa, 0x85, 0x2028, 0x2029
430    };
431    enum {
432        iCR, iLF, iNL, iLS, iPS, iCount
433    };
434
435    // first, see if there is a CRLF split between prev and s
436    if (prev.endsWith(paraEnds + iCR, 1)) {
437        if (s.startsWith(paraEnds + iLF, 1)) {
438            return 1; // split CRLF, include the LF
439        } else if (!s.isEmpty()) {
440            return 0; // complete the last chunk
441        } else {
442            return -1; // wait for actual further contents to arrive
443        }
444    }
445
446    const UChar *u = s.getBuffer(), *limit = u + s.length();
447    UChar c;
448
449    while (u < limit) {
450        c = *u++;
451        if (
452            ((c < uSP) && (c == uCR || c == uLF)) ||
453            (c == uNL) ||
454            ((c & uLS) == uLS)
455        ) {
456            if (c == uCR) {
457                // check for CRLF
458                if (u == limit) {
459                    return -1; // LF may be in the next chunk
460                } else if (*u == uLF) {
461                    ++u; // include the LF in this chunk
462                }
463            }
464            return (int32_t)(u - s.getBuffer());
465        }
466    }
467
468    return -1; // continue collecting the chunk
469}
470
471enum {
472    CNV_NO_FEFF,    // cannot convert the U+FEFF Unicode signature character (BOM)
473    CNV_WITH_FEFF,  // can convert the U+FEFF signature character
474    CNV_ADDS_FEFF   // automatically adds/detects the U+FEFF signature character
475};
476
477static inline UChar
478nibbleToHex(uint8_t n) {
479    n &= 0xf;
480    return
481        n <= 9 ?
482            (UChar)(0x30 + n) :
483            (UChar)((0x61 - 10) + n);
484}
485
486// check the converter's Unicode signature properties;
487// the fromUnicode side of the converter must be in its initial state
488// and will be reset again if it was used
489static int32_t
490cnvSigType(UConverter *cnv) {
491    UErrorCode err;
492    int32_t result;
493
494    // test if the output charset can convert U+FEFF
495    USet *set = uset_open(1, 0);
496    err = U_ZERO_ERROR;
497    ucnv_getUnicodeSet(cnv, set, UCNV_ROUNDTRIP_SET, &err);
498    if (U_SUCCESS(err) && uset_contains(set, uSig)) {
499        result = CNV_WITH_FEFF;
500    } else {
501        result = CNV_NO_FEFF; // an error occurred or U+FEFF cannot be converted
502    }
503    uset_close(set);
504
505    if (result == CNV_WITH_FEFF) {
506        // test if the output charset emits a signature anyway
507        const UChar a[1] = { 0x61 }; // "a"
508        const UChar *in;
509
510        char buffer[20];
511        char *out;
512
513        in = a;
514        out = buffer;
515        err = U_ZERO_ERROR;
516        ucnv_fromUnicode(cnv,
517            &out, buffer + sizeof(buffer),
518            &in, a + 1,
519            NULL, TRUE, &err);
520        ucnv_resetFromUnicode(cnv);
521
522        if (NULL != ucnv_detectUnicodeSignature(buffer, (int32_t)(out - buffer), NULL, &err) &&
523            U_SUCCESS(err)
524        ) {
525            result = CNV_ADDS_FEFF;
526        }
527    }
528
529    return result;
530}
531
532class ConvertFile {
533public:
534    ConvertFile() :
535        buf(NULL), outbuf(NULL), fromoffsets(NULL),
536        bufsz(0), signature(0) {}
537
538    void
539    setBufferSize(size_t bufferSize) {
540        bufsz = bufferSize;
541
542        buf = new char[2 * bufsz];
543        outbuf = buf + bufsz;
544
545        // +1 for an added U+FEFF in the intermediate Unicode buffer
546        fromoffsets = new int32_t[bufsz + 1];
547    }
548
549    ~ConvertFile() {
550        delete [] buf;
551        delete [] fromoffsets;
552    }
553
554    UBool convertFile(const char *pname,
555                      const char *fromcpage,
556                      UConverterToUCallback toucallback,
557                      const void *touctxt,
558                      const char *tocpage,
559                      UConverterFromUCallback fromucallback,
560                      const void *fromuctxt,
561                      UBool fallback,
562                      const char *translit,
563                      const char *infilestr,
564                      FILE * outfile, int verbose);
565private:
566    friend int main(int argc, char **argv);
567
568    char *buf, *outbuf;
569    int32_t *fromoffsets;
570
571    size_t bufsz;
572    int8_t signature; // add (1) or remove (-1) a U+FEFF Unicode signature character
573};
574
575// Convert a file from one encoding to another
576UBool
577ConvertFile::convertFile(const char *pname,
578                         const char *fromcpage,
579                         UConverterToUCallback toucallback,
580                         const void *touctxt,
581                         const char *tocpage,
582                         UConverterFromUCallback fromucallback,
583                         const void *fromuctxt,
584                         UBool fallback,
585                         const char *translit,
586                         const char *infilestr,
587                         FILE * outfile, int verbose)
588{
589    FILE *infile;
590    UBool ret = TRUE;
591    UConverter *convfrom = 0;
592    UConverter *convto = 0;
593    UErrorCode err = U_ZERO_ERROR;
594    UBool flush;
595    UBool closeFile = FALSE;
596    const char *cbufp, *prevbufp;
597    char *bufp;
598
599    uint32_t infoffset = 0, outfoffset = 0;   /* Where we are in the file, for error reporting. */
600
601    const UChar *unibuf, *unibufbp;
602    UChar *unibufp;
603
604    size_t rd, wr;
605
606#if !UCONFIG_NO_TRANSLITERATION
607    Transliterator *t = 0;      // Transliterator acting on Unicode data.
608    UnicodeString chunk;        // One chunk of the text being collected for transformation.
609#endif
610    UnicodeString u;            // String to do the transliteration.
611    int32_t ulen;
612
613    // use conversion offsets for error messages
614    // unless a transliterator is used -
615    // a text transformation will reorder characters in unpredictable ways
616    UBool useOffsets = TRUE;
617
618    // Open the correct input file or connect to stdin for reading input
619
620    if (infilestr != 0 && strcmp(infilestr, "-")) {
621        infile = fopen(infilestr, "rb");
622        if (infile == 0) {
623            UnicodeString str1(infilestr, "");
624            str1.append((UChar32) 0);
625            UnicodeString str2(strerror(errno), "");
626            str2.append((UChar32) 0);
627            initMsg(pname);
628            u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
629            return FALSE;
630        }
631        closeFile = TRUE;
632    } else {
633        infilestr = "-";
634        infile = stdin;
635#ifdef USE_FILENO_BINARY_MODE
636        if (setmode(fileno(stdin), O_BINARY) == -1) {
637            initMsg(pname);
638            u_wmsg(stderr, "cantSetInBinMode");
639            return FALSE;
640        }
641#endif
642    }
643
644    if (verbose) {
645        fprintf(stderr, "%s:\n", infilestr);
646    }
647
648#if !UCONFIG_NO_TRANSLITERATION
649    // Create transliterator as needed.
650
651    if (translit != NULL && *translit) {
652        UParseError parse;
653        UnicodeString str(translit), pestr;
654
655        /* Create from rules or by ID as needed. */
656
657        parse.line = -1;
658
659        if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
660            t = Transliterator::createFromRules(UNICODE_STRING_SIMPLE("Uconv"), str, UTRANS_FORWARD, parse, err);
661        } else {
662            t = Transliterator::createInstance(UnicodeString(translit, -1, US_INV), UTRANS_FORWARD, err);
663        }
664
665        if (U_FAILURE(err)) {
666            str.append((UChar32) 0);
667            initMsg(pname);
668
669            if (parse.line >= 0) {
670                UChar linebuf[20], offsetbuf[20];
671                uprv_itou(linebuf, 20, parse.line, 10, 0);
672                uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
673                u_wmsg(stderr, "cantCreateTranslitParseErr", str.getTerminatedBuffer(),
674                    u_wmsg_errorName(err), linebuf, offsetbuf);
675            } else {
676                u_wmsg(stderr, "cantCreateTranslit", str.getTerminatedBuffer(),
677                    u_wmsg_errorName(err));
678            }
679
680            if (t) {
681                delete t;
682                t = 0;
683            }
684            goto error_exit;
685        }
686
687        useOffsets = FALSE;
688    }
689#endif
690
691    // Create codepage converter. If the codepage or its aliases weren't
692    // available, it returns NULL and a failure code. We also set the
693    // callbacks, and return errors in the same way.
694
695    convfrom = ucnv_open(fromcpage, &err);
696    if (U_FAILURE(err)) {
697        UnicodeString str(fromcpage, "");
698        initMsg(pname);
699        u_wmsg(stderr, "cantOpenFromCodeset", str.getTerminatedBuffer(),
700            u_wmsg_errorName(err));
701        goto error_exit;
702    }
703    ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
704    if (U_FAILURE(err)) {
705        initMsg(pname);
706        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
707        goto error_exit;
708    }
709
710    convto = ucnv_open(tocpage, &err);
711    if (U_FAILURE(err)) {
712        UnicodeString str(tocpage, "");
713        initMsg(pname);
714        u_wmsg(stderr, "cantOpenToCodeset", str.getTerminatedBuffer(),
715            u_wmsg_errorName(err));
716        goto error_exit;
717    }
718    ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
719    if (U_FAILURE(err)) {
720        initMsg(pname);
721        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
722        goto error_exit;
723    }
724    ucnv_setFallback(convto, fallback);
725
726    UBool willexit, fromSawEndOfBytes, toSawEndOfUnicode;
727    int8_t sig;
728
729    // OK, we can convert now.
730    sig = signature;
731    rd = 0;
732
733    do {
734        willexit = FALSE;
735
736        // input file offset at the beginning of the next buffer
737        infoffset += rd;
738
739        rd = fread(buf, 1, bufsz, infile);
740        if (ferror(infile) != 0) {
741            UnicodeString str(strerror(errno));
742            initMsg(pname);
743            u_wmsg(stderr, "cantRead", str.getTerminatedBuffer());
744            goto error_exit;
745        }
746
747        // Convert the read buffer into the new encoding via Unicode.
748        // After the call 'unibufp' will be placed behind the last
749        // character that was converted in the 'unibuf'.
750        // Also the 'cbufp' is positioned behind the last converted
751        // character.
752        // At the last conversion in the file, flush should be set to
753        // true so that we get all characters converted.
754        //
755        // The converter must be flushed at the end of conversion so
756        // that characters on hold also will be written.
757
758        cbufp = buf;
759        flush = (UBool)(rd != bufsz);
760
761        // convert until the input is consumed
762        do {
763            // remember the start of the current byte-to-Unicode conversion
764            prevbufp = cbufp;
765
766            unibuf = unibufp = u.getBuffer((int32_t)bufsz);
767
768            // Use bufsz instead of u.getCapacity() for the targetLimit
769            // so that we don't overflow fromoffsets[].
770            ucnv_toUnicode(convfrom, &unibufp, unibuf + bufsz, &cbufp,
771                buf + rd, useOffsets ? fromoffsets : NULL, flush, &err);
772
773            ulen = (int32_t)(unibufp - unibuf);
774            u.releaseBuffer(U_SUCCESS(err) ? ulen : 0);
775
776            // fromSawEndOfBytes indicates that ucnv_toUnicode() is done
777            // converting all of the input bytes.
778            // It works like this because ucnv_toUnicode() returns only under the
779            // following conditions:
780            // - an error occurred during conversion (an error code is set)
781            // - the target buffer is filled (the error code indicates an overflow)
782            // - the source is consumed
783            // That is, if the error code does not indicate a failure,
784            // not even an overflow, then the source must be consumed entirely.
785            fromSawEndOfBytes = (UBool)U_SUCCESS(err);
786
787            if (err == U_BUFFER_OVERFLOW_ERROR) {
788                err = U_ZERO_ERROR;
789            } else if (U_FAILURE(err)) {
790                char pos[32], errorBytes[32];
791                int8_t i, length, errorLength;
792
793                UErrorCode localError = U_ZERO_ERROR;
794                errorLength = (int8_t)sizeof(errorBytes);
795                ucnv_getInvalidChars(convfrom, errorBytes, &errorLength, &localError);
796                if (U_FAILURE(localError) || errorLength == 0) {
797                    errorLength = 1;
798                }
799
800                // print the input file offset of the start of the error bytes:
801                // input file offset of the current byte buffer +
802                // length of the just consumed bytes -
803                // length of the error bytes
804                length =
805                    (int8_t)sprintf(pos, "%d",
806                        (int)(infoffset + (cbufp - buf) - errorLength));
807
808                // output the bytes that caused the error
809                UnicodeString str;
810                for (i = 0; i < errorLength; ++i) {
811                    if (i > 0) {
812                        str.append((UChar)uSP);
813                    }
814                    str.append(nibbleToHex((uint8_t)errorBytes[i] >> 4));
815                    str.append(nibbleToHex((uint8_t)errorBytes[i]));
816                }
817
818                initMsg(pname);
819                u_wmsg(stderr, "problemCvtToU",
820                        UnicodeString(pos, length, "").getTerminatedBuffer(),
821                        str.getTerminatedBuffer(),
822                        u_wmsg_errorName(err));
823
824                willexit = TRUE;
825                err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
826            }
827
828            // Replaced a check for whether the input was consumed by
829            // looping until it is; message key "premEndInput" now obsolete.
830
831            if (ulen == 0) {
832                continue;
833            }
834
835            // remove a U+FEFF Unicode signature character if requested
836            if (sig < 0) {
837                if (u.charAt(0) == uSig) {
838                    u.remove(0, 1);
839
840                    // account for the removed UChar and offset
841                    --ulen;
842
843                    if (useOffsets) {
844                        // remove an offset from fromoffsets[] as well
845                        // to keep the array parallel with the UChars
846                        memmove(fromoffsets, fromoffsets + 1, ulen * 4);
847                    }
848
849                }
850                sig = 0;
851            }
852
853#if !UCONFIG_NO_TRANSLITERATION
854            // Transliterate/transform if needed.
855
856            // For transformation, we use chunking code -
857            // collect Unicode input until, for example, an end-of-line,
858            // then transform and output-convert that and continue collecting.
859            // This makes the transformation result independent of the buffer size
860            // while avoiding the slower keyboard mode.
861            // The end-of-chunk characters are completely included in the
862            // transformed string in case they are to be transformed themselves.
863            if (t != NULL) {
864                UnicodeString out;
865                int32_t chunkLimit;
866
867                do {
868                    chunkLimit = getChunkLimit(chunk, u);
869                    if (chunkLimit < 0 && flush && fromSawEndOfBytes) {
870                        // use all of the rest at the end of the text
871                        chunkLimit = u.length();
872                    }
873                    if (chunkLimit >= 0) {
874                        // complete the chunk and transform it
875                        chunk.append(u, 0, chunkLimit);
876                        u.remove(0, chunkLimit);
877                        t->transliterate(chunk);
878
879                        // append the transformation result to the result and empty the chunk
880                        out.append(chunk);
881                        chunk.remove();
882                    } else {
883                        // continue collecting the chunk
884                        chunk.append(u);
885                        break;
886                    }
887                } while (!u.isEmpty());
888
889                u = out;
890                ulen = u.length();
891            }
892#endif
893
894            // add a U+FEFF Unicode signature character if requested
895            // and possible/necessary
896            if (sig > 0) {
897                if (u.charAt(0) != uSig && cnvSigType(convto) == CNV_WITH_FEFF) {
898                    u.insert(0, (UChar)uSig);
899
900                    if (useOffsets) {
901                        // insert a pseudo-offset into fromoffsets[] as well
902                        // to keep the array parallel with the UChars
903                        memmove(fromoffsets + 1, fromoffsets, ulen * 4);
904                        fromoffsets[0] = -1;
905                    }
906
907                    // account for the additional UChar and offset
908                    ++ulen;
909                }
910                sig = 0;
911            }
912
913            // Convert the Unicode buffer into the destination codepage
914            // Again 'bufp' will be placed behind the last converted character
915            // And 'unibufp' will be placed behind the last converted unicode character
916            // At the last conversion flush should be set to true to ensure that
917            // all characters left get converted
918
919            unibuf = unibufbp = u.getBuffer();
920
921            do {
922                bufp = outbuf;
923
924                // Use fromSawEndOfBytes in addition to the flush flag -
925                // it indicates whether the intermediate Unicode string
926                // contains the very last UChars for the very last input bytes.
927                ucnv_fromUnicode(convto, &bufp, outbuf + bufsz,
928                                 &unibufbp,
929                                 unibuf + ulen,
930                                 NULL, (UBool)(flush && fromSawEndOfBytes), &err);
931
932                // toSawEndOfUnicode indicates that ucnv_fromUnicode() is done
933                // converting all of the intermediate UChars.
934                // See comment for fromSawEndOfBytes.
935                toSawEndOfUnicode = (UBool)U_SUCCESS(err);
936
937                if (err == U_BUFFER_OVERFLOW_ERROR) {
938                    err = U_ZERO_ERROR;
939                } else if (U_FAILURE(err)) {
940                    UChar errorUChars[4];
941                    const char *errtag;
942                    char pos[32];
943                    UChar32 c;
944                    int8_t i, length, errorLength;
945
946                    UErrorCode localError = U_ZERO_ERROR;
947                    errorLength = (int8_t)UPRV_LENGTHOF(errorUChars);
948                    ucnv_getInvalidUChars(convto, errorUChars, &errorLength, &localError);
949                    if (U_FAILURE(localError) || errorLength == 0) {
950                        // need at least 1 so that we don't access beyond the length of fromoffsets[]
951                        errorLength = 1;
952                    }
953
954                    int32_t ferroffset;
955
956                    if (useOffsets) {
957                        // Unicode buffer offset of the start of the error UChars
958                        ferroffset = (int32_t)((unibufbp - unibuf) - errorLength);
959                        if (ferroffset < 0) {
960                            // approximation - the character started in the previous Unicode buffer
961                            ferroffset = 0;
962                        }
963
964                        // get the corresponding byte offset out of fromoffsets[]
965                        // go back if the offset is not known for some of the UChars
966                        int32_t fromoffset;
967                        do {
968                            fromoffset = fromoffsets[ferroffset];
969                        } while (fromoffset < 0 && --ferroffset >= 0);
970
971                        // total input file offset =
972                        // input file offset of the current byte buffer +
973                        // byte buffer offset of where the current Unicode buffer is converted from +
974                        // fromoffsets[Unicode offset]
975                        ferroffset = infoffset + (prevbufp - buf) + fromoffset;
976                        errtag = "problemCvtFromU";
977                    } else {
978                        // Do not use fromoffsets if (t != NULL) because the Unicode text may
979                        // be different from what the offsets refer to.
980
981                        // output file offset
982                        ferroffset = (int32_t)(outfoffset + (bufp - outbuf));
983                        errtag = "problemCvtFromUOut";
984                    }
985
986                    length = (int8_t)sprintf(pos, "%u", (int)ferroffset);
987
988                    // output the code points that caused the error
989                    UnicodeString str;
990                    for (i = 0; i < errorLength;) {
991                        if (i > 0) {
992                            str.append((UChar)uSP);
993                        }
994                        U16_NEXT(errorUChars, i, errorLength, c);
995                        if (c >= 0x100000) {
996                            str.append(nibbleToHex((uint8_t)(c >> 20)));
997                        }
998                        if (c >= 0x10000) {
999                            str.append(nibbleToHex((uint8_t)(c >> 16)));
1000                        }
1001                        str.append(nibbleToHex((uint8_t)(c >> 12)));
1002                        str.append(nibbleToHex((uint8_t)(c >> 8)));
1003                        str.append(nibbleToHex((uint8_t)(c >> 4)));
1004                        str.append(nibbleToHex((uint8_t)c));
1005                    }
1006
1007                    initMsg(pname);
1008                    u_wmsg(stderr, errtag,
1009                            UnicodeString(pos, length, "").getTerminatedBuffer(),
1010                            str.getTerminatedBuffer(),
1011                           u_wmsg_errorName(err));
1012                    u_wmsg(stderr, "errorUnicode", str.getTerminatedBuffer());
1013
1014                    willexit = TRUE;
1015                    err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
1016                }
1017
1018                // Replaced a check for whether the intermediate Unicode characters were all consumed by
1019                // looping until they are; message key "premEnd" now obsolete.
1020
1021                // Finally, write the converted buffer to the output file
1022                size_t outlen = (size_t) (bufp - outbuf);
1023                outfoffset += (int32_t)(wr = fwrite(outbuf, 1, outlen, outfile));
1024                if (wr != outlen) {
1025                    UnicodeString str(strerror(errno));
1026                    initMsg(pname);
1027                    u_wmsg(stderr, "cantWrite", str.getTerminatedBuffer());
1028                    willexit = TRUE;
1029                }
1030
1031                if (willexit) {
1032                    goto error_exit;
1033                }
1034            } while (!toSawEndOfUnicode);
1035        } while (!fromSawEndOfBytes);
1036    } while (!flush);           // Stop when we have flushed the
1037                                // converters (this means that it's
1038                                // the end of output)
1039
1040    goto normal_exit;
1041
1042error_exit:
1043    ret = FALSE;
1044
1045normal_exit:
1046    // Cleanup.
1047
1048    ucnv_close(convfrom);
1049    ucnv_close(convto);
1050
1051#if !UCONFIG_NO_TRANSLITERATION
1052    delete t;
1053#endif
1054
1055    if (closeFile) {
1056        fclose(infile);
1057    }
1058
1059    return ret;
1060}
1061
1062static void usage(const char *pname, int ecode) {
1063    const UChar *msg;
1064    int32_t msgLen;
1065    UErrorCode err = U_ZERO_ERROR;
1066    FILE *fp = ecode ? stderr : stdout;
1067    int res;
1068
1069    initMsg(pname);
1070    msg =
1071        ures_getStringByKey(gBundle, ecode ? "lcUsageWord" : "ucUsageWord",
1072                            &msgLen, &err);
1073    UnicodeString upname(pname, (int32_t)(uprv_strlen(pname) + 1));
1074    UnicodeString mname(msg, msgLen + 1);
1075
1076    res = u_wmsg(fp, "usage", mname.getBuffer(), upname.getBuffer());
1077    if (!ecode) {
1078        if (!res) {
1079            fputc('\n', fp);
1080        }
1081        if (!u_wmsg(fp, "help")) {
1082            /* Now dump callbacks and finish. */
1083
1084            int i, count =
1085                sizeof(transcode_callbacks) / sizeof(*transcode_callbacks);
1086            for (i = 0; i < count; ++i) {
1087                fprintf(fp, " %s", transcode_callbacks[i].name);
1088            }
1089            fputc('\n', fp);
1090        }
1091    }
1092
1093    exit(ecode);
1094}
1095
1096extern int
1097main(int argc, char **argv)
1098{
1099    FILE *outfile;
1100    int ret = 0;
1101
1102    size_t bufsz = DEFAULT_BUFSZ;
1103
1104    const char *fromcpage = 0;
1105    const char *tocpage = 0;
1106    const char *translit = 0;
1107    const char *outfilestr = 0;
1108    UBool fallback = FALSE;
1109
1110    UConverterFromUCallback fromucallback = UCNV_FROM_U_CALLBACK_STOP;
1111    const void *fromuctxt = 0;
1112    UConverterToUCallback toucallback = UCNV_TO_U_CALLBACK_STOP;
1113    const void *touctxt = 0;
1114
1115    char **iter, **remainArgv, **remainArgvLimit;
1116    char **end = argv + argc;
1117
1118    const char *pname;
1119
1120    UBool printConvs = FALSE, printCanon = FALSE, printTranslits = FALSE;
1121    const char *printName = 0;
1122
1123    UBool verbose = FALSE;
1124    UErrorCode status = U_ZERO_ERROR;
1125
1126    ConvertFile cf;
1127
1128    /* Initialize ICU */
1129    u_init(&status);
1130    if (U_FAILURE(status)) {
1131        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
1132            argv[0], u_errorName(status));
1133        exit(1);
1134    }
1135
1136    // Get and prettify pname.
1137    pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR);
1138#if U_PLATFORM_USES_ONLY_WIN32_API
1139    if (!pname) {
1140        pname = uprv_strrchr(*argv, '/');
1141    }
1142#endif
1143    if (!pname) {
1144        pname = *argv;
1145    } else {
1146        ++pname;
1147    }
1148
1149    // First, get the arguments from command-line
1150    // to know the codepages to convert between
1151
1152    remainArgv = remainArgvLimit = argv + 1;
1153    for (iter = argv + 1; iter != end; iter++) {
1154        // Check for from charset
1155        if (strcmp("-f", *iter) == 0 || !strcmp("--from-code", *iter)) {
1156            iter++;
1157            if (iter != end)
1158                fromcpage = *iter;
1159            else
1160                usage(pname, 1);
1161        } else if (strcmp("-t", *iter) == 0 || !strcmp("--to-code", *iter)) {
1162            iter++;
1163            if (iter != end)
1164                tocpage = *iter;
1165            else
1166                usage(pname, 1);
1167        } else if (strcmp("-x", *iter) == 0) {
1168            iter++;
1169            if (iter != end)
1170                translit = *iter;
1171            else
1172                usage(pname, 1);
1173        } else if (!strcmp("--fallback", *iter)) {
1174            fallback = TRUE;
1175        } else if (!strcmp("--no-fallback", *iter)) {
1176            fallback = FALSE;
1177        } else if (strcmp("-b", *iter) == 0 || !strcmp("--block-size", *iter)) {
1178            iter++;
1179            if (iter != end) {
1180                bufsz = atoi(*iter);
1181                if ((int) bufsz <= 0) {
1182                    initMsg(pname);
1183                    UnicodeString str(*iter);
1184                    initMsg(pname);
1185                    u_wmsg(stderr, "badBlockSize", str.getTerminatedBuffer());
1186                    return 3;
1187                }
1188            } else {
1189                usage(pname, 1);
1190            }
1191        } else if (strcmp("-l", *iter) == 0 || !strcmp("--list", *iter)) {
1192            if (printTranslits) {
1193                usage(pname, 1);
1194            }
1195            printConvs = TRUE;
1196        } else if (strcmp("--default-code", *iter) == 0) {
1197            if (printTranslits) {
1198                usage(pname, 1);
1199            }
1200            printName = ucnv_getDefaultName();
1201        } else if (strcmp("--list-code", *iter) == 0) {
1202            if (printTranslits) {
1203                usage(pname, 1);
1204            }
1205
1206            iter++;
1207            if (iter != end) {
1208                UErrorCode e = U_ZERO_ERROR;
1209                printName = ucnv_getAlias(*iter, 0, &e);
1210                if (U_FAILURE(e) || !printName) {
1211                    UnicodeString str(*iter);
1212                    initMsg(pname);
1213                    u_wmsg(stderr, "noSuchCodeset", str.getTerminatedBuffer());
1214                    return 2;
1215                }
1216            } else
1217                usage(pname, 1);
1218        } else if (strcmp("--canon", *iter) == 0) {
1219            printCanon = TRUE;
1220        } else if (strcmp("-L", *iter) == 0
1221            || !strcmp("--list-transliterators", *iter)) {
1222            if (printConvs) {
1223                usage(pname, 1);
1224            }
1225            printTranslits = TRUE;
1226        } else if (strcmp("-h", *iter) == 0 || !strcmp("-?", *iter)
1227            || !strcmp("--help", *iter)) {
1228            usage(pname, 0);
1229        } else if (!strcmp("-c", *iter)) {
1230            fromucallback = UCNV_FROM_U_CALLBACK_SKIP;
1231        } else if (!strcmp("--to-callback", *iter)) {
1232            iter++;
1233            if (iter != end) {
1234                const struct callback_ent *cbe = findCallback(*iter);
1235                if (cbe) {
1236                    fromucallback = cbe->fromu;
1237                    fromuctxt = cbe->fromuctxt;
1238                } else {
1239                    UnicodeString str(*iter);
1240                    initMsg(pname);
1241                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1242                    return 4;
1243                }
1244            } else {
1245                usage(pname, 1);
1246            }
1247        } else if (!strcmp("--from-callback", *iter)) {
1248            iter++;
1249            if (iter != end) {
1250                const struct callback_ent *cbe = findCallback(*iter);
1251                if (cbe) {
1252                    toucallback = cbe->tou;
1253                    touctxt = cbe->touctxt;
1254                } else {
1255                    UnicodeString str(*iter);
1256                    initMsg(pname);
1257                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1258                    return 4;
1259                }
1260            } else {
1261                usage(pname, 1);
1262            }
1263        } else if (!strcmp("-i", *iter)) {
1264            toucallback = UCNV_TO_U_CALLBACK_SKIP;
1265        } else if (!strcmp("--callback", *iter)) {
1266            iter++;
1267            if (iter != end) {
1268                const struct callback_ent *cbe = findCallback(*iter);
1269                if (cbe) {
1270                    fromucallback = cbe->fromu;
1271                    fromuctxt = cbe->fromuctxt;
1272                    toucallback = cbe->tou;
1273                    touctxt = cbe->touctxt;
1274                } else {
1275                    UnicodeString str(*iter);
1276                    initMsg(pname);
1277                    u_wmsg(stderr, "unknownCallback", str.getTerminatedBuffer());
1278                    return 4;
1279                }
1280            } else {
1281                usage(pname, 1);
1282            }
1283        } else if (!strcmp("-s", *iter) || !strcmp("--silent", *iter)) {
1284            verbose = FALSE;
1285        } else if (!strcmp("-v", *iter) || !strcmp("--verbose", *iter)) {
1286            verbose = TRUE;
1287        } else if (!strcmp("-V", *iter) || !strcmp("--version", *iter)) {
1288            printf("%s v2.1  ICU " U_ICU_VERSION "\n", pname);
1289            return 0;
1290        } else if (!strcmp("-o", *iter) || !strcmp("--output", *iter)) {
1291            ++iter;
1292            if (iter != end && !outfilestr) {
1293                outfilestr = *iter;
1294            } else {
1295                usage(pname, 1);
1296            }
1297        } else if (0 == strcmp("--add-signature", *iter)) {
1298            cf.signature = 1;
1299        } else if (0 == strcmp("--remove-signature", *iter)) {
1300            cf.signature = -1;
1301        } else if (**iter == '-' && (*iter)[1]) {
1302            usage(pname, 1);
1303        } else {
1304            // move a non-option up in argv[]
1305            *remainArgvLimit++ = *iter;
1306        }
1307    }
1308
1309    if (printConvs || printName) {
1310        return printConverters(pname, printName, printCanon) ? 2 : 0;
1311    } else if (printTranslits) {
1312        return printTransliterators(printCanon) ? 3 : 0;
1313    }
1314
1315    if (!fromcpage || !uprv_strcmp(fromcpage, "-")) {
1316        fromcpage = ucnv_getDefaultName();
1317    }
1318    if (!tocpage || !uprv_strcmp(tocpage, "-")) {
1319        tocpage = ucnv_getDefaultName();
1320    }
1321
1322    // Open the correct output file or connect to stdout for reading input
1323    if (outfilestr != 0 && strcmp(outfilestr, "-")) {
1324        outfile = fopen(outfilestr, "wb");
1325        if (outfile == 0) {
1326            UnicodeString str1(outfilestr, "");
1327            UnicodeString str2(strerror(errno), "");
1328            initMsg(pname);
1329            u_wmsg(stderr, "cantCreateOutputF",
1330                str1.getBuffer(), str2.getBuffer());
1331            return 1;
1332        }
1333    } else {
1334        outfilestr = "-";
1335        outfile = stdout;
1336#ifdef USE_FILENO_BINARY_MODE
1337        if (setmode(fileno(outfile), O_BINARY) == -1) {
1338            u_wmsg(stderr, "cantSetOutBinMode");
1339            exit(-1);
1340        }
1341#endif
1342    }
1343
1344    /* Loop again on the arguments to find all the input files, and
1345    convert them. */
1346
1347    cf.setBufferSize(bufsz);
1348
1349    if(remainArgv < remainArgvLimit) {
1350        for (iter = remainArgv; iter != remainArgvLimit; iter++) {
1351            if (!cf.convertFile(
1352                    pname, fromcpage, toucallback, touctxt, tocpage,
1353                    fromucallback, fromuctxt, fallback, translit, *iter,
1354                    outfile, verbose)
1355            ) {
1356                goto error_exit;
1357            }
1358        }
1359    } else {
1360        if (!cf.convertFile(
1361                pname, fromcpage, toucallback, touctxt, tocpage,
1362                fromucallback, fromuctxt, fallback, translit, 0,
1363                outfile, verbose)
1364        ) {
1365            goto error_exit;
1366        }
1367    }
1368
1369    goto normal_exit;
1370error_exit:
1371#if !UCONFIG_NO_LEGACY_CONVERSION
1372    ret = 1;
1373#else
1374    fprintf(stderr, "uconv error: UCONFIG_NO_LEGACY_CONVERSION is on. See uconfig.h\n");
1375#endif
1376normal_exit:
1377
1378    if (outfile != stdout) {
1379        fclose(outfile);
1380    }
1381
1382    u_cleanup();
1383
1384    return ret;
1385}
1386
1387
1388/*
1389 * Hey, Emacs, please set the following:
1390 *
1391 * Local Variables:
1392 * indent-tabs-mode: nil
1393 * End:
1394 *
1395 */
1396