1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003-2014, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  icuswap.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003aug08
14*   created by: Markus W. Scherer
15*
16*   This tool takes an ICU data file and "swaps" it, that is, changes its
17*   platform properties between big-/little-endianness and ASCII/EBCDIC charset
18*   families.
19*   The modified data file is written to a new file.
20*   Useful as an install-time tool for shipping only one flavor of ICU data
21*   and preparing data files for the target platform.
22*   Will not work with data DLLs (shared libraries).
23*/
24
25#include "unicode/utypes.h"
26#include "unicode/putil.h"
27#include "unicode/udata.h"
28#include "cmemory.h"
29#include "cstring.h"
30#include "uinvchar.h"
31#include "uarrsort.h"
32#include "ucmndata.h"
33#include "udataswp.h"
34#include "swapimpl.h"
35#include "toolutil.h"
36#include "uoptions.h"
37
38#include <stdio.h>
39#include <stdlib.h>
40#include <string.h>
41
42/* definitions */
43
44#define DEFAULT_PADDING_LENGTH 15
45
46static UOption options[]={
47    UOPTION_HELP_H,
48    UOPTION_HELP_QUESTION_MARK,
49    UOPTION_DEF("type", 't', UOPT_REQUIRES_ARG)
50};
51
52enum {
53    OPT_HELP_H,
54    OPT_HELP_QUESTION_MARK,
55    OPT_OUT_TYPE
56};
57
58static int32_t
59fileSize(FILE *f) {
60    int32_t size;
61
62    fseek(f, 0, SEEK_END);
63    size=(int32_t)ftell(f);
64    fseek(f, 0, SEEK_SET);
65    return size;
66}
67
68/**
69 * Swap an ICU .dat package, including swapping of enclosed items.
70 */
71U_CFUNC int32_t U_CALLCONV
72udata_swapPackage(const char *inFilename, const char *outFilename,
73                  const UDataSwapper *ds,
74                  const void *inData, int32_t length, void *outData,
75                  UErrorCode *pErrorCode);
76
77U_CDECL_BEGIN
78static void U_CALLCONV
79printError(void *context, const char *fmt, va_list args) {
80    vfprintf((FILE *)context, fmt, args);
81}
82U_CDECL_END
83
84static int
85printUsage(const char *pname, UBool ishelp) {
86    fprintf(stderr,
87            "%csage: %s [ -h, -?, --help ] -tl|-tb|-te|--type=b|... infilename outfilename\n",
88            ishelp ? 'U' : 'u', pname);
89    if(ishelp) {
90        fprintf(stderr,
91              "\nOptions: -h, -?, --help    print this message and exit\n"
92                "         Read the input file, swap its platform properties according\n"
93                "         to the -t or --type option, and write the result to the output file.\n"
94                "         -tl               change to little-endian/ASCII charset family\n"
95                "         -tb               change to big-endian/ASCII charset family\n"
96                "         -te               change to big-endian/EBCDIC charset family\n");
97    }
98
99    return !ishelp;
100}
101
102extern int
103main(int argc, char *argv[]) {
104    FILE *in, *out;
105    const char *pname;
106    char *data;
107    int32_t length;
108    UBool ishelp;
109    int rc;
110
111    UDataSwapper *ds;
112    const UDataInfo *pInfo;
113    UErrorCode errorCode;
114    uint8_t outCharset;
115    UBool outIsBigEndian;
116
117    U_MAIN_INIT_ARGS(argc, argv);
118
119    fprintf(stderr, "Warning: icuswap is an obsolete tool and it will be removed in the next ICU release.\nPlease use the icupkg tool instead.\n");
120
121    /* get the program basename */
122    pname=strrchr(argv[0], U_FILE_SEP_CHAR);
123    if(pname==NULL) {
124        pname=strrchr(argv[0], '/');
125    }
126    if(pname!=NULL) {
127        ++pname;
128    } else {
129        pname=argv[0];
130    }
131
132    argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
133    ishelp=options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur;
134    if(ishelp || argc!=3) {
135        return printUsage(pname, ishelp);
136    }
137
138    /* parse the output type option */
139    data=(char *)options[OPT_OUT_TYPE].value;
140    if(data[0]==0 || data[1]!=0) {
141        /* the type must be exactly one letter */
142        return printUsage(pname, FALSE);
143    }
144    switch(data[0]) {
145    case 'l':
146        outIsBigEndian=FALSE;
147        outCharset=U_ASCII_FAMILY;
148        break;
149    case 'b':
150        outIsBigEndian=TRUE;
151        outCharset=U_ASCII_FAMILY;
152        break;
153    case 'e':
154        outIsBigEndian=TRUE;
155        outCharset=U_EBCDIC_FAMILY;
156        break;
157    default:
158        return printUsage(pname, FALSE);
159    }
160
161    in=out=NULL;
162    data=NULL;
163
164    /* open the input file, get its length, allocate memory for it, read the file */
165    in=fopen(argv[1], "rb");
166    if(in==NULL) {
167        fprintf(stderr, "%s: unable to open input file \"%s\"\n", pname, argv[1]);
168        rc=2;
169        goto done;
170    }
171
172    length=fileSize(in);
173    if(length<DEFAULT_PADDING_LENGTH) {
174        fprintf(stderr, "%s: empty input file \"%s\"\n", pname, argv[1]);
175        rc=2;
176        goto done;
177    }
178
179    /*
180     * +15: udata_swapPackage() may need to add a few padding bytes to the
181     * last item if charset swapping is done,
182     * because the last item may be resorted into the middle and then needs
183     * additional padding bytes
184     */
185    data=(char *)malloc(length+DEFAULT_PADDING_LENGTH);
186    if(data==NULL) {
187        fprintf(stderr, "%s: error allocating memory for \"%s\"\n", pname, argv[1]);
188        rc=2;
189        goto done;
190    }
191
192    /* set the last 15 bytes to the usual padding byte, see udata_swapPackage() */
193    uprv_memset(data+length-DEFAULT_PADDING_LENGTH, 0xaa, DEFAULT_PADDING_LENGTH);
194
195    if(length!=(int32_t)fread(data, 1, length, in)) {
196        fprintf(stderr, "%s: error reading \"%s\"\n", pname, argv[1]);
197        rc=3;
198        goto done;
199    }
200
201    fclose(in);
202    in=NULL;
203
204    /* swap the data in-place */
205    errorCode=U_ZERO_ERROR;
206    ds=udata_openSwapperForInputData(data, length, outIsBigEndian, outCharset, &errorCode);
207    if(U_FAILURE(errorCode)) {
208        fprintf(stderr, "%s: udata_openSwapperForInputData(\"%s\") failed - %s\n",
209                pname, argv[1], u_errorName(errorCode));
210        rc=4;
211        goto done;
212    }
213
214    ds->printError=printError;
215    ds->printErrorContext=stderr;
216
217    /* speculative cast, protected by the following length check */
218    pInfo=(const UDataInfo *)((const char *)data+4);
219
220    if( length>=20 &&
221        pInfo->dataFormat[0]==0x43 &&   /* dataFormat="CmnD" */
222        pInfo->dataFormat[1]==0x6d &&
223        pInfo->dataFormat[2]==0x6e &&
224        pInfo->dataFormat[3]==0x44
225    ) {
226        /*
227         * swap the .dat package
228         * udata_swapPackage() needs to rename ToC name entries from the old package
229         * name to the new one.
230         * We pass it the filenames, and udata_swapPackage() will extract the
231         * package names.
232         */
233        length=udata_swapPackage(argv[1], argv[2], ds, data, length, data, &errorCode);
234        udata_closeSwapper(ds);
235        if(U_FAILURE(errorCode)) {
236            fprintf(stderr, "%s: udata_swapPackage(\"%s\") failed - %s\n",
237                    pname, argv[1], u_errorName(errorCode));
238            rc=4;
239            goto done;
240        }
241    } else {
242        /* swap the data, which is not a .dat package */
243        length=udata_swap(ds, data, length, data, &errorCode);
244        udata_closeSwapper(ds);
245        if(U_FAILURE(errorCode)) {
246            fprintf(stderr, "%s: udata_swap(\"%s\") failed - %s\n",
247                    pname, argv[1], u_errorName(errorCode));
248            rc=4;
249            goto done;
250        }
251    }
252
253    out=fopen(argv[2], "wb");
254    if(out==NULL) {
255        fprintf(stderr, "%s: unable to open output file \"%s\"\n", pname, argv[2]);
256        rc=5;
257        goto done;
258    }
259
260    if(length!=(int32_t)fwrite(data, 1, length, out)) {
261        fprintf(stderr, "%s: error writing \"%s\"\n", pname, argv[2]);
262        rc=6;
263        goto done;
264    }
265
266    fclose(out);
267    out=NULL;
268
269    /* all done */
270    rc=0;
271
272done:
273    if(in!=NULL) {
274        fclose(in);
275    }
276    if(out!=NULL) {
277        fclose(out);
278    }
279    if(data!=NULL) {
280        free(data);
281    }
282    return rc;
283}
284
285/* swap .dat package files -------------------------------------------------- */
286
287static int32_t
288extractPackageName(const UDataSwapper *ds, const char *filename,
289                   char pkg[], int32_t capacity,
290                   UErrorCode *pErrorCode) {
291    const char *basename;
292    int32_t len;
293
294    if(U_FAILURE(*pErrorCode)) {
295        return 0;
296    }
297
298    basename=findBasename(filename);
299    len=(int32_t)uprv_strlen(basename)-4; /* -4: subtract the length of ".dat" */
300
301    if(len<=0 || 0!=uprv_strcmp(basename+len, ".dat")) {
302        udata_printError(ds, "udata_swapPackage(): \"%s\" is not recognized as a package filename (must end with .dat)\n",
303                         basename);
304        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
305        return 0;
306    }
307
308    if(len>=capacity) {
309        udata_printError(ds, "udata_swapPackage(): the package name \"%s\" is too long (>=%ld)\n",
310                         (long)capacity);
311        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
312        return 0;
313    }
314
315    uprv_memcpy(pkg, basename, len);
316    pkg[len]=0;
317    return len;
318}
319
320struct ToCEntry {
321    uint32_t nameOffset, inOffset, outOffset, length;
322};
323
324U_CDECL_BEGIN
325static int32_t U_CALLCONV
326compareToCEntries(const void *context, const void *left, const void *right) {
327    const char *chars=(const char *)context;
328    return (int32_t)uprv_strcmp(chars+((const ToCEntry *)left)->nameOffset,
329                                chars+((const ToCEntry *)right)->nameOffset);
330}
331U_CDECL_END
332
333U_CFUNC int32_t U_CALLCONV
334udata_swapPackage(const char *inFilename, const char *outFilename,
335                  const UDataSwapper *ds,
336                  const void *inData, int32_t length, void *outData,
337                  UErrorCode *pErrorCode) {
338    const UDataInfo *pInfo;
339    int32_t headerSize;
340
341    const uint8_t *inBytes;
342    uint8_t *outBytes;
343
344    uint32_t itemCount, offset, i;
345    int32_t itemLength;
346
347    const UDataOffsetTOCEntry *inEntries;
348    UDataOffsetTOCEntry *outEntries;
349
350    ToCEntry *table;
351
352    char inPkgName[32], outPkgName[32];
353    int32_t inPkgNameLength, outPkgNameLength;
354
355    /* udata_swapDataHeader checks the arguments */
356    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
357    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
358        return 0;
359    }
360
361    /* check data format and format version */
362    pInfo=(const UDataInfo *)((const char *)inData+4);
363    if(!(
364        pInfo->dataFormat[0]==0x43 &&   /* dataFormat="CmnD" */
365        pInfo->dataFormat[1]==0x6d &&
366        pInfo->dataFormat[2]==0x6e &&
367        pInfo->dataFormat[3]==0x44 &&
368        pInfo->formatVersion[0]==1
369    )) {
370        udata_printError(ds, "udata_swapPackage(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as an ICU .dat package\n",
371                         pInfo->dataFormat[0], pInfo->dataFormat[1],
372                         pInfo->dataFormat[2], pInfo->dataFormat[3],
373                         pInfo->formatVersion[0]);
374        *pErrorCode=U_UNSUPPORTED_ERROR;
375        return 0;
376    }
377
378    /*
379     * We need to change the ToC name entries so that they have the correct
380     * package name prefix.
381     * Extract the package names from the in/out filenames.
382     */
383    inPkgNameLength=extractPackageName(
384                        ds, inFilename,
385                        inPkgName, (int32_t)sizeof(inPkgName),
386                        pErrorCode);
387    outPkgNameLength=extractPackageName(
388                        ds, outFilename,
389                        outPkgName, (int32_t)sizeof(outPkgName),
390                        pErrorCode);
391    if(U_FAILURE(*pErrorCode)) {
392        return 0;
393    }
394
395    /*
396     * It is possible to work with inPkgNameLength!=outPkgNameLength,
397     * but then the length of the data file would change more significantly,
398     * which we are not currently prepared for.
399     */
400    if(inPkgNameLength!=outPkgNameLength) {
401        udata_printError(ds, "udata_swapPackage(): the package names \"%s\" and \"%s\" must have the same length\n",
402                         inPkgName, outPkgName);
403        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
404        return 0;
405    }
406
407    inBytes=(const uint8_t *)inData+headerSize;
408    inEntries=(const UDataOffsetTOCEntry *)(inBytes+4);
409
410    if(length<0) {
411        /* preflighting */
412        itemCount=ds->readUInt32(*(const uint32_t *)inBytes);
413        if(itemCount==0) {
414            /* no items: count only the item count and return */
415            return headerSize+4;
416        }
417
418        /* read the last item's offset and preflight it */
419        offset=ds->readUInt32(inEntries[itemCount-1].dataOffset);
420        itemLength=udata_swap(ds, inBytes+offset, -1, NULL, pErrorCode);
421
422        if(U_SUCCESS(*pErrorCode)) {
423            return headerSize+offset+(uint32_t)itemLength;
424        } else {
425            return 0;
426        }
427    } else {
428        /* check that the itemCount fits, then the ToC table, then at least the header of the last item */
429        length-=headerSize;
430        if(length<4) {
431            /* itemCount does not fit */
432            offset=0xffffffff;
433            itemCount=0; /* make compilers happy */
434        } else {
435            itemCount=ds->readUInt32(*(const uint32_t *)inBytes);
436            if(itemCount==0) {
437                offset=4;
438            } else if((uint32_t)length<(4+8*itemCount)) {
439                /* ToC table does not fit */
440                offset=0xffffffff;
441            } else {
442                /* offset of the last item plus at least 20 bytes for its header */
443                offset=20+ds->readUInt32(inEntries[itemCount-1].dataOffset);
444            }
445        }
446        if((uint32_t)length<offset) {
447            udata_printError(ds, "udata_swapPackage(): too few bytes (%d after header) for a .dat package\n",
448                             length);
449            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
450            return 0;
451        }
452
453        outBytes=(uint8_t *)outData+headerSize;
454
455        /* swap the item count */
456        ds->swapArray32(ds, inBytes, 4, outBytes, pErrorCode);
457
458        if(itemCount==0) {
459            /* no items: just return now */
460            return headerSize+4;
461        }
462
463        /* swap the item name strings */
464        offset=4+8*itemCount;
465        itemLength=(int32_t)(ds->readUInt32(inEntries[0].dataOffset)-offset);
466        udata_swapInvStringBlock(ds, inBytes+offset, itemLength, outBytes+offset, pErrorCode);
467        if(U_FAILURE(*pErrorCode)) {
468            udata_printError(ds, "udata_swapPackage() failed to swap the data item name strings\n");
469            return 0;
470        }
471        /* keep offset and itemLength in case we allocate and copy the strings below */
472
473        /* swap the package names into the output charset */
474        if(ds->outCharset!=U_CHARSET_FAMILY) {
475            UDataSwapper *ds2;
476            ds2=udata_openSwapper(TRUE, U_CHARSET_FAMILY, TRUE, ds->outCharset, pErrorCode);
477            ds2->swapInvChars(ds2, inPkgName, inPkgNameLength, inPkgName, pErrorCode);
478            ds2->swapInvChars(ds2, outPkgName, outPkgNameLength, outPkgName, pErrorCode);
479            udata_closeSwapper(ds2);
480            if(U_FAILURE(*pErrorCode)) {
481                udata_printError(ds, "udata_swapPackage() failed to swap the input/output package names\n");
482            }
483        }
484
485        /* change the prefix of each ToC entry name from the old to the new package name */
486        {
487            char *entryName;
488
489            for(i=0; i<itemCount; ++i) {
490                entryName=(char *)inBytes+ds->readUInt32(inEntries[i].nameOffset);
491
492                if(0==uprv_memcmp(entryName, inPkgName, inPkgNameLength)) {
493                    uprv_memcpy(entryName, outPkgName, inPkgNameLength);
494                } else {
495                    udata_printError(ds, "udata_swapPackage() failed: ToC item %ld does not have the input package name as a prefix\n",
496                                     (long)i);
497                    *pErrorCode=U_INVALID_FORMAT_ERROR;
498                    return 0;
499                }
500            }
501        }
502
503        /*
504         * Allocate the ToC table and, if necessary, a temporary buffer for
505         * pseudo-in-place swapping.
506         *
507         * We cannot swap in-place because:
508         *
509         * 1. If the swapping of an item fails mid-way, then in-place swapping
510         * has destroyed its data.
511         * Out-of-place swapping allows us to then copy its original data.
512         *
513         * 2. If swapping changes the charset family, then we must resort
514         * not only the ToC table but also the data items themselves.
515         * This requires a permutation and is best done with separate in/out
516         * buffers.
517         *
518         * We swapped the strings above to avoid the malloc below if string swapping fails.
519         */
520        if(inData==outData) {
521            /* +15: prepare for extra padding of a newly-last item */
522            table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry)+length+DEFAULT_PADDING_LENGTH);
523            if(table!=NULL) {
524                outBytes=(uint8_t *)(table+itemCount);
525
526                /* copy the item count and the swapped strings */
527                uprv_memcpy(outBytes, inBytes, 4);
528                uprv_memcpy(outBytes+offset, inBytes+offset, itemLength);
529            }
530        } else {
531            table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry));
532        }
533        if(table==NULL) {
534            udata_printError(ds, "udata_swapPackage(): out of memory allocating %d bytes\n",
535                             inData==outData ?
536                                 itemCount*sizeof(ToCEntry)+length+DEFAULT_PADDING_LENGTH :
537                                 itemCount*sizeof(ToCEntry));
538            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
539            return 0;
540        }
541        outEntries=(UDataOffsetTOCEntry *)(outBytes+4);
542
543        /* read the ToC table */
544        for(i=0; i<itemCount; ++i) {
545            table[i].nameOffset=ds->readUInt32(inEntries[i].nameOffset);
546            table[i].inOffset=ds->readUInt32(inEntries[i].dataOffset);
547            if(i>0) {
548                table[i-1].length=table[i].inOffset-table[i-1].inOffset;
549            }
550        }
551        table[itemCount-1].length=(uint32_t)length-table[itemCount-1].inOffset;
552
553        if(ds->inCharset==ds->outCharset) {
554            /* no charset swapping, no resorting: keep item offsets the same */
555            for(i=0; i<itemCount; ++i) {
556                table[i].outOffset=table[i].inOffset;
557            }
558        } else {
559            /* charset swapping: resort items by their swapped names */
560
561            /*
562             * Before the actual sorting, we need to make sure that each item
563             * has a length that is a multiple of 16 bytes so that all items
564             * are 16-aligned.
565             * Only the old last item may be missing up to 15 padding bytes.
566             * Add padding bytes for it.
567             * Since the icuswap main() function has already allocated enough
568             * input buffer space and set the last 15 bytes there to 0xaa,
569             * we only need to increase the total data length and the length
570             * of the last item here.
571             */
572            if((length&0xf)!=0) {
573                int32_t delta=16-(length&0xf);
574                length+=delta;
575                table[itemCount-1].length+=(uint32_t)delta;
576            }
577
578            /* Save the offset before we sort the TOC. */
579            offset=table[0].inOffset;
580            /* sort the TOC entries */
581            uprv_sortArray(table, (int32_t)itemCount, (int32_t)sizeof(ToCEntry),
582                           compareToCEntries, outBytes, FALSE, pErrorCode);
583
584            /*
585             * Note: Before sorting, the inOffset values were in order.
586             * Now the outOffset values are in order.
587             */
588
589            /* assign outOffset values */
590            for(i=0; i<itemCount; ++i) {
591                table[i].outOffset=offset;
592                offset+=table[i].length;
593            }
594        }
595
596        /* write the output ToC table */
597        for(i=0; i<itemCount; ++i) {
598            ds->writeUInt32(&outEntries[i].nameOffset, table[i].nameOffset);
599            ds->writeUInt32(&outEntries[i].dataOffset, table[i].outOffset);
600        }
601
602        /* swap each data item */
603        for(i=0; i<itemCount; ++i) {
604            /* first copy the item bytes to make sure that unreachable bytes are copied */
605            uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length);
606
607            /* swap the item */
608            udata_swap(ds, inBytes+table[i].inOffset, (int32_t)table[i].length,
609                          outBytes+table[i].outOffset, pErrorCode);
610
611            if(U_FAILURE(*pErrorCode)) {
612                if(ds->outCharset==U_CHARSET_FAMILY) {
613                    udata_printError(ds, "warning: udata_swapPackage() failed to swap item \"%s\"\n"
614                                         "    at inOffset 0x%x length 0x%x - %s\n"
615                                         "    the data item will be copied, not swapped\n\n",
616                                     (char *)outBytes+table[i].nameOffset,
617                                     table[i].inOffset, table[i].length, u_errorName(*pErrorCode));
618                } else {
619                    udata_printError(ds, "warning: udata_swapPackage() failed to swap an item\n"
620                                         "    at inOffset 0x%x length 0x%x - %s\n"
621                                         "    the data item will be copied, not swapped\n\n",
622                                     table[i].inOffset, table[i].length, u_errorName(*pErrorCode));
623                }
624                /* reset the error code, copy the data item, and continue */
625                *pErrorCode=U_ZERO_ERROR;
626                uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length);
627            }
628        }
629
630        if(inData==outData) {
631            /* copy the data from the temporary buffer to the in-place buffer */
632            uprv_memcpy((uint8_t *)outData+headerSize, outBytes, length);
633        }
634        uprv_free(table);
635
636        return headerSize+length;
637    }
638}
639
640/*
641 * Hey, Emacs, please set the following:
642 *
643 * Local Variables:
644 * indent-tabs-mode: nil
645 * End:
646 *
647 */
648