1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003-2007, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  icuswap.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003aug08
14*   created by: Markus W. Scherer
15*
16*   This tool takes an ICU data file and "swaps" it, that is, changes its
17*   platform properties between big-/little-endianness and ASCII/EBCDIC charset
18*   families.
19*   The modified data file is written to a new file.
20*   Useful as an install-time tool for shipping only one flavor of ICU data
21*   and preparing data files for the target platform.
22*   Will not work with data DLLs (shared libraries).
23*/
24
25#include "unicode/utypes.h"
26#include "unicode/putil.h"
27#include "unicode/udata.h"
28#include "cmemory.h"
29#include "cstring.h"
30#include "uinvchar.h"
31#include "uarrsort.h"
32#include "ucmndata.h"
33#include "udataswp.h"
34#include "swapimpl.h"
35#include "toolutil.h"
36#include "uoptions.h"
37
38#include <stdio.h>
39#include <stdlib.h>
40#include <string.h>
41
42/* definitions */
43
44#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
45#define DEFAULT_PADDING_LENGTH 15
46
47static UOption options[]={
48    UOPTION_HELP_H,
49    UOPTION_HELP_QUESTION_MARK,
50    UOPTION_DEF("type", 't', UOPT_REQUIRES_ARG)
51};
52
53enum {
54    OPT_HELP_H,
55    OPT_HELP_QUESTION_MARK,
56    OPT_OUT_TYPE
57};
58
59static int32_t
60fileSize(FILE *f) {
61    int32_t size;
62
63    fseek(f, 0, SEEK_END);
64    size=(int32_t)ftell(f);
65    fseek(f, 0, SEEK_SET);
66    return size;
67}
68
69/**
70 * Swap an ICU .dat package, including swapping of enclosed items.
71 */
72U_CFUNC int32_t U_CALLCONV
73udata_swapPackage(const char *inFilename, const char *outFilename,
74                  const UDataSwapper *ds,
75                  const void *inData, int32_t length, void *outData,
76                  UErrorCode *pErrorCode);
77
78U_CDECL_BEGIN
79static void U_CALLCONV
80printError(void *context, const char *fmt, va_list args) {
81    vfprintf((FILE *)context, fmt, args);
82}
83U_CDECL_END
84
85static int
86printUsage(const char *pname, UBool ishelp) {
87    fprintf(stderr,
88            "%csage: %s [ -h, -?, --help ] -tl|-tb|-te|--type=b|... infilename outfilename\n",
89            ishelp ? 'U' : 'u', pname);
90    if(ishelp) {
91        fprintf(stderr,
92              "\nOptions: -h, -?, --help    print this message and exit\n"
93                "         Read the input file, swap its platform properties according\n"
94                "         to the -t or --type option, and write the result to the output file.\n"
95                "         -tl               change to little-endian/ASCII charset family\n"
96                "         -tb               change to big-endian/ASCII charset family\n"
97                "         -te               change to big-endian/EBCDIC charset family\n");
98    }
99
100    return !ishelp;
101}
102
103extern int
104main(int argc, char *argv[]) {
105    FILE *in, *out;
106    const char *pname;
107    char *data;
108    int32_t length;
109    UBool ishelp;
110    int rc;
111
112    UDataSwapper *ds;
113    const UDataInfo *pInfo;
114    UErrorCode errorCode;
115    uint8_t outCharset;
116    UBool outIsBigEndian;
117
118    U_MAIN_INIT_ARGS(argc, argv);
119
120    fprintf(stderr, "Warning: icuswap is an obsolete tool and it will be removed in the next ICU release.\nPlease use the icupkg tool instead.\n");
121
122    /* get the program basename */
123    pname=strrchr(argv[0], U_FILE_SEP_CHAR);
124    if(pname==NULL) {
125        pname=strrchr(argv[0], '/');
126    }
127    if(pname!=NULL) {
128        ++pname;
129    } else {
130        pname=argv[0];
131    }
132
133    argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
134    ishelp=options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur;
135    if(ishelp || argc!=3) {
136        return printUsage(pname, ishelp);
137    }
138
139    /* parse the output type option */
140    data=(char *)options[OPT_OUT_TYPE].value;
141    if(data[0]==0 || data[1]!=0) {
142        /* the type must be exactly one letter */
143        return printUsage(pname, FALSE);
144    }
145    switch(data[0]) {
146    case 'l':
147        outIsBigEndian=FALSE;
148        outCharset=U_ASCII_FAMILY;
149        break;
150    case 'b':
151        outIsBigEndian=TRUE;
152        outCharset=U_ASCII_FAMILY;
153        break;
154    case 'e':
155        outIsBigEndian=TRUE;
156        outCharset=U_EBCDIC_FAMILY;
157        break;
158    default:
159        return printUsage(pname, FALSE);
160    }
161
162    in=out=NULL;
163    data=NULL;
164
165    /* open the input file, get its length, allocate memory for it, read the file */
166    in=fopen(argv[1], "rb");
167    if(in==NULL) {
168        fprintf(stderr, "%s: unable to open input file \"%s\"\n", pname, argv[1]);
169        rc=2;
170        goto done;
171    }
172
173    length=fileSize(in);
174    if(length<DEFAULT_PADDING_LENGTH) {
175        fprintf(stderr, "%s: empty input file \"%s\"\n", pname, argv[1]);
176        rc=2;
177        goto done;
178    }
179
180    /*
181     * +15: udata_swapPackage() may need to add a few padding bytes to the
182     * last item if charset swapping is done,
183     * because the last item may be resorted into the middle and then needs
184     * additional padding bytes
185     */
186    data=(char *)malloc(length+DEFAULT_PADDING_LENGTH);
187    if(data==NULL) {
188        fprintf(stderr, "%s: error allocating memory for \"%s\"\n", pname, argv[1]);
189        rc=2;
190        goto done;
191    }
192
193    /* set the last 15 bytes to the usual padding byte, see udata_swapPackage() */
194    uprv_memset(data+length-DEFAULT_PADDING_LENGTH, 0xaa, DEFAULT_PADDING_LENGTH);
195
196    if(length!=(int32_t)fread(data, 1, length, in)) {
197        fprintf(stderr, "%s: error reading \"%s\"\n", pname, argv[1]);
198        rc=3;
199        goto done;
200    }
201
202    fclose(in);
203    in=NULL;
204
205    /* swap the data in-place */
206    errorCode=U_ZERO_ERROR;
207    ds=udata_openSwapperForInputData(data, length, outIsBigEndian, outCharset, &errorCode);
208    if(U_FAILURE(errorCode)) {
209        fprintf(stderr, "%s: udata_openSwapperForInputData(\"%s\") failed - %s\n",
210                pname, argv[1], u_errorName(errorCode));
211        rc=4;
212        goto done;
213    }
214
215    ds->printError=printError;
216    ds->printErrorContext=stderr;
217
218    /* speculative cast, protected by the following length check */
219    pInfo=(const UDataInfo *)((const char *)data+4);
220
221    if( length>=20 &&
222        pInfo->dataFormat[0]==0x43 &&   /* dataFormat="CmnD" */
223        pInfo->dataFormat[1]==0x6d &&
224        pInfo->dataFormat[2]==0x6e &&
225        pInfo->dataFormat[3]==0x44
226    ) {
227        /*
228         * swap the .dat package
229         * udata_swapPackage() needs to rename ToC name entries from the old package
230         * name to the new one.
231         * We pass it the filenames, and udata_swapPackage() will extract the
232         * package names.
233         */
234        length=udata_swapPackage(argv[1], argv[2], ds, data, length, data, &errorCode);
235        udata_closeSwapper(ds);
236        if(U_FAILURE(errorCode)) {
237            fprintf(stderr, "%s: udata_swapPackage(\"%s\") failed - %s\n",
238                    pname, argv[1], u_errorName(errorCode));
239            rc=4;
240            goto done;
241        }
242    } else {
243        /* swap the data, which is not a .dat package */
244        length=udata_swap(ds, data, length, data, &errorCode);
245        udata_closeSwapper(ds);
246        if(U_FAILURE(errorCode)) {
247            fprintf(stderr, "%s: udata_swap(\"%s\") failed - %s\n",
248                    pname, argv[1], u_errorName(errorCode));
249            rc=4;
250            goto done;
251        }
252    }
253
254    out=fopen(argv[2], "wb");
255    if(out==NULL) {
256        fprintf(stderr, "%s: unable to open output file \"%s\"\n", pname, argv[2]);
257        rc=5;
258        goto done;
259    }
260
261    if(length!=(int32_t)fwrite(data, 1, length, out)) {
262        fprintf(stderr, "%s: error writing \"%s\"\n", pname, argv[2]);
263        rc=6;
264        goto done;
265    }
266
267    fclose(out);
268    out=NULL;
269
270    /* all done */
271    rc=0;
272
273done:
274    if(in!=NULL) {
275        fclose(in);
276    }
277    if(out!=NULL) {
278        fclose(out);
279    }
280    if(data!=NULL) {
281        free(data);
282    }
283    return rc;
284}
285
286/* swap .dat package files -------------------------------------------------- */
287
288static int32_t
289extractPackageName(const UDataSwapper *ds, const char *filename,
290                   char pkg[], int32_t capacity,
291                   UErrorCode *pErrorCode) {
292    const char *basename;
293    int32_t len;
294
295    if(U_FAILURE(*pErrorCode)) {
296        return 0;
297    }
298
299    basename=findBasename(filename);
300    len=(int32_t)uprv_strlen(basename)-4; /* -4: subtract the length of ".dat" */
301
302    if(len<=0 || 0!=uprv_strcmp(basename+len, ".dat")) {
303        udata_printError(ds, "udata_swapPackage(): \"%s\" is not recognized as a package filename (must end with .dat)\n",
304                         basename);
305        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
306        return 0;
307    }
308
309    if(len>=capacity) {
310        udata_printError(ds, "udata_swapPackage(): the package name \"%s\" is too long (>=%ld)\n",
311                         (long)capacity);
312        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
313        return 0;
314    }
315
316    uprv_memcpy(pkg, basename, len);
317    pkg[len]=0;
318    return len;
319}
320
321struct ToCEntry {
322    uint32_t nameOffset, inOffset, outOffset, length;
323};
324
325U_CDECL_BEGIN
326static int32_t U_CALLCONV
327compareToCEntries(const void *context, const void *left, const void *right) {
328    const char *chars=(const char *)context;
329    return (int32_t)uprv_strcmp(chars+((const ToCEntry *)left)->nameOffset,
330                                chars+((const ToCEntry *)right)->nameOffset);
331}
332U_CDECL_END
333
334U_CFUNC int32_t U_CALLCONV
335udata_swapPackage(const char *inFilename, const char *outFilename,
336                  const UDataSwapper *ds,
337                  const void *inData, int32_t length, void *outData,
338                  UErrorCode *pErrorCode) {
339    const UDataInfo *pInfo;
340    int32_t headerSize;
341
342    const uint8_t *inBytes;
343    uint8_t *outBytes;
344
345    uint32_t itemCount, offset, i;
346    int32_t itemLength;
347
348    const UDataOffsetTOCEntry *inEntries;
349    UDataOffsetTOCEntry *outEntries;
350
351    ToCEntry *table;
352
353    char inPkgName[32], outPkgName[32];
354    int32_t inPkgNameLength, outPkgNameLength;
355
356    /* udata_swapDataHeader checks the arguments */
357    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
358    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
359        return 0;
360    }
361
362    /* check data format and format version */
363    pInfo=(const UDataInfo *)((const char *)inData+4);
364    if(!(
365        pInfo->dataFormat[0]==0x43 &&   /* dataFormat="CmnD" */
366        pInfo->dataFormat[1]==0x6d &&
367        pInfo->dataFormat[2]==0x6e &&
368        pInfo->dataFormat[3]==0x44 &&
369        pInfo->formatVersion[0]==1
370    )) {
371        udata_printError(ds, "udata_swapPackage(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as an ICU .dat package\n",
372                         pInfo->dataFormat[0], pInfo->dataFormat[1],
373                         pInfo->dataFormat[2], pInfo->dataFormat[3],
374                         pInfo->formatVersion[0]);
375        *pErrorCode=U_UNSUPPORTED_ERROR;
376        return 0;
377    }
378
379    /*
380     * We need to change the ToC name entries so that they have the correct
381     * package name prefix.
382     * Extract the package names from the in/out filenames.
383     */
384    inPkgNameLength=extractPackageName(
385                        ds, inFilename,
386                        inPkgName, (int32_t)sizeof(inPkgName),
387                        pErrorCode);
388    outPkgNameLength=extractPackageName(
389                        ds, outFilename,
390                        outPkgName, (int32_t)sizeof(outPkgName),
391                        pErrorCode);
392    if(U_FAILURE(*pErrorCode)) {
393        return 0;
394    }
395
396    /*
397     * It is possible to work with inPkgNameLength!=outPkgNameLength,
398     * but then the length of the data file would change more significantly,
399     * which we are not currently prepared for.
400     */
401    if(inPkgNameLength!=outPkgNameLength) {
402        udata_printError(ds, "udata_swapPackage(): the package names \"%s\" and \"%s\" must have the same length\n",
403                         inPkgName, outPkgName);
404        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
405        return 0;
406    }
407
408    inBytes=(const uint8_t *)inData+headerSize;
409    inEntries=(const UDataOffsetTOCEntry *)(inBytes+4);
410
411    if(length<0) {
412        /* preflighting */
413        itemCount=ds->readUInt32(*(const uint32_t *)inBytes);
414        if(itemCount==0) {
415            /* no items: count only the item count and return */
416            return headerSize+4;
417        }
418
419        /* read the last item's offset and preflight it */
420        offset=ds->readUInt32(inEntries[itemCount-1].dataOffset);
421        itemLength=udata_swap(ds, inBytes+offset, -1, NULL, pErrorCode);
422
423        if(U_SUCCESS(*pErrorCode)) {
424            return headerSize+offset+(uint32_t)itemLength;
425        } else {
426            return 0;
427        }
428    } else {
429        /* check that the itemCount fits, then the ToC table, then at least the header of the last item */
430        length-=headerSize;
431        if(length<4) {
432            /* itemCount does not fit */
433            offset=0xffffffff;
434            itemCount=0; /* make compilers happy */
435        } else {
436            itemCount=ds->readUInt32(*(const uint32_t *)inBytes);
437            if(itemCount==0) {
438                offset=4;
439            } else if((uint32_t)length<(4+8*itemCount)) {
440                /* ToC table does not fit */
441                offset=0xffffffff;
442            } else {
443                /* offset of the last item plus at least 20 bytes for its header */
444                offset=20+ds->readUInt32(inEntries[itemCount-1].dataOffset);
445            }
446        }
447        if((uint32_t)length<offset) {
448            udata_printError(ds, "udata_swapPackage(): too few bytes (%d after header) for a .dat package\n",
449                             length);
450            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
451            return 0;
452        }
453
454        outBytes=(uint8_t *)outData+headerSize;
455
456        /* swap the item count */
457        ds->swapArray32(ds, inBytes, 4, outBytes, pErrorCode);
458
459        if(itemCount==0) {
460            /* no items: just return now */
461            return headerSize+4;
462        }
463
464        /* swap the item name strings */
465        offset=4+8*itemCount;
466        itemLength=(int32_t)(ds->readUInt32(inEntries[0].dataOffset)-offset);
467        udata_swapInvStringBlock(ds, inBytes+offset, itemLength, outBytes+offset, pErrorCode);
468        if(U_FAILURE(*pErrorCode)) {
469            udata_printError(ds, "udata_swapPackage() failed to swap the data item name strings\n");
470            return 0;
471        }
472        /* keep offset and itemLength in case we allocate and copy the strings below */
473
474        /* swap the package names into the output charset */
475        if(ds->outCharset!=U_CHARSET_FAMILY) {
476            UDataSwapper *ds2;
477            ds2=udata_openSwapper(TRUE, U_CHARSET_FAMILY, TRUE, ds->outCharset, pErrorCode);
478            ds2->swapInvChars(ds2, inPkgName, inPkgNameLength, inPkgName, pErrorCode);
479            ds2->swapInvChars(ds2, outPkgName, outPkgNameLength, outPkgName, pErrorCode);
480            udata_closeSwapper(ds2);
481            if(U_FAILURE(*pErrorCode)) {
482                udata_printError(ds, "udata_swapPackage() failed to swap the input/output package names\n");
483            }
484        }
485
486        /* change the prefix of each ToC entry name from the old to the new package name */
487        {
488            char *entryName;
489
490            for(i=0; i<itemCount; ++i) {
491                entryName=(char *)inBytes+ds->readUInt32(inEntries[i].nameOffset);
492
493                if(0==uprv_memcmp(entryName, inPkgName, inPkgNameLength)) {
494                    uprv_memcpy(entryName, outPkgName, inPkgNameLength);
495                } else {
496                    udata_printError(ds, "udata_swapPackage() failed: ToC item %ld does not have the input package name as a prefix\n",
497                                     (long)i);
498                    *pErrorCode=U_INVALID_FORMAT_ERROR;
499                    return 0;
500                }
501            }
502        }
503
504        /*
505         * Allocate the ToC table and, if necessary, a temporary buffer for
506         * pseudo-in-place swapping.
507         *
508         * We cannot swap in-place because:
509         *
510         * 1. If the swapping of an item fails mid-way, then in-place swapping
511         * has destroyed its data.
512         * Out-of-place swapping allows us to then copy its original data.
513         *
514         * 2. If swapping changes the charset family, then we must resort
515         * not only the ToC table but also the data items themselves.
516         * This requires a permutation and is best done with separate in/out
517         * buffers.
518         *
519         * We swapped the strings above to avoid the malloc below if string swapping fails.
520         */
521        if(inData==outData) {
522            /* +15: prepare for extra padding of a newly-last item */
523            table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry)+length+DEFAULT_PADDING_LENGTH);
524            if(table!=NULL) {
525                outBytes=(uint8_t *)(table+itemCount);
526
527                /* copy the item count and the swapped strings */
528                uprv_memcpy(outBytes, inBytes, 4);
529                uprv_memcpy(outBytes+offset, inBytes+offset, itemLength);
530            }
531        } else {
532            table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry));
533        }
534        if(table==NULL) {
535            udata_printError(ds, "udata_swapPackage(): out of memory allocating %d bytes\n",
536                             inData==outData ?
537                                 itemCount*sizeof(ToCEntry)+length+DEFAULT_PADDING_LENGTH :
538                                 itemCount*sizeof(ToCEntry));
539            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
540            return 0;
541        }
542        outEntries=(UDataOffsetTOCEntry *)(outBytes+4);
543
544        /* read the ToC table */
545        for(i=0; i<itemCount; ++i) {
546            table[i].nameOffset=ds->readUInt32(inEntries[i].nameOffset);
547            table[i].inOffset=ds->readUInt32(inEntries[i].dataOffset);
548            if(i>0) {
549                table[i-1].length=table[i].inOffset-table[i-1].inOffset;
550            }
551        }
552        table[itemCount-1].length=(uint32_t)length-table[itemCount-1].inOffset;
553
554        if(ds->inCharset==ds->outCharset) {
555            /* no charset swapping, no resorting: keep item offsets the same */
556            for(i=0; i<itemCount; ++i) {
557                table[i].outOffset=table[i].inOffset;
558            }
559        } else {
560            /* charset swapping: resort items by their swapped names */
561
562            /*
563             * Before the actual sorting, we need to make sure that each item
564             * has a length that is a multiple of 16 bytes so that all items
565             * are 16-aligned.
566             * Only the old last item may be missing up to 15 padding bytes.
567             * Add padding bytes for it.
568             * Since the icuswap main() function has already allocated enough
569             * input buffer space and set the last 15 bytes there to 0xaa,
570             * we only need to increase the total data length and the length
571             * of the last item here.
572             */
573            if((length&0xf)!=0) {
574                int32_t delta=16-(length&0xf);
575                length+=delta;
576                table[itemCount-1].length+=(uint32_t)delta;
577            }
578
579            /* Save the offset before we sort the TOC. */
580            offset=table[0].inOffset;
581            /* sort the TOC entries */
582            uprv_sortArray(table, (int32_t)itemCount, (int32_t)sizeof(ToCEntry),
583                           compareToCEntries, outBytes, FALSE, pErrorCode);
584
585            /*
586             * Note: Before sorting, the inOffset values were in order.
587             * Now the outOffset values are in order.
588             */
589
590            /* assign outOffset values */
591            for(i=0; i<itemCount; ++i) {
592                table[i].outOffset=offset;
593                offset+=table[i].length;
594            }
595        }
596
597        /* write the output ToC table */
598        for(i=0; i<itemCount; ++i) {
599            ds->writeUInt32(&outEntries[i].nameOffset, table[i].nameOffset);
600            ds->writeUInt32(&outEntries[i].dataOffset, table[i].outOffset);
601        }
602
603        /* swap each data item */
604        for(i=0; i<itemCount; ++i) {
605            /* first copy the item bytes to make sure that unreachable bytes are copied */
606            uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length);
607
608            /* swap the item */
609            udata_swap(ds, inBytes+table[i].inOffset, (int32_t)table[i].length,
610                          outBytes+table[i].outOffset, pErrorCode);
611
612            if(U_FAILURE(*pErrorCode)) {
613                if(ds->outCharset==U_CHARSET_FAMILY) {
614                    udata_printError(ds, "warning: udata_swapPackage() failed to swap item \"%s\"\n"
615                                         "    at inOffset 0x%x length 0x%x - %s\n"
616                                         "    the data item will be copied, not swapped\n\n",
617                                     (char *)outBytes+table[i].nameOffset,
618                                     table[i].inOffset, table[i].length, u_errorName(*pErrorCode));
619                } else {
620                    udata_printError(ds, "warning: udata_swapPackage() failed to swap an item\n"
621                                         "    at inOffset 0x%x length 0x%x - %s\n"
622                                         "    the data item will be copied, not swapped\n\n",
623                                     table[i].inOffset, table[i].length, u_errorName(*pErrorCode));
624                }
625                /* reset the error code, copy the data item, and continue */
626                *pErrorCode=U_ZERO_ERROR;
627                uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length);
628            }
629        }
630
631        if(inData==outData) {
632            /* copy the data from the temporary buffer to the in-place buffer */
633            uprv_memcpy((uint8_t *)outData+headerSize, outBytes, length);
634        }
635        uprv_free(table);
636
637        return headerSize+length;
638    }
639}
640
641/*
642 * Hey, Emacs, please set the following:
643 *
644 * Local Variables:
645 * indent-tabs-mode: nil
646 * End:
647 *
648 */
649