1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/******************************************************************************
4 *   Copyright (C) 2008-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *******************************************************************************
7 */
8#include "unicode/utypes.h"
9
10#include <stdio.h>
11#include <stdlib.h>
12#include "unicode/utypes.h"
13#include "unicode/putil.h"
14#include "cmemory.h"
15#include "cstring.h"
16#include "filestrm.h"
17#include "toolutil.h"
18#include "unicode/uclean.h"
19#include "unewdata.h"
20#include "putilimp.h"
21#include "pkg_gencmn.h"
22
23#define STRING_STORE_SIZE 200000
24
25#define COMMON_DATA_NAME U_ICUDATA_NAME
26#define DATA_TYPE "dat"
27
28/* ICU package data file format (.dat files) ------------------------------- ***
29
30Description of the data format after the usual ICU data file header
31(UDataInfo etc.).
32
33Format version 1
34
35A .dat package file contains a simple Table of Contents of item names,
36followed by the items themselves:
37
381. ToC table
39
40uint32_t count; - number of items
41UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item:
42    uint32_t nameOffset; - offset of the item name
43    uint32_t dataOffset; - offset of the item data
44both are byte offsets from the beginning of the data
45
462. item name strings
47
48All item names are stored as char * strings in one block between the ToC table
49and the data items.
50
513. data items
52
53The data items are stored following the item names block.
54Each data item is 16-aligned.
55The data items are stored in the sorted order of their names.
56
57Therefore, the top of the name strings block is the offset of the first item,
58the length of the last item is the difference between its offset and
59the .dat file length, and the length of all previous items is the difference
60between its offset and the next one.
61
62----------------------------------------------------------------------------- */
63
64/* UDataInfo cf. udata.h */
65static const UDataInfo dataInfo={
66    sizeof(UDataInfo),
67    0,
68
69    U_IS_BIG_ENDIAN,
70    U_CHARSET_FAMILY,
71    sizeof(UChar),
72    0,
73
74    {0x43, 0x6d, 0x6e, 0x44},     /* dataFormat="CmnD" */
75    {1, 0, 0, 0},                 /* formatVersion */
76    {3, 0, 0, 0}                  /* dataVersion */
77};
78
79static uint32_t maxSize;
80
81static char stringStore[STRING_STORE_SIZE];
82static uint32_t stringTop=0, basenameTotal=0;
83
84typedef struct {
85    char *pathname, *basename;
86    uint32_t basenameLength, basenameOffset, fileSize, fileOffset;
87} File;
88
89#define CHUNK_FILE_COUNT 256
90static File *files = NULL;
91static uint32_t fileCount=0;
92static uint32_t fileMax = 0;
93
94
95static char *symPrefix = NULL;
96
97#define LINE_BUFFER_SIZE 512
98/* prototypes --------------------------------------------------------------- */
99
100static void
101addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose);
102
103static char *
104allocString(uint32_t length);
105
106static int
107compareFiles(const void *file1, const void *file2);
108
109static char *
110pathToFullPath(const char *path, const char *source);
111
112/* map non-tree separator (such as '\') to tree separator ('/') inplace. */
113static void
114fixDirToTreePath(char *s);
115/* -------------------------------------------------------------------------- */
116
117U_CAPI void U_EXPORT2
118createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight,
119                     const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) {
120    static char buffer[4096];
121    char *line;
122    char *linePtr;
123    char *s = NULL;
124    UErrorCode errorCode=U_ZERO_ERROR;
125    uint32_t i, fileOffset, basenameOffset, length, nread;
126    FileStream *in, *file;
127
128    line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE);
129    if (line == NULL) {
130        fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE);
131        exit(U_MEMORY_ALLOCATION_ERROR);
132    }
133
134    linePtr = line;
135
136    maxSize = max_size;
137
138    if (destDir == NULL) {
139        destDir = u_getDataDirectory();
140    }
141    if (name == NULL) {
142        name = COMMON_DATA_NAME;
143    }
144    if (type == NULL) {
145        type = DATA_TYPE;
146    }
147    if (source == NULL) {
148        source = ".";
149    }
150
151    if (dataFile == NULL) {
152        in = T_FileStream_stdin();
153    } else {
154        in = T_FileStream_open(dataFile, "r");
155        if(in == NULL) {
156            fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile);
157            exit(U_FILE_ACCESS_ERROR);
158        }
159    }
160
161    if (verbose) {
162        if(sourceTOC) {
163            printf("generating %s_%s.c (table of contents source file)\n", name, type);
164        } else {
165            printf("generating %s.%s (common data file with table of contents)\n", name, type);
166        }
167    }
168
169    /* read the list of files and get their lengths */
170    while((s != NULL && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr),
171                                                             LINE_BUFFER_SIZE))!=NULL) {
172        /* remove trailing newline characters and parse space separated items */
173        if (s != NULL && *s != 0) {
174            line=s;
175        } else {
176            s=line;
177        }
178        while(*s!=0) {
179            if(*s==' ') {
180                *s=0;
181                ++s;
182                break;
183            } else if(*s=='\r' || *s=='\n') {
184                *s=0;
185                break;
186            }
187            ++s;
188        }
189
190        /* check for comment */
191
192        if (*line == '#') {
193            continue;
194        }
195
196        /* add the file */
197#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
198        {
199          char *t;
200          while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) {
201            *t = U_FILE_SEP_CHAR;
202          }
203        }
204#endif
205        addFile(getLongPathname(line), name, source, sourceTOC, verbose);
206    }
207
208    uprv_free(linePtr);
209
210    if(in!=T_FileStream_stdin()) {
211        T_FileStream_close(in);
212    }
213
214    if(fileCount==0) {
215        fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == NULL ? "<stdin>" : dataFile);
216        return;
217    }
218
219    /* sort the files by basename */
220    qsort(files, fileCount, sizeof(File), compareFiles);
221
222    if(!sourceTOC) {
223        UNewDataMemory *out;
224
225        /* determine the offsets of all basenames and files in this common one */
226        basenameOffset=4+8*fileCount;
227        fileOffset=(basenameOffset+(basenameTotal+15))&~0xf;
228        for(i=0; i<fileCount; ++i) {
229            files[i].fileOffset=fileOffset;
230            fileOffset+=(files[i].fileSize+15)&~0xf;
231            files[i].basenameOffset=basenameOffset;
232            basenameOffset+=files[i].basenameLength;
233        }
234
235        /* create the output file */
236        out=udata_create(destDir, type, name,
237                         &dataInfo,
238                         copyRight == NULL ? U_COPYRIGHT_STRING : copyRight,
239                         &errorCode);
240        if(U_FAILURE(errorCode)) {
241            fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n",
242                destDir, name, type,
243                u_errorName(errorCode));
244            exit(errorCode);
245        }
246
247        /* write the table of contents */
248        udata_write32(out, fileCount);
249        for(i=0; i<fileCount; ++i) {
250            udata_write32(out, files[i].basenameOffset);
251            udata_write32(out, files[i].fileOffset);
252        }
253
254        /* write the basenames */
255        for(i=0; i<fileCount; ++i) {
256            udata_writeString(out, files[i].basename, files[i].basenameLength);
257        }
258        length=4+8*fileCount+basenameTotal;
259
260        /* copy the files */
261        for(i=0; i<fileCount; ++i) {
262            /* pad to 16-align the next file */
263            length&=0xf;
264            if(length!=0) {
265                udata_writePadding(out, 16-length);
266            }
267
268            if (verbose) {
269                printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
270            }
271
272            /* copy the next file */
273            file=T_FileStream_open(files[i].pathname, "rb");
274            if(file==NULL) {
275                fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname);
276                exit(U_FILE_ACCESS_ERROR);
277            }
278            for(nread = 0;;) {
279                length=T_FileStream_read(file, buffer, sizeof(buffer));
280                if(length <= 0) {
281                    break;
282                }
283                nread += length;
284                udata_writeBlock(out, buffer, length);
285            }
286            T_FileStream_close(file);
287            length=files[i].fileSize;
288
289            if (nread != files[i].fileSize) {
290              fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname,  (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
291                exit(U_FILE_ACCESS_ERROR);
292            }
293        }
294
295        /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */
296        length&=0xf;
297        if(length!=0) {
298            udata_writePadding(out, 16-length);
299        }
300
301        /* finish */
302        udata_finish(out, &errorCode);
303        if(U_FAILURE(errorCode)) {
304            fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode));
305            exit(errorCode);
306        }
307    } else {
308        /* write a .c source file with the table of contents */
309        char *filename;
310        FileStream *out;
311
312        /* create the output filename */
313        filename=s=buffer;
314        uprv_strcpy(filename, destDir);
315        s=filename+uprv_strlen(filename);
316        if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) {
317            *s++=U_FILE_SEP_CHAR;
318        }
319        uprv_strcpy(s, name);
320        if(*(type)!=0) {
321            s+=uprv_strlen(s);
322            *s++='_';
323            uprv_strcpy(s, type);
324        }
325        s+=uprv_strlen(s);
326        uprv_strcpy(s, ".c");
327
328        /* open the output file */
329        out=T_FileStream_open(filename, "w");
330        if (gencmnFileName != NULL) {
331            uprv_strcpy(gencmnFileName, filename);
332        }
333        if(out==NULL) {
334            fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename);
335            exit(U_FILE_ACCESS_ERROR);
336        }
337
338        /* write the source file */
339        sprintf(buffer,
340            "/*\n"
341            " * ICU common data table of contents for %s.%s\n"
342            " * Automatically generated by icu/source/tools/gencmn/gencmn .\n"
343            " */\n\n"
344            "#include \"unicode/utypes.h\"\n"
345            "#include \"unicode/udata.h\"\n"
346            "\n"
347            "/* external symbol declarations for data (%d files) */\n",
348                name, type, fileCount);
349        T_FileStream_writeLine(out, buffer);
350
351        sprintf(buffer, "extern const char\n    %s%s[]", symPrefix?symPrefix:"", files[0].pathname);
352        T_FileStream_writeLine(out, buffer);
353        for(i=1; i<fileCount; ++i) {
354            sprintf(buffer, ",\n    %s%s[]", symPrefix?symPrefix:"", files[i].pathname);
355            T_FileStream_writeLine(out, buffer);
356        }
357        T_FileStream_writeLine(out, ";\n\n");
358
359        sprintf(
360            buffer,
361            "U_EXPORT struct {\n"
362            "    uint16_t headerSize;\n"
363            "    uint8_t magic1, magic2;\n"
364            "    UDataInfo info;\n"
365            "    char padding[%lu];\n"
366            "    uint32_t count, reserved;\n"
367            "    struct {\n"
368            "        const char *name;\n"
369            "        const void *data;\n"
370            "    } toc[%lu];\n"
371            "} U_EXPORT2 %s_dat = {\n"
372            "    32, 0xda, 0x27, {\n"
373            "        %lu, 0,\n"
374            "        %u, %u, %u, 0,\n"
375            "        {0x54, 0x6f, 0x43, 0x50},\n"
376            "        {1, 0, 0, 0},\n"
377            "        {0, 0, 0, 0}\n"
378            "    },\n"
379            "    \"\", %lu, 0, {\n",
380            (unsigned long)32-4-sizeof(UDataInfo),
381            (unsigned long)fileCount,
382            entrypointName,
383            (unsigned long)sizeof(UDataInfo),
384            U_IS_BIG_ENDIAN,
385            U_CHARSET_FAMILY,
386            U_SIZEOF_UCHAR,
387            (unsigned long)fileCount
388        );
389        T_FileStream_writeLine(out, buffer);
390
391        sprintf(buffer, "        { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname);
392        T_FileStream_writeLine(out, buffer);
393        for(i=1; i<fileCount; ++i) {
394            sprintf(buffer, ",\n        { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname);
395            T_FileStream_writeLine(out, buffer);
396        }
397
398        T_FileStream_writeLine(out, "\n    }\n};\n");
399        T_FileStream_close(out);
400
401        uprv_free(symPrefix);
402    }
403}
404
405static void
406addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) {
407    char *s;
408    uint32_t length;
409    char *fullPath = NULL;
410
411    if(fileCount==fileMax) {
412      fileMax += CHUNK_FILE_COUNT;
413      files = uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */
414      if(files==NULL) {
415        fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount);
416        exit(U_MEMORY_ALLOCATION_ERROR);
417      }
418    }
419
420    if(!sourceTOC) {
421        FileStream *file;
422
423        if(uprv_pathIsAbsolute(filename)) {
424            fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename);
425            exit(U_ILLEGAL_ARGUMENT_ERROR);
426        }
427        fullPath = pathToFullPath(filename, source);
428        /* store the pathname */
429        length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
430        s=allocString(length);
431        uprv_strcpy(s, name);
432        uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
433        uprv_strcat(s, filename);
434
435        /* get the basename */
436        fixDirToTreePath(s);
437        files[fileCount].basename=s;
438        files[fileCount].basenameLength=length;
439
440        files[fileCount].pathname=fullPath;
441
442        basenameTotal+=length;
443
444        /* try to open the file */
445        file=T_FileStream_open(fullPath, "rb");
446        if(file==NULL) {
447            fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath);
448            exit(U_FILE_ACCESS_ERROR);
449        }
450
451        /* get the file length */
452        length=T_FileStream_size(file);
453        if(T_FileStream_error(file) || length<=20) {
454            fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath);
455            exit(U_FILE_ACCESS_ERROR);
456        }
457
458        T_FileStream_close(file);
459
460        /* do not add files that are longer than maxSize */
461        if(maxSize && length>maxSize) {
462            if (verbose) {
463                printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize);
464            }
465            return;
466        }
467        files[fileCount].fileSize=length;
468    } else {
469        char *t;
470        /* get and store the basename */
471        /* need to include the package name */
472        length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
473        s=allocString(length);
474        uprv_strcpy(s, name);
475        uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
476        uprv_strcat(s, filename);
477        fixDirToTreePath(s);
478        files[fileCount].basename=s;
479        /* turn the basename into an entry point name and store in the pathname field */
480        t=files[fileCount].pathname=allocString(length);
481        while(--length>0) {
482            if(*s=='.' || *s=='-' || *s=='/') {
483                *t='_';
484            } else {
485                *t=*s;
486            }
487            ++s;
488            ++t;
489        }
490        *t=0;
491    }
492    ++fileCount;
493}
494
495static char *
496allocString(uint32_t length) {
497    uint32_t top=stringTop+length;
498    char *p;
499
500    if(top>STRING_STORE_SIZE) {
501        fprintf(stderr, "gencmn: out of memory\n");
502        exit(U_MEMORY_ALLOCATION_ERROR);
503    }
504    p=stringStore+stringTop;
505    stringTop=top;
506    return p;
507}
508
509static char *
510pathToFullPath(const char *path, const char *source) {
511    int32_t length;
512    int32_t newLength;
513    char *fullPath;
514    int32_t n;
515
516    length = (uint32_t)(uprv_strlen(path) + 1);
517    newLength = (length + 1 + (int32_t)uprv_strlen(source));
518    fullPath = uprv_malloc(newLength);
519    if(source != NULL) {
520        uprv_strcpy(fullPath, source);
521        uprv_strcat(fullPath, U_FILE_SEP_STRING);
522    } else {
523        fullPath[0] = 0;
524    }
525    n = (int32_t)uprv_strlen(fullPath);
526    fullPath[n] = 0;       /* Suppress compiler warning for unused variable n    */
527                           /*  when conditional code below is not compiled.      */
528    uprv_strcat(fullPath, path);
529
530#if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
531#if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR)
532    /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
533    for(;fullPath[n];n++) {
534        if(fullPath[n] == U_FILE_ALT_SEP_CHAR) {
535            fullPath[n] = U_FILE_SEP_CHAR;
536        }
537    }
538#endif
539#endif
540#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
541    /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
542    for(;fullPath[n];n++) {
543        if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) {
544            fullPath[n] = U_FILE_SEP_CHAR;
545        }
546    }
547#endif
548    return fullPath;
549}
550
551static int
552compareFiles(const void *file1, const void *file2) {
553    /* sort by basename */
554    return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename);
555}
556
557static void
558fixDirToTreePath(char *s)
559{
560#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR))
561    char *t;
562#endif
563#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
564    for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) {
565        *t = U_TREE_ENTRY_SEP_CHAR;
566    }
567#endif
568#if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
569    for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) {
570        *t = U_TREE_ENTRY_SEP_CHAR;
571    }
572#endif
573}
574