genbidi.c revision 85bf2e2fbc60a9f938064abc8127d61da7d19882
1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2008, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  genbidi.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004dec30
14*   created by: Markus W. Scherer
15*
16*   This program reads several of the Unicode character database text files,
17*   parses them, and extracts the bidi/shaping properties for each character.
18*   It then writes a binary file containing the properties
19*   that is designed to be used directly for random-access to
20*   the properties of each Unicode character.
21*/
22
23#include <stdio.h>
24#include "unicode/utypes.h"
25#include "unicode/uchar.h"
26#include "unicode/putil.h"
27#include "unicode/uclean.h"
28#include "cmemory.h"
29#include "cstring.h"
30#include "uarrsort.h"
31#include "unewdata.h"
32#include "uoptions.h"
33#include "uparse.h"
34#include "propsvec.h"
35#include "ubidi_props.h"
36#include "genbidi.h"
37
38#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
39
40/* data --------------------------------------------------------------------- */
41
42UPropsVectors *pv;
43
44UBool beVerbose=FALSE, haveCopyright=TRUE;
45
46/* prototypes --------------------------------------------------------------- */
47
48static UBool
49isToken(const char *token, const char *s);
50
51static void
52parseBidiMirroring(const char *filename, UErrorCode *pErrorCode);
53
54static void
55parseDB(const char *filename, UErrorCode *pErrorCode);
56
57/* miscellaneous ------------------------------------------------------------ */
58
59/* TODO: more common code, move functions to uparse.h|c */
60
61static char *
62trimTerminateField(char *s, char *limit) {
63    /* trim leading whitespace */
64    s=(char *)u_skipWhitespace(s);
65
66    /* trim trailing whitespace */
67    while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
68        --limit;
69    }
70    *limit=0;
71
72    return s;
73}
74
75static void
76parseTwoFieldFile(char *filename, char *basename,
77                  const char *ucdFile, const char *suffix,
78                  UParseLineFn *lineFn,
79                  UErrorCode *pErrorCode) {
80    char *fields[2][2];
81
82    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
83        return;
84    }
85
86    writeUCDFilename(basename, ucdFile, suffix);
87
88    u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
89    if(U_FAILURE(*pErrorCode)) {
90        fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
91    }
92}
93
94static void U_CALLCONV
95bidiClassLineFn(void *context,
96                char *fields[][2], int32_t fieldCount,
97                UErrorCode *pErrorCode);
98
99/* parse files with single enumerated properties ---------------------------- */
100
101/* TODO: more common code, move functions to uparse.h|c */
102
103struct SingleEnum {
104    const char *ucdFile, *propName;
105    UProperty prop;
106    int32_t vecWord, vecShift;
107    uint32_t vecMask;
108};
109typedef struct SingleEnum SingleEnum;
110
111static void
112parseSingleEnumFile(char *filename, char *basename, const char *suffix,
113                    const SingleEnum *sen,
114                    UErrorCode *pErrorCode);
115
116static const SingleEnum jtSingleEnum={
117    "DerivedJoiningType", "joining type",
118    UCHAR_JOINING_TYPE,
119    0, UBIDI_JT_SHIFT, UBIDI_JT_MASK
120};
121
122static const SingleEnum jgSingleEnum={
123    "DerivedJoiningGroup", "joining group",
124    UCHAR_JOINING_GROUP,
125    1, 0, 0xff                  /* column 1 bits 7..0 */
126};
127
128static void U_CALLCONV
129singleEnumLineFn(void *context,
130                 char *fields[][2], int32_t fieldCount,
131                 UErrorCode *pErrorCode) {
132    const SingleEnum *sen;
133    char *s;
134    uint32_t start, end, uv;
135    int32_t value;
136
137    sen=(const SingleEnum *)context;
138
139    u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
140    if(U_FAILURE(*pErrorCode)) {
141        fprintf(stderr, "genbidi: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
142        exit(*pErrorCode);
143    }
144
145    /* parse property alias */
146    s=trimTerminateField(fields[1][0], fields[1][1]);
147    value=u_getPropertyValueEnum(sen->prop, s);
148    if(value<0) {
149        if(sen->prop==UCHAR_BLOCK) {
150            if(isToken("Greek", s)) {
151                value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
152            } else if(isToken("Combining Marks for Symbols", s)) {
153                value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
154            } else if(isToken("Private Use", s)) {
155                value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
156            }
157        }
158    }
159    if(value<0) {
160        fprintf(stderr, "genbidi error: unknown %s name in %s.txt field 1 at %s\n",
161                        sen->propName, sen->ucdFile, s);
162        exit(U_PARSE_ERROR);
163    }
164
165    uv=(uint32_t)(value<<sen->vecShift);
166    if((uv&sen->vecMask)!=uv) {
167        fprintf(stderr, "genbidi error: %s value overflow (0x%x) at %s\n",
168                        sen->propName, (int)uv, s);
169        exit(U_INTERNAL_PROGRAM_ERROR);
170    }
171
172    upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode);
173    if(U_FAILURE(*pErrorCode)) {
174        fprintf(stderr, "genbidi error: unable to set %s code: %s\n",
175                        sen->propName, u_errorName(*pErrorCode));
176        exit(*pErrorCode);
177    }
178}
179
180static void
181parseSingleEnumFile(char *filename, char *basename, const char *suffix,
182                    const SingleEnum *sen,
183                    UErrorCode *pErrorCode) {
184    char *fields[2][2];
185
186    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
187        return;
188    }
189
190    writeUCDFilename(basename, sen->ucdFile, suffix);
191
192    u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
193    if(U_FAILURE(*pErrorCode)) {
194        fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
195    }
196}
197
198/* parse files with multiple binary properties ------------------------------ */
199
200/* TODO: more common code, move functions to uparse.h|c */
201
202/* TODO: similar to genbidi/props2.c but not the same; same as in gencase/gencase.c */
203
204struct Binary {
205    const char *propName;
206    int32_t vecWord;
207    uint32_t vecValue, vecMask;
208};
209typedef struct Binary Binary;
210
211struct Binaries {
212    const char *ucdFile;
213    const Binary *binaries;
214    int32_t binariesCount;
215};
216typedef struct Binaries Binaries;
217
218static const Binary
219propListNames[]={
220    { "Bidi_Control",                       0, U_MASK(UBIDI_BIDI_CONTROL_SHIFT), U_MASK(UBIDI_BIDI_CONTROL_SHIFT) },
221    { "Join_Control",                       0, U_MASK(UBIDI_JOIN_CONTROL_SHIFT), U_MASK(UBIDI_JOIN_CONTROL_SHIFT) }
222};
223
224static const Binaries
225propListBinaries={
226    "PropList", propListNames, LENGTHOF(propListNames)
227};
228
229static void U_CALLCONV
230binariesLineFn(void *context,
231               char *fields[][2], int32_t fieldCount,
232               UErrorCode *pErrorCode) {
233    const Binaries *bin;
234    char *s;
235    uint32_t start, end;
236    int32_t i;
237
238    bin=(const Binaries *)context;
239
240    u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
241    if(U_FAILURE(*pErrorCode)) {
242        fprintf(stderr, "genbidi: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
243        exit(*pErrorCode);
244    }
245
246    /* parse binary property name */
247    s=(char *)u_skipWhitespace(fields[1][0]);
248    for(i=0;; ++i) {
249        if(i==bin->binariesCount) {
250            /* ignore unrecognized properties */
251            return;
252        }
253        if(isToken(bin->binaries[i].propName, s)) {
254            break;
255        }
256    }
257
258    if(bin->binaries[i].vecMask==0) {
259        fprintf(stderr, "genbidi error: mask value %d==0 for %s %s\n",
260                        (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
261        exit(U_INTERNAL_PROGRAM_ERROR);
262    }
263
264    upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode);
265    if(U_FAILURE(*pErrorCode)) {
266        fprintf(stderr, "genbidi error: unable to set %s, code: %s\n",
267                        bin->binaries[i].propName, u_errorName(*pErrorCode));
268        exit(*pErrorCode);
269    }
270}
271
272static void
273parseBinariesFile(char *filename, char *basename, const char *suffix,
274                  const Binaries *bin,
275                  UErrorCode *pErrorCode) {
276    char *fields[2][2];
277
278    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
279        return;
280    }
281
282    writeUCDFilename(basename, bin->ucdFile, suffix);
283
284    u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
285    if(U_FAILURE(*pErrorCode)) {
286        fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
287    }
288}
289
290/* -------------------------------------------------------------------------- */
291
292enum {
293    HELP_H,
294    HELP_QUESTION_MARK,
295    VERBOSE,
296    COPYRIGHT,
297    DESTDIR,
298    SOURCEDIR,
299    UNICODE_VERSION,
300    ICUDATADIR,
301    CSOURCE
302};
303
304/* Keep these values in sync with the above enums */
305static UOption options[]={
306    UOPTION_HELP_H,
307    UOPTION_HELP_QUESTION_MARK,
308    UOPTION_VERBOSE,
309    UOPTION_COPYRIGHT,
310    UOPTION_DESTDIR,
311    UOPTION_SOURCEDIR,
312    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
313    UOPTION_ICUDATADIR,
314    UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
315};
316
317extern int
318main(int argc, char* argv[]) {
319    char filename[300];
320    const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
321    char *basename=NULL;
322    UErrorCode errorCode=U_ZERO_ERROR;
323
324    U_MAIN_INIT_ARGS(argc, argv);
325
326    /* preset then read command line options */
327    options[DESTDIR].value=u_getDataDirectory();
328    options[SOURCEDIR].value="";
329    options[UNICODE_VERSION].value="";
330    options[ICUDATADIR].value=u_getDataDirectory();
331    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
332
333    /* error handling, printing usage message */
334    if(argc<0) {
335        fprintf(stderr,
336            "error in command line argument \"%s\"\n",
337            argv[-argc]);
338    }
339    if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
340        /*
341         * Broken into chucks because the C89 standard says the minimum
342         * required supported string length is 509 bytes.
343         */
344        fprintf(stderr,
345            "Usage: %s [-options] [suffix]\n"
346            "\n"
347            "read the UnicodeData.txt file and other Unicode properties files and\n"
348            "create a binary file " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE " with the bidi/shaping properties\n"
349            "\n",
350            argv[0]);
351        fprintf(stderr,
352            "Options:\n"
353            "\t-h or -? or --help  this usage text\n"
354            "\t-v or --verbose     verbose output\n"
355            "\t-c or --copyright   include a copyright notice\n"
356            "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
357            "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
358        fprintf(stderr,
359            "\t-d or --destdir     destination directory, followed by the path\n"
360            "\t-s or --sourcedir   source directory, followed by the path\n"
361            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
362            "\t                    followed by path, defaults to %s\n"
363            "\tsuffix              suffix that is to be appended with a '-'\n"
364            "\t                    to the source file basenames before opening;\n"
365            "\t                    'genbidi new' will read UnicodeData-new.txt etc.\n",
366            u_getDataDirectory());
367        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
368    }
369
370    /* get the options values */
371    beVerbose=options[VERBOSE].doesOccur;
372    haveCopyright=options[COPYRIGHT].doesOccur;
373    srcDir=options[SOURCEDIR].value;
374    destDir=options[DESTDIR].value;
375
376    if(argc>=2) {
377        suffix=argv[1];
378    } else {
379        suffix=NULL;
380    }
381
382    if(options[UNICODE_VERSION].doesOccur) {
383        setUnicodeVersion(options[UNICODE_VERSION].value);
384    }
385    /* else use the default dataVersion in store.c */
386
387    if (options[ICUDATADIR].doesOccur) {
388        u_setDataDirectory(options[ICUDATADIR].value);
389    }
390
391    /* prepare the filename beginning with the source dir */
392    uprv_strcpy(filename, srcDir);
393    basename=filename+uprv_strlen(filename);
394    if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
395        *basename++=U_FILE_SEP_CHAR;
396    }
397
398    /* initialize */
399    pv=upvec_open(2, &errorCode);
400
401    /* process BidiMirroring.txt */
402    writeUCDFilename(basename, "BidiMirroring", suffix);
403    parseBidiMirroring(filename, &errorCode);
404
405    /* process additional properties files */
406    *basename=0;
407
408    parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
409
410    parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, &errorCode);
411
412    parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, &errorCode);
413
414    /* process UnicodeData.txt */
415    writeUCDFilename(basename, "UnicodeData", suffix);
416    parseDB(filename, &errorCode);
417
418    /* set proper bidi class for unassigned code points (Cn) */
419    parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, &errorCode);
420
421    /* process parsed data */
422    if(U_SUCCESS(errorCode)) {
423        /* write the properties data file */
424        generateData(destDir, options[CSOURCE].doesOccur);
425    }
426
427    u_cleanup();
428    return errorCode;
429}
430
431U_CFUNC void
432writeUCDFilename(char *basename, const char *filename, const char *suffix) {
433    int32_t length=(int32_t)uprv_strlen(filename);
434    uprv_strcpy(basename, filename);
435    if(suffix!=NULL) {
436        basename[length++]='-';
437        uprv_strcpy(basename+length, suffix);
438        length+=(int32_t)uprv_strlen(suffix);
439    }
440    uprv_strcpy(basename+length, ".txt");
441}
442
443/* TODO: move to toolutil */
444static UBool
445isToken(const char *token, const char *s) {
446    const char *z;
447    int32_t j;
448
449    s=u_skipWhitespace(s);
450    for(j=0;; ++j) {
451        if(token[j]!=0) {
452            if(s[j]!=token[j]) {
453                break;
454            }
455        } else {
456            z=u_skipWhitespace(s+j);
457            if(*z==';' || *z==0) {
458                return TRUE;
459            } else {
460                break;
461            }
462        }
463    }
464
465    return FALSE;
466}
467
468/* parser for BidiMirroring.txt --------------------------------------------- */
469
470static void U_CALLCONV
471mirrorLineFn(void *context,
472             char *fields[][2], int32_t fieldCount,
473             UErrorCode *pErrorCode) {
474    char *end;
475    UChar32 src, mirror;
476
477    src=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
478    if(end<=fields[0][0] || end!=fields[0][1]) {
479        fprintf(stderr, "genbidi: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]);
480        *pErrorCode=U_PARSE_ERROR;
481        exit(U_PARSE_ERROR);
482    }
483
484    mirror=(UChar32)uprv_strtoul(fields[1][0], &end, 16);
485    if(end<=fields[1][0] || end!=fields[1][1]) {
486        fprintf(stderr, "genbidi: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]);
487        *pErrorCode=U_PARSE_ERROR;
488        exit(U_PARSE_ERROR);
489    }
490
491    addMirror(src, mirror);
492}
493
494static void
495parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) {
496    char *fields[2][2];
497
498    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
499        return;
500    }
501
502    u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode);
503}
504
505/* parser for UnicodeData.txt ----------------------------------------------- */
506
507static void U_CALLCONV
508unicodeDataLineFn(void *context,
509                  char *fields[][2], int32_t fieldCount,
510                  UErrorCode *pErrorCode) {
511    char *end;
512    UErrorCode errorCode;
513    UChar32 c;
514
515    errorCode=U_ZERO_ERROR;
516
517    /* get the character code, field 0 */
518    c=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
519    if(end<=fields[0][0] || end!=fields[0][1]) {
520        fprintf(stderr, "genbidi: syntax error in field 0 at %s\n", fields[0][0]);
521        *pErrorCode=U_PARSE_ERROR;
522        exit(U_PARSE_ERROR);
523    }
524
525    /* get Mirrored flag, field 9 */
526    if(*fields[9][0]=='Y') {
527        upvec_setValue(pv, c, c, 0, U_MASK(UBIDI_IS_MIRRORED_SHIFT), U_MASK(UBIDI_IS_MIRRORED_SHIFT), &errorCode);
528        if(U_FAILURE(*pErrorCode)) {
529            fprintf(stderr, "genbidi error: unable to set 'is mirrored' for U+%04lx, code: %s\n",
530                            (long)c, u_errorName(errorCode));
531            exit(errorCode);
532        }
533    } else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') {
534        fprintf(stderr, "genbidi: syntax error in field 9 at U+%04lx\n",
535            (long)c);
536        *pErrorCode=U_PARSE_ERROR;
537        exit(U_PARSE_ERROR);
538    }
539}
540
541static void
542parseDB(const char *filename, UErrorCode *pErrorCode) {
543    /* default Bidi classes for unassigned code points */
544    static const UChar32 defaultBidi[][3]={ /* { start, end, class } */
545        /* R: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF */
546        { 0x0590, 0x05FF, U_RIGHT_TO_LEFT },
547        { 0x07C0, 0x08FF, U_RIGHT_TO_LEFT },
548        { 0xFB1D, 0xFB4F, U_RIGHT_TO_LEFT },
549        { 0x10800, 0x10FFF, U_RIGHT_TO_LEFT },
550
551        /* AL: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE */
552        { 0x0600, 0x07BF, U_RIGHT_TO_LEFT_ARABIC },
553        { 0xFB50, 0xFDCF, U_RIGHT_TO_LEFT_ARABIC },
554        { 0xFDF0, 0xFDFF, U_RIGHT_TO_LEFT_ARABIC },
555        { 0xFE70, 0xFEFE, U_RIGHT_TO_LEFT_ARABIC }
556
557        /* L otherwise */
558    };
559
560    char *fields[15][2];
561    UChar32 start, end;
562    int32_t i;
563
564    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
565        return;
566    }
567
568    /*
569     * Set default Bidi classes for unassigned code points.
570     * See the documentation for Bidi_Class in UCD.html in the Unicode data.
571     * http://www.unicode.org/Public/
572     *
573     * Starting with Unicode 5.0, DerivedBidiClass.txt should (re)set
574     * the Bidi_Class values for all code points including unassigned ones
575     * and including L values for these.
576     * This code becomes unnecesary but harmless. Leave it for now in case
577     * someone uses genbidi on pre-Unicode 5.0 data.
578     */
579    for(i=0; i<LENGTHOF(defaultBidi); ++i) {
580        start=defaultBidi[i][0];
581        end=defaultBidi[i][1];
582        upvec_setValue(pv, start, end, 0, (uint32_t)defaultBidi[i][2], UBIDI_CLASS_MASK, pErrorCode);
583        if(U_FAILURE(*pErrorCode)) {
584            fprintf(stderr, "genbidi error: unable to set default bidi class for U+%04lx..U+%04lx, code: %s\n",
585                            (long)start, (long)end, u_errorName(*pErrorCode));
586            exit(*pErrorCode);
587        }
588    }
589
590    u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
591
592    if(U_FAILURE(*pErrorCode)) {
593        return;
594    }
595}
596
597/* DerivedBidiClass.txt ----------------------------------------------------- */
598
599static void U_CALLCONV
600bidiClassLineFn(void *context,
601                char *fields[][2], int32_t fieldCount,
602                UErrorCode *pErrorCode) {
603    char *s;
604    uint32_t start, end, value;
605
606    /* get the code point range */
607    u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
608    if(U_FAILURE(*pErrorCode)) {
609        fprintf(stderr, "genbidi: syntax error in DerivedBidiClass.txt field 0 at %s\n", fields[0][0]);
610        exit(*pErrorCode);
611    }
612
613    /* parse bidi class */
614    s=trimTerminateField(fields[1][0], fields[1][1]);
615    value=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, s);
616    if((int32_t)value<0) {
617        fprintf(stderr, "genbidi error: unknown bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
618        exit(U_PARSE_ERROR);
619    }
620
621    upvec_setValue(pv, start, end, 0, value, UBIDI_CLASS_MASK, pErrorCode);
622    if(U_FAILURE(*pErrorCode)) {
623        fprintf(stderr, "genbidi error: unable to set derived bidi class for U+%04x..U+%04x - %s\n",
624                (int)start, (int)end, u_errorName(*pErrorCode));
625        exit(*pErrorCode);
626    }
627}
628
629/*
630 * Hey, Emacs, please set the following:
631 *
632 * Local Variables:
633 * indent-tabs-mode: nil
634 * End:
635 *
636 */
637