1/*
2*******************************************************************************
3*
4*   Copyright (C) 2001-2005, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  gennorm.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2001may25
14*   created by: Markus W. Scherer
15*
16*   This program reads the Unicode character database text file,
17*   parses it, and extracts the data for normalization.
18*   It then preprocesses it and writes a binary file for efficient use
19*   in various Unicode text normalization processes.
20*/
21
22#include <stdio.h>
23#include <stdlib.h>
24#include "unicode/utypes.h"
25#include "unicode/uchar.h"
26#include "unicode/ustring.h"
27#include "unicode/putil.h"
28#include "unicode/uclean.h"
29#include "unicode/udata.h"
30#include "unicode/uset.h"
31#include "cmemory.h"
32#include "cstring.h"
33#include "unewdata.h"
34#include "uoptions.h"
35#include "uparse.h"
36#include "unormimp.h"
37
38U_CDECL_BEGIN
39#include "gennorm.h"
40U_CDECL_END
41
42UBool beVerbose=FALSE, haveCopyright=TRUE;
43
44/* prototypes --------------------------------------------------------------- */
45
46static void
47parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
48
49static void
50parseDB(const char *filename, UErrorCode *pErrorCode);
51
52/* -------------------------------------------------------------------------- */
53
54enum {
55    HELP_H,
56    HELP_QUESTION_MARK,
57    VERBOSE,
58    COPYRIGHT,
59    DESTDIR,
60    SOURCEDIR,
61    UNICODE_VERSION,
62    ICUDATADIR,
63    CSOURCE,
64    STORE_FLAGS
65};
66
67static UOption options[]={
68    UOPTION_HELP_H,
69    UOPTION_HELP_QUESTION_MARK,
70    UOPTION_VERBOSE,
71    UOPTION_COPYRIGHT,
72    UOPTION_DESTDIR,
73    UOPTION_SOURCEDIR,
74    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
75    UOPTION_ICUDATADIR,
76    UOPTION_DEF("csource", 'C', UOPT_NO_ARG),
77    UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG)
78};
79
80extern int
81main(int argc, char* argv[]) {
82#if !UCONFIG_NO_NORMALIZATION
83    char filename[300];
84#endif
85    const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
86    char *basename=NULL;
87    UErrorCode errorCode=U_ZERO_ERROR;
88
89    U_MAIN_INIT_ARGS(argc, argv);
90
91    /* preset then read command line options */
92    options[4].value=u_getDataDirectory();
93    options[5].value="";
94    options[6].value="3.0.0";
95    options[ICUDATADIR].value=u_getDataDirectory();
96    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
97
98    /* error handling, printing usage message */
99    if(argc<0) {
100        fprintf(stderr,
101            "error in command line argument \"%s\"\n",
102            argv[-argc]);
103    }
104    if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
105        /*
106         * Broken into chucks because the C89 standard says the minimum
107         * required supported string length is 509 bytes.
108         */
109        fprintf(stderr,
110            "Usage: %s [-options] [suffix]\n"
111            "\n"
112            "Read the UnicodeData.txt file and other Unicode properties files and\n"
113            "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
114            "\n",
115            argv[0]);
116        fprintf(stderr,
117            "Options:\n"
118            "\t-h or -? or --help  this usage text\n"
119            "\t-v or --verbose     verbose output\n"
120            "\t-c or --copyright   include a copyright notice\n"
121            "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
122            "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
123        fprintf(stderr,
124            "\t-p or --prune flags Prune for data modularization:\n"
125            "\t                    Determine what data is to be stored.\n"
126            "\t        0 (zero) stores minimal data (only for NFD)\n"
127            "\t        lowercase letters turn off data, uppercase turn on (use with 0)\n");
128        fprintf(stderr,
129            "\t        k: compatibility decompositions (NFKC, NFKD)\n"
130            "\t        c: composition data (NFC, NFKC)\n"
131            "\t        f: FCD data (will be generated at load time)\n"
132            "\t        a: auxiliary data (canonical closure etc.)\n"
133            "\t        x: exclusion sets (Unicode 3.2-level normalization)\n");
134        fprintf(stderr,
135            "\t-d or --destdir     destination directory, followed by the path\n"
136            "\t-s or --sourcedir   source directory, followed by the path\n"
137            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
138            "\t                    followed by path, defaults to <%s>\n"
139            "\tsuffix              suffix that is to be appended with a '-'\n"
140            "\t                    to the source file basenames before opening;\n"
141            "\t                    'gennorm new' will read UnicodeData-new.txt etc.\n",
142            u_getDataDirectory());
143        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
144    }
145
146    /* get the options values */
147    beVerbose=options[2].doesOccur;
148    haveCopyright=options[3].doesOccur;
149    srcDir=options[5].value;
150    destDir=options[4].value;
151
152    if(argc>=2) {
153        suffix=argv[1];
154    } else {
155        suffix=NULL;
156    }
157
158#if UCONFIG_NO_NORMALIZATION
159
160    fprintf(stderr,
161        "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
162        " because UCONFIG_NO_NORMALIZATION is set, \n"
163        "see icu/source/common/unicode/uconfig.h\n");
164    generateData(destDir, options[CSOURCE].doesOccur);
165
166#else
167
168    setUnicodeVersion(options[6].value);
169
170    if (options[ICUDATADIR].doesOccur) {
171        u_setDataDirectory(options[ICUDATADIR].value);
172    }
173
174    if(options[STORE_FLAGS].doesOccur) {
175        const char *s=options[STORE_FLAGS].value;
176        char c;
177
178        while((c=*s++)!=0) {
179            switch(c) {
180            case '0':
181                gStoreFlags=0;  /* store minimal data (only for NFD) */
182                break;
183
184            /* lowercase letters: omit data */
185            case 'k':
186                gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPAT);
187                break;
188            case 'c':
189                gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPOSITION);
190                break;
191            case 'f':
192                gStoreFlags&=~U_MASK(UGENNORM_STORE_FCD);
193                break;
194            case 'a':
195                gStoreFlags&=~U_MASK(UGENNORM_STORE_AUX);
196                break;
197            case 'x':
198                gStoreFlags&=~U_MASK(UGENNORM_STORE_EXCLUSIONS);
199                break;
200
201            /* uppercase letters: include data (use with 0) */
202            case 'K':
203                gStoreFlags|=U_MASK(UGENNORM_STORE_COMPAT);
204                break;
205            case 'C':
206                gStoreFlags|=U_MASK(UGENNORM_STORE_COMPOSITION);
207                break;
208            case 'F':
209                gStoreFlags|=U_MASK(UGENNORM_STORE_FCD);
210                break;
211            case 'A':
212                gStoreFlags|=U_MASK(UGENNORM_STORE_AUX);
213                break;
214            case 'X':
215                gStoreFlags|=U_MASK(UGENNORM_STORE_EXCLUSIONS);
216                break;
217
218            default:
219                fprintf(stderr, "ignoring undefined prune flag '%c'\n", c);
220                break;
221            }
222        }
223    }
224
225    /*
226     * Verify that we can work with properties
227     * but don't call u_init() because that needs unorm.icu which we are just
228     * going to build here.
229     */
230    {
231        U_STRING_DECL(ideo, "[:Ideographic:]", 15);
232        USet *set;
233
234        U_STRING_INIT(ideo, "[:Ideographic:]", 15);
235        set=uset_openPattern(ideo, -1, &errorCode);
236        if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) {
237            fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
238            exit(errorCode);
239        }
240        uset_close(set);
241    }
242
243    /* prepare the filename beginning with the source dir */
244    uprv_strcpy(filename, srcDir);
245    basename=filename+uprv_strlen(filename);
246    if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
247        *basename++=U_FILE_SEP_CHAR;
248    }
249
250    /* initialize */
251    init();
252
253    /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
254    if(suffix==NULL) {
255        uprv_strcpy(basename, "DerivedNormalizationProps.txt");
256    } else {
257        uprv_strcpy(basename, "DerivedNormalizationProps");
258        basename[30]='-';
259        uprv_strcpy(basename+31, suffix);
260        uprv_strcat(basename+31, ".txt");
261    }
262    parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
263    if(U_FAILURE(errorCode)) {
264        /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
265        if(suffix==NULL) {
266            uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
267        } else {
268            uprv_strcpy(basename, "DerivedNormalizationProperties");
269            basename[30]='-';
270            uprv_strcpy(basename+31, suffix);
271            uprv_strcat(basename+31, ".txt");
272        }
273        parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
274    }
275
276    /* process UnicodeData.txt */
277    if(suffix==NULL) {
278        uprv_strcpy(basename, "UnicodeData.txt");
279    } else {
280        uprv_strcpy(basename, "UnicodeData");
281        basename[11]='-';
282        uprv_strcpy(basename+12, suffix);
283        uprv_strcat(basename+12, ".txt");
284    }
285    parseDB(filename, &errorCode);
286
287    /* process parsed data */
288    if(U_SUCCESS(errorCode)) {
289        processData();
290
291        /* write the properties data file */
292        generateData(destDir, options[CSOURCE].doesOccur);
293
294        cleanUpData();
295    }
296
297#endif
298
299    return errorCode;
300}
301
302#if !UCONFIG_NO_NORMALIZATION
303
304/* parser for DerivedNormalizationProperties.txt ---------------------------- */
305
306static void U_CALLCONV
307derivedNormalizationPropertiesLineFn(void *context,
308                                     char *fields[][2], int32_t fieldCount,
309                                     UErrorCode *pErrorCode) {
310    UChar string[32];
311    char *s;
312    uint32_t start, end;
313    int32_t count;
314    uint8_t qcFlags;
315
316    /* get code point range */
317    count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
318    if(U_FAILURE(*pErrorCode)) {
319        fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
320        exit(*pErrorCode);
321    }
322
323    /* ignore hangul - handle explicitly */
324    if(start==0xac00) {
325        return;
326    }
327
328    /* get property - ignore unrecognized ones */
329    s=(char *)u_skipWhitespace(fields[1][0]);
330    if(*s=='N' && s[1]=='F') {
331        /* quick check flag */
332        qcFlags=0x11;
333        s+=2;
334        if(*s=='K') {
335            qcFlags<<=1;
336            ++s;
337        }
338
339        if(*s=='C' && s[1]=='_') {
340            s+=2;
341        } else if(*s=='D' && s[1]=='_') {
342            qcFlags<<=2;
343            s+=2;
344        } else {
345            return;
346        }
347
348        if(0==uprv_strncmp(s, "NO", 2)) {
349            qcFlags&=0xf;
350        } else if(0==uprv_strncmp(s, "MAYBE", 5)) {
351            qcFlags&=0x30;
352        } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
353            /*
354             * Unicode 4.0.1:
355             * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
356             */
357            /* start of the field */
358            s=(char *)u_skipWhitespace(s+1);
359            if(*s=='N') {
360                qcFlags&=0xf;
361            } else if(*s=='M') {
362                qcFlags&=0x30;
363            } else {
364                return; /* do nothing for "Yes" because it's the default value */
365            }
366        } else {
367            return; /* do nothing for "Yes" because it's the default value */
368        }
369
370        /* set this flag for all code points in this range */
371        while(start<=end) {
372            setQCFlags(start++, qcFlags);
373        }
374    } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
375        /* full composition exclusion */
376        while(start<=end) {
377            setCompositionExclusion(start++);
378        }
379    } else if(
380        ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') ||
381        (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';'))
382
383    ) {
384        /* FC_NFKC_Closure, parse field 2 to get the string */
385        char *t;
386
387        /* start of the field */
388        s=(char *)u_skipWhitespace(s+1);
389
390        /* find the end of the field */
391        for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
392        *t=0;
393
394        string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
395        if(U_FAILURE(*pErrorCode)) {
396            fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
397            exit(*pErrorCode);
398        }
399        while(start<=end) {
400            setFNC(start++, string);
401        }
402    }
403}
404
405static void
406parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
407    char *fields[2][2];
408
409    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
410        return;
411    }
412
413    u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
414    if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
415        fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
416        exit(*pErrorCode);
417    }
418}
419
420/* parser for UnicodeData.txt ----------------------------------------------- */
421
422static void U_CALLCONV
423unicodeDataLineFn(void *context,
424                  char *fields[][2], int32_t fieldCount,
425                  UErrorCode *pErrorCode) {
426    uint32_t decomp[40];
427    Norm norm;
428    const char *s;
429    char *end;
430    uint32_t code, value;
431    int32_t length;
432    UBool isCompat, something=FALSE;
433
434    /* ignore First and Last entries for ranges */
435    if( *fields[1][0]=='<' &&
436        (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
437        (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
438    ) {
439        return;
440    }
441
442    /* reset the properties */
443    uprv_memset(&norm, 0, sizeof(Norm));
444
445    /*
446     * The combiningIndex must not be initialized to 0 because 0 is the
447     * combiningIndex of the first forward-combining character.
448     */
449    norm.combiningIndex=0xffff;
450
451    /* get the character code, field 0 */
452    code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
453    if(end<=fields[0][0] || end!=fields[0][1]) {
454        fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
455        *pErrorCode=U_PARSE_ERROR;
456        exit(U_PARSE_ERROR);
457    }
458
459    /* get canonical combining class, field 3 */
460    value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
461    if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
462        fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
463        *pErrorCode=U_PARSE_ERROR;
464        exit(U_PARSE_ERROR);
465    }
466    if(value>0) {
467        norm.udataCC=(uint8_t)value;
468        something=TRUE;
469    }
470
471    /* get the decomposition, field 5 */
472    if(fields[5][0]<fields[5][1]) {
473        if(*(s=fields[5][0])=='<') {
474            ++s;
475            isCompat=TRUE;
476
477            /* skip and ignore the compatibility type name */
478            do {
479                if(s==fields[5][1]) {
480                    /* missing '>' */
481                    fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
482                    *pErrorCode=U_PARSE_ERROR;
483                    exit(U_PARSE_ERROR);
484                }
485            } while(*s++!='>');
486        } else {
487            isCompat=FALSE;
488        }
489
490        /* parse the decomposition string */
491        length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
492        if(U_FAILURE(*pErrorCode)) {
493            fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
494                    (long)code, u_errorName(*pErrorCode));
495            exit(*pErrorCode);
496        }
497
498        /* store the string */
499        if(length>0) {
500            something=TRUE;
501            if(isCompat) {
502                norm.lenNFKD=(uint8_t)length;
503                norm.nfkd=decomp;
504            } else {
505                if(length>2) {
506                    fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
507                            (long)code, (long)length);
508                    *pErrorCode=U_PARSE_ERROR;
509                    exit(U_PARSE_ERROR);
510                }
511                norm.lenNFD=(uint8_t)length;
512                norm.nfd=decomp;
513            }
514        }
515    }
516
517    /* check for non-character code points */
518    if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
519        fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
520                (long)code);
521        *pErrorCode=U_PARSE_ERROR;
522        exit(U_PARSE_ERROR);
523    }
524
525    if(something) {
526        /* there are normalization values, so store them */
527#if 0
528        if(beVerbose) {
529            printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
530                   (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
531        }
532#endif
533        storeNorm(code, &norm);
534    }
535}
536
537static void
538parseDB(const char *filename, UErrorCode *pErrorCode) {
539    char *fields[15][2];
540
541    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
542        return;
543    }
544
545    u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
546    if(U_FAILURE(*pErrorCode)) {
547        fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
548        exit(*pErrorCode);
549    }
550}
551
552#endif /* #if !UCONFIG_NO_NORMALIZATION */
553
554/*
555 * Hey, Emacs, please set the following:
556 *
557 * Local Variables:
558 * indent-tabs-mode: nil
559 * End:
560 *
561 */
562