1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  gencase.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004aug28
14*   created by: Markus W. Scherer
15*
16*   This program reads several of the Unicode character database text files,
17*   parses them, and the case mapping properties for each character.
18*   It then writes a binary file containing the properties
19*   that is designed to be used directly for random-access to
20*   the properties of each Unicode character.
21*/
22
23#include <stdio.h>
24#include "unicode/utypes.h"
25#include "unicode/uchar.h"
26#include "unicode/uset.h"
27#include "unicode/putil.h"
28#include "unicode/uclean.h"
29#include "cmemory.h"
30#include "cstring.h"
31#include "uarrsort.h"
32#include "unewdata.h"
33#include "uoptions.h"
34#include "uparse.h"
35#include "uprops.h"
36#include "propsvec.h"
37#include "gencase.h"
38
39#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
40
41/* data --------------------------------------------------------------------- */
42
43UPropsVectors *pv;
44
45UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47/*
48 * Unicode set collecting the case-sensitive characters;
49 * see uchar.h UCHAR_CASE_SENSITIVE.
50 * Add code points from case mappings/foldings in
51 * the root locale and with default options.
52 */
53static USet *caseSensitive;
54
55/* prototypes --------------------------------------------------------------- */
56
57static void
58parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
59
60static void
61parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
62
63static void
64parseDB(const char *filename, UErrorCode *pErrorCode);
65
66/* parse files with multiple binary properties ------------------------------ */
67
68/* TODO: more common code, move functions to uparse.h|c */
69
70/* TODO: similar to genprops/props2.c but not the same */
71
72struct Binary {
73    const char *propName;
74    int32_t vecWord;
75    uint32_t vecValue, vecMask;
76};
77typedef struct Binary Binary;
78
79struct Binaries {
80    const char *ucdFile;
81    const Binary *binaries;
82    int32_t binariesCount;
83};
84typedef struct Binaries Binaries;
85
86static const Binary
87propListNames[]={
88    { "Soft_Dotted",                        0, UCASE_SOFT_DOTTED,   UCASE_DOT_MASK }
89};
90
91static const Binaries
92propListBinaries={
93    "PropList", propListNames, LENGTHOF(propListNames)
94};
95
96static const Binary
97derCorePropsNames[]={
98    { "Lowercase",                          0, UCASE_LOWER,         UCASE_TYPE_MASK },
99    { "Uppercase",                          0, UCASE_UPPER,         UCASE_TYPE_MASK },
100    /* Unicode 5.2 adds Case_Ignorable as a public property. See comments in store.c. */
101    { "Case_Ignorable",                     1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
102};
103
104static const Binaries
105derCorePropsBinaries={
106    "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
107};
108
109/*
110 * Treat Word_Break=MidLetter and MidNumLet as a single binary property.
111 * We need not distinguish between them because both add to case-ignorable.
112 * We ignore all other Word_Break values.
113 */
114static const Binary
115wordBreakNames[]={
116    { "MidLetter",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) },
117    { "MidNumLet",                          1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }
118};
119
120static const Binaries
121wordBreakBinaries={
122    "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames)
123};
124
125static void U_CALLCONV
126binariesLineFn(void *context,
127               char *fields[][2], int32_t fieldCount,
128               UErrorCode *pErrorCode) {
129    const Binaries *bin;
130    char *s;
131    uint32_t start, end;
132    int32_t i;
133
134    bin=(const Binaries *)context;
135
136    u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
137    if(U_FAILURE(*pErrorCode)) {
138        fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
139        exit(*pErrorCode);
140    }
141
142    /* parse binary property name */
143    s=(char *)u_skipWhitespace(fields[1][0]);
144    for(i=0;; ++i) {
145        if(i==bin->binariesCount) {
146            /* ignore unrecognized properties */
147            return;
148        }
149        if(isToken(bin->binaries[i].propName, s)) {
150            break;
151        }
152    }
153
154    if(bin->binaries[i].vecMask==0) {
155        fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
156                        (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
157        exit(U_INTERNAL_PROGRAM_ERROR);
158    }
159
160    upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode);
161    if(U_FAILURE(*pErrorCode)) {
162        fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
163                        bin->binaries[i].propName, u_errorName(*pErrorCode));
164        exit(*pErrorCode);
165    }
166}
167
168static void
169parseBinariesFile(char *filename, char *basename, const char *suffix,
170                  const Binaries *bin,
171                  UErrorCode *pErrorCode) {
172    char *fields[2][2];
173
174    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
175        return;
176    }
177
178    writeUCDFilename(basename, bin->ucdFile, suffix);
179
180    u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
181    if(U_FAILURE(*pErrorCode)) {
182        fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
183    }
184}
185
186/* -------------------------------------------------------------------------- */
187
188enum
189{
190    HELP_H,
191    HELP_QUESTION_MARK,
192    VERBOSE,
193    COPYRIGHT,
194    DESTDIR,
195    SOURCEDIR,
196    UNICODE_VERSION,
197    ICUDATADIR,
198    CSOURCE
199};
200
201/* Keep these values in sync with the above enums */
202static UOption options[]={
203    UOPTION_HELP_H,
204    UOPTION_HELP_QUESTION_MARK,
205    UOPTION_VERBOSE,
206    UOPTION_COPYRIGHT,
207    UOPTION_DESTDIR,
208    UOPTION_SOURCEDIR,
209    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
210    UOPTION_ICUDATADIR,
211    UOPTION_DEF("csource", 'C', UOPT_NO_ARG)
212};
213
214extern int
215main(int argc, char* argv[]) {
216    char filename[300];
217    const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
218    char *basename=NULL;
219    UErrorCode errorCode=U_ZERO_ERROR;
220
221    U_MAIN_INIT_ARGS(argc, argv);
222
223    /* preset then read command line options */
224    options[DESTDIR].value=u_getDataDirectory();
225    options[SOURCEDIR].value="";
226    options[UNICODE_VERSION].value="";
227    options[ICUDATADIR].value=u_getDataDirectory();
228    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
229
230    /* error handling, printing usage message */
231    if(argc<0) {
232        fprintf(stderr,
233            "error in command line argument \"%s\"\n",
234            argv[-argc]);
235    }
236    if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
237        /*
238         * Broken into chunks because the C89 standard says the minimum
239         * required supported string length is 509 bytes.
240         */
241        fprintf(stderr,
242            "Usage: %s [-options] [suffix]\n"
243            "\n"
244            "read the UnicodeData.txt file and other Unicode properties files and\n"
245            "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n"
246            "\n",
247            argv[0]);
248        fprintf(stderr,
249            "Options:\n"
250            "\t-h or -? or --help  this usage text\n"
251            "\t-v or --verbose     verbose output\n"
252            "\t-c or --copyright   include a copyright notice\n"
253            "\t-u or --unicode     Unicode version, followed by the version like 3.0.0\n"
254            "\t-C or --csource     generate a .c source file rather than the .icu binary\n");
255        fprintf(stderr,
256            "\t-d or --destdir     destination directory, followed by the path\n"
257            "\t-s or --sourcedir   source directory, followed by the path\n"
258            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
259            "\t                    followed by path, defaults to %s\n"
260            "\tsuffix              suffix that is to be appended with a '-'\n"
261            "\t                    to the source file basenames before opening;\n"
262            "\t                    'gencase new' will read UnicodeData-new.txt etc.\n",
263            u_getDataDirectory());
264        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
265    }
266
267    /* get the options values */
268    beVerbose=options[VERBOSE].doesOccur;
269    haveCopyright=options[COPYRIGHT].doesOccur;
270    srcDir=options[SOURCEDIR].value;
271    destDir=options[DESTDIR].value;
272
273    if(argc>=2) {
274        suffix=argv[1];
275    } else {
276        suffix=NULL;
277    }
278
279    if(options[UNICODE_VERSION].doesOccur) {
280        setUnicodeVersion(options[UNICODE_VERSION].value);
281    }
282    /* else use the default dataVersion in store.c */
283
284    if (options[ICUDATADIR].doesOccur) {
285        u_setDataDirectory(options[ICUDATADIR].value);
286    }
287
288    /* prepare the filename beginning with the source dir */
289    uprv_strcpy(filename, srcDir);
290    basename=filename+uprv_strlen(filename);
291    if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
292        *basename++=U_FILE_SEP_CHAR;
293    }
294
295    /* initialize */
296    pv=upvec_open(2, &errorCode);
297    caseSensitive=uset_open(1, 0); /* empty set (start>end) */
298
299    /* process SpecialCasing.txt */
300    writeUCDFilename(basename, "SpecialCasing", suffix);
301    parseSpecialCasing(filename, &errorCode);
302
303    /* process CaseFolding.txt */
304    writeUCDFilename(basename, "CaseFolding", suffix);
305    parseCaseFolding(filename, &errorCode);
306
307    /* process additional properties files */
308    *basename=0;
309
310    parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
311
312    parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
313
314    if(ucdVersion>=UNI_4_1) {
315        parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode);
316    }
317
318    /* process UnicodeData.txt */
319    writeUCDFilename(basename, "UnicodeData", suffix);
320    parseDB(filename, &errorCode);
321
322    /* process parsed data */
323    makeCaseClosure();
324
325    makeExceptions();
326
327    if(U_SUCCESS(errorCode)) {
328        /* write the properties data file */
329        generateData(destDir, options[CSOURCE].doesOccur);
330    }
331
332    u_cleanup();
333    return errorCode;
334}
335
336U_CFUNC void
337writeUCDFilename(char *basename, const char *filename, const char *suffix) {
338    int32_t length=(int32_t)uprv_strlen(filename);
339    uprv_strcpy(basename, filename);
340    if(suffix!=NULL) {
341        basename[length++]='-';
342        uprv_strcpy(basename+length, suffix);
343        length+=(int32_t)uprv_strlen(suffix);
344    }
345    uprv_strcpy(basename+length, ".txt");
346}
347
348/* TODO: move to toolutil */
349U_CFUNC UBool
350isToken(const char *token, const char *s) {
351    const char *z;
352    int32_t j;
353
354    s=u_skipWhitespace(s);
355    for(j=0;; ++j) {
356        if(token[j]!=0) {
357            if(s[j]!=token[j]) {
358                break;
359            }
360        } else {
361            z=u_skipWhitespace(s+j);
362            if(*z==';' || *z==0) {
363                return TRUE;
364            } else {
365                break;
366            }
367        }
368    }
369
370    return FALSE;
371}
372
373static int32_t
374getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
375    const char *t, *z;
376    int32_t i, j;
377
378    s=u_skipWhitespace(s);
379    for(i=0; i<countTokens; ++i) {
380        t=tokens[i];
381        if(t!=NULL) {
382            for(j=0;; ++j) {
383                if(t[j]!=0) {
384                    if(s[j]!=t[j]) {
385                        break;
386                    }
387                } else {
388                    z=u_skipWhitespace(s+j);
389                    if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
390                        return i;
391                    } else {
392                        break;
393                    }
394                }
395            }
396        }
397    }
398    return -1;
399}
400
401static void
402_set_addAll(USet *set, const UChar *s, int32_t length) {
403    UChar32 c;
404    int32_t i;
405
406    /* needs length>=0 */
407    for(i=0; i<length; /* U16_NEXT advances i */) {
408        U16_NEXT(s, i, length, c);
409        uset_add(set, c);
410    }
411}
412
413/* parser for SpecialCasing.txt --------------------------------------------- */
414
415#define MAX_SPECIAL_CASING_COUNT 500
416
417static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
418static int32_t specialCasingCount=0;
419
420static void U_CALLCONV
421specialCasingLineFn(void *context,
422                    char *fields[][2], int32_t fieldCount,
423                    UErrorCode *pErrorCode) {
424    char *end;
425
426    /* get code point */
427    specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
428    end=(char *)u_skipWhitespace(end);
429    if(end<=fields[0][0] || end!=fields[0][1]) {
430        fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
431        *pErrorCode=U_PARSE_ERROR;
432        exit(U_PARSE_ERROR);
433    }
434
435    /* is this a complex mapping? */
436    if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
437        /* there is some condition text in the fifth field */
438        specialCasings[specialCasingCount].isComplex=TRUE;
439
440        /* do not store any actual mappings for this */
441        specialCasings[specialCasingCount].lowerCase[0]=0;
442        specialCasings[specialCasingCount].upperCase[0]=0;
443        specialCasings[specialCasingCount].titleCase[0]=0;
444    } else {
445        /* just set the "complex" flag and get the case mappings */
446        specialCasings[specialCasingCount].isComplex=FALSE;
447        specialCasings[specialCasingCount].lowerCase[0]=
448            (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
449        specialCasings[specialCasingCount].upperCase[0]=
450            (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
451        specialCasings[specialCasingCount].titleCase[0]=
452            (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
453        if(U_FAILURE(*pErrorCode)) {
454            fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
455            exit(*pErrorCode);
456        }
457
458        uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
459        _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
460        _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
461        _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
462    }
463
464    if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
465        fprintf(stderr, "gencase: too many special casing mappings\n");
466        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
467        exit(U_INDEX_OUTOFBOUNDS_ERROR);
468    }
469}
470
471static int32_t U_CALLCONV
472compareSpecialCasings(const void *context, const void *left, const void *right) {
473    return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
474}
475
476static void
477parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
478    char *fields[5][2];
479    int32_t i, j;
480
481    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
482        return;
483    }
484
485    u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
486
487    /* sort the special casing entries by code point */
488    if(specialCasingCount>0) {
489        uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
490                       compareSpecialCasings, NULL, FALSE, pErrorCode);
491    }
492    if(U_FAILURE(*pErrorCode)) {
493        return;
494    }
495
496    /* replace multiple entries for any code point by one "complex" one */
497    j=0;
498    for(i=1; i<specialCasingCount; ++i) {
499        if(specialCasings[i-1].code==specialCasings[i].code) {
500            /* there is a duplicate code point */
501            specialCasings[i-1].code=0x7fffffff;    /* remove this entry in the following sorting */
502            specialCasings[i].isComplex=TRUE;       /* make the following one complex */
503            specialCasings[i].lowerCase[0]=0;
504            specialCasings[i].upperCase[0]=0;
505            specialCasings[i].titleCase[0]=0;
506            ++j;
507        }
508    }
509
510    /* if some entries just were removed, then re-sort */
511    if(j>0) {
512        uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
513                       compareSpecialCasings, NULL, FALSE, pErrorCode);
514        specialCasingCount-=j;
515    }
516    if(U_FAILURE(*pErrorCode)) {
517        return;
518    }
519
520    /*
521     * Add one complex mapping to caseSensitive that was filtered out above:
522     * Greek final Sigma has a conditional mapping but not locale-sensitive,
523     * and it is taken when lowercasing just U+03A3 alone.
524     * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
525     */
526    uset_add(caseSensitive, 0x3c2);
527}
528
529/* parser for CaseFolding.txt ----------------------------------------------- */
530
531#define MAX_CASE_FOLDING_COUNT 2000
532
533static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
534static int32_t caseFoldingCount=0;
535
536static void U_CALLCONV
537caseFoldingLineFn(void *context,
538                  char *fields[][2], int32_t fieldCount,
539                  UErrorCode *pErrorCode) {
540    char *end;
541    static UChar32 prevCode=0;
542    int32_t count;
543    char status;
544
545    /* get code point */
546    caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
547    end=(char *)u_skipWhitespace(end);
548    if(end<=fields[0][0] || end!=fields[0][1]) {
549        fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
550        *pErrorCode=U_PARSE_ERROR;
551        exit(U_PARSE_ERROR);
552    }
553
554    /* get the status of this mapping */
555    caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
556    if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
557        fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
558        *pErrorCode=U_PARSE_ERROR;
559        exit(U_PARSE_ERROR);
560    }
561
562    /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
563    if(status=='L') {
564        return;
565    }
566
567    /* get the mapping */
568    count=caseFoldings[caseFoldingCount].full[0]=
569        (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
570    if(U_FAILURE(*pErrorCode)) {
571        fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
572        exit(*pErrorCode);
573    }
574
575    /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
576    if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
577        caseFoldings[caseFoldingCount].simple=0;
578    }
579
580    /* update the case-sensitive set */
581    if(status!='T') {
582        uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
583        _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
584    }
585
586    /* check the status */
587    if(status=='S') {
588        /* check if there was a full mapping for this code point before */
589        if( caseFoldingCount>0 &&
590            caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
591            caseFoldings[caseFoldingCount-1].status=='F'
592        ) {
593            /* merge the two entries */
594            caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
595            return;
596        }
597    } else if(status=='F') {
598        /* check if there was a simple mapping for this code point before */
599        if( caseFoldingCount>0 &&
600            caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
601            caseFoldings[caseFoldingCount-1].status=='S'
602        ) {
603            /* merge the two entries */
604            uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
605            return;
606        }
607    } else if(status=='I' || status=='T') {
608        /* check if there was a default mapping for this code point before (remove it) */
609        while(caseFoldingCount>0 &&
610              caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
611        ) {
612            prevCode=0;
613            --caseFoldingCount;
614        }
615        /* store only a marker for special handling for cases like dotless i */
616        caseFoldings[caseFoldingCount].simple=0;
617        caseFoldings[caseFoldingCount].full[0]=0;
618    }
619
620    /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
621    if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
622        fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
623                (unsigned long)caseFoldings[caseFoldingCount].code,
624                (unsigned long)prevCode);
625        *pErrorCode=U_PARSE_ERROR;
626        exit(U_PARSE_ERROR);
627    }
628    prevCode=caseFoldings[caseFoldingCount].code;
629
630    if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
631        fprintf(stderr, "gencase: too many case folding mappings\n");
632        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
633        exit(U_INDEX_OUTOFBOUNDS_ERROR);
634    }
635}
636
637static void
638parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
639    char *fields[3][2];
640
641    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
642        return;
643    }
644
645    u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
646}
647
648/* parser for UnicodeData.txt ----------------------------------------------- */
649
650/* general categories */
651const char *const
652genCategoryNames[U_CHAR_CATEGORY_COUNT]={
653    "Cn",
654    "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
655    "Mc", "Nd", "Nl", "No",
656    "Zs", "Zl", "Zp",
657    "Cc", "Cf", "Co", "Cs",
658    "Pd", "Ps", "Pe", "Pc", "Po",
659    "Sm", "Sc", "Sk", "So",
660    "Pi", "Pf"
661};
662
663static int32_t specialCasingIndex=0, caseFoldingIndex=0;
664
665static void U_CALLCONV
666unicodeDataLineFn(void *context,
667                  char *fields[][2], int32_t fieldCount,
668                  UErrorCode *pErrorCode) {
669    Props p;
670    char *end;
671    static UChar32 prevCode=0;
672    UChar32 value;
673    int32_t i;
674
675    /* reset the properties */
676    uprv_memset(&p, 0, sizeof(Props));
677
678    /* get the character code, field 0 */
679    p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
680    if(end<=fields[0][0] || end!=fields[0][1]) {
681        fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
682        *pErrorCode=U_PARSE_ERROR;
683        exit(U_PARSE_ERROR);
684    }
685
686    /* get general category, field 2 */
687    i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
688    if(i>=0) {
689        p.gc=(uint8_t)i;
690    } else {
691        fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n",
692            fields[2][0], (unsigned long)p.code);
693        *pErrorCode=U_PARSE_ERROR;
694        exit(U_PARSE_ERROR);
695    }
696
697    /* get canonical combining class, field 3 */
698    value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
699    if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
700        fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
701        *pErrorCode=U_PARSE_ERROR;
702        exit(U_PARSE_ERROR);
703    }
704    p.cc=(uint8_t)value;
705
706    /* get uppercase mapping, field 12 */
707    value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
708    if(end!=fields[12][1]) {
709        fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
710            (unsigned long)p.code);
711        *pErrorCode=U_PARSE_ERROR;
712        exit(U_PARSE_ERROR);
713    }
714    if(value!=0 && value!=p.code) {
715        p.upperCase=value;
716        uset_add(caseSensitive, p.code);
717        uset_add(caseSensitive, value);
718    }
719
720    /* get lowercase value, field 13 */
721    value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
722    if(end!=fields[13][1]) {
723        fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
724            (unsigned long)p.code);
725        *pErrorCode=U_PARSE_ERROR;
726        exit(U_PARSE_ERROR);
727    }
728    if(value!=0 && value!=p.code) {
729        p.lowerCase=value;
730        uset_add(caseSensitive, p.code);
731        uset_add(caseSensitive, value);
732    }
733
734    /* get titlecase value, field 14 */
735    value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
736    if(end!=fields[14][1]) {
737        fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
738            (unsigned long)p.code);
739        *pErrorCode=U_PARSE_ERROR;
740        exit(U_PARSE_ERROR);
741    }
742    if(value!=0 && value!=p.code) {
743        p.titleCase=value;
744        uset_add(caseSensitive, p.code);
745        uset_add(caseSensitive, value);
746    }
747
748    /* set additional properties from previously parsed files */
749    if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
750        p.specialCasing=specialCasings+specialCasingIndex++;
751    } else {
752        p.specialCasing=NULL;
753    }
754    if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
755        p.caseFolding=caseFoldings+caseFoldingIndex++;
756
757        /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
758        if( p.caseFolding->status=='C' &&
759            p.caseFolding->simple==p.lowerCase
760        ) {
761            p.caseFolding=NULL;
762        }
763    } else {
764        p.caseFolding=NULL;
765    }
766
767    /* check for non-character code points */
768    if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
769        fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
770                (unsigned long)p.code);
771        *pErrorCode=U_PARSE_ERROR;
772        exit(U_PARSE_ERROR);
773    }
774
775    /* check that the code points (p.code) are in ascending order */
776    if(p.code<=prevCode && p.code>0) {
777        fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
778                (unsigned long)p.code, (unsigned long)prevCode);
779        *pErrorCode=U_PARSE_ERROR;
780        exit(U_PARSE_ERROR);
781    }
782
783    /* properties for a single code point */
784    setProps(&p);
785
786    prevCode=p.code;
787}
788
789static void
790parseDB(const char *filename, UErrorCode *pErrorCode) {
791    char *fields[15][2];
792    UChar32 start, end;
793    int32_t i;
794
795    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
796        return;
797    }
798
799    u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
800
801    /* are all sub-properties consumed? */
802    if(specialCasingIndex<specialCasingCount) {
803        fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
804        *pErrorCode=U_PARSE_ERROR;
805        exit(U_PARSE_ERROR);
806    }
807    if(caseFoldingIndex<caseFoldingCount) {
808        fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
809        *pErrorCode=U_PARSE_ERROR;
810        exit(U_PARSE_ERROR);
811    }
812
813    if(U_FAILURE(*pErrorCode)) {
814        return;
815    }
816
817    for(i=0;
818        0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
819        ++i
820    ) {
821        addCaseSensitive(start, end);
822    }
823    if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
824        *pErrorCode=U_ZERO_ERROR;
825    }
826}
827
828/*
829 * Hey, Emacs, please set the following:
830 *
831 * Local Variables:
832 * indent-tabs-mode: nil
833 * End:
834 *
835 */
836