1/*
2*******************************************************************************
3*   Copyright (C) 2011-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ppucd.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2011dec11
12*   created by: Markus W. Scherer
13*/
14
15#include "unicode/utypes.h"
16#include "unicode/uchar.h"
17#include "charstr.h"
18#include "cstring.h"
19#include "ppucd.h"
20#include "uassert.h"
21#include "uparse.h"
22
23#include <stdio.h>
24#include <string.h>
25
26U_NAMESPACE_BEGIN
27
28PropertyNames::~PropertyNames() {}
29
30int32_t
31PropertyNames::getPropertyEnum(const char *name) const {
32    return u_getPropertyEnum(name);
33}
34
35int32_t
36PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
37    return u_getPropertyValueEnum((UProperty)property, name);
38}
39
40UniProps::UniProps()
41        : start(U_SENTINEL), end(U_SENTINEL),
42          bmg(U_SENTINEL), bpb(U_SENTINEL),
43          scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
44          digitValue(-1), numericValue(NULL),
45          name(NULL), nameAlias(NULL) {
46    memset(binProps, 0, sizeof(binProps));
47    memset(intProps, 0, sizeof(intProps));
48    memset(age, 0, 4);
49}
50
51UniProps::~UniProps() {}
52
53const int32_t PreparsedUCD::kNumLineBuffers;
54
55PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
56        : icuPnames(new PropertyNames()), pnames(icuPnames),
57          file(NULL),
58          defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
59          lineNumber(0),
60          lineType(NO_LINE),
61          fieldLimit(NULL), lineLimit(NULL) {
62    if(U_FAILURE(errorCode)) { return; }
63
64    if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
65        filename=NULL;
66        file=stdin;
67    } else {
68        file=fopen(filename, "r");
69    }
70    if(file==NULL) {
71        perror("error opening preparsed UCD");
72        fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
73        errorCode=U_FILE_ACCESS_ERROR;
74        return;
75    }
76
77    memset(ucdVersion, 0, 4);
78    lines[0][0]=0;
79}
80
81PreparsedUCD::~PreparsedUCD() {
82    if(file!=stdin) {
83        fclose(file);
84    }
85    delete icuPnames;
86}
87
88// Same order as the LineType values.
89static const char *lineTypeStrings[]={
90    NULL,
91    NULL,
92    "ucd",
93    "property",
94    "binary",
95    "value",
96    "defaults",
97    "block",
98    "cp",
99    "algnamesrange"
100};
101
102PreparsedUCD::LineType
103PreparsedUCD::readLine(UErrorCode &errorCode) {
104    if(U_FAILURE(errorCode)) { return NO_LINE; }
105    // Select the next available line buffer.
106    while(!isLineBufferAvailable(lineIndex)) {
107        ++lineIndex;
108        if (lineIndex == kNumLineBuffers) {
109            lineIndex = 0;
110        }
111    }
112    char *line=lines[lineIndex];
113    *line=0;
114    lineLimit=fieldLimit=line;
115    lineType=NO_LINE;
116    char *result=fgets(line, sizeof(lines[0]), file);
117    if(result==NULL) {
118        if(ferror(file)) {
119            perror("error reading preparsed UCD");
120            fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
121            errorCode=U_FILE_ACCESS_ERROR;
122        }
123        return NO_LINE;
124    }
125    ++lineNumber;
126    if(*line=='#') {
127        fieldLimit=strchr(line, 0);
128        return lineType=EMPTY_LINE;
129    }
130    // Remove trailing /r/n.
131    char c;
132    char *limit=strchr(line, 0);
133    while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
134    // Remove trailing white space.
135    while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
136    *limit=0;
137    lineLimit=limit;
138    if(line==limit) {
139        fieldLimit=limit;
140        return lineType=EMPTY_LINE;
141    }
142    // Split by ';'.
143    char *semi=line;
144    while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
145    fieldLimit=strchr(line, 0);
146    // Determine the line type.
147    int32_t type;
148    for(type=EMPTY_LINE+1;; ++type) {
149        if(type==LINE_TYPE_COUNT) {
150            fprintf(stderr,
151                    "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
152                    line, (long)lineNumber);
153            errorCode=U_PARSE_ERROR;
154            return NO_LINE;
155        }
156        if(0==strcmp(line, lineTypeStrings[type])) {
157            break;
158        }
159    }
160    lineType=(LineType)type;
161    if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
162        u_versionFromString(ucdVersion, fieldLimit+1);
163    }
164    return lineType;
165}
166
167const char *
168PreparsedUCD::firstField() {
169    char *field=lines[lineIndex];
170    fieldLimit=strchr(field, 0);
171    return field;
172}
173
174const char *
175PreparsedUCD::nextField() {
176    if(fieldLimit==lineLimit) { return NULL; }
177    char *field=fieldLimit+1;
178    fieldLimit=strchr(field, 0);
179    return field;
180}
181
182const UniProps *
183PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
184    if(U_FAILURE(errorCode)) { return NULL; }
185    newValues.clear();
186    if(!lineHasPropertyValues()) {
187        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
188        return NULL;
189    }
190    firstField();
191    const char *field=nextField();
192    if(field==NULL) {
193        // No range field after the type.
194        fprintf(stderr,
195                "error in preparsed UCD: missing default/block/cp range field "
196                "(no second field) on line %ld\n",
197                (long)lineNumber);
198        errorCode=U_PARSE_ERROR;
199        return NULL;
200    }
201    UChar32 start, end;
202    if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
203    UniProps *props;
204    switch(lineType) {
205    case DEFAULTS_LINE:
206        if(defaultLineIndex>=0) {
207            fprintf(stderr,
208                    "error in preparsed UCD: second line with default properties on line %ld\n",
209                    (long)lineNumber);
210            errorCode=U_PARSE_ERROR;
211            return NULL;
212        }
213        if(start!=0 || end!=0x10ffff) {
214            fprintf(stderr,
215                    "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
216                    field, (long)lineNumber);
217            errorCode=U_PARSE_ERROR;
218            return NULL;
219        }
220        props=&defaultProps;
221        defaultLineIndex=lineIndex;
222        break;
223    case BLOCK_LINE:
224        blockProps=defaultProps;  // Block inherits default properties.
225        props=&blockProps;
226        blockLineIndex=lineIndex;
227        break;
228    case CP_LINE:
229        if(blockProps.start<=start && end<=blockProps.end) {
230            // Code point range fully inside the last block inherits the block properties.
231            cpProps=blockProps;
232        } else if(start>blockProps.end || end<blockProps.start) {
233            // Code point range fully outside the last block inherits the default properties.
234            cpProps=defaultProps;
235        } else {
236            // Code point range partially overlapping with the last block is illegal.
237            fprintf(stderr,
238                    "error in preparsed UCD: cp range %s on line %ld only "
239                    "partially overlaps with block range %04lX..%04lX\n",
240                    field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
241            errorCode=U_PARSE_ERROR;
242            return NULL;
243        }
244        props=&cpProps;
245        break;
246    default:
247        // Will not occur because of the range check above.
248        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
249        return NULL;
250    }
251    props->start=start;
252    props->end=end;
253    while((field=nextField())!=NULL) {
254        if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
255    }
256    return props;
257}
258
259static const struct {
260    const char *name;
261    int32_t prop;
262} ppucdProperties[]={
263    { "Name_Alias", PPUCD_NAME_ALIAS },
264    { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
265    { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
266};
267
268// Returns TRUE for "ok to continue parsing fields".
269UBool
270PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
271                            UErrorCode &errorCode) {
272    CharString pBuffer;
273    const char *p=field;
274    const char *v=strchr(p, '=');
275    int binaryValue;
276    if(*p=='-') {
277        if(v!=NULL) {
278            fprintf(stderr,
279                    "error in preparsed UCD: mix of binary-property-no and "
280                    "enum-property syntax '%s' on line %ld\n",
281                    field, (long)lineNumber);
282            errorCode=U_PARSE_ERROR;
283            return FALSE;
284        }
285        binaryValue=0;
286        ++p;
287    } else if(v==NULL) {
288        binaryValue=1;
289    } else {
290        binaryValue=-1;
291        // Copy out the property name rather than modifying the field (writing a NUL).
292        pBuffer.append(p, (int32_t)(v-p), errorCode);
293        p=pBuffer.data();
294        ++v;
295    }
296    int32_t prop=pnames->getPropertyEnum(p);
297    if(prop<0) {
298        for(int32_t i=0;; ++i) {
299            if(i==UPRV_LENGTHOF(ppucdProperties)) {
300                // Ignore unknown property names.
301                return TRUE;
302            }
303            if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
304                prop=ppucdProperties[i].prop;
305                U_ASSERT(prop>=0);
306                break;
307            }
308        }
309    }
310    if(prop<UCHAR_BINARY_LIMIT) {
311        if(binaryValue>=0) {
312            props.binProps[prop]=(UBool)binaryValue;
313        } else {
314            // No binary value for a binary property.
315            fprintf(stderr,
316                    "error in preparsed UCD: enum-property syntax '%s' "
317                    "for binary property on line %ld\n",
318                    field, (long)lineNumber);
319            errorCode=U_PARSE_ERROR;
320        }
321    } else if(binaryValue>=0) {
322        // Binary value for a non-binary property.
323        fprintf(stderr,
324                "error in preparsed UCD: binary-property syntax '%s' "
325                "for non-binary property on line %ld\n",
326                field, (long)lineNumber);
327        errorCode=U_PARSE_ERROR;
328    } else if (prop < UCHAR_INT_START) {
329        fprintf(stderr,
330                "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
331                prop, (long)lineNumber);
332        errorCode=U_PARSE_ERROR;
333    } else if(prop<UCHAR_INT_LIMIT) {
334        int32_t value=pnames->getPropertyValueEnum(prop, v);
335        if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
336            // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
337            char *end;
338            unsigned long ccc=uprv_strtoul(v, &end, 10);
339            if(v<end && *end==0 && ccc<=254) {
340                value=(int32_t)ccc;
341            }
342        }
343        if(value==UCHAR_INVALID_CODE) {
344            fprintf(stderr,
345                    "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
346                    field, (long)lineNumber);
347            errorCode=U_PARSE_ERROR;
348        } else {
349            props.intProps[prop-UCHAR_INT_START]=value;
350        }
351    } else if(*v=='<') {
352        // Do not parse default values like <code point>, just set null values.
353        switch(prop) {
354        case UCHAR_BIDI_MIRRORING_GLYPH:
355            props.bmg=U_SENTINEL;
356            break;
357        case UCHAR_BIDI_PAIRED_BRACKET:
358            props.bpb=U_SENTINEL;
359            break;
360        case UCHAR_SIMPLE_CASE_FOLDING:
361            props.scf=U_SENTINEL;
362            break;
363        case UCHAR_SIMPLE_LOWERCASE_MAPPING:
364            props.slc=U_SENTINEL;
365            break;
366        case UCHAR_SIMPLE_TITLECASE_MAPPING:
367            props.stc=U_SENTINEL;
368            break;
369        case UCHAR_SIMPLE_UPPERCASE_MAPPING:
370            props.suc=U_SENTINEL;
371            break;
372        case UCHAR_CASE_FOLDING:
373            props.cf.remove();
374            break;
375        case UCHAR_LOWERCASE_MAPPING:
376            props.lc.remove();
377            break;
378        case UCHAR_TITLECASE_MAPPING:
379            props.tc.remove();
380            break;
381        case UCHAR_UPPERCASE_MAPPING:
382            props.uc.remove();
383            break;
384        case UCHAR_SCRIPT_EXTENSIONS:
385            props.scx.clear();
386            break;
387        default:
388            fprintf(stderr,
389                    "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
390                    field, (long)lineNumber);
391            errorCode=U_PARSE_ERROR;
392        }
393    } else {
394        char c;
395        switch(prop) {
396        case UCHAR_NUMERIC_VALUE:
397            props.numericValue=v;
398            c=*v;
399            if('0'<=c && c<='9' && v[1]==0) {
400                props.digitValue=c-'0';
401            } else {
402                props.digitValue=-1;
403            }
404            break;
405        case UCHAR_NAME:
406            props.name=v;
407            break;
408        case UCHAR_AGE:
409            u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
410            break;
411        case UCHAR_BIDI_MIRRORING_GLYPH:
412            props.bmg=parseCodePoint(v, errorCode);
413            break;
414        case UCHAR_BIDI_PAIRED_BRACKET:
415            props.bpb=parseCodePoint(v, errorCode);
416            break;
417        case UCHAR_SIMPLE_CASE_FOLDING:
418            props.scf=parseCodePoint(v, errorCode);
419            break;
420        case UCHAR_SIMPLE_LOWERCASE_MAPPING:
421            props.slc=parseCodePoint(v, errorCode);
422            break;
423        case UCHAR_SIMPLE_TITLECASE_MAPPING:
424            props.stc=parseCodePoint(v, errorCode);
425            break;
426        case UCHAR_SIMPLE_UPPERCASE_MAPPING:
427            props.suc=parseCodePoint(v, errorCode);
428            break;
429        case UCHAR_CASE_FOLDING:
430            parseString(v, props.cf, errorCode);
431            break;
432        case UCHAR_LOWERCASE_MAPPING:
433            parseString(v, props.lc, errorCode);
434            break;
435        case UCHAR_TITLECASE_MAPPING:
436            parseString(v, props.tc, errorCode);
437            break;
438        case UCHAR_UPPERCASE_MAPPING:
439            parseString(v, props.uc, errorCode);
440            break;
441        case PPUCD_NAME_ALIAS:
442            props.nameAlias=v;
443            break;
444        case PPUCD_CONDITIONAL_CASE_MAPPINGS:
445        case PPUCD_TURKIC_CASE_FOLDING:
446            // No need to parse their values: They are hardcoded in the runtime library.
447            break;
448        case UCHAR_SCRIPT_EXTENSIONS:
449            parseScriptExtensions(v, props.scx, errorCode);
450            break;
451        default:
452            // Ignore unhandled properties.
453            return TRUE;
454        }
455    }
456    if(U_SUCCESS(errorCode)) {
457        newValues.add((UChar32)prop);
458        return TRUE;
459    } else {
460        return FALSE;
461    }
462}
463
464UBool
465PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
466    if(U_FAILURE(errorCode)) { return FALSE; }
467    if(lineType!=ALG_NAMES_RANGE_LINE) {
468        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
469        return FALSE;
470    }
471    firstField();
472    const char *field=nextField();
473    if(field==NULL) {
474        // No range field after the type.
475        fprintf(stderr,
476                "error in preparsed UCD: missing algnamesrange range field "
477                "(no second field) on line %ld\n",
478                (long)lineNumber);
479        errorCode=U_PARSE_ERROR;
480        return FALSE;
481    }
482    return parseCodePointRange(field, start, end, errorCode);
483}
484
485UChar32
486PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
487    char *end;
488    uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
489    if(end<=s || *end!=0 || value>=0x110000) {
490        fprintf(stderr,
491                "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
492                s, (long)lineNumber);
493        errorCode=U_PARSE_ERROR;
494        return U_SENTINEL;
495    }
496    return (UChar32)value;
497}
498
499UBool
500PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
501    uint32_t st, e;
502    u_parseCodePointRange(s, &st, &e, &errorCode);
503    if(U_FAILURE(errorCode)) {
504        fprintf(stderr,
505                "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
506                s, (long)lineNumber);
507        return FALSE;
508    }
509    start=(UChar32)st;
510    end=(UChar32)e;
511    return TRUE;
512}
513
514void
515PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
516    UChar *buffer=uni.getBuffer(-1);
517    int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
518    if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
519        errorCode=U_ZERO_ERROR;
520        uni.releaseBuffer(0);
521        buffer=uni.getBuffer(length);
522        length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
523    }
524    uni.releaseBuffer(length);
525    if(U_FAILURE(errorCode)) {
526        fprintf(stderr,
527                "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
528                s, (long)lineNumber);
529    }
530}
531
532void
533PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
534    if(U_FAILURE(errorCode)) { return; }
535    scx.clear();
536    CharString scString;
537    for(;;) {
538        const char *scs;
539        const char *scLimit=strchr(s, ' ');
540        if(scLimit!=NULL) {
541            scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
542            if(U_FAILURE(errorCode)) { return; }
543        } else {
544            scs=s;
545        }
546        int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
547        if(script==UCHAR_INVALID_CODE) {
548            fprintf(stderr,
549                    "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
550                    scs, (long)lineNumber);
551            errorCode=U_PARSE_ERROR;
552            return;
553        } else if(scx.contains(script)) {
554            fprintf(stderr,
555                    "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
556                    scs, (long)lineNumber);
557            errorCode=U_PARSE_ERROR;
558            return;
559        } else {
560            scx.add(script);
561        }
562        if(scLimit!=NULL) {
563            s=scLimit+1;
564        } else {
565            break;
566        }
567    }
568    if(scx.isEmpty()) {
569        fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
570        errorCode=U_PARSE_ERROR;
571    }
572}
573
574U_NAMESPACE_END
575