1/*
2*******************************************************************************
3*   Copyright (C) 2011-2013, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ppucd.h
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2011dec11
12*   created by: Markus W. Scherer
13*/
14
15#ifndef __PPUCD_H__
16#define __PPUCD_H__
17
18#include "unicode/utypes.h"
19#include "unicode/uniset.h"
20#include "unicode/unistr.h"
21
22#include <stdio.h>
23
24/** Additions to the uchar.h enum UProperty. */
25enum {
26    /** Name_Alias */
27    PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
28    PPUCD_CONDITIONAL_CASE_MAPPINGS,
29    PPUCD_TURKIC_CASE_FOLDING
30};
31
32U_NAMESPACE_BEGIN
33
34class U_TOOLUTIL_API PropertyNames {
35public:
36    virtual ~PropertyNames();
37    virtual int32_t getPropertyEnum(const char *name) const;
38    virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
39};
40
41struct U_TOOLUTIL_API UniProps {
42    UniProps();
43    ~UniProps();
44
45    int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
46
47    UChar32 start, end;
48    UBool binProps[UCHAR_BINARY_LIMIT];
49    int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
50    UVersionInfo age;
51    UChar32 bmg, bpb;
52    UChar32 scf, slc, stc, suc;
53    int32_t digitValue;
54    const char *numericValue;
55    const char *name;
56    const char *nameAlias;
57    UnicodeString cf, lc, tc, uc;
58    UnicodeSet scx;
59};
60
61class U_TOOLUTIL_API PreparsedUCD {
62public:
63    enum LineType {
64        /** No line, end of file. */
65        NO_LINE,
66        /** Empty line. (Might contain a comment.) */
67        EMPTY_LINE,
68
69        /** ucd;6.1.0 */
70        UNICODE_VERSION_LINE,
71
72        /** property;Binary;Alpha;Alphabetic */
73        PROPERTY_LINE,
74        /** binary;N;No;F;False */
75        BINARY_LINE,
76        /** value;gc;Zs;Space_Separator */
77        VALUE_LINE,
78
79        /** defaults;0000..10FFFF;age=NA;bc=L;... */
80        DEFAULTS_LINE,
81        /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
82        BLOCK_LINE,
83        /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
84        CP_LINE,
85
86        /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
87        ALG_NAMES_RANGE_LINE,
88
89        LINE_TYPE_COUNT
90    };
91
92    /**
93     * Constructor.
94     * Prepare this object for a new, empty package.
95     */
96    PreparsedUCD(const char *filename, UErrorCode &errorCode);
97
98    /** Destructor. */
99    ~PreparsedUCD();
100
101    /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
102    void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
103
104    /**
105     * Reads a line from the preparsed UCD file.
106     * Splits the line by replacing each ';' with a NUL.
107     */
108    LineType readLine(UErrorCode &errorCode);
109
110    /** Returns the number of the line read by readLine(). */
111    int32_t getLineNumber() const { return lineNumber; }
112
113    /** Returns the line's next field, or NULL. */
114    const char *nextField();
115
116    /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
117    const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
118
119    /** Returns TRUE if the current line has property values. */
120    UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; }
121
122    /**
123     * Parses properties from the current line.
124     * Clears newValues and sets UProperty codes for property values mentioned
125     * on the current line (as opposed to being inherited).
126     * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
127     * The returned UniProps are usable until the next line of the same type is read.
128     */
129    const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
130
131    /**
132     * Returns the code point range for the current algnamesrange line.
133     * Calls & parses nextField().
134     * Further nextField() calls will yield the range's type & prefix string.
135     * Returns U_SUCCESS(errorCode).
136     */
137    UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
138
139private:
140    UBool isLineBufferAvailable(int32_t i) {
141        return defaultLineIndex!=i && blockLineIndex!=i;
142    }
143
144    /** Resets the field iterator and returns the line's first field (the line type field). */
145    const char *firstField();
146
147    UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
148                        UErrorCode &errorCode);
149    UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
150    UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
151    void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
152    void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
153
154    static const int32_t kNumLineBuffers=3;
155
156    PropertyNames *icuPnames;  // owned
157    const PropertyNames *pnames;  // aliased
158    FILE *file;
159    int32_t defaultLineIndex, blockLineIndex, lineIndex;
160    int32_t lineNumber;
161    LineType lineType;
162    char *fieldLimit;
163    char *lineLimit;
164
165    UVersionInfo ucdVersion;
166    UniProps defaultProps, blockProps, cpProps;
167    // Multiple lines so that default and block properties can maintain pointers
168    // into their line buffers.
169    char lines[kNumLineBuffers][4096];
170};
171
172U_NAMESPACE_END
173
174#endif  // __PPUCD_H__
175