16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/*
26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*******************************************************************************
36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Copyright (C) 2011-2013, International Business Machines
46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   Corporation and others.  All Rights Reserved.
56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*******************************************************************************
66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   file name:  ppucd.h
76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   encoding:   US-ASCII
86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   tab size:   8 (not used)
96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   indentation:4
106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*
116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   created on: 2011dec11
126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*   created by: Markus W. Scherer
136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/
146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifndef __PPUCD_H__
166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define __PPUCD_H__
176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h"
196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uniset.h"
206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/unistr.h"
216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include <stdio.h>
236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** Additions to the uchar.h enum UProperty. */
256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgenum {
266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /** Name_Alias */
276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    PPUCD_CONDITIONAL_CASE_MAPPINGS,
296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    PPUCD_TURKIC_CASE_FOLDING
306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org};
316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN
336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass U_TOOLUTIL_API PropertyNames {
356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic:
366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    virtual ~PropertyNames();
376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    virtual int32_t getPropertyEnum(const char *name) const;
386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org};
406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgstruct U_TOOLUTIL_API UniProps {
426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UniProps();
436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ~UniProps();
446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar32 start, end;
486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UBool binProps[UCHAR_BINARY_LIMIT];
496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UVersionInfo age;
516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar32 bmg, bpb;
526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar32 scf, slc, stc, suc;
536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t digitValue;
546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *numericValue;
556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *name;
566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *nameAlias;
576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeString cf, lc, tc, uc;
586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UnicodeSet scx;
596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org};
606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass U_TOOLUTIL_API PreparsedUCD {
626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic:
636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    enum LineType {
646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** No line, end of file. */
656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        NO_LINE,
666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** Empty line. (Might contain a comment.) */
676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        EMPTY_LINE,
686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** ucd;6.1.0 */
706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        UNICODE_VERSION_LINE,
716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** property;Binary;Alpha;Alphabetic */
736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        PROPERTY_LINE,
746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** binary;N;No;F;False */
756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        BINARY_LINE,
766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** value;gc;Zs;Space_Separator */
776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        VALUE_LINE,
786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** defaults;0000..10FFFF;age=NA;bc=L;... */
806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        DEFAULTS_LINE,
816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        BLOCK_LINE,
836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        CP_LINE,
856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        ALG_NAMES_RANGE_LINE,
886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        LINE_TYPE_COUNT
906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    };
916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Constructor.
946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Prepare this object for a new, empty package.
956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    PreparsedUCD(const char *filename, UErrorCode &errorCode);
976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /** Destructor. */
996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    ~PreparsedUCD();
1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Reads a line from the preparsed UCD file.
1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Splits the line by replacing each ';' with a NUL.
1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    LineType readLine(UErrorCode &errorCode);
1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /** Returns the number of the line read by readLine(). */
1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t getLineNumber() const { return lineNumber; }
1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /** Returns the line's next field, or NULL. */
1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *nextField();
1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /** Returns TRUE if the current line has property values. */
1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; }
1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Parses properties from the current line.
1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Clears newValues and sets UProperty codes for property values mentioned
1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * on the current line (as opposed to being inherited).
1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * The returned UniProps are usable until the next line of the same type is read.
1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /**
1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Returns the code point range for the current algnamesrange line.
1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Calls & parses nextField().
1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Further nextField() calls will yield the range's type & prefix string.
1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     * Returns U_SUCCESS(errorCode).
1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org     */
1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprivate:
1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UBool isLineBufferAvailable(int32_t i) {
1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org        return defaultLineIndex!=i && blockLineIndex!=i;
1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    }
1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    /** Resets the field iterator and returns the line's first field (the line type field). */
1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const char *firstField();
1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org                        UErrorCode &errorCode);
1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    static const int32_t kNumLineBuffers=3;
1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    PropertyNames *icuPnames;  // owned
1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    const PropertyNames *pnames;  // aliased
1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    FILE *file;
1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t defaultLineIndex, blockLineIndex, lineIndex;
1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    int32_t lineNumber;
1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    LineType lineType;
1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *fieldLimit;
1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char *lineLimit;
1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UVersionInfo ucdVersion;
1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    UniProps defaultProps, blockProps, cpProps;
1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // Multiple lines so that default and block properties can maintain pointers
1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    // into their line buffers.
1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org    char lines[kNumLineBuffers][4096];
1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org};
1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END
1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org
1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif  // __PPUCD_H__
175