1/*
2*******************************************************************************
3*
4*   Copyright (C) 2004-2005, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  xmlparser.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2004jul21
14*   created by: Andy Heninger
15*
16* Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
17* Not suitable for production use. Not supported.
18* Not conformant. Not efficient.
19* But very small.
20*/
21
22#ifndef __XMLPARSER_H__
23#define __XMLPARSER_H__
24
25#include "unicode/uobject.h"
26#include "unicode/unistr.h"
27#include "unicode/regex.h"
28#include "uvector.h"
29#include "hash.h"
30
31#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
32
33enum UXMLNodeType {
34    /** Node type string (text contents), stored as a UnicodeString. */
35    UXML_NODE_TYPE_STRING,
36    /** Node type element, stored as a UXMLElement. */
37    UXML_NODE_TYPE_ELEMENT,
38    UXML_NODE_TYPE_COUNT
39};
40
41U_NAMESPACE_BEGIN
42
43class UXMLParser;
44
45/**
46 * This class represents an element node in a parsed XML tree.
47 */
48class U_TOOLUTIL_API UXMLElement : public UObject {
49public:
50    /**
51     * Destructor.
52     */
53    virtual ~UXMLElement();
54
55    /**
56     * Get the tag name of this element.
57     */
58    const UnicodeString &getTagName() const;
59    /**
60     * Get the text contents of the element.
61     * Append the contents of all text child nodes.
62     * @param recurse If TRUE, also recursively appends the contents of all
63     *        text child nodes of element children.
64     * @return The text contents.
65     */
66    UnicodeString getText(UBool recurse) const;
67    /**
68     * Get the number of attributes.
69     */
70    int32_t countAttributes() const;
71    /**
72     * Get the i-th attribute.
73     * @param i Index of the attribute.
74     * @param name Output parameter, receives the attribute name.
75     * @param value Output parameter, receives the attribute value.
76     * @return A pointer to the attribute value (may be &value or a pointer to an
77     *         internal string object), or NULL if i is out of bounds.
78     */
79    const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
80    /**
81     * Get the value of the attribute with the given name.
82     * @param name Attribute name to be looked up.
83     * @return A pointer to the attribute value, or NULL if this element
84     * does not have this attribute.
85     */
86    const UnicodeString *getAttribute(const UnicodeString &name) const;
87    /**
88     * Get the number of child nodes.
89     */
90    int32_t countChildren() const;
91    /**
92     * Get the i-th child node.
93     * @param i Index of the child node.
94     * @param type The child node type.
95     * @return A pointer to the child node object, or NULL if i is out of bounds.
96     */
97    const UObject *getChild(int32_t i, UXMLNodeType &type) const;
98    /**
99     * Get the next child element node, skipping non-element child nodes.
100     * @param i Enumeration index; initialize to 0 before getting the first child element.
101     * @return A pointer to the next child element, or NULL if there is none.
102     */
103    const UXMLElement *nextChildElement(int32_t &i) const;
104    /**
105     * Get the immediate child element with the given name.
106     * If there are multiple child elements with this name, then return
107     * the first one.
108     * @param name Element name to be looked up.
109     * @return A pointer to the element node, or NULL if this element
110     * does not have this immediate child element.
111     */
112    const UXMLElement *getChildElement(const UnicodeString &name) const;
113
114    /**
115     * ICU "poor man's RTTI", returns a UClassID for the actual class.
116     */
117    virtual UClassID getDynamicClassID() const;
118
119    /**
120     * ICU "poor man's RTTI", returns a UClassID for this class.
121     */
122    static UClassID U_EXPORT2 getStaticClassID();
123
124private:
125    // prevent default construction etc.
126    UXMLElement();
127    UXMLElement(const UXMLElement &other);
128    UXMLElement &operator=(const UXMLElement &other);
129
130    void appendText(UnicodeString &text, UBool recurse) const;
131
132    friend class UXMLParser;
133
134    UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
135
136    const UXMLParser *fParser;
137    const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
138    UnicodeString       fContent;        // The text content of this node.  All element content is
139                                         //   concatenated even when there are intervening nested elements
140                                         //   (which doesn't happen with most xml files we care about)
141                                         //   Sections of content containing only white space are dropped,
142                                         //   which gets rid  the bogus white space content from
143                                         //   elements which are primarily containers for nested elements.
144    UVector             fAttNames;       // A vector containing the names of this element's attributes
145                                         //    The names are UnicodeString objects, owned by the UXMLParser.
146    UVector             fAttValues;      // A vector containing the attribute values for
147                                         //    this element's attributes.  The order is the same
148                                         //    as that of the attribute name vector.
149
150    UVector             fChildren;       // The child nodes of this element (a Vector)
151
152    UXMLElement        *fParent;         // A pointer to the parent element of this element.
153};
154
155/**
156 * A simple XML parser; it is neither efficient nor conformant and only useful for
157 * restricted types of XML documents.
158 *
159 * The parse methods parse whole documents and return the parse trees via their
160 * root elements.
161 */
162class U_TOOLUTIL_API UXMLParser : public UObject {
163public:
164    /**
165     * Create an XML parser.
166     */
167    static UXMLParser *createParser(UErrorCode &errorCode);
168    /**
169     * Destructor.
170     */
171    virtual ~UXMLParser();
172
173    /**
174     * Parse an XML document, create the entire document tree, and
175     * return a pointer to the root element of the parsed tree.
176     * The caller must delete the element.
177     */
178    UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
179    /**
180     * Parse an XML file, create the entire document tree, and
181     * return a pointer to the root element of the parsed tree.
182     * The caller must delete the element.
183     */
184    UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
185
186    /**
187     * ICU "poor man's RTTI", returns a UClassID for the actual class.
188     */
189    virtual UClassID getDynamicClassID() const;
190
191    /**
192     * ICU "poor man's RTTI", returns a UClassID for this class.
193     */
194    static UClassID U_EXPORT2 getStaticClassID();
195
196private:
197    // prevent default construction etc.
198    UXMLParser();
199    UXMLParser(const UXMLParser &other);
200    UXMLParser &operator=(const UXMLParser &other);
201
202    // constructor
203    UXMLParser(UErrorCode &status);
204
205    void           parseMisc(UErrorCode &status);
206    UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
207    void           error(const char *message, UErrorCode &status);
208    UnicodeString  scanContent(UErrorCode &status);
209    void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
210
211    const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
212public:
213    // public for UXMLElement only
214    const UnicodeString *findName(const UnicodeString &s) const;
215private:
216
217    // There is one ICU regex matcher for each of the major XML syntax items
218    //  that are recognized.
219    RegexMatcher mXMLDecl;
220    RegexMatcher mXMLComment;
221    RegexMatcher mXMLSP;
222    RegexMatcher mXMLDoctype;
223    RegexMatcher mXMLPI;
224    RegexMatcher mXMLElemStart;
225    RegexMatcher mXMLElemEnd;
226    RegexMatcher mXMLElemEmpty;
227    RegexMatcher mXMLCharData;
228    RegexMatcher mAttrValue;
229    RegexMatcher mAttrNormalizer;
230    RegexMatcher mNewLineNormalizer;
231    RegexMatcher mAmps;
232
233    Hashtable             fNames;           // interned element/attribute name strings
234    UStack                fElementStack;    // Stack holds the parent elements when nested
235                                            //    elements are being parsed.  All items on this
236                                            //    stack are of type UXMLElement.
237    int32_t               fPos;             // String index of the current scan position in
238                                            //    xml source (in fSrc).
239    UnicodeString         fOneLF;
240};
241
242U_NAMESPACE_END
243#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
244
245#endif
246