1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2004-2010, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  xmlparser.cpp
11*   encoding:   UTF-8
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2004jul21
16*   created by: Andy Heninger
17*/
18
19#include <stdio.h>
20#include "unicode/uchar.h"
21#include "unicode/ucnv.h"
22#include "unicode/regex.h"
23#include "filestrm.h"
24#include "xmlparser.h"
25
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
27
28// character constants
29enum {
30    x_QUOT=0x22,
31    x_AMP=0x26,
32    x_APOS=0x27,
33    x_LT=0x3c,
34    x_GT=0x3e,
35    x_l=0x6c
36};
37
38#define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
39
40// XML #4
41#define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
42                    "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
43                    "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
44                    "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
45
46//  XML #5
47#define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
48
49//  XML #6
50#define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
51
52U_NAMESPACE_BEGIN
53
54UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
55UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
56
57//
58//   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
59//                             used for parsing.
60//
61UXMLParser::UXMLParser(UErrorCode &status) :
62      //  XML Declaration.  XML Production #23.
63      //      example:  "<?xml version=1.0 encoding="utf-16" ?>
64      //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
65      //            allow for a possible leading BOM.
66      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
67
68      //  XML Comment   production #15
69      //     example:  "<!-- whatever -->
70      //       note, does not detect an illegal "--" within comments
71      mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
72
73      //  XML Spaces
74      //      production [3]
75      mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
76
77      //  XML Doctype decl  production #28
78      //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
79      //       or      "<!DOCTYPE foo [internal dtd]>
80      //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
81      //           Some internal dtd subsets could confuse this simple-minded
82      //           attempt at skipping over them, specifically, occcurences
83      //           of closeing square brackets.  These could appear in comments,
84      //           or in parameter entity declarations, for example.
85      mXMLDoctype(UnicodeString(
86           "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
87           ), 0, status),
88
89      //  XML PI     production #16
90      //     example   "<?target stuff?>
91      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
92
93      //  XML Element Start   Productions #40, #41
94      //          example   <foo att1='abc'  att2="d e f" >
95      //      capture #1:  the tag name
96      //
97      mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
98          "(?:"
99                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
100                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
101          ")*"                                                             //   * for zero or more attributes.
102          XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
103
104      //  XML Element End     production #42
105      //     example   </foo>
106      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
107
108      // XML Element Empty    production #44
109      //     example   <foo att1="abc"   att2="d e f" />
110      mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
111          "(?:"
112                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
113                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
114          ")*"                                                             //   * for zero or more attributes.
115          XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
116
117
118      // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
119      mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
120
121      // Attribute name = "value".  XML Productions 10, 40/41
122      //  Capture group 1 is name,
123      //                2 is the attribute value, including the quotes.
124      //
125      //   Note that attributes are scanned twice.  The first time is with
126      //        the regex for an entire element start.  There, the attributes
127      //        are checked syntactically, but not separted out one by one.
128      //        Here, we match a single attribute, and make its name and
129      //        attribute value available to the parser code.
130      mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
131         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
132
133
134      mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
135
136      // Match any of the new-line sequences in content.
137      //   All are changed to \u000a.
138      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
139
140      // & char references
141      //   We will figure out what we've got based on which capture group has content.
142      //   The last one is a catchall for unrecognized entity references..
143      //             1     2     3      4      5           6                    7          8
144      mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
145                0, status),
146
147      fNames(status),
148      fElementStack(status),
149      fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
150      {
151      }
152
153UXMLParser *
154UXMLParser::createParser(UErrorCode &errorCode) {
155    if (U_FAILURE(errorCode)) {
156        return NULL;
157    } else {
158        return new UXMLParser(errorCode);
159    }
160}
161
162UXMLParser::~UXMLParser() {}
163
164UXMLElement *
165UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
166    char bytes[4096], charsetBuffer[100];
167    FileStream *f;
168    const char *charset, *pb;
169    UnicodeString src;
170    UConverter *cnv;
171    UChar *buffer, *pu;
172    int32_t fileLength, bytesLength, length, capacity;
173    UBool flush;
174
175    if(U_FAILURE(errorCode)) {
176        return NULL;
177    }
178
179    f=T_FileStream_open(filename, "rb");
180    if(f==NULL) {
181        errorCode=U_FILE_ACCESS_ERROR;
182        return NULL;
183    }
184
185    bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
186    if(bytesLength<(int32_t)sizeof(bytes)) {
187        // we have already read the entire file
188        fileLength=bytesLength;
189    } else {
190        // get the file length
191        fileLength=T_FileStream_size(f);
192    }
193
194    /*
195     * get the charset:
196     * 1. Unicode signature
197     * 2. treat as ISO-8859-1 and read XML encoding="charser"
198     * 3. default to UTF-8
199     */
200    charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
201    if(U_SUCCESS(errorCode) && charset!=NULL) {
202        // open converter according to Unicode signature
203        cnv=ucnv_open(charset, &errorCode);
204    } else {
205        // read as Latin-1 and parse the XML declaration and encoding
206        cnv=ucnv_open("ISO-8859-1", &errorCode);
207        if(U_FAILURE(errorCode)) {
208            // unexpected error opening Latin-1 converter
209            goto exit;
210        }
211
212        buffer=toUCharPtr(src.getBuffer(bytesLength));
213        if(buffer==NULL) {
214            // unexpected failure to reserve some string capacity
215            errorCode=U_MEMORY_ALLOCATION_ERROR;
216            goto exit;
217        }
218        pb=bytes;
219        pu=buffer;
220        ucnv_toUnicode(
221            cnv,
222            &pu, buffer+src.getCapacity(),
223            &pb, bytes+bytesLength,
224            NULL, TRUE, &errorCode);
225        src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
226        ucnv_close(cnv);
227        cnv=NULL;
228        if(U_FAILURE(errorCode)) {
229            // unexpected error in conversion from Latin-1
230            src.remove();
231            goto exit;
232        }
233
234        // parse XML declaration
235        if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
236            int32_t declEnd=mXMLDecl.end(errorCode);
237            // go beyond <?xml
238            int32_t pos=src.indexOf((UChar)x_l)+1;
239
240            mAttrValue.reset(src);
241            while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
242                UnicodeString attName  = mAttrValue.group(1, errorCode);
243                UnicodeString attValue = mAttrValue.group(2, errorCode);
244
245                // Trim the quotes from the att value.  These are left over from the original regex
246                //   that parsed the attribue, which couldn't conveniently strip them.
247                attValue.remove(0,1);                    // one char from the beginning
248                attValue.truncate(attValue.length()-1);  // and one from the end.
249
250                if(attName==UNICODE_STRING("encoding", 8)) {
251                    length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
252                    charset=charsetBuffer;
253                    break;
254                }
255                pos = mAttrValue.end(2, errorCode);
256            }
257
258            if(charset==NULL) {
259                // default to UTF-8
260                charset="UTF-8";
261            }
262            cnv=ucnv_open(charset, &errorCode);
263        }
264    }
265
266    if(U_FAILURE(errorCode)) {
267        // unable to open the converter
268        goto exit;
269    }
270
271    // convert the file contents
272    capacity=fileLength;        // estimated capacity
273    src.getBuffer(capacity);
274    src.releaseBuffer(0);       // zero length
275    flush=FALSE;
276    for(;;) {
277        // convert contents of bytes[bytesLength]
278        pb=bytes;
279        for(;;) {
280            length=src.length();
281            buffer=toUCharPtr(src.getBuffer(capacity));
282            if(buffer==NULL) {
283                // unexpected failure to reserve some string capacity
284                errorCode=U_MEMORY_ALLOCATION_ERROR;
285                goto exit;
286            }
287
288            pu=buffer+length;
289            ucnv_toUnicode(
290                cnv, &pu, buffer+src.getCapacity(),
291                &pb, bytes+bytesLength,
292                NULL, FALSE, &errorCode);
293            src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
294            if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
295                errorCode=U_ZERO_ERROR;
296                capacity=(3*src.getCapacity())/2; // increase capacity by 50%
297            } else {
298                break;
299            }
300        }
301
302        if(U_FAILURE(errorCode)) {
303            break; // conversion error
304        }
305
306        if(flush) {
307            break; // completely converted the file
308        }
309
310        // read next block
311        bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
312        if(bytesLength==0) {
313            // reached end of file, convert once more to flush the converter
314            flush=TRUE;
315        }
316    };
317
318exit:
319    ucnv_close(cnv);
320    T_FileStream_close(f);
321
322    if(U_SUCCESS(errorCode)) {
323        return parse(src, errorCode);
324    } else {
325        return NULL;
326    }
327}
328
329UXMLElement *
330UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
331    if(U_FAILURE(status)) {
332        return NULL;
333    }
334
335    UXMLElement   *root = NULL;
336    fPos = 0; // TODO use just a local pos variable and pass it into functions
337              // where necessary?
338
339    // set all matchers to work on the input string
340    mXMLDecl.reset(src);
341    mXMLComment.reset(src);
342    mXMLSP.reset(src);
343    mXMLDoctype.reset(src);
344    mXMLPI.reset(src);
345    mXMLElemStart.reset(src);
346    mXMLElemEnd.reset(src);
347    mXMLElemEmpty.reset(src);
348    mXMLCharData.reset(src);
349    mAttrValue.reset(src);
350    mAttrNormalizer.reset(src);
351    mNewLineNormalizer.reset(src);
352    mAmps.reset(src);
353
354    // Consume the XML Declaration, if present.
355    if (mXMLDecl.lookingAt(fPos, status)) {
356        fPos = mXMLDecl.end(status);
357    }
358
359    // Consume "misc" [XML production 27] appearing before DocType
360    parseMisc(status);
361
362    // Consume a DocType declaration, if present.
363    if (mXMLDoctype.lookingAt(fPos, status)) {
364        fPos = mXMLDoctype.end(status);
365    }
366
367    // Consume additional "misc" [XML production 27] appearing after the DocType
368    parseMisc(status);
369
370    // Get the root element
371    if (mXMLElemEmpty.lookingAt(fPos, status)) {
372        // Root is an empty element (no nested elements or content)
373        root = createElement(mXMLElemEmpty, status);
374        fPos = mXMLElemEmpty.end(status);
375    } else {
376        if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
377            error("Root Element expected", status);
378            goto errorExit;
379        }
380        root = createElement(mXMLElemStart, status);
381        UXMLElement  *el = root;
382
383        //
384        // This is the loop that consumes the root element of the document,
385        //      including all nested content.   Nested elements are handled by
386        //      explicit pushes/pops of the element stack; there is no recursion
387        //      in the control flow of this code.
388        //      "el" always refers to the current element, the one to which content
389        //      is being added.  It is above the top of the element stack.
390        for (;;) {
391            // Nested Element Start
392            if (mXMLElemStart.lookingAt(fPos, status)) {
393                UXMLElement *t = createElement(mXMLElemStart, status);
394                el->fChildren.addElement(t, status);
395                t->fParent = el;
396                fElementStack.push(el, status);
397                el = t;
398                continue;
399            }
400
401            // Text Content.  String is concatenated onto the current node's content,
402            //                but only if it contains something other than spaces.
403            UnicodeString s = scanContent(status);
404            if (s.length() > 0) {
405                mXMLSP.reset(s);
406                if (mXMLSP.matches(status) == FALSE) {
407                    // This chunk of text contains something other than just
408                    //  white space. Make a child node for it.
409                    replaceCharRefs(s, status);
410                    el->fChildren.addElement(s.clone(), status);
411                }
412                mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
413                continue;
414            }
415
416            // Comments.  Discard.
417            if (mXMLComment.lookingAt(fPos, status)) {
418                fPos = mXMLComment.end(status);
419                continue;
420            }
421
422            // PIs.  Discard.
423            if (mXMLPI.lookingAt(fPos, status)) {
424                fPos = mXMLPI.end(status);
425                continue;
426            }
427
428            // Element End
429            if (mXMLElemEnd.lookingAt(fPos, status)) {
430                fPos = mXMLElemEnd.end(0, status);
431                const UnicodeString name = mXMLElemEnd.group(1, status);
432                if (name != *el->fName) {
433                    error("Element start / end tag mismatch", status);
434                    goto errorExit;
435                }
436                if (fElementStack.empty()) {
437                    // Close of the root element.  We're done with the doc.
438                    el = NULL;
439                    break;
440                }
441                el = (UXMLElement *)fElementStack.pop();
442                continue;
443            }
444
445            // Empty Element.  Stored as a child of the current element, but not stacked.
446            if (mXMLElemEmpty.lookingAt(fPos, status)) {
447                UXMLElement *t = createElement(mXMLElemEmpty, status);
448                el->fChildren.addElement(t, status);
449                continue;
450            }
451
452            // Hit something within the document that doesn't match anything.
453            //   It's an error.
454            error("Unrecognized markup", status);
455            break;
456        }
457
458        if (el != NULL || !fElementStack.empty()) {
459            // We bailed out early, for some reason.
460            error("Root element not closed.", status);
461            goto errorExit;
462        }
463    }
464
465    // Root Element parse is complete.
466    // Consume the annoying xml "Misc" that can appear at the end of the doc.
467    parseMisc(status);
468
469    // We should have reached the end of the input
470    if (fPos != src.length()) {
471        error("Extra content at the end of the document", status);
472        goto errorExit;
473    }
474
475    // Success!
476    return root;
477
478errorExit:
479    delete root;
480    return NULL;
481}
482
483//
484//  createElement
485//      We've just matched an element start tag.  Create and fill in a UXMLElement object
486//      for it.
487//
488UXMLElement *
489UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
490    // First capture group is the element's name.
491    UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
492
493    // Scan for attributes.
494    int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
495
496    while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
497        UnicodeString attName  = mAttrValue.group(1, status);
498        UnicodeString attValue = mAttrValue.group(2, status);
499
500        // Trim the quotes from the att value.  These are left over from the original regex
501        //   that parsed the attribue, which couldn't conveniently strip them.
502        attValue.remove(0,1);                    // one char from the beginning
503        attValue.truncate(attValue.length()-1);  // and one from the end.
504
505        // XML Attribue value normalization.
506        // This is one of the really screwy parts of the XML spec.
507        // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
508        // Note that non-validating parsers must treat all entities as type CDATA
509        //   which simplifies things some.
510
511        // Att normalization step 1:  normalize any newlines in the attribute value
512        mNewLineNormalizer.reset(attValue);
513        attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
514
515        // Next change all xml white space chars to plain \u0020 spaces.
516        mAttrNormalizer.reset(attValue);
517        UnicodeString oneSpace((UChar)0x0020);
518        attValue = mAttrNormalizer.replaceAll(oneSpace, status);
519
520        // Replace character entities.
521        replaceCharRefs(attValue, status);
522
523        // Save the attribute name and value in our document structure.
524        el->fAttNames.addElement((void *)intern(attName, status), status);
525        el->fAttValues.addElement(attValue.clone(), status);
526        pos = mAttrValue.end(2, status);
527    }
528    fPos = mEl.end(0, status);
529    return el;
530}
531
532//
533//  parseMisc
534//     Consume XML "Misc" [production #27]
535//        which is any combination of space, PI and comments
536//      Need to watch end-of-input because xml MISC stuff is allowed after
537//        the document element, so we WILL scan off the end in this function
538//
539void
540UXMLParser::parseMisc(UErrorCode &status)  {
541    for (;;) {
542        if (fPos >= mXMLPI.input().length()) {
543            break;
544        }
545        if (mXMLPI.lookingAt(fPos, status)) {
546            fPos = mXMLPI.end(status);
547            continue;
548        }
549        if (mXMLSP.lookingAt(fPos, status)) {
550            fPos = mXMLSP.end(status);
551            continue;
552        }
553        if (mXMLComment.lookingAt(fPos, status)) {
554            fPos = mXMLComment.end(status);
555            continue;
556        }
557        break;
558    }
559}
560
561//
562//  Scan for document content.
563//
564UnicodeString
565UXMLParser::scanContent(UErrorCode &status) {
566    UnicodeString  result;
567    if (mXMLCharData.lookingAt(fPos, status)) {
568        result = mXMLCharData.group((int32_t)0, status);
569        // Normalize the new-lines.  (Before char ref substitution)
570        mNewLineNormalizer.reset(result);
571        result = mNewLineNormalizer.replaceAll(fOneLF, status);
572
573        // TODO:  handle CDATA
574        fPos = mXMLCharData.end(0, status);
575    }
576
577    return result;
578}
579
580//
581//   replaceCharRefs
582//
583//      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
584//       with the corresponding actual character.
585//
586void
587UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
588    UnicodeString result;
589    UnicodeString replacement;
590    int     i;
591
592    mAmps.reset(s);
593    // See the initialization for the regex matcher mAmps.
594    //    Which entity we've matched is determined by which capture group has content,
595    //      which is flaged by start() of that group not being -1.
596    while (mAmps.find()) {
597        if (mAmps.start(1, status) != -1) {
598            replacement.setTo((UChar)x_AMP);
599        } else if (mAmps.start(2, status) != -1) {
600            replacement.setTo((UChar)x_LT);
601        } else if (mAmps.start(3, status) != -1) {
602            replacement.setTo((UChar)x_GT);
603        } else if (mAmps.start(4, status) != -1) {
604            replacement.setTo((UChar)x_APOS);
605        } else if (mAmps.start(5, status) != -1) {
606            replacement.setTo((UChar)x_QUOT);
607        } else if (mAmps.start(6, status) != -1) {
608            UnicodeString hexString = mAmps.group(6, status);
609            UChar32 val = 0;
610            for (i=0; i<hexString.length(); i++) {
611                val = (val << 4) + u_digit(hexString.charAt(i), 16);
612            }
613            // TODO:  some verification that the character is valid
614            replacement.setTo(val);
615        } else if (mAmps.start(7, status) != -1) {
616            UnicodeString decimalString = mAmps.group(7, status);
617            UChar32 val = 0;
618            for (i=0; i<decimalString.length(); i++) {
619                val = val*10 + u_digit(decimalString.charAt(i), 10);
620            }
621            // TODO:  some verification that the character is valid
622            replacement.setTo(val);
623        } else {
624            // An unrecognized &entity;  Leave it alone.
625            //  TODO:  check that it really looks like an entity, and is not some
626            //         random & in the text.
627            replacement = mAmps.group((int32_t)0, status);
628        }
629        mAmps.appendReplacement(result, replacement, status);
630    }
631    mAmps.appendTail(result);
632    s = result;
633}
634
635void
636UXMLParser::error(const char *message, UErrorCode &status) {
637    // TODO:  something better here...
638    const UnicodeString &src=mXMLDecl.input();
639    int  line = 0;
640    int  ci = 0;
641    while (ci < fPos && ci>=0) {
642        ci = src.indexOf((UChar)0x0a, ci+1);
643        line++;
644    }
645    fprintf(stderr, "Error: %s at line %d\n", message, line);
646    if (U_SUCCESS(status)) {
647        status = U_PARSE_ERROR;
648    }
649}
650
651// intern strings like in Java
652
653const UnicodeString *
654UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
655    const UHashElement *he=fNames.find(s);
656    if(he!=NULL) {
657        // already a known name, return its hashed key pointer
658        return (const UnicodeString *)he->key.pointer;
659    } else {
660        // add this new name and return its hashed key pointer
661        fNames.puti(s, 0, errorCode);
662        he=fNames.find(s);
663        return (const UnicodeString *)he->key.pointer;
664    }
665}
666
667const UnicodeString *
668UXMLParser::findName(const UnicodeString &s) const {
669    const UHashElement *he=fNames.find(s);
670    if(he!=NULL) {
671        // a known name, return its hashed key pointer
672        return (const UnicodeString *)he->key.pointer;
673    } else {
674        // unknown name
675        return NULL;
676    }
677}
678
679// UXMLElement ------------------------------------------------------------- ***
680
681UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
682   fParser(parser),
683   fName(name),
684   fAttNames(errorCode),
685   fAttValues(errorCode),
686   fChildren(errorCode),
687   fParent(NULL)
688{
689}
690
691UXMLElement::~UXMLElement() {
692    int   i;
693    // attribute names are owned by the UXMLParser, don't delete them here
694    for (i=fAttValues.size()-1; i>=0; i--) {
695        delete (UObject *)fAttValues.elementAt(i);
696    }
697    for (i=fChildren.size()-1; i>=0; i--) {
698        delete (UObject *)fChildren.elementAt(i);
699    }
700}
701
702const UnicodeString &
703UXMLElement::getTagName() const {
704    return *fName;
705}
706
707UnicodeString
708UXMLElement::getText(UBool recurse) const {
709    UnicodeString text;
710    appendText(text, recurse);
711    return text;
712}
713
714void
715UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
716    const UObject *node;
717    int32_t i, count=fChildren.size();
718    for(i=0; i<count; ++i) {
719        node=(const UObject *)fChildren.elementAt(i);
720        const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
721        if(s!=NULL) {
722            text.append(*s);
723        } else if(recurse) /* must be a UXMLElement */ {
724            ((const UXMLElement *)node)->appendText(text, recurse);
725        }
726    }
727}
728
729int32_t
730UXMLElement::countAttributes() const {
731    return fAttNames.size();
732}
733
734const UnicodeString *
735UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
736    if(0<=i && i<fAttNames.size()) {
737        name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
738        value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
739        return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
740    } else {
741        return NULL;
742    }
743}
744
745const UnicodeString *
746UXMLElement::getAttribute(const UnicodeString &name) const {
747    // search for the attribute name by comparing the interned pointer,
748    // not the string contents
749    const UnicodeString *p=fParser->findName(name);
750    if(p==NULL) {
751        return NULL; // no such attribute seen by the parser at all
752    }
753
754    int32_t i, count=fAttNames.size();
755    for(i=0; i<count; ++i) {
756        if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
757            return (const UnicodeString *)fAttValues.elementAt(i);
758        }
759    }
760    return NULL;
761}
762
763int32_t
764UXMLElement::countChildren() const {
765    return fChildren.size();
766}
767
768const UObject *
769UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
770    if(0<=i && i<fChildren.size()) {
771        const UObject *node=(const UObject *)fChildren.elementAt(i);
772        if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
773            type=UXML_NODE_TYPE_ELEMENT;
774        } else {
775            type=UXML_NODE_TYPE_STRING;
776        }
777        return node;
778    } else {
779        return NULL;
780    }
781}
782
783const UXMLElement *
784UXMLElement::nextChildElement(int32_t &i) const {
785    if(i<0) {
786        return NULL;
787    }
788
789    const UObject *node;
790    int32_t count=fChildren.size();
791    while(i<count) {
792        node=(const UObject *)fChildren.elementAt(i++);
793        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
794        if(elem!=NULL) {
795            return elem;
796        }
797    }
798    return NULL;
799}
800
801const UXMLElement *
802UXMLElement::getChildElement(const UnicodeString &name) const {
803    // search for the element name by comparing the interned pointer,
804    // not the string contents
805    const UnicodeString *p=fParser->findName(name);
806    if(p==NULL) {
807        return NULL; // no such element seen by the parser at all
808    }
809
810    const UObject *node;
811    int32_t i, count=fChildren.size();
812    for(i=0; i<count; ++i) {
813        node=(const UObject *)fChildren.elementAt(i);
814        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
815        if(elem!=NULL) {
816            if(p==elem->fName) {
817                return elem;
818            }
819        }
820    }
821    return NULL;
822}
823
824U_NAMESPACE_END
825
826#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
827
828