1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* 6* Copyright (C) 2004-2010, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9******************************************************************************* 10* file name: xmlparser.cpp 11* encoding: UTF-8 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2004jul21 16* created by: Andy Heninger 17*/ 18 19#include <stdio.h> 20#include "unicode/uchar.h" 21#include "unicode/ucnv.h" 22#include "unicode/regex.h" 23#include "filestrm.h" 24#include "xmlparser.h" 25 26#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION 27 28// character constants 29enum { 30 x_QUOT=0x22, 31 x_AMP=0x26, 32 x_APOS=0x27, 33 x_LT=0x3c, 34 x_GT=0x3e, 35 x_l=0x6c 36}; 37 38#define XML_SPACES "[ \\u0009\\u000d\\u000a]" 39 40// XML #4 41#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ 42 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ 43 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ 44 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" 45 46// XML #5 47#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" 48 49// XML #6 50#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" 51 52U_NAMESPACE_BEGIN 53 54UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) 55UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) 56 57// 58// UXMLParser constructor. Mostly just initializes the ICU regexes that are 59// used for parsing. 60// 61UXMLParser::UXMLParser(UErrorCode &status) : 62 // XML Declaration. XML Production #23. 63 // example: "<?xml version=1.0 encoding="utf-16" ?> 64 // This is a sloppy implementation - just look for the leading <?xml and the closing ?> 65 // allow for a possible leading BOM. 66 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), 67 68 // XML Comment production #15 69 // example: "<!-- whatever --> 70 // note, does not detect an illegal "--" within comments 71 mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), 72 73 // XML Spaces 74 // production [3] 75 mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), 76 77 // XML Doctype decl production #28 78 // example "<!DOCTYPE foo SYSTEM "somewhere" > 79 // or "<!DOCTYPE foo [internal dtd]> 80 // TODO: we don't actually parse the DOCTYPE or internal subsets. 81 // Some internal dtd subsets could confuse this simple-minded 82 // attempt at skipping over them, specifically, occcurences 83 // of closeing square brackets. These could appear in comments, 84 // or in parameter entity declarations, for example. 85 mXMLDoctype(UnicodeString( 86 "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV 87 ), 0, status), 88 89 // XML PI production #16 90 // example "<?target stuff?> 91 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), 92 93 // XML Element Start Productions #40, #41 94 // example <foo att1='abc' att2="d e f" > 95 // capture #1: the tag name 96 // 97 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" 98 "(?:" 99 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " 100 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' 101 ")*" // * for zero or more attributes. 102 XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" 103 104 // XML Element End production #42 105 // example </foo> 106 mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), 107 108 // XML Element Empty production #44 109 // example <foo att1="abc" att2="d e f" /> 110 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" 111 "(?:" 112 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " 113 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' 114 ")*" // * for zero or more attributes. 115 XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" 116 117 118 // XMLCharData. Everything but '<'. Note that & will be dealt with later. 119 mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), 120 121 // Attribute name = "value". XML Productions 10, 40/41 122 // Capture group 1 is name, 123 // 2 is the attribute value, including the quotes. 124 // 125 // Note that attributes are scanned twice. The first time is with 126 // the regex for an entire element start. There, the attributes 127 // are checked syntactically, but not separted out one by one. 128 // Here, we match a single attribute, and make its name and 129 // attribute value available to the parser code. 130 mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" 131 "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), 132 133 134 mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), 135 136 // Match any of the new-line sequences in content. 137 // All are changed to \u000a. 138 mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), 139 140 // & char references 141 // We will figure out what we've got based on which capture group has content. 142 // The last one is a catchall for unrecognized entity references.. 143 // 1 2 3 4 5 6 7 8 144 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), 145 0, status), 146 147 fNames(status), 148 fElementStack(status), 149 fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. 150 { 151 } 152 153UXMLParser * 154UXMLParser::createParser(UErrorCode &errorCode) { 155 if (U_FAILURE(errorCode)) { 156 return NULL; 157 } else { 158 return new UXMLParser(errorCode); 159 } 160} 161 162UXMLParser::~UXMLParser() {} 163 164UXMLElement * 165UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { 166 char bytes[4096], charsetBuffer[100]; 167 FileStream *f; 168 const char *charset, *pb; 169 UnicodeString src; 170 UConverter *cnv; 171 UChar *buffer, *pu; 172 int32_t fileLength, bytesLength, length, capacity; 173 UBool flush; 174 175 if(U_FAILURE(errorCode)) { 176 return NULL; 177 } 178 179 f=T_FileStream_open(filename, "rb"); 180 if(f==NULL) { 181 errorCode=U_FILE_ACCESS_ERROR; 182 return NULL; 183 } 184 185 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); 186 if(bytesLength<(int32_t)sizeof(bytes)) { 187 // we have already read the entire file 188 fileLength=bytesLength; 189 } else { 190 // get the file length 191 fileLength=T_FileStream_size(f); 192 } 193 194 /* 195 * get the charset: 196 * 1. Unicode signature 197 * 2. treat as ISO-8859-1 and read XML encoding="charser" 198 * 3. default to UTF-8 199 */ 200 charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); 201 if(U_SUCCESS(errorCode) && charset!=NULL) { 202 // open converter according to Unicode signature 203 cnv=ucnv_open(charset, &errorCode); 204 } else { 205 // read as Latin-1 and parse the XML declaration and encoding 206 cnv=ucnv_open("ISO-8859-1", &errorCode); 207 if(U_FAILURE(errorCode)) { 208 // unexpected error opening Latin-1 converter 209 goto exit; 210 } 211 212 buffer=toUCharPtr(src.getBuffer(bytesLength)); 213 if(buffer==NULL) { 214 // unexpected failure to reserve some string capacity 215 errorCode=U_MEMORY_ALLOCATION_ERROR; 216 goto exit; 217 } 218 pb=bytes; 219 pu=buffer; 220 ucnv_toUnicode( 221 cnv, 222 &pu, buffer+src.getCapacity(), 223 &pb, bytes+bytesLength, 224 NULL, TRUE, &errorCode); 225 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); 226 ucnv_close(cnv); 227 cnv=NULL; 228 if(U_FAILURE(errorCode)) { 229 // unexpected error in conversion from Latin-1 230 src.remove(); 231 goto exit; 232 } 233 234 // parse XML declaration 235 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { 236 int32_t declEnd=mXMLDecl.end(errorCode); 237 // go beyond <?xml 238 int32_t pos=src.indexOf((UChar)x_l)+1; 239 240 mAttrValue.reset(src); 241 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. 242 UnicodeString attName = mAttrValue.group(1, errorCode); 243 UnicodeString attValue = mAttrValue.group(2, errorCode); 244 245 // Trim the quotes from the att value. These are left over from the original regex 246 // that parsed the attribue, which couldn't conveniently strip them. 247 attValue.remove(0,1); // one char from the beginning 248 attValue.truncate(attValue.length()-1); // and one from the end. 249 250 if(attName==UNICODE_STRING("encoding", 8)) { 251 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); 252 charset=charsetBuffer; 253 break; 254 } 255 pos = mAttrValue.end(2, errorCode); 256 } 257 258 if(charset==NULL) { 259 // default to UTF-8 260 charset="UTF-8"; 261 } 262 cnv=ucnv_open(charset, &errorCode); 263 } 264 } 265 266 if(U_FAILURE(errorCode)) { 267 // unable to open the converter 268 goto exit; 269 } 270 271 // convert the file contents 272 capacity=fileLength; // estimated capacity 273 src.getBuffer(capacity); 274 src.releaseBuffer(0); // zero length 275 flush=FALSE; 276 for(;;) { 277 // convert contents of bytes[bytesLength] 278 pb=bytes; 279 for(;;) { 280 length=src.length(); 281 buffer=toUCharPtr(src.getBuffer(capacity)); 282 if(buffer==NULL) { 283 // unexpected failure to reserve some string capacity 284 errorCode=U_MEMORY_ALLOCATION_ERROR; 285 goto exit; 286 } 287 288 pu=buffer+length; 289 ucnv_toUnicode( 290 cnv, &pu, buffer+src.getCapacity(), 291 &pb, bytes+bytesLength, 292 NULL, FALSE, &errorCode); 293 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); 294 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 295 errorCode=U_ZERO_ERROR; 296 capacity=(3*src.getCapacity())/2; // increase capacity by 50% 297 } else { 298 break; 299 } 300 } 301 302 if(U_FAILURE(errorCode)) { 303 break; // conversion error 304 } 305 306 if(flush) { 307 break; // completely converted the file 308 } 309 310 // read next block 311 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); 312 if(bytesLength==0) { 313 // reached end of file, convert once more to flush the converter 314 flush=TRUE; 315 } 316 }; 317 318exit: 319 ucnv_close(cnv); 320 T_FileStream_close(f); 321 322 if(U_SUCCESS(errorCode)) { 323 return parse(src, errorCode); 324 } else { 325 return NULL; 326 } 327} 328 329UXMLElement * 330UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { 331 if(U_FAILURE(status)) { 332 return NULL; 333 } 334 335 UXMLElement *root = NULL; 336 fPos = 0; // TODO use just a local pos variable and pass it into functions 337 // where necessary? 338 339 // set all matchers to work on the input string 340 mXMLDecl.reset(src); 341 mXMLComment.reset(src); 342 mXMLSP.reset(src); 343 mXMLDoctype.reset(src); 344 mXMLPI.reset(src); 345 mXMLElemStart.reset(src); 346 mXMLElemEnd.reset(src); 347 mXMLElemEmpty.reset(src); 348 mXMLCharData.reset(src); 349 mAttrValue.reset(src); 350 mAttrNormalizer.reset(src); 351 mNewLineNormalizer.reset(src); 352 mAmps.reset(src); 353 354 // Consume the XML Declaration, if present. 355 if (mXMLDecl.lookingAt(fPos, status)) { 356 fPos = mXMLDecl.end(status); 357 } 358 359 // Consume "misc" [XML production 27] appearing before DocType 360 parseMisc(status); 361 362 // Consume a DocType declaration, if present. 363 if (mXMLDoctype.lookingAt(fPos, status)) { 364 fPos = mXMLDoctype.end(status); 365 } 366 367 // Consume additional "misc" [XML production 27] appearing after the DocType 368 parseMisc(status); 369 370 // Get the root element 371 if (mXMLElemEmpty.lookingAt(fPos, status)) { 372 // Root is an empty element (no nested elements or content) 373 root = createElement(mXMLElemEmpty, status); 374 fPos = mXMLElemEmpty.end(status); 375 } else { 376 if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { 377 error("Root Element expected", status); 378 goto errorExit; 379 } 380 root = createElement(mXMLElemStart, status); 381 UXMLElement *el = root; 382 383 // 384 // This is the loop that consumes the root element of the document, 385 // including all nested content. Nested elements are handled by 386 // explicit pushes/pops of the element stack; there is no recursion 387 // in the control flow of this code. 388 // "el" always refers to the current element, the one to which content 389 // is being added. It is above the top of the element stack. 390 for (;;) { 391 // Nested Element Start 392 if (mXMLElemStart.lookingAt(fPos, status)) { 393 UXMLElement *t = createElement(mXMLElemStart, status); 394 el->fChildren.addElement(t, status); 395 t->fParent = el; 396 fElementStack.push(el, status); 397 el = t; 398 continue; 399 } 400 401 // Text Content. String is concatenated onto the current node's content, 402 // but only if it contains something other than spaces. 403 UnicodeString s = scanContent(status); 404 if (s.length() > 0) { 405 mXMLSP.reset(s); 406 if (mXMLSP.matches(status) == FALSE) { 407 // This chunk of text contains something other than just 408 // white space. Make a child node for it. 409 replaceCharRefs(s, status); 410 el->fChildren.addElement(s.clone(), status); 411 } 412 mXMLSP.reset(src); // The matchers need to stay set to the main input string. 413 continue; 414 } 415 416 // Comments. Discard. 417 if (mXMLComment.lookingAt(fPos, status)) { 418 fPos = mXMLComment.end(status); 419 continue; 420 } 421 422 // PIs. Discard. 423 if (mXMLPI.lookingAt(fPos, status)) { 424 fPos = mXMLPI.end(status); 425 continue; 426 } 427 428 // Element End 429 if (mXMLElemEnd.lookingAt(fPos, status)) { 430 fPos = mXMLElemEnd.end(0, status); 431 const UnicodeString name = mXMLElemEnd.group(1, status); 432 if (name != *el->fName) { 433 error("Element start / end tag mismatch", status); 434 goto errorExit; 435 } 436 if (fElementStack.empty()) { 437 // Close of the root element. We're done with the doc. 438 el = NULL; 439 break; 440 } 441 el = (UXMLElement *)fElementStack.pop(); 442 continue; 443 } 444 445 // Empty Element. Stored as a child of the current element, but not stacked. 446 if (mXMLElemEmpty.lookingAt(fPos, status)) { 447 UXMLElement *t = createElement(mXMLElemEmpty, status); 448 el->fChildren.addElement(t, status); 449 continue; 450 } 451 452 // Hit something within the document that doesn't match anything. 453 // It's an error. 454 error("Unrecognized markup", status); 455 break; 456 } 457 458 if (el != NULL || !fElementStack.empty()) { 459 // We bailed out early, for some reason. 460 error("Root element not closed.", status); 461 goto errorExit; 462 } 463 } 464 465 // Root Element parse is complete. 466 // Consume the annoying xml "Misc" that can appear at the end of the doc. 467 parseMisc(status); 468 469 // We should have reached the end of the input 470 if (fPos != src.length()) { 471 error("Extra content at the end of the document", status); 472 goto errorExit; 473 } 474 475 // Success! 476 return root; 477 478errorExit: 479 delete root; 480 return NULL; 481} 482 483// 484// createElement 485// We've just matched an element start tag. Create and fill in a UXMLElement object 486// for it. 487// 488UXMLElement * 489UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { 490 // First capture group is the element's name. 491 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); 492 493 // Scan for attributes. 494 int32_t pos = mEl.end(1, status); // The position after the end of the tag name 495 496 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. 497 UnicodeString attName = mAttrValue.group(1, status); 498 UnicodeString attValue = mAttrValue.group(2, status); 499 500 // Trim the quotes from the att value. These are left over from the original regex 501 // that parsed the attribue, which couldn't conveniently strip them. 502 attValue.remove(0,1); // one char from the beginning 503 attValue.truncate(attValue.length()-1); // and one from the end. 504 505 // XML Attribue value normalization. 506 // This is one of the really screwy parts of the XML spec. 507 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize 508 // Note that non-validating parsers must treat all entities as type CDATA 509 // which simplifies things some. 510 511 // Att normalization step 1: normalize any newlines in the attribute value 512 mNewLineNormalizer.reset(attValue); 513 attValue = mNewLineNormalizer.replaceAll(fOneLF, status); 514 515 // Next change all xml white space chars to plain \u0020 spaces. 516 mAttrNormalizer.reset(attValue); 517 UnicodeString oneSpace((UChar)0x0020); 518 attValue = mAttrNormalizer.replaceAll(oneSpace, status); 519 520 // Replace character entities. 521 replaceCharRefs(attValue, status); 522 523 // Save the attribute name and value in our document structure. 524 el->fAttNames.addElement((void *)intern(attName, status), status); 525 el->fAttValues.addElement(attValue.clone(), status); 526 pos = mAttrValue.end(2, status); 527 } 528 fPos = mEl.end(0, status); 529 return el; 530} 531 532// 533// parseMisc 534// Consume XML "Misc" [production #27] 535// which is any combination of space, PI and comments 536// Need to watch end-of-input because xml MISC stuff is allowed after 537// the document element, so we WILL scan off the end in this function 538// 539void 540UXMLParser::parseMisc(UErrorCode &status) { 541 for (;;) { 542 if (fPos >= mXMLPI.input().length()) { 543 break; 544 } 545 if (mXMLPI.lookingAt(fPos, status)) { 546 fPos = mXMLPI.end(status); 547 continue; 548 } 549 if (mXMLSP.lookingAt(fPos, status)) { 550 fPos = mXMLSP.end(status); 551 continue; 552 } 553 if (mXMLComment.lookingAt(fPos, status)) { 554 fPos = mXMLComment.end(status); 555 continue; 556 } 557 break; 558 } 559} 560 561// 562// Scan for document content. 563// 564UnicodeString 565UXMLParser::scanContent(UErrorCode &status) { 566 UnicodeString result; 567 if (mXMLCharData.lookingAt(fPos, status)) { 568 result = mXMLCharData.group((int32_t)0, status); 569 // Normalize the new-lines. (Before char ref substitution) 570 mNewLineNormalizer.reset(result); 571 result = mNewLineNormalizer.replaceAll(fOneLF, status); 572 573 // TODO: handle CDATA 574 fPos = mXMLCharData.end(0, status); 575 } 576 577 return result; 578} 579 580// 581// replaceCharRefs 582// 583// replace the char entities < & { ካ etc. in a string 584// with the corresponding actual character. 585// 586void 587UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { 588 UnicodeString result; 589 UnicodeString replacement; 590 int i; 591 592 mAmps.reset(s); 593 // See the initialization for the regex matcher mAmps. 594 // Which entity we've matched is determined by which capture group has content, 595 // which is flaged by start() of that group not being -1. 596 while (mAmps.find()) { 597 if (mAmps.start(1, status) != -1) { 598 replacement.setTo((UChar)x_AMP); 599 } else if (mAmps.start(2, status) != -1) { 600 replacement.setTo((UChar)x_LT); 601 } else if (mAmps.start(3, status) != -1) { 602 replacement.setTo((UChar)x_GT); 603 } else if (mAmps.start(4, status) != -1) { 604 replacement.setTo((UChar)x_APOS); 605 } else if (mAmps.start(5, status) != -1) { 606 replacement.setTo((UChar)x_QUOT); 607 } else if (mAmps.start(6, status) != -1) { 608 UnicodeString hexString = mAmps.group(6, status); 609 UChar32 val = 0; 610 for (i=0; i<hexString.length(); i++) { 611 val = (val << 4) + u_digit(hexString.charAt(i), 16); 612 } 613 // TODO: some verification that the character is valid 614 replacement.setTo(val); 615 } else if (mAmps.start(7, status) != -1) { 616 UnicodeString decimalString = mAmps.group(7, status); 617 UChar32 val = 0; 618 for (i=0; i<decimalString.length(); i++) { 619 val = val*10 + u_digit(decimalString.charAt(i), 10); 620 } 621 // TODO: some verification that the character is valid 622 replacement.setTo(val); 623 } else { 624 // An unrecognized &entity; Leave it alone. 625 // TODO: check that it really looks like an entity, and is not some 626 // random & in the text. 627 replacement = mAmps.group((int32_t)0, status); 628 } 629 mAmps.appendReplacement(result, replacement, status); 630 } 631 mAmps.appendTail(result); 632 s = result; 633} 634 635void 636UXMLParser::error(const char *message, UErrorCode &status) { 637 // TODO: something better here... 638 const UnicodeString &src=mXMLDecl.input(); 639 int line = 0; 640 int ci = 0; 641 while (ci < fPos && ci>=0) { 642 ci = src.indexOf((UChar)0x0a, ci+1); 643 line++; 644 } 645 fprintf(stderr, "Error: %s at line %d\n", message, line); 646 if (U_SUCCESS(status)) { 647 status = U_PARSE_ERROR; 648 } 649} 650 651// intern strings like in Java 652 653const UnicodeString * 654UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { 655 const UHashElement *he=fNames.find(s); 656 if(he!=NULL) { 657 // already a known name, return its hashed key pointer 658 return (const UnicodeString *)he->key.pointer; 659 } else { 660 // add this new name and return its hashed key pointer 661 fNames.puti(s, 0, errorCode); 662 he=fNames.find(s); 663 return (const UnicodeString *)he->key.pointer; 664 } 665} 666 667const UnicodeString * 668UXMLParser::findName(const UnicodeString &s) const { 669 const UHashElement *he=fNames.find(s); 670 if(he!=NULL) { 671 // a known name, return its hashed key pointer 672 return (const UnicodeString *)he->key.pointer; 673 } else { 674 // unknown name 675 return NULL; 676 } 677} 678 679// UXMLElement ------------------------------------------------------------- *** 680 681UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : 682 fParser(parser), 683 fName(name), 684 fAttNames(errorCode), 685 fAttValues(errorCode), 686 fChildren(errorCode), 687 fParent(NULL) 688{ 689} 690 691UXMLElement::~UXMLElement() { 692 int i; 693 // attribute names are owned by the UXMLParser, don't delete them here 694 for (i=fAttValues.size()-1; i>=0; i--) { 695 delete (UObject *)fAttValues.elementAt(i); 696 } 697 for (i=fChildren.size()-1; i>=0; i--) { 698 delete (UObject *)fChildren.elementAt(i); 699 } 700} 701 702const UnicodeString & 703UXMLElement::getTagName() const { 704 return *fName; 705} 706 707UnicodeString 708UXMLElement::getText(UBool recurse) const { 709 UnicodeString text; 710 appendText(text, recurse); 711 return text; 712} 713 714void 715UXMLElement::appendText(UnicodeString &text, UBool recurse) const { 716 const UObject *node; 717 int32_t i, count=fChildren.size(); 718 for(i=0; i<count; ++i) { 719 node=(const UObject *)fChildren.elementAt(i); 720 const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); 721 if(s!=NULL) { 722 text.append(*s); 723 } else if(recurse) /* must be a UXMLElement */ { 724 ((const UXMLElement *)node)->appendText(text, recurse); 725 } 726 } 727} 728 729int32_t 730UXMLElement::countAttributes() const { 731 return fAttNames.size(); 732} 733 734const UnicodeString * 735UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { 736 if(0<=i && i<fAttNames.size()) { 737 name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); 738 value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); 739 return &value; // or return (UnicodeString *)fAttValues.elementAt(i); 740 } else { 741 return NULL; 742 } 743} 744 745const UnicodeString * 746UXMLElement::getAttribute(const UnicodeString &name) const { 747 // search for the attribute name by comparing the interned pointer, 748 // not the string contents 749 const UnicodeString *p=fParser->findName(name); 750 if(p==NULL) { 751 return NULL; // no such attribute seen by the parser at all 752 } 753 754 int32_t i, count=fAttNames.size(); 755 for(i=0; i<count; ++i) { 756 if(p==(const UnicodeString *)fAttNames.elementAt(i)) { 757 return (const UnicodeString *)fAttValues.elementAt(i); 758 } 759 } 760 return NULL; 761} 762 763int32_t 764UXMLElement::countChildren() const { 765 return fChildren.size(); 766} 767 768const UObject * 769UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { 770 if(0<=i && i<fChildren.size()) { 771 const UObject *node=(const UObject *)fChildren.elementAt(i); 772 if(dynamic_cast<const UXMLElement *>(node)!=NULL) { 773 type=UXML_NODE_TYPE_ELEMENT; 774 } else { 775 type=UXML_NODE_TYPE_STRING; 776 } 777 return node; 778 } else { 779 return NULL; 780 } 781} 782 783const UXMLElement * 784UXMLElement::nextChildElement(int32_t &i) const { 785 if(i<0) { 786 return NULL; 787 } 788 789 const UObject *node; 790 int32_t count=fChildren.size(); 791 while(i<count) { 792 node=(const UObject *)fChildren.elementAt(i++); 793 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); 794 if(elem!=NULL) { 795 return elem; 796 } 797 } 798 return NULL; 799} 800 801const UXMLElement * 802UXMLElement::getChildElement(const UnicodeString &name) const { 803 // search for the element name by comparing the interned pointer, 804 // not the string contents 805 const UnicodeString *p=fParser->findName(name); 806 if(p==NULL) { 807 return NULL; // no such element seen by the parser at all 808 } 809 810 const UObject *node; 811 int32_t i, count=fChildren.size(); 812 for(i=0; i<count; ++i) { 813 node=(const UObject *)fChildren.elementAt(i); 814 const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); 815 if(elem!=NULL) { 816 if(p==elem->fName) { 817 return elem; 818 } 819 } 820 } 821 return NULL; 822} 823 824U_NAMESPACE_END 825 826#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 827 828